1 //====- X86FlagsCopyLowering.cpp - Lowers COPY nodes of EFLAGS ------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// \file 10 /// 11 /// Lowers COPY nodes of EFLAGS by directly extracting and preserving individual 12 /// flag bits. 13 /// 14 /// We have to do this by carefully analyzing and rewriting the usage of the 15 /// copied EFLAGS register because there is no general way to rematerialize the 16 /// entire EFLAGS register safely and efficiently. Using `popf` both forces 17 /// dynamic stack adjustment and can create correctness issues due to IF, TF, 18 /// and other non-status flags being overwritten. Using sequences involving 19 /// SAHF don't work on all x86 processors and are often quite slow compared to 20 /// directly testing a single status preserved in its own GPR. 21 /// 22 //===----------------------------------------------------------------------===// 23 24 #include "X86.h" 25 #include "X86InstrBuilder.h" 26 #include "X86InstrInfo.h" 27 #include "X86Subtarget.h" 28 #include "llvm/ADT/ArrayRef.h" 29 #include "llvm/ADT/DenseMap.h" 30 #include "llvm/ADT/STLExtras.h" 31 #include "llvm/ADT/ScopeExit.h" 32 #include "llvm/ADT/SmallPtrSet.h" 33 #include "llvm/ADT/SmallSet.h" 34 #include "llvm/ADT/SmallVector.h" 35 #include "llvm/ADT/SparseBitVector.h" 36 #include "llvm/ADT/Statistic.h" 37 #include "llvm/CodeGen/MachineBasicBlock.h" 38 #include "llvm/CodeGen/MachineConstantPool.h" 39 #include "llvm/CodeGen/MachineDominators.h" 40 #include "llvm/CodeGen/MachineFunction.h" 41 #include "llvm/CodeGen/MachineFunctionPass.h" 42 #include "llvm/CodeGen/MachineInstr.h" 43 #include "llvm/CodeGen/MachineInstrBuilder.h" 44 #include "llvm/CodeGen/MachineModuleInfo.h" 45 #include "llvm/CodeGen/MachineOperand.h" 46 #include "llvm/CodeGen/MachineRegisterInfo.h" 47 #include "llvm/CodeGen/MachineSSAUpdater.h" 48 #include "llvm/CodeGen/TargetInstrInfo.h" 49 #include "llvm/CodeGen/TargetRegisterInfo.h" 50 #include "llvm/CodeGen/TargetSchedule.h" 51 #include "llvm/CodeGen/TargetSubtargetInfo.h" 52 #include "llvm/IR/DebugLoc.h" 53 #include "llvm/MC/MCSchedule.h" 54 #include "llvm/Pass.h" 55 #include "llvm/Support/CommandLine.h" 56 #include "llvm/Support/Debug.h" 57 #include "llvm/Support/raw_ostream.h" 58 #include <algorithm> 59 #include <cassert> 60 #include <iterator> 61 #include <utility> 62 63 using namespace llvm; 64 65 #define PASS_KEY "x86-flags-copy-lowering" 66 #define DEBUG_TYPE PASS_KEY 67 68 STATISTIC(NumCopiesEliminated, "Number of copies of EFLAGS eliminated"); 69 STATISTIC(NumSetCCsInserted, "Number of setCC instructions inserted"); 70 STATISTIC(NumTestsInserted, "Number of test instructions inserted"); 71 STATISTIC(NumAddsInserted, "Number of adds instructions inserted"); 72 73 namespace llvm { 74 75 void initializeX86FlagsCopyLoweringPassPass(PassRegistry &); 76 77 } // end namespace llvm 78 79 namespace { 80 81 // Convenient array type for storing registers associated with each condition. 82 using CondRegArray = std::array<unsigned, X86::LAST_VALID_COND + 1>; 83 84 class X86FlagsCopyLoweringPass : public MachineFunctionPass { 85 public: 86 X86FlagsCopyLoweringPass() : MachineFunctionPass(ID) { 87 initializeX86FlagsCopyLoweringPassPass(*PassRegistry::getPassRegistry()); 88 } 89 90 StringRef getPassName() const override { return "X86 EFLAGS copy lowering"; } 91 bool runOnMachineFunction(MachineFunction &MF) override; 92 void getAnalysisUsage(AnalysisUsage &AU) const override; 93 94 /// Pass identification, replacement for typeid. 95 static char ID; 96 97 private: 98 MachineRegisterInfo *MRI; 99 const X86InstrInfo *TII; 100 const TargetRegisterInfo *TRI; 101 const TargetRegisterClass *PromoteRC; 102 MachineDominatorTree *MDT; 103 104 CondRegArray collectCondsInRegs(MachineBasicBlock &MBB, 105 MachineInstr &CopyDefI); 106 107 unsigned promoteCondToReg(MachineBasicBlock &MBB, 108 MachineBasicBlock::iterator TestPos, 109 DebugLoc TestLoc, X86::CondCode Cond); 110 std::pair<unsigned, bool> 111 getCondOrInverseInReg(MachineBasicBlock &TestMBB, 112 MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, 113 X86::CondCode Cond, CondRegArray &CondRegs); 114 void insertTest(MachineBasicBlock &MBB, MachineBasicBlock::iterator Pos, 115 DebugLoc Loc, unsigned Reg); 116 117 void rewriteArithmetic(MachineBasicBlock &TestMBB, 118 MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, 119 MachineInstr &MI, MachineOperand &FlagUse, 120 CondRegArray &CondRegs); 121 void rewriteCMov(MachineBasicBlock &TestMBB, 122 MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, 123 MachineInstr &CMovI, MachineOperand &FlagUse, 124 CondRegArray &CondRegs); 125 void rewriteCondJmp(MachineBasicBlock &TestMBB, 126 MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, 127 MachineInstr &JmpI, CondRegArray &CondRegs); 128 void rewriteCopy(MachineInstr &MI, MachineOperand &FlagUse, 129 MachineInstr &CopyDefI); 130 void rewriteSetCarryExtended(MachineBasicBlock &TestMBB, 131 MachineBasicBlock::iterator TestPos, 132 DebugLoc TestLoc, MachineInstr &SetBI, 133 MachineOperand &FlagUse, CondRegArray &CondRegs); 134 void rewriteSetCC(MachineBasicBlock &TestMBB, 135 MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, 136 MachineInstr &SetCCI, MachineOperand &FlagUse, 137 CondRegArray &CondRegs); 138 }; 139 140 } // end anonymous namespace 141 142 INITIALIZE_PASS_BEGIN(X86FlagsCopyLoweringPass, DEBUG_TYPE, 143 "X86 EFLAGS copy lowering", false, false) 144 INITIALIZE_PASS_END(X86FlagsCopyLoweringPass, DEBUG_TYPE, 145 "X86 EFLAGS copy lowering", false, false) 146 147 FunctionPass *llvm::createX86FlagsCopyLoweringPass() { 148 return new X86FlagsCopyLoweringPass(); 149 } 150 151 char X86FlagsCopyLoweringPass::ID = 0; 152 153 void X86FlagsCopyLoweringPass::getAnalysisUsage(AnalysisUsage &AU) const { 154 AU.addRequired<MachineDominatorTree>(); 155 MachineFunctionPass::getAnalysisUsage(AU); 156 } 157 158 namespace { 159 /// An enumeration of the arithmetic instruction mnemonics which have 160 /// interesting flag semantics. 161 /// 162 /// We can map instruction opcodes into these mnemonics to make it easy to 163 /// dispatch with specific functionality. 164 enum class FlagArithMnemonic { 165 ADC, 166 ADCX, 167 ADOX, 168 RCL, 169 RCR, 170 SBB, 171 }; 172 } // namespace 173 174 static FlagArithMnemonic getMnemonicFromOpcode(unsigned Opcode) { 175 switch (Opcode) { 176 default: 177 report_fatal_error("No support for lowering a copy into EFLAGS when used " 178 "by this instruction!"); 179 180 #define LLVM_EXPAND_INSTR_SIZES(MNEMONIC, SUFFIX) \ 181 case X86::MNEMONIC##8##SUFFIX: \ 182 case X86::MNEMONIC##16##SUFFIX: \ 183 case X86::MNEMONIC##32##SUFFIX: \ 184 case X86::MNEMONIC##64##SUFFIX: 185 186 #define LLVM_EXPAND_ADC_SBB_INSTR(MNEMONIC) \ 187 LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr) \ 188 LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr_REV) \ 189 LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rm) \ 190 LLVM_EXPAND_INSTR_SIZES(MNEMONIC, mr) \ 191 case X86::MNEMONIC##8ri: \ 192 case X86::MNEMONIC##16ri8: \ 193 case X86::MNEMONIC##32ri8: \ 194 case X86::MNEMONIC##64ri8: \ 195 case X86::MNEMONIC##16ri: \ 196 case X86::MNEMONIC##32ri: \ 197 case X86::MNEMONIC##64ri32: \ 198 case X86::MNEMONIC##8mi: \ 199 case X86::MNEMONIC##16mi8: \ 200 case X86::MNEMONIC##32mi8: \ 201 case X86::MNEMONIC##64mi8: \ 202 case X86::MNEMONIC##16mi: \ 203 case X86::MNEMONIC##32mi: \ 204 case X86::MNEMONIC##64mi32: \ 205 case X86::MNEMONIC##8i8: \ 206 case X86::MNEMONIC##16i16: \ 207 case X86::MNEMONIC##32i32: \ 208 case X86::MNEMONIC##64i32: 209 210 LLVM_EXPAND_ADC_SBB_INSTR(ADC) 211 return FlagArithMnemonic::ADC; 212 213 LLVM_EXPAND_ADC_SBB_INSTR(SBB) 214 return FlagArithMnemonic::SBB; 215 216 #undef LLVM_EXPAND_ADC_SBB_INSTR 217 218 LLVM_EXPAND_INSTR_SIZES(RCL, rCL) 219 LLVM_EXPAND_INSTR_SIZES(RCL, r1) 220 LLVM_EXPAND_INSTR_SIZES(RCL, ri) 221 return FlagArithMnemonic::RCL; 222 223 LLVM_EXPAND_INSTR_SIZES(RCR, rCL) 224 LLVM_EXPAND_INSTR_SIZES(RCR, r1) 225 LLVM_EXPAND_INSTR_SIZES(RCR, ri) 226 return FlagArithMnemonic::RCR; 227 228 #undef LLVM_EXPAND_INSTR_SIZES 229 230 case X86::ADCX32rr: 231 case X86::ADCX64rr: 232 case X86::ADCX32rm: 233 case X86::ADCX64rm: 234 return FlagArithMnemonic::ADCX; 235 236 case X86::ADOX32rr: 237 case X86::ADOX64rr: 238 case X86::ADOX32rm: 239 case X86::ADOX64rm: 240 return FlagArithMnemonic::ADOX; 241 } 242 } 243 244 static MachineBasicBlock &splitBlock(MachineBasicBlock &MBB, 245 MachineInstr &SplitI, 246 const X86InstrInfo &TII) { 247 MachineFunction &MF = *MBB.getParent(); 248 249 assert(SplitI.getParent() == &MBB && 250 "Split instruction must be in the split block!"); 251 assert(SplitI.isBranch() && 252 "Only designed to split a tail of branch instructions!"); 253 assert(X86::getCondFromBranchOpc(SplitI.getOpcode()) != X86::COND_INVALID && 254 "Must split on an actual jCC instruction!"); 255 256 // Dig out the previous instruction to the split point. 257 MachineInstr &PrevI = *std::prev(SplitI.getIterator()); 258 assert(PrevI.isBranch() && "Must split after a branch!"); 259 assert(X86::getCondFromBranchOpc(PrevI.getOpcode()) != X86::COND_INVALID && 260 "Must split after an actual jCC instruction!"); 261 assert(!std::prev(PrevI.getIterator())->isTerminator() && 262 "Must only have this one terminator prior to the split!"); 263 264 // Grab the one successor edge that will stay in `MBB`. 265 MachineBasicBlock &UnsplitSucc = *PrevI.getOperand(0).getMBB(); 266 267 // Analyze the original block to see if we are actually splitting an edge 268 // into two edges. This can happen when we have multiple conditional jumps to 269 // the same successor. 270 bool IsEdgeSplit = 271 std::any_of(SplitI.getIterator(), MBB.instr_end(), 272 [&](MachineInstr &MI) { 273 assert(MI.isTerminator() && 274 "Should only have spliced terminators!"); 275 return llvm::any_of( 276 MI.operands(), [&](MachineOperand &MOp) { 277 return MOp.isMBB() && MOp.getMBB() == &UnsplitSucc; 278 }); 279 }) || 280 MBB.getFallThrough() == &UnsplitSucc; 281 282 MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock(); 283 284 // Insert the new block immediately after the current one. Any existing 285 // fallthrough will be sunk into this new block anyways. 286 MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB); 287 288 // Splice the tail of instructions into the new block. 289 NewMBB.splice(NewMBB.end(), &MBB, SplitI.getIterator(), MBB.end()); 290 291 // Copy the necessary succesors (and their probability info) into the new 292 // block. 293 for (auto SI = MBB.succ_begin(), SE = MBB.succ_end(); SI != SE; ++SI) 294 if (IsEdgeSplit || *SI != &UnsplitSucc) 295 NewMBB.copySuccessor(&MBB, SI); 296 // Normalize the probabilities if we didn't end up splitting the edge. 297 if (!IsEdgeSplit) 298 NewMBB.normalizeSuccProbs(); 299 300 // Now replace all of the moved successors in the original block with the new 301 // block. This will merge their probabilities. 302 for (MachineBasicBlock *Succ : NewMBB.successors()) 303 if (Succ != &UnsplitSucc) 304 MBB.replaceSuccessor(Succ, &NewMBB); 305 306 // We should always end up replacing at least one successor. 307 assert(MBB.isSuccessor(&NewMBB) && 308 "Failed to make the new block a successor!"); 309 310 // Now update all the PHIs. 311 for (MachineBasicBlock *Succ : NewMBB.successors()) { 312 for (MachineInstr &MI : *Succ) { 313 if (!MI.isPHI()) 314 break; 315 316 for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps; 317 OpIdx += 2) { 318 MachineOperand &OpV = MI.getOperand(OpIdx); 319 MachineOperand &OpMBB = MI.getOperand(OpIdx + 1); 320 assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!"); 321 if (OpMBB.getMBB() != &MBB) 322 continue; 323 324 // Replace the operand for unsplit successors 325 if (!IsEdgeSplit || Succ != &UnsplitSucc) { 326 OpMBB.setMBB(&NewMBB); 327 328 // We have to continue scanning as there may be multiple entries in 329 // the PHI. 330 continue; 331 } 332 333 // When we have split the edge append a new successor. 334 MI.addOperand(MF, OpV); 335 MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB)); 336 break; 337 } 338 } 339 } 340 341 return NewMBB; 342 } 343 344 bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) { 345 DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName() 346 << " **********\n"); 347 348 auto &Subtarget = MF.getSubtarget<X86Subtarget>(); 349 MRI = &MF.getRegInfo(); 350 TII = Subtarget.getInstrInfo(); 351 TRI = Subtarget.getRegisterInfo(); 352 MDT = &getAnalysis<MachineDominatorTree>(); 353 PromoteRC = &X86::GR8RegClass; 354 355 if (MF.begin() == MF.end()) 356 // Nothing to do for a degenerate empty function... 357 return false; 358 359 SmallVector<MachineInstr *, 4> Copies; 360 for (MachineBasicBlock &MBB : MF) 361 for (MachineInstr &MI : MBB) 362 if (MI.getOpcode() == TargetOpcode::COPY && 363 MI.getOperand(0).getReg() == X86::EFLAGS) 364 Copies.push_back(&MI); 365 366 for (MachineInstr *CopyI : Copies) { 367 MachineBasicBlock &MBB = *CopyI->getParent(); 368 369 MachineOperand &VOp = CopyI->getOperand(1); 370 assert(VOp.isReg() && 371 "The input to the copy for EFLAGS should always be a register!"); 372 MachineInstr &CopyDefI = *MRI->getVRegDef(VOp.getReg()); 373 if (CopyDefI.getOpcode() != TargetOpcode::COPY) { 374 // FIXME: The big likely candidate here are PHI nodes. We could in theory 375 // handle PHI nodes, but it gets really, really hard. Insanely hard. Hard 376 // enough that it is probably better to change every other part of LLVM 377 // to avoid creating them. The issue is that once we have PHIs we won't 378 // know which original EFLAGS value we need to capture with our setCCs 379 // below. The end result will be computing a complete set of setCCs that 380 // we *might* want, computing them in every place where we copy *out* of 381 // EFLAGS and then doing SSA formation on all of them to insert necessary 382 // PHI nodes and consume those here. Then hoping that somehow we DCE the 383 // unnecessary ones. This DCE seems very unlikely to be successful and so 384 // we will almost certainly end up with a glut of dead setCC 385 // instructions. Until we have a motivating test case and fail to avoid 386 // it by changing other parts of LLVM's lowering, we refuse to handle 387 // this complex case here. 388 DEBUG(dbgs() << "ERROR: Encountered unexpected def of an eflags copy: "; 389 CopyDefI.dump()); 390 report_fatal_error( 391 "Cannot lower EFLAGS copy unless it is defined in turn by a copy!"); 392 } 393 394 auto Cleanup = make_scope_exit([&] { 395 // All uses of the EFLAGS copy are now rewritten, kill the copy into 396 // eflags and if dead the copy from. 397 CopyI->eraseFromParent(); 398 if (MRI->use_empty(CopyDefI.getOperand(0).getReg())) 399 CopyDefI.eraseFromParent(); 400 ++NumCopiesEliminated; 401 }); 402 403 MachineOperand &DOp = CopyI->getOperand(0); 404 assert(DOp.isDef() && "Expected register def!"); 405 assert(DOp.getReg() == X86::EFLAGS && "Unexpected copy def register!"); 406 if (DOp.isDead()) 407 continue; 408 409 MachineBasicBlock &TestMBB = *CopyDefI.getParent(); 410 auto TestPos = CopyDefI.getIterator(); 411 DebugLoc TestLoc = CopyDefI.getDebugLoc(); 412 413 DEBUG(dbgs() << "Rewriting copy: "; CopyI->dump()); 414 415 // Scan for usage of newly set EFLAGS so we can rewrite them. We just buffer 416 // jumps because their usage is very constrained. 417 bool FlagsKilled = false; 418 SmallVector<MachineInstr *, 4> JmpIs; 419 420 // Gather the condition flags that have already been preserved in 421 // registers. We do this from scratch each time as we expect there to be 422 // very few of them and we expect to not revisit the same copy definition 423 // many times. If either of those change sufficiently we could build a map 424 // of these up front instead. 425 CondRegArray CondRegs = collectCondsInRegs(TestMBB, CopyDefI); 426 427 // Collect the basic blocks we need to scan. Typically this will just be 428 // a single basic block but we may have to scan multiple blocks if the 429 // EFLAGS copy lives into successors. 430 SmallVector<MachineBasicBlock *, 2> Blocks; 431 SmallPtrSet<MachineBasicBlock *, 2> VisitedBlocks; 432 Blocks.push_back(&MBB); 433 VisitedBlocks.insert(&MBB); 434 435 do { 436 MachineBasicBlock &UseMBB = *Blocks.pop_back_val(); 437 438 // We currently don't do any PHI insertion and so we require that the 439 // test basic block dominates all of the use basic blocks. 440 // 441 // We could in theory do PHI insertion here if it becomes useful by just 442 // taking undef values in along every edge that we don't trace this 443 // EFLAGS copy along. This isn't as bad as fully general PHI insertion, 444 // but still seems like a great deal of complexity. 445 // 446 // Because it is theoretically possible that some earlier MI pass or 447 // other lowering transformation could induce this to happen, we do 448 // a hard check even in non-debug builds here. 449 if (&TestMBB != &UseMBB && !MDT->dominates(&TestMBB, &UseMBB)) { 450 DEBUG({ 451 dbgs() << "ERROR: Encountered use that is not dominated by our test " 452 "basic block! Rewriting this would require inserting PHI " 453 "nodes to track the flag state across the CFG.\n\nTest " 454 "block:\n"; 455 TestMBB.dump(); 456 dbgs() << "Use block:\n"; 457 UseMBB.dump(); 458 }); 459 report_fatal_error("Cannot lower EFLAGS copy when original copy def " 460 "does not dominate all uses."); 461 } 462 463 for (auto MII = &UseMBB == &MBB ? std::next(CopyI->getIterator()) 464 : UseMBB.instr_begin(), 465 MIE = UseMBB.instr_end(); 466 MII != MIE;) { 467 MachineInstr &MI = *MII++; 468 MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS); 469 if (!FlagUse) { 470 if (MI.findRegisterDefOperand(X86::EFLAGS)) { 471 // If EFLAGS are defined, it's as-if they were killed. We can stop 472 // scanning here. 473 // 474 // NB!!! Many instructions only modify some flags. LLVM currently 475 // models this as clobbering all flags, but if that ever changes 476 // this will need to be carefully updated to handle that more 477 // complex logic. 478 FlagsKilled = true; 479 break; 480 } 481 continue; 482 } 483 484 DEBUG(dbgs() << " Rewriting use: "; MI.dump()); 485 486 // Check the kill flag before we rewrite as that may change it. 487 if (FlagUse->isKill()) 488 FlagsKilled = true; 489 490 // Once we encounter a branch, the rest of the instructions must also be 491 // branches. We can't rewrite in place here, so we handle them below. 492 // 493 // Note that we don't have to handle tail calls here, even conditional 494 // tail calls, as those are not introduced into the X86 MI until post-RA 495 // branch folding or black placement. As a consequence, we get to deal 496 // with the simpler formulation of conditional branches followed by tail 497 // calls. 498 if (X86::getCondFromBranchOpc(MI.getOpcode()) != X86::COND_INVALID) { 499 auto JmpIt = MI.getIterator(); 500 do { 501 JmpIs.push_back(&*JmpIt); 502 ++JmpIt; 503 } while (JmpIt != UseMBB.instr_end() && 504 X86::getCondFromBranchOpc(JmpIt->getOpcode()) != 505 X86::COND_INVALID); 506 break; 507 } 508 509 // Otherwise we can just rewrite in-place. 510 if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) { 511 rewriteCMov(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); 512 } else if (X86::getCondFromSETOpc(MI.getOpcode()) != 513 X86::COND_INVALID) { 514 rewriteSetCC(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); 515 } else if (MI.getOpcode() == TargetOpcode::COPY) { 516 rewriteCopy(MI, *FlagUse, CopyDefI); 517 } else { 518 // We assume all other instructions that use flags also def them. 519 assert(MI.findRegisterDefOperand(X86::EFLAGS) && 520 "Expected a def of EFLAGS for this instruction!"); 521 522 // NB!!! Several arithmetic instructions only *partially* update 523 // flags. Theoretically, we could generate MI code sequences that 524 // would rely on this fact and observe different flags independently. 525 // But currently LLVM models all of these instructions as clobbering 526 // all the flags in an undef way. We rely on that to simplify the 527 // logic. 528 FlagsKilled = true; 529 530 switch (MI.getOpcode()) { 531 case X86::SETB_C8r: 532 case X86::SETB_C16r: 533 case X86::SETB_C32r: 534 case X86::SETB_C64r: 535 // Use custom lowering for arithmetic that is merely extending the 536 // carry flag. We model this as the SETB_C* pseudo instructions. 537 rewriteSetCarryExtended(TestMBB, TestPos, TestLoc, MI, *FlagUse, 538 CondRegs); 539 break; 540 541 default: 542 // Generically handle remaining uses as arithmetic instructions. 543 rewriteArithmetic(TestMBB, TestPos, TestLoc, MI, *FlagUse, 544 CondRegs); 545 break; 546 } 547 break; 548 } 549 550 // If this was the last use of the flags, we're done. 551 if (FlagsKilled) 552 break; 553 } 554 555 // If the flags were killed, we're done with this block. 556 if (FlagsKilled) 557 break; 558 559 // Otherwise we need to scan successors for ones where the flags live-in 560 // and queue those up for processing. 561 for (MachineBasicBlock *SuccMBB : UseMBB.successors()) 562 if (SuccMBB->isLiveIn(X86::EFLAGS) && 563 VisitedBlocks.insert(SuccMBB).second) 564 Blocks.push_back(SuccMBB); 565 } while (!Blocks.empty()); 566 567 // Now rewrite the jumps that use the flags. These we handle specially 568 // because if there are multiple jumps in a single basic block we'll have 569 // to do surgery on the CFG. 570 MachineBasicBlock *LastJmpMBB = nullptr; 571 for (MachineInstr *JmpI : JmpIs) { 572 // Past the first jump within a basic block we need to split the blocks 573 // apart. 574 if (JmpI->getParent() == LastJmpMBB) 575 splitBlock(*JmpI->getParent(), *JmpI, *TII); 576 else 577 LastJmpMBB = JmpI->getParent(); 578 579 rewriteCondJmp(TestMBB, TestPos, TestLoc, *JmpI, CondRegs); 580 } 581 582 // FIXME: Mark the last use of EFLAGS before the copy's def as a kill if 583 // the copy's def operand is itself a kill. 584 } 585 586 #ifndef NDEBUG 587 for (MachineBasicBlock &MBB : MF) 588 for (MachineInstr &MI : MBB) 589 if (MI.getOpcode() == TargetOpcode::COPY && 590 (MI.getOperand(0).getReg() == X86::EFLAGS || 591 MI.getOperand(1).getReg() == X86::EFLAGS)) { 592 DEBUG(dbgs() << "ERROR: Found a COPY involving EFLAGS: "; MI.dump()); 593 llvm_unreachable("Unlowered EFLAGS copy!"); 594 } 595 #endif 596 597 return true; 598 } 599 600 /// Collect any conditions that have already been set in registers so that we 601 /// can re-use them rather than adding duplicates. 602 CondRegArray 603 X86FlagsCopyLoweringPass::collectCondsInRegs(MachineBasicBlock &MBB, 604 MachineInstr &CopyDefI) { 605 CondRegArray CondRegs = {}; 606 607 // Scan backwards across the range of instructions with live EFLAGS. 608 for (MachineInstr &MI : llvm::reverse( 609 llvm::make_range(MBB.instr_begin(), CopyDefI.getIterator()))) { 610 X86::CondCode Cond = X86::getCondFromSETOpc(MI.getOpcode()); 611 if (Cond != X86::COND_INVALID && !MI.mayStore() && MI.getOperand(0).isReg() && 612 TRI->isVirtualRegister(MI.getOperand(0).getReg())) { 613 assert(MI.getOperand(0).isDef() && 614 "A non-storing SETcc should always define a register!"); 615 CondRegs[Cond] = MI.getOperand(0).getReg(); 616 } 617 618 // Stop scanning when we see the first definition of the EFLAGS as prior to 619 // this we would potentially capture the wrong flag state. 620 if (MI.findRegisterDefOperand(X86::EFLAGS)) 621 break; 622 } 623 return CondRegs; 624 } 625 626 unsigned X86FlagsCopyLoweringPass::promoteCondToReg( 627 MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, 628 DebugLoc TestLoc, X86::CondCode Cond) { 629 unsigned Reg = MRI->createVirtualRegister(PromoteRC); 630 auto SetI = BuildMI(TestMBB, TestPos, TestLoc, 631 TII->get(X86::getSETFromCond(Cond)), Reg); 632 (void)SetI; 633 DEBUG(dbgs() << " save cond: "; SetI->dump()); 634 ++NumSetCCsInserted; 635 return Reg; 636 } 637 638 std::pair<unsigned, bool> X86FlagsCopyLoweringPass::getCondOrInverseInReg( 639 MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, 640 DebugLoc TestLoc, X86::CondCode Cond, CondRegArray &CondRegs) { 641 unsigned &CondReg = CondRegs[Cond]; 642 unsigned &InvCondReg = CondRegs[X86::GetOppositeBranchCondition(Cond)]; 643 if (!CondReg && !InvCondReg) 644 CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond); 645 646 if (CondReg) 647 return {CondReg, false}; 648 else 649 return {InvCondReg, true}; 650 } 651 652 void X86FlagsCopyLoweringPass::insertTest(MachineBasicBlock &MBB, 653 MachineBasicBlock::iterator Pos, 654 DebugLoc Loc, unsigned Reg) { 655 // We emit test instructions as register/immediate test against -1. This 656 // allows register allocation to fold a memory operand if needed (that will 657 // happen often due to the places this code is emitted). But hopefully will 658 // also allow us to select a shorter encoding of `testb %reg, %reg` when that 659 // would be equivalent. 660 auto TestI = 661 BuildMI(MBB, Pos, Loc, TII->get(X86::TEST8rr)).addReg(Reg).addReg(Reg); 662 (void)TestI; 663 DEBUG(dbgs() << " test cond: "; TestI->dump()); 664 ++NumTestsInserted; 665 } 666 667 void X86FlagsCopyLoweringPass::rewriteArithmetic( 668 MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, 669 DebugLoc TestLoc, MachineInstr &MI, MachineOperand &FlagUse, 670 CondRegArray &CondRegs) { 671 // Arithmetic is either reading CF or OF. Figure out which condition we need 672 // to preserve in a register. 673 X86::CondCode Cond; 674 675 // The addend to use to reset CF or OF when added to the flag value. 676 int Addend; 677 678 switch (getMnemonicFromOpcode(MI.getOpcode())) { 679 case FlagArithMnemonic::ADC: 680 case FlagArithMnemonic::ADCX: 681 case FlagArithMnemonic::RCL: 682 case FlagArithMnemonic::RCR: 683 case FlagArithMnemonic::SBB: 684 Cond = X86::COND_B; // CF == 1 685 // Set up an addend that when one is added will need a carry due to not 686 // having a higher bit available. 687 Addend = 255; 688 break; 689 690 case FlagArithMnemonic::ADOX: 691 Cond = X86::COND_O; // OF == 1 692 // Set up an addend that when one is added will turn from positive to 693 // negative and thus overflow in the signed domain. 694 Addend = 127; 695 break; 696 } 697 698 // Now get a register that contains the value of the flag input to the 699 // arithmetic. We require exactly this flag to simplify the arithmetic 700 // required to materialize it back into the flag. 701 unsigned &CondReg = CondRegs[Cond]; 702 if (!CondReg) 703 CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond); 704 705 MachineBasicBlock &MBB = *MI.getParent(); 706 707 // Insert an instruction that will set the flag back to the desired value. 708 unsigned TmpReg = MRI->createVirtualRegister(PromoteRC); 709 auto AddI = 710 BuildMI(MBB, MI.getIterator(), MI.getDebugLoc(), TII->get(X86::ADD8ri)) 711 .addDef(TmpReg, RegState::Dead) 712 .addReg(CondReg) 713 .addImm(Addend); 714 (void)AddI; 715 DEBUG(dbgs() << " add cond: "; AddI->dump()); 716 ++NumAddsInserted; 717 FlagUse.setIsKill(true); 718 } 719 720 void X86FlagsCopyLoweringPass::rewriteCMov(MachineBasicBlock &TestMBB, 721 MachineBasicBlock::iterator TestPos, 722 DebugLoc TestLoc, 723 MachineInstr &CMovI, 724 MachineOperand &FlagUse, 725 CondRegArray &CondRegs) { 726 // First get the register containing this specific condition. 727 X86::CondCode Cond = X86::getCondFromCMovOpc(CMovI.getOpcode()); 728 unsigned CondReg; 729 bool Inverted; 730 std::tie(CondReg, Inverted) = 731 getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs); 732 733 MachineBasicBlock &MBB = *CMovI.getParent(); 734 735 // Insert a direct test of the saved register. 736 insertTest(MBB, CMovI.getIterator(), CMovI.getDebugLoc(), CondReg); 737 738 // Rewrite the CMov to use the !ZF flag from the test (but match register 739 // size and memory operand), and then kill its use of the flags afterward. 740 auto &CMovRC = *MRI->getRegClass(CMovI.getOperand(0).getReg()); 741 CMovI.setDesc(TII->get(X86::getCMovFromCond( 742 Inverted ? X86::COND_E : X86::COND_NE, TRI->getRegSizeInBits(CMovRC) / 8, 743 !CMovI.memoperands_empty()))); 744 FlagUse.setIsKill(true); 745 DEBUG(dbgs() << " fixed cmov: "; CMovI.dump()); 746 } 747 748 void X86FlagsCopyLoweringPass::rewriteCondJmp( 749 MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, 750 DebugLoc TestLoc, MachineInstr &JmpI, CondRegArray &CondRegs) { 751 // First get the register containing this specific condition. 752 X86::CondCode Cond = X86::getCondFromBranchOpc(JmpI.getOpcode()); 753 unsigned CondReg; 754 bool Inverted; 755 std::tie(CondReg, Inverted) = 756 getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs); 757 758 MachineBasicBlock &JmpMBB = *JmpI.getParent(); 759 760 // Insert a direct test of the saved register. 761 insertTest(JmpMBB, JmpI.getIterator(), JmpI.getDebugLoc(), CondReg); 762 763 // Rewrite the jump to use the !ZF flag from the test, and kill its use of 764 // flags afterward. 765 JmpI.setDesc(TII->get( 766 X86::GetCondBranchFromCond(Inverted ? X86::COND_E : X86::COND_NE))); 767 const int ImplicitEFLAGSOpIdx = 1; 768 JmpI.getOperand(ImplicitEFLAGSOpIdx).setIsKill(true); 769 DEBUG(dbgs() << " fixed jCC: "; JmpI.dump()); 770 } 771 772 void X86FlagsCopyLoweringPass::rewriteCopy(MachineInstr &MI, 773 MachineOperand &FlagUse, 774 MachineInstr &CopyDefI) { 775 // Just replace this copy with the the original copy def. 776 MRI->replaceRegWith(MI.getOperand(0).getReg(), 777 CopyDefI.getOperand(0).getReg()); 778 MI.eraseFromParent(); 779 } 780 781 void X86FlagsCopyLoweringPass::rewriteSetCarryExtended( 782 MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, 783 DebugLoc TestLoc, MachineInstr &SetBI, MachineOperand &FlagUse, 784 CondRegArray &CondRegs) { 785 // This routine is only used to handle pseudos for setting a register to zero 786 // or all ones based on CF. This is essentially the sign extended from 1-bit 787 // form of SETB and modeled with the SETB_C* pseudos. They require special 788 // handling as they aren't normal SETcc instructions and are lowered to an 789 // EFLAGS clobbering operation (SBB typically). One simplifying aspect is that 790 // they are only provided in reg-defining forms. A complicating factor is that 791 // they can define many different register widths. 792 assert(SetBI.getOperand(0).isReg() && 793 "Cannot have a non-register defined operand to this variant of SETB!"); 794 795 // Little helper to do the common final step of replacing the register def'ed 796 // by this SETB instruction with a new register and removing the SETB 797 // instruction. 798 auto RewriteToReg = [&](unsigned Reg) { 799 MRI->replaceRegWith(SetBI.getOperand(0).getReg(), Reg); 800 SetBI.eraseFromParent(); 801 }; 802 803 // Grab the register class used for this particular instruction. 804 auto &SetBRC = *MRI->getRegClass(SetBI.getOperand(0).getReg()); 805 806 MachineBasicBlock &MBB = *SetBI.getParent(); 807 auto SetPos = SetBI.getIterator(); 808 auto SetLoc = SetBI.getDebugLoc(); 809 810 auto AdjustReg = [&](unsigned Reg) { 811 auto &OrigRC = *MRI->getRegClass(Reg); 812 if (&OrigRC == &SetBRC) 813 return Reg; 814 815 unsigned NewReg; 816 817 int OrigRegSize = TRI->getRegSizeInBits(OrigRC) / 8; 818 int TargetRegSize = TRI->getRegSizeInBits(SetBRC) / 8; 819 assert(OrigRegSize <= 8 && "No GPRs larger than 64-bits!"); 820 assert(TargetRegSize <= 8 && "No GPRs larger than 64-bits!"); 821 int SubRegIdx[] = {X86::NoSubRegister, X86::sub_8bit, X86::sub_16bit, 822 X86::NoSubRegister, X86::sub_32bit}; 823 824 // If the original size is smaller than the target *and* is smaller than 4 825 // bytes, we need to explicitly zero extend it. We always extend to 4-bytes 826 // to maximize the chance of being able to CSE that operation and to avoid 827 // partial dependency stalls extending to 2-bytes. 828 if (OrigRegSize < TargetRegSize && OrigRegSize < 4) { 829 NewReg = MRI->createVirtualRegister(&X86::GR32RegClass); 830 BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOVZX32rr8), NewReg) 831 .addReg(Reg); 832 if (&SetBRC == &X86::GR32RegClass) 833 return NewReg; 834 Reg = NewReg; 835 OrigRegSize = 4; 836 } 837 838 NewReg = MRI->createVirtualRegister(&SetBRC); 839 if (OrigRegSize < TargetRegSize) { 840 BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::SUBREG_TO_REG), 841 NewReg) 842 .addImm(0) 843 .addReg(Reg) 844 .addImm(SubRegIdx[OrigRegSize]); 845 } else if (OrigRegSize > TargetRegSize) { 846 BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::EXTRACT_SUBREG), 847 NewReg) 848 .addReg(Reg) 849 .addImm(SubRegIdx[TargetRegSize]); 850 } else { 851 BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::COPY), NewReg) 852 .addReg(Reg); 853 } 854 return NewReg; 855 }; 856 857 unsigned &CondReg = CondRegs[X86::COND_B]; 858 if (!CondReg) 859 CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, X86::COND_B); 860 861 // Adjust the condition to have the desired register width by zero-extending 862 // as needed. 863 // FIXME: We should use a better API to avoid the local reference and using a 864 // different variable here. 865 unsigned ExtCondReg = AdjustReg(CondReg); 866 867 // Now we need to turn this into a bitmask. We do this by subtracting it from 868 // zero. 869 unsigned ZeroReg = MRI->createVirtualRegister(&X86::GR32RegClass); 870 BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOV32r0), ZeroReg); 871 ZeroReg = AdjustReg(ZeroReg); 872 873 unsigned Sub; 874 switch (SetBI.getOpcode()) { 875 case X86::SETB_C8r: 876 Sub = X86::SUB8rr; 877 break; 878 879 case X86::SETB_C16r: 880 Sub = X86::SUB16rr; 881 break; 882 883 case X86::SETB_C32r: 884 Sub = X86::SUB32rr; 885 break; 886 887 case X86::SETB_C64r: 888 Sub = X86::SUB64rr; 889 break; 890 891 default: 892 llvm_unreachable("Invalid SETB_C* opcode!"); 893 } 894 unsigned ResultReg = MRI->createVirtualRegister(&SetBRC); 895 BuildMI(MBB, SetPos, SetLoc, TII->get(Sub), ResultReg) 896 .addReg(ZeroReg) 897 .addReg(ExtCondReg); 898 return RewriteToReg(ResultReg); 899 } 900 901 void X86FlagsCopyLoweringPass::rewriteSetCC(MachineBasicBlock &TestMBB, 902 MachineBasicBlock::iterator TestPos, 903 DebugLoc TestLoc, 904 MachineInstr &SetCCI, 905 MachineOperand &FlagUse, 906 CondRegArray &CondRegs) { 907 X86::CondCode Cond = X86::getCondFromSETOpc(SetCCI.getOpcode()); 908 // Note that we can't usefully rewrite this to the inverse without complex 909 // analysis of the users of the setCC. Largely we rely on duplicates which 910 // could have been avoided already being avoided here. 911 unsigned &CondReg = CondRegs[Cond]; 912 if (!CondReg) 913 CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond); 914 915 // Rewriting a register def is trivial: we just replace the register and 916 // remove the setcc. 917 if (!SetCCI.mayStore()) { 918 assert(SetCCI.getOperand(0).isReg() && 919 "Cannot have a non-register defined operand to SETcc!"); 920 MRI->replaceRegWith(SetCCI.getOperand(0).getReg(), CondReg); 921 SetCCI.eraseFromParent(); 922 return; 923 } 924 925 // Otherwise, we need to emit a store. 926 auto MIB = BuildMI(*SetCCI.getParent(), SetCCI.getIterator(), 927 SetCCI.getDebugLoc(), TII->get(X86::MOV8mr)); 928 // Copy the address operands. 929 for (int i = 0; i < X86::AddrNumOperands; ++i) 930 MIB.add(SetCCI.getOperand(i)); 931 932 MIB.addReg(CondReg); 933 934 MIB->setMemRefs(SetCCI.memoperands_begin(), SetCCI.memoperands_end()); 935 936 SetCCI.eraseFromParent(); 937 return; 938 } 939