1 //====- X86FlagsCopyLowering.cpp - Lowers COPY nodes of EFLAGS ------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// \file 10 /// 11 /// Lowers COPY nodes of EFLAGS by directly extracting and preserving individual 12 /// flag bits. 13 /// 14 /// We have to do this by carefully analyzing and rewriting the usage of the 15 /// copied EFLAGS register because there is no general way to rematerialize the 16 /// entire EFLAGS register safely and efficiently. Using `popf` both forces 17 /// dynamic stack adjustment and can create correctness issues due to IF, TF, 18 /// and other non-status flags being overwritten. Using sequences involving 19 /// SAHF don't work on all x86 processors and are often quite slow compared to 20 /// directly testing a single status preserved in its own GPR. 21 /// 22 //===----------------------------------------------------------------------===// 23 24 #include "X86.h" 25 #include "X86InstrBuilder.h" 26 #include "X86InstrInfo.h" 27 #include "X86Subtarget.h" 28 #include "llvm/ADT/ArrayRef.h" 29 #include "llvm/ADT/DenseMap.h" 30 #include "llvm/ADT/STLExtras.h" 31 #include "llvm/ADT/ScopeExit.h" 32 #include "llvm/ADT/SmallPtrSet.h" 33 #include "llvm/ADT/SmallSet.h" 34 #include "llvm/ADT/SmallVector.h" 35 #include "llvm/ADT/SparseBitVector.h" 36 #include "llvm/ADT/Statistic.h" 37 #include "llvm/CodeGen/MachineBasicBlock.h" 38 #include "llvm/CodeGen/MachineConstantPool.h" 39 #include "llvm/CodeGen/MachineDominators.h" 40 #include "llvm/CodeGen/MachineFunction.h" 41 #include "llvm/CodeGen/MachineFunctionPass.h" 42 #include "llvm/CodeGen/MachineInstr.h" 43 #include "llvm/CodeGen/MachineInstrBuilder.h" 44 #include "llvm/CodeGen/MachineModuleInfo.h" 45 #include "llvm/CodeGen/MachineOperand.h" 46 #include "llvm/CodeGen/MachineRegisterInfo.h" 47 #include "llvm/CodeGen/MachineSSAUpdater.h" 48 #include "llvm/CodeGen/TargetInstrInfo.h" 49 #include "llvm/CodeGen/TargetRegisterInfo.h" 50 #include "llvm/CodeGen/TargetSchedule.h" 51 #include "llvm/CodeGen/TargetSubtargetInfo.h" 52 #include "llvm/IR/DebugLoc.h" 53 #include "llvm/MC/MCSchedule.h" 54 #include "llvm/Pass.h" 55 #include "llvm/Support/CommandLine.h" 56 #include "llvm/Support/Debug.h" 57 #include "llvm/Support/raw_ostream.h" 58 #include <algorithm> 59 #include <cassert> 60 #include <iterator> 61 #include <utility> 62 63 using namespace llvm; 64 65 #define PASS_KEY "x86-flags-copy-lowering" 66 #define DEBUG_TYPE PASS_KEY 67 68 STATISTIC(NumCopiesEliminated, "Number of copies of EFLAGS eliminated"); 69 STATISTIC(NumSetCCsInserted, "Number of setCC instructions inserted"); 70 STATISTIC(NumTestsInserted, "Number of test instructions inserted"); 71 STATISTIC(NumAddsInserted, "Number of adds instructions inserted"); 72 73 namespace llvm { 74 75 void initializeX86FlagsCopyLoweringPassPass(PassRegistry &); 76 77 } // end namespace llvm 78 79 namespace { 80 81 // Convenient array type for storing registers associated with each condition. 82 using CondRegArray = std::array<unsigned, X86::LAST_VALID_COND + 1>; 83 84 class X86FlagsCopyLoweringPass : public MachineFunctionPass { 85 public: 86 X86FlagsCopyLoweringPass() : MachineFunctionPass(ID) { 87 initializeX86FlagsCopyLoweringPassPass(*PassRegistry::getPassRegistry()); 88 } 89 90 StringRef getPassName() const override { return "X86 EFLAGS copy lowering"; } 91 bool runOnMachineFunction(MachineFunction &MF) override; 92 void getAnalysisUsage(AnalysisUsage &AU) const override; 93 94 /// Pass identification, replacement for typeid. 95 static char ID; 96 97 private: 98 MachineRegisterInfo *MRI; 99 const X86InstrInfo *TII; 100 const TargetRegisterInfo *TRI; 101 const TargetRegisterClass *PromoteRC; 102 MachineDominatorTree *MDT; 103 104 CondRegArray collectCondsInRegs(MachineBasicBlock &MBB, 105 MachineInstr &CopyDefI); 106 107 unsigned promoteCondToReg(MachineBasicBlock &MBB, 108 MachineBasicBlock::iterator TestPos, 109 DebugLoc TestLoc, X86::CondCode Cond); 110 std::pair<unsigned, bool> 111 getCondOrInverseInReg(MachineBasicBlock &TestMBB, 112 MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, 113 X86::CondCode Cond, CondRegArray &CondRegs); 114 void insertTest(MachineBasicBlock &MBB, MachineBasicBlock::iterator Pos, 115 DebugLoc Loc, unsigned Reg); 116 117 void rewriteArithmetic(MachineBasicBlock &TestMBB, 118 MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, 119 MachineInstr &MI, MachineOperand &FlagUse, 120 CondRegArray &CondRegs); 121 void rewriteCMov(MachineBasicBlock &TestMBB, 122 MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, 123 MachineInstr &CMovI, MachineOperand &FlagUse, 124 CondRegArray &CondRegs); 125 void rewriteCondJmp(MachineBasicBlock &TestMBB, 126 MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, 127 MachineInstr &JmpI, CondRegArray &CondRegs); 128 void rewriteCopy(MachineInstr &MI, MachineOperand &FlagUse, 129 MachineInstr &CopyDefI); 130 void rewriteSetCarryExtended(MachineBasicBlock &TestMBB, 131 MachineBasicBlock::iterator TestPos, 132 DebugLoc TestLoc, MachineInstr &SetBI, 133 MachineOperand &FlagUse, CondRegArray &CondRegs); 134 void rewriteSetCC(MachineBasicBlock &TestMBB, 135 MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, 136 MachineInstr &SetCCI, MachineOperand &FlagUse, 137 CondRegArray &CondRegs); 138 }; 139 140 } // end anonymous namespace 141 142 INITIALIZE_PASS_BEGIN(X86FlagsCopyLoweringPass, DEBUG_TYPE, 143 "X86 EFLAGS copy lowering", false, false) 144 INITIALIZE_PASS_END(X86FlagsCopyLoweringPass, DEBUG_TYPE, 145 "X86 EFLAGS copy lowering", false, false) 146 147 FunctionPass *llvm::createX86FlagsCopyLoweringPass() { 148 return new X86FlagsCopyLoweringPass(); 149 } 150 151 char X86FlagsCopyLoweringPass::ID = 0; 152 153 void X86FlagsCopyLoweringPass::getAnalysisUsage(AnalysisUsage &AU) const { 154 AU.addRequired<MachineDominatorTree>(); 155 MachineFunctionPass::getAnalysisUsage(AU); 156 } 157 158 namespace { 159 /// An enumeration of the arithmetic instruction mnemonics which have 160 /// interesting flag semantics. 161 /// 162 /// We can map instruction opcodes into these mnemonics to make it easy to 163 /// dispatch with specific functionality. 164 enum class FlagArithMnemonic { 165 ADC, 166 ADCX, 167 ADOX, 168 RCL, 169 RCR, 170 SBB, 171 }; 172 } // namespace 173 174 static FlagArithMnemonic getMnemonicFromOpcode(unsigned Opcode) { 175 switch (Opcode) { 176 default: 177 report_fatal_error("No support for lowering a copy into EFLAGS when used " 178 "by this instruction!"); 179 180 #define LLVM_EXPAND_INSTR_SIZES(MNEMONIC, SUFFIX) \ 181 case X86::MNEMONIC##8##SUFFIX: \ 182 case X86::MNEMONIC##16##SUFFIX: \ 183 case X86::MNEMONIC##32##SUFFIX: \ 184 case X86::MNEMONIC##64##SUFFIX: 185 186 #define LLVM_EXPAND_ADC_SBB_INSTR(MNEMONIC) \ 187 LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr) \ 188 LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr_REV) \ 189 LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rm) \ 190 LLVM_EXPAND_INSTR_SIZES(MNEMONIC, mr) \ 191 case X86::MNEMONIC##8ri: \ 192 case X86::MNEMONIC##16ri8: \ 193 case X86::MNEMONIC##32ri8: \ 194 case X86::MNEMONIC##64ri8: \ 195 case X86::MNEMONIC##16ri: \ 196 case X86::MNEMONIC##32ri: \ 197 case X86::MNEMONIC##64ri32: \ 198 case X86::MNEMONIC##8mi: \ 199 case X86::MNEMONIC##16mi8: \ 200 case X86::MNEMONIC##32mi8: \ 201 case X86::MNEMONIC##64mi8: \ 202 case X86::MNEMONIC##16mi: \ 203 case X86::MNEMONIC##32mi: \ 204 case X86::MNEMONIC##64mi32: \ 205 case X86::MNEMONIC##8i8: \ 206 case X86::MNEMONIC##16i16: \ 207 case X86::MNEMONIC##32i32: \ 208 case X86::MNEMONIC##64i32: 209 210 LLVM_EXPAND_ADC_SBB_INSTR(ADC) 211 return FlagArithMnemonic::ADC; 212 213 LLVM_EXPAND_ADC_SBB_INSTR(SBB) 214 return FlagArithMnemonic::SBB; 215 216 #undef LLVM_EXPAND_ADC_SBB_INSTR 217 218 LLVM_EXPAND_INSTR_SIZES(RCL, rCL) 219 LLVM_EXPAND_INSTR_SIZES(RCL, r1) 220 LLVM_EXPAND_INSTR_SIZES(RCL, ri) 221 return FlagArithMnemonic::RCL; 222 223 LLVM_EXPAND_INSTR_SIZES(RCR, rCL) 224 LLVM_EXPAND_INSTR_SIZES(RCR, r1) 225 LLVM_EXPAND_INSTR_SIZES(RCR, ri) 226 return FlagArithMnemonic::RCR; 227 228 #undef LLVM_EXPAND_INSTR_SIZES 229 230 case X86::ADCX32rr: 231 case X86::ADCX64rr: 232 case X86::ADCX32rm: 233 case X86::ADCX64rm: 234 return FlagArithMnemonic::ADCX; 235 236 case X86::ADOX32rr: 237 case X86::ADOX64rr: 238 case X86::ADOX32rm: 239 case X86::ADOX64rm: 240 return FlagArithMnemonic::ADOX; 241 } 242 } 243 244 static MachineBasicBlock &splitBlock(MachineBasicBlock &MBB, 245 MachineInstr &SplitI, 246 const X86InstrInfo &TII) { 247 MachineFunction &MF = *MBB.getParent(); 248 249 assert(SplitI.getParent() == &MBB && 250 "Split instruction must be in the split block!"); 251 assert(SplitI.isBranch() && 252 "Only designed to split a tail of branch instructions!"); 253 assert(X86::getCondFromBranchOpc(SplitI.getOpcode()) != X86::COND_INVALID && 254 "Must split on an actual jCC instruction!"); 255 256 // Dig out the previous instruction to the split point. 257 MachineInstr &PrevI = *std::prev(SplitI.getIterator()); 258 assert(PrevI.isBranch() && "Must split after a branch!"); 259 assert(X86::getCondFromBranchOpc(PrevI.getOpcode()) != X86::COND_INVALID && 260 "Must split after an actual jCC instruction!"); 261 assert(!std::prev(PrevI.getIterator())->isTerminator() && 262 "Must only have this one terminator prior to the split!"); 263 264 // Grab the one successor edge that will stay in `MBB`. 265 MachineBasicBlock &UnsplitSucc = *PrevI.getOperand(0).getMBB(); 266 267 // Analyze the original block to see if we are actually splitting an edge 268 // into two edges. This can happen when we have multiple conditional jumps to 269 // the same successor. 270 bool IsEdgeSplit = 271 std::any_of(SplitI.getIterator(), MBB.instr_end(), 272 [&](MachineInstr &MI) { 273 assert(MI.isTerminator() && 274 "Should only have spliced terminators!"); 275 return llvm::any_of( 276 MI.operands(), [&](MachineOperand &MOp) { 277 return MOp.isMBB() && MOp.getMBB() == &UnsplitSucc; 278 }); 279 }) || 280 MBB.getFallThrough() == &UnsplitSucc; 281 282 MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock(); 283 284 // Insert the new block immediately after the current one. Any existing 285 // fallthrough will be sunk into this new block anyways. 286 MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB); 287 288 // Splice the tail of instructions into the new block. 289 NewMBB.splice(NewMBB.end(), &MBB, SplitI.getIterator(), MBB.end()); 290 291 // Copy the necessary succesors (and their probability info) into the new 292 // block. 293 for (auto SI = MBB.succ_begin(), SE = MBB.succ_end(); SI != SE; ++SI) 294 if (IsEdgeSplit || *SI != &UnsplitSucc) 295 NewMBB.copySuccessor(&MBB, SI); 296 // Normalize the probabilities if we didn't end up splitting the edge. 297 if (!IsEdgeSplit) 298 NewMBB.normalizeSuccProbs(); 299 300 // Now replace all of the moved successors in the original block with the new 301 // block. This will merge their probabilities. 302 for (MachineBasicBlock *Succ : NewMBB.successors()) 303 if (Succ != &UnsplitSucc) 304 MBB.replaceSuccessor(Succ, &NewMBB); 305 306 // We should always end up replacing at least one successor. 307 assert(MBB.isSuccessor(&NewMBB) && 308 "Failed to make the new block a successor!"); 309 310 // Now update all the PHIs. 311 for (MachineBasicBlock *Succ : NewMBB.successors()) { 312 for (MachineInstr &MI : *Succ) { 313 if (!MI.isPHI()) 314 break; 315 316 for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps; 317 OpIdx += 2) { 318 MachineOperand &OpV = MI.getOperand(OpIdx); 319 MachineOperand &OpMBB = MI.getOperand(OpIdx + 1); 320 assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!"); 321 if (OpMBB.getMBB() != &MBB) 322 continue; 323 324 // Replace the operand for unsplit successors 325 if (!IsEdgeSplit || Succ != &UnsplitSucc) { 326 OpMBB.setMBB(&NewMBB); 327 328 // We have to continue scanning as there may be multiple entries in 329 // the PHI. 330 continue; 331 } 332 333 // When we have split the edge append a new successor. 334 MI.addOperand(MF, OpV); 335 MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB)); 336 break; 337 } 338 } 339 } 340 341 return NewMBB; 342 } 343 344 bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) { 345 LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName() 346 << " **********\n"); 347 348 auto &Subtarget = MF.getSubtarget<X86Subtarget>(); 349 MRI = &MF.getRegInfo(); 350 TII = Subtarget.getInstrInfo(); 351 TRI = Subtarget.getRegisterInfo(); 352 MDT = &getAnalysis<MachineDominatorTree>(); 353 PromoteRC = &X86::GR8RegClass; 354 355 if (MF.begin() == MF.end()) 356 // Nothing to do for a degenerate empty function... 357 return false; 358 359 SmallVector<MachineInstr *, 4> Copies; 360 for (MachineBasicBlock &MBB : MF) 361 for (MachineInstr &MI : MBB) 362 if (MI.getOpcode() == TargetOpcode::COPY && 363 MI.getOperand(0).getReg() == X86::EFLAGS) 364 Copies.push_back(&MI); 365 366 for (MachineInstr *CopyI : Copies) { 367 MachineBasicBlock &MBB = *CopyI->getParent(); 368 369 MachineOperand &VOp = CopyI->getOperand(1); 370 assert(VOp.isReg() && 371 "The input to the copy for EFLAGS should always be a register!"); 372 MachineInstr &CopyDefI = *MRI->getVRegDef(VOp.getReg()); 373 if (CopyDefI.getOpcode() != TargetOpcode::COPY) { 374 // FIXME: The big likely candidate here are PHI nodes. We could in theory 375 // handle PHI nodes, but it gets really, really hard. Insanely hard. Hard 376 // enough that it is probably better to change every other part of LLVM 377 // to avoid creating them. The issue is that once we have PHIs we won't 378 // know which original EFLAGS value we need to capture with our setCCs 379 // below. The end result will be computing a complete set of setCCs that 380 // we *might* want, computing them in every place where we copy *out* of 381 // EFLAGS and then doing SSA formation on all of them to insert necessary 382 // PHI nodes and consume those here. Then hoping that somehow we DCE the 383 // unnecessary ones. This DCE seems very unlikely to be successful and so 384 // we will almost certainly end up with a glut of dead setCC 385 // instructions. Until we have a motivating test case and fail to avoid 386 // it by changing other parts of LLVM's lowering, we refuse to handle 387 // this complex case here. 388 LLVM_DEBUG( 389 dbgs() << "ERROR: Encountered unexpected def of an eflags copy: "; 390 CopyDefI.dump()); 391 report_fatal_error( 392 "Cannot lower EFLAGS copy unless it is defined in turn by a copy!"); 393 } 394 395 auto Cleanup = make_scope_exit([&] { 396 // All uses of the EFLAGS copy are now rewritten, kill the copy into 397 // eflags and if dead the copy from. 398 CopyI->eraseFromParent(); 399 if (MRI->use_empty(CopyDefI.getOperand(0).getReg())) 400 CopyDefI.eraseFromParent(); 401 ++NumCopiesEliminated; 402 }); 403 404 MachineOperand &DOp = CopyI->getOperand(0); 405 assert(DOp.isDef() && "Expected register def!"); 406 assert(DOp.getReg() == X86::EFLAGS && "Unexpected copy def register!"); 407 if (DOp.isDead()) 408 continue; 409 410 MachineBasicBlock &TestMBB = *CopyDefI.getParent(); 411 auto TestPos = CopyDefI.getIterator(); 412 DebugLoc TestLoc = CopyDefI.getDebugLoc(); 413 414 LLVM_DEBUG(dbgs() << "Rewriting copy: "; CopyI->dump()); 415 416 // Scan for usage of newly set EFLAGS so we can rewrite them. We just buffer 417 // jumps because their usage is very constrained. 418 bool FlagsKilled = false; 419 SmallVector<MachineInstr *, 4> JmpIs; 420 421 // Gather the condition flags that have already been preserved in 422 // registers. We do this from scratch each time as we expect there to be 423 // very few of them and we expect to not revisit the same copy definition 424 // many times. If either of those change sufficiently we could build a map 425 // of these up front instead. 426 CondRegArray CondRegs = collectCondsInRegs(TestMBB, CopyDefI); 427 428 // Collect the basic blocks we need to scan. Typically this will just be 429 // a single basic block but we may have to scan multiple blocks if the 430 // EFLAGS copy lives into successors. 431 SmallVector<MachineBasicBlock *, 2> Blocks; 432 SmallPtrSet<MachineBasicBlock *, 2> VisitedBlocks; 433 Blocks.push_back(&MBB); 434 VisitedBlocks.insert(&MBB); 435 436 do { 437 MachineBasicBlock &UseMBB = *Blocks.pop_back_val(); 438 439 // We currently don't do any PHI insertion and so we require that the 440 // test basic block dominates all of the use basic blocks. 441 // 442 // We could in theory do PHI insertion here if it becomes useful by just 443 // taking undef values in along every edge that we don't trace this 444 // EFLAGS copy along. This isn't as bad as fully general PHI insertion, 445 // but still seems like a great deal of complexity. 446 // 447 // Because it is theoretically possible that some earlier MI pass or 448 // other lowering transformation could induce this to happen, we do 449 // a hard check even in non-debug builds here. 450 if (&TestMBB != &UseMBB && !MDT->dominates(&TestMBB, &UseMBB)) { 451 LLVM_DEBUG({ 452 dbgs() << "ERROR: Encountered use that is not dominated by our test " 453 "basic block! Rewriting this would require inserting PHI " 454 "nodes to track the flag state across the CFG.\n\nTest " 455 "block:\n"; 456 TestMBB.dump(); 457 dbgs() << "Use block:\n"; 458 UseMBB.dump(); 459 }); 460 report_fatal_error("Cannot lower EFLAGS copy when original copy def " 461 "does not dominate all uses."); 462 } 463 464 for (auto MII = &UseMBB == &MBB ? std::next(CopyI->getIterator()) 465 : UseMBB.instr_begin(), 466 MIE = UseMBB.instr_end(); 467 MII != MIE;) { 468 MachineInstr &MI = *MII++; 469 MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS); 470 if (!FlagUse) { 471 if (MI.findRegisterDefOperand(X86::EFLAGS)) { 472 // If EFLAGS are defined, it's as-if they were killed. We can stop 473 // scanning here. 474 // 475 // NB!!! Many instructions only modify some flags. LLVM currently 476 // models this as clobbering all flags, but if that ever changes 477 // this will need to be carefully updated to handle that more 478 // complex logic. 479 FlagsKilled = true; 480 break; 481 } 482 continue; 483 } 484 485 LLVM_DEBUG(dbgs() << " Rewriting use: "; MI.dump()); 486 487 // Check the kill flag before we rewrite as that may change it. 488 if (FlagUse->isKill()) 489 FlagsKilled = true; 490 491 // Once we encounter a branch, the rest of the instructions must also be 492 // branches. We can't rewrite in place here, so we handle them below. 493 // 494 // Note that we don't have to handle tail calls here, even conditional 495 // tail calls, as those are not introduced into the X86 MI until post-RA 496 // branch folding or black placement. As a consequence, we get to deal 497 // with the simpler formulation of conditional branches followed by tail 498 // calls. 499 if (X86::getCondFromBranchOpc(MI.getOpcode()) != X86::COND_INVALID) { 500 auto JmpIt = MI.getIterator(); 501 do { 502 JmpIs.push_back(&*JmpIt); 503 ++JmpIt; 504 } while (JmpIt != UseMBB.instr_end() && 505 X86::getCondFromBranchOpc(JmpIt->getOpcode()) != 506 X86::COND_INVALID); 507 break; 508 } 509 510 // Otherwise we can just rewrite in-place. 511 if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) { 512 rewriteCMov(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); 513 } else if (X86::getCondFromSETOpc(MI.getOpcode()) != 514 X86::COND_INVALID) { 515 rewriteSetCC(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); 516 } else if (MI.getOpcode() == TargetOpcode::COPY) { 517 rewriteCopy(MI, *FlagUse, CopyDefI); 518 } else { 519 // We assume all other instructions that use flags also def them. 520 assert(MI.findRegisterDefOperand(X86::EFLAGS) && 521 "Expected a def of EFLAGS for this instruction!"); 522 523 // NB!!! Several arithmetic instructions only *partially* update 524 // flags. Theoretically, we could generate MI code sequences that 525 // would rely on this fact and observe different flags independently. 526 // But currently LLVM models all of these instructions as clobbering 527 // all the flags in an undef way. We rely on that to simplify the 528 // logic. 529 FlagsKilled = true; 530 531 switch (MI.getOpcode()) { 532 case X86::SETB_C8r: 533 case X86::SETB_C16r: 534 case X86::SETB_C32r: 535 case X86::SETB_C64r: 536 // Use custom lowering for arithmetic that is merely extending the 537 // carry flag. We model this as the SETB_C* pseudo instructions. 538 rewriteSetCarryExtended(TestMBB, TestPos, TestLoc, MI, *FlagUse, 539 CondRegs); 540 break; 541 542 default: 543 // Generically handle remaining uses as arithmetic instructions. 544 rewriteArithmetic(TestMBB, TestPos, TestLoc, MI, *FlagUse, 545 CondRegs); 546 break; 547 } 548 break; 549 } 550 551 // If this was the last use of the flags, we're done. 552 if (FlagsKilled) 553 break; 554 } 555 556 // If the flags were killed, we're done with this block. 557 if (FlagsKilled) 558 break; 559 560 // Otherwise we need to scan successors for ones where the flags live-in 561 // and queue those up for processing. 562 for (MachineBasicBlock *SuccMBB : UseMBB.successors()) 563 if (SuccMBB->isLiveIn(X86::EFLAGS) && 564 VisitedBlocks.insert(SuccMBB).second) 565 Blocks.push_back(SuccMBB); 566 } while (!Blocks.empty()); 567 568 // Now rewrite the jumps that use the flags. These we handle specially 569 // because if there are multiple jumps in a single basic block we'll have 570 // to do surgery on the CFG. 571 MachineBasicBlock *LastJmpMBB = nullptr; 572 for (MachineInstr *JmpI : JmpIs) { 573 // Past the first jump within a basic block we need to split the blocks 574 // apart. 575 if (JmpI->getParent() == LastJmpMBB) 576 splitBlock(*JmpI->getParent(), *JmpI, *TII); 577 else 578 LastJmpMBB = JmpI->getParent(); 579 580 rewriteCondJmp(TestMBB, TestPos, TestLoc, *JmpI, CondRegs); 581 } 582 583 // FIXME: Mark the last use of EFLAGS before the copy's def as a kill if 584 // the copy's def operand is itself a kill. 585 } 586 587 #ifndef NDEBUG 588 for (MachineBasicBlock &MBB : MF) 589 for (MachineInstr &MI : MBB) 590 if (MI.getOpcode() == TargetOpcode::COPY && 591 (MI.getOperand(0).getReg() == X86::EFLAGS || 592 MI.getOperand(1).getReg() == X86::EFLAGS)) { 593 LLVM_DEBUG(dbgs() << "ERROR: Found a COPY involving EFLAGS: "; 594 MI.dump()); 595 llvm_unreachable("Unlowered EFLAGS copy!"); 596 } 597 #endif 598 599 return true; 600 } 601 602 /// Collect any conditions that have already been set in registers so that we 603 /// can re-use them rather than adding duplicates. 604 CondRegArray 605 X86FlagsCopyLoweringPass::collectCondsInRegs(MachineBasicBlock &MBB, 606 MachineInstr &CopyDefI) { 607 CondRegArray CondRegs = {}; 608 609 // Scan backwards across the range of instructions with live EFLAGS. 610 for (MachineInstr &MI : llvm::reverse( 611 llvm::make_range(MBB.instr_begin(), CopyDefI.getIterator()))) { 612 X86::CondCode Cond = X86::getCondFromSETOpc(MI.getOpcode()); 613 if (Cond != X86::COND_INVALID && MI.getOperand(0).isReg() && 614 TRI->isVirtualRegister(MI.getOperand(0).getReg())) 615 CondRegs[Cond] = MI.getOperand(0).getReg(); 616 617 // Stop scanning when we see the first definition of the EFLAGS as prior to 618 // this we would potentially capture the wrong flag state. 619 if (MI.findRegisterDefOperand(X86::EFLAGS)) 620 break; 621 } 622 return CondRegs; 623 } 624 625 unsigned X86FlagsCopyLoweringPass::promoteCondToReg( 626 MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, 627 DebugLoc TestLoc, X86::CondCode Cond) { 628 unsigned Reg = MRI->createVirtualRegister(PromoteRC); 629 auto SetI = BuildMI(TestMBB, TestPos, TestLoc, 630 TII->get(X86::getSETFromCond(Cond)), Reg); 631 (void)SetI; 632 LLVM_DEBUG(dbgs() << " save cond: "; SetI->dump()); 633 ++NumSetCCsInserted; 634 return Reg; 635 } 636 637 std::pair<unsigned, bool> X86FlagsCopyLoweringPass::getCondOrInverseInReg( 638 MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, 639 DebugLoc TestLoc, X86::CondCode Cond, CondRegArray &CondRegs) { 640 unsigned &CondReg = CondRegs[Cond]; 641 unsigned &InvCondReg = CondRegs[X86::GetOppositeBranchCondition(Cond)]; 642 if (!CondReg && !InvCondReg) 643 CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond); 644 645 if (CondReg) 646 return {CondReg, false}; 647 else 648 return {InvCondReg, true}; 649 } 650 651 void X86FlagsCopyLoweringPass::insertTest(MachineBasicBlock &MBB, 652 MachineBasicBlock::iterator Pos, 653 DebugLoc Loc, unsigned Reg) { 654 auto TestI = 655 BuildMI(MBB, Pos, Loc, TII->get(X86::TEST8rr)).addReg(Reg).addReg(Reg); 656 (void)TestI; 657 LLVM_DEBUG(dbgs() << " test cond: "; TestI->dump()); 658 ++NumTestsInserted; 659 } 660 661 void X86FlagsCopyLoweringPass::rewriteArithmetic( 662 MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, 663 DebugLoc TestLoc, MachineInstr &MI, MachineOperand &FlagUse, 664 CondRegArray &CondRegs) { 665 // Arithmetic is either reading CF or OF. Figure out which condition we need 666 // to preserve in a register. 667 X86::CondCode Cond; 668 669 // The addend to use to reset CF or OF when added to the flag value. 670 int Addend; 671 672 switch (getMnemonicFromOpcode(MI.getOpcode())) { 673 case FlagArithMnemonic::ADC: 674 case FlagArithMnemonic::ADCX: 675 case FlagArithMnemonic::RCL: 676 case FlagArithMnemonic::RCR: 677 case FlagArithMnemonic::SBB: 678 Cond = X86::COND_B; // CF == 1 679 // Set up an addend that when one is added will need a carry due to not 680 // having a higher bit available. 681 Addend = 255; 682 break; 683 684 case FlagArithMnemonic::ADOX: 685 Cond = X86::COND_O; // OF == 1 686 // Set up an addend that when one is added will turn from positive to 687 // negative and thus overflow in the signed domain. 688 Addend = 127; 689 break; 690 } 691 692 // Now get a register that contains the value of the flag input to the 693 // arithmetic. We require exactly this flag to simplify the arithmetic 694 // required to materialize it back into the flag. 695 unsigned &CondReg = CondRegs[Cond]; 696 if (!CondReg) 697 CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond); 698 699 MachineBasicBlock &MBB = *MI.getParent(); 700 701 // Insert an instruction that will set the flag back to the desired value. 702 unsigned TmpReg = MRI->createVirtualRegister(PromoteRC); 703 auto AddI = 704 BuildMI(MBB, MI.getIterator(), MI.getDebugLoc(), TII->get(X86::ADD8ri)) 705 .addDef(TmpReg, RegState::Dead) 706 .addReg(CondReg) 707 .addImm(Addend); 708 (void)AddI; 709 LLVM_DEBUG(dbgs() << " add cond: "; AddI->dump()); 710 ++NumAddsInserted; 711 FlagUse.setIsKill(true); 712 } 713 714 void X86FlagsCopyLoweringPass::rewriteCMov(MachineBasicBlock &TestMBB, 715 MachineBasicBlock::iterator TestPos, 716 DebugLoc TestLoc, 717 MachineInstr &CMovI, 718 MachineOperand &FlagUse, 719 CondRegArray &CondRegs) { 720 // First get the register containing this specific condition. 721 X86::CondCode Cond = X86::getCondFromCMovOpc(CMovI.getOpcode()); 722 unsigned CondReg; 723 bool Inverted; 724 std::tie(CondReg, Inverted) = 725 getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs); 726 727 MachineBasicBlock &MBB = *CMovI.getParent(); 728 729 // Insert a direct test of the saved register. 730 insertTest(MBB, CMovI.getIterator(), CMovI.getDebugLoc(), CondReg); 731 732 // Rewrite the CMov to use the !ZF flag from the test (but match register 733 // size and memory operand), and then kill its use of the flags afterward. 734 auto &CMovRC = *MRI->getRegClass(CMovI.getOperand(0).getReg()); 735 CMovI.setDesc(TII->get(X86::getCMovFromCond( 736 Inverted ? X86::COND_E : X86::COND_NE, TRI->getRegSizeInBits(CMovRC) / 8, 737 !CMovI.memoperands_empty()))); 738 FlagUse.setIsKill(true); 739 LLVM_DEBUG(dbgs() << " fixed cmov: "; CMovI.dump()); 740 } 741 742 void X86FlagsCopyLoweringPass::rewriteCondJmp( 743 MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, 744 DebugLoc TestLoc, MachineInstr &JmpI, CondRegArray &CondRegs) { 745 // First get the register containing this specific condition. 746 X86::CondCode Cond = X86::getCondFromBranchOpc(JmpI.getOpcode()); 747 unsigned CondReg; 748 bool Inverted; 749 std::tie(CondReg, Inverted) = 750 getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs); 751 752 MachineBasicBlock &JmpMBB = *JmpI.getParent(); 753 754 // Insert a direct test of the saved register. 755 insertTest(JmpMBB, JmpI.getIterator(), JmpI.getDebugLoc(), CondReg); 756 757 // Rewrite the jump to use the !ZF flag from the test, and kill its use of 758 // flags afterward. 759 JmpI.setDesc(TII->get( 760 X86::GetCondBranchFromCond(Inverted ? X86::COND_E : X86::COND_NE))); 761 const int ImplicitEFLAGSOpIdx = 1; 762 JmpI.getOperand(ImplicitEFLAGSOpIdx).setIsKill(true); 763 LLVM_DEBUG(dbgs() << " fixed jCC: "; JmpI.dump()); 764 } 765 766 void X86FlagsCopyLoweringPass::rewriteCopy(MachineInstr &MI, 767 MachineOperand &FlagUse, 768 MachineInstr &CopyDefI) { 769 // Just replace this copy with the original copy def. 770 MRI->replaceRegWith(MI.getOperand(0).getReg(), 771 CopyDefI.getOperand(0).getReg()); 772 MI.eraseFromParent(); 773 } 774 775 void X86FlagsCopyLoweringPass::rewriteSetCarryExtended( 776 MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, 777 DebugLoc TestLoc, MachineInstr &SetBI, MachineOperand &FlagUse, 778 CondRegArray &CondRegs) { 779 // This routine is only used to handle pseudos for setting a register to zero 780 // or all ones based on CF. This is essentially the sign extended from 1-bit 781 // form of SETB and modeled with the SETB_C* pseudos. They require special 782 // handling as they aren't normal SETcc instructions and are lowered to an 783 // EFLAGS clobbering operation (SBB typically). One simplifying aspect is that 784 // they are only provided in reg-defining forms. A complicating factor is that 785 // they can define many different register widths. 786 assert(SetBI.getOperand(0).isReg() && 787 "Cannot have a non-register defined operand to this variant of SETB!"); 788 789 // Little helper to do the common final step of replacing the register def'ed 790 // by this SETB instruction with a new register and removing the SETB 791 // instruction. 792 auto RewriteToReg = [&](unsigned Reg) { 793 MRI->replaceRegWith(SetBI.getOperand(0).getReg(), Reg); 794 SetBI.eraseFromParent(); 795 }; 796 797 // Grab the register class used for this particular instruction. 798 auto &SetBRC = *MRI->getRegClass(SetBI.getOperand(0).getReg()); 799 800 MachineBasicBlock &MBB = *SetBI.getParent(); 801 auto SetPos = SetBI.getIterator(); 802 auto SetLoc = SetBI.getDebugLoc(); 803 804 auto AdjustReg = [&](unsigned Reg) { 805 auto &OrigRC = *MRI->getRegClass(Reg); 806 if (&OrigRC == &SetBRC) 807 return Reg; 808 809 unsigned NewReg; 810 811 int OrigRegSize = TRI->getRegSizeInBits(OrigRC) / 8; 812 int TargetRegSize = TRI->getRegSizeInBits(SetBRC) / 8; 813 assert(OrigRegSize <= 8 && "No GPRs larger than 64-bits!"); 814 assert(TargetRegSize <= 8 && "No GPRs larger than 64-bits!"); 815 int SubRegIdx[] = {X86::NoSubRegister, X86::sub_8bit, X86::sub_16bit, 816 X86::NoSubRegister, X86::sub_32bit}; 817 818 // If the original size is smaller than the target *and* is smaller than 4 819 // bytes, we need to explicitly zero extend it. We always extend to 4-bytes 820 // to maximize the chance of being able to CSE that operation and to avoid 821 // partial dependency stalls extending to 2-bytes. 822 if (OrigRegSize < TargetRegSize && OrigRegSize < 4) { 823 NewReg = MRI->createVirtualRegister(&X86::GR32RegClass); 824 BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOVZX32rr8), NewReg) 825 .addReg(Reg); 826 if (&SetBRC == &X86::GR32RegClass) 827 return NewReg; 828 Reg = NewReg; 829 OrigRegSize = 4; 830 } 831 832 NewReg = MRI->createVirtualRegister(&SetBRC); 833 if (OrigRegSize < TargetRegSize) { 834 BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::SUBREG_TO_REG), 835 NewReg) 836 .addImm(0) 837 .addReg(Reg) 838 .addImm(SubRegIdx[OrigRegSize]); 839 } else if (OrigRegSize > TargetRegSize) { 840 BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::EXTRACT_SUBREG), 841 NewReg) 842 .addReg(Reg) 843 .addImm(SubRegIdx[TargetRegSize]); 844 } else { 845 BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::COPY), NewReg) 846 .addReg(Reg); 847 } 848 return NewReg; 849 }; 850 851 unsigned &CondReg = CondRegs[X86::COND_B]; 852 if (!CondReg) 853 CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, X86::COND_B); 854 855 // Adjust the condition to have the desired register width by zero-extending 856 // as needed. 857 // FIXME: We should use a better API to avoid the local reference and using a 858 // different variable here. 859 unsigned ExtCondReg = AdjustReg(CondReg); 860 861 // Now we need to turn this into a bitmask. We do this by subtracting it from 862 // zero. 863 unsigned ZeroReg = MRI->createVirtualRegister(&X86::GR32RegClass); 864 BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOV32r0), ZeroReg); 865 ZeroReg = AdjustReg(ZeroReg); 866 867 unsigned Sub; 868 switch (SetBI.getOpcode()) { 869 case X86::SETB_C8r: 870 Sub = X86::SUB8rr; 871 break; 872 873 case X86::SETB_C16r: 874 Sub = X86::SUB16rr; 875 break; 876 877 case X86::SETB_C32r: 878 Sub = X86::SUB32rr; 879 break; 880 881 case X86::SETB_C64r: 882 Sub = X86::SUB64rr; 883 break; 884 885 default: 886 llvm_unreachable("Invalid SETB_C* opcode!"); 887 } 888 unsigned ResultReg = MRI->createVirtualRegister(&SetBRC); 889 BuildMI(MBB, SetPos, SetLoc, TII->get(Sub), ResultReg) 890 .addReg(ZeroReg) 891 .addReg(ExtCondReg); 892 return RewriteToReg(ResultReg); 893 } 894 895 void X86FlagsCopyLoweringPass::rewriteSetCC(MachineBasicBlock &TestMBB, 896 MachineBasicBlock::iterator TestPos, 897 DebugLoc TestLoc, 898 MachineInstr &SetCCI, 899 MachineOperand &FlagUse, 900 CondRegArray &CondRegs) { 901 X86::CondCode Cond = X86::getCondFromSETOpc(SetCCI.getOpcode()); 902 // Note that we can't usefully rewrite this to the inverse without complex 903 // analysis of the users of the setCC. Largely we rely on duplicates which 904 // could have been avoided already being avoided here. 905 unsigned &CondReg = CondRegs[Cond]; 906 if (!CondReg) 907 CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond); 908 909 // Rewriting a register def is trivial: we just replace the register and 910 // remove the setcc. 911 if (!SetCCI.mayStore()) { 912 assert(SetCCI.getOperand(0).isReg() && 913 "Cannot have a non-register defined operand to SETcc!"); 914 MRI->replaceRegWith(SetCCI.getOperand(0).getReg(), CondReg); 915 SetCCI.eraseFromParent(); 916 return; 917 } 918 919 // Otherwise, we need to emit a store. 920 auto MIB = BuildMI(*SetCCI.getParent(), SetCCI.getIterator(), 921 SetCCI.getDebugLoc(), TII->get(X86::MOV8mr)); 922 // Copy the address operands. 923 for (int i = 0; i < X86::AddrNumOperands; ++i) 924 MIB.add(SetCCI.getOperand(i)); 925 926 MIB.addReg(CondReg); 927 928 MIB->setMemRefs(SetCCI.memoperands_begin(), SetCCI.memoperands_end()); 929 930 SetCCI.eraseFromParent(); 931 return; 932 } 933