1 //====- X86FlagsCopyLowering.cpp - Lowers COPY nodes of EFLAGS ------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// \file 10 /// 11 /// Lowers COPY nodes of EFLAGS by directly extracting and preserving individual 12 /// flag bits. 13 /// 14 /// We have to do this by carefully analyzing and rewriting the usage of the 15 /// copied EFLAGS register because there is no general way to rematerialize the 16 /// entire EFLAGS register safely and efficiently. Using `popf` both forces 17 /// dynamic stack adjustment and can create correctness issues due to IF, TF, 18 /// and other non-status flags being overwritten. Using sequences involving 19 /// SAHF don't work on all x86 processors and are often quite slow compared to 20 /// directly testing a single status preserved in its own GPR. 21 /// 22 //===----------------------------------------------------------------------===// 23 24 #include "X86.h" 25 #include "X86InstrBuilder.h" 26 #include "X86InstrInfo.h" 27 #include "X86Subtarget.h" 28 #include "llvm/ADT/ArrayRef.h" 29 #include "llvm/ADT/DenseMap.h" 30 #include "llvm/ADT/PostOrderIterator.h" 31 #include "llvm/ADT/STLExtras.h" 32 #include "llvm/ADT/ScopeExit.h" 33 #include "llvm/ADT/SmallPtrSet.h" 34 #include "llvm/ADT/SmallSet.h" 35 #include "llvm/ADT/SmallVector.h" 36 #include "llvm/ADT/SparseBitVector.h" 37 #include "llvm/ADT/Statistic.h" 38 #include "llvm/CodeGen/MachineBasicBlock.h" 39 #include "llvm/CodeGen/MachineConstantPool.h" 40 #include "llvm/CodeGen/MachineDominators.h" 41 #include "llvm/CodeGen/MachineFunction.h" 42 #include "llvm/CodeGen/MachineFunctionPass.h" 43 #include "llvm/CodeGen/MachineInstr.h" 44 #include "llvm/CodeGen/MachineInstrBuilder.h" 45 #include "llvm/CodeGen/MachineModuleInfo.h" 46 #include "llvm/CodeGen/MachineOperand.h" 47 #include "llvm/CodeGen/MachineRegisterInfo.h" 48 #include "llvm/CodeGen/MachineSSAUpdater.h" 49 #include "llvm/CodeGen/TargetInstrInfo.h" 50 #include "llvm/CodeGen/TargetRegisterInfo.h" 51 #include "llvm/CodeGen/TargetSchedule.h" 52 #include "llvm/CodeGen/TargetSubtargetInfo.h" 53 #include "llvm/IR/DebugLoc.h" 54 #include "llvm/MC/MCSchedule.h" 55 #include "llvm/Pass.h" 56 #include "llvm/Support/CommandLine.h" 57 #include "llvm/Support/Debug.h" 58 #include "llvm/Support/raw_ostream.h" 59 #include <algorithm> 60 #include <cassert> 61 #include <iterator> 62 #include <utility> 63 64 using namespace llvm; 65 66 #define PASS_KEY "x86-flags-copy-lowering" 67 #define DEBUG_TYPE PASS_KEY 68 69 STATISTIC(NumCopiesEliminated, "Number of copies of EFLAGS eliminated"); 70 STATISTIC(NumSetCCsInserted, "Number of setCC instructions inserted"); 71 STATISTIC(NumTestsInserted, "Number of test instructions inserted"); 72 STATISTIC(NumAddsInserted, "Number of adds instructions inserted"); 73 74 namespace llvm { 75 76 void initializeX86FlagsCopyLoweringPassPass(PassRegistry &); 77 78 } // end namespace llvm 79 80 namespace { 81 82 // Convenient array type for storing registers associated with each condition. 83 using CondRegArray = std::array<unsigned, X86::LAST_VALID_COND + 1>; 84 85 class X86FlagsCopyLoweringPass : public MachineFunctionPass { 86 public: 87 X86FlagsCopyLoweringPass() : MachineFunctionPass(ID) { 88 initializeX86FlagsCopyLoweringPassPass(*PassRegistry::getPassRegistry()); 89 } 90 91 StringRef getPassName() const override { return "X86 EFLAGS copy lowering"; } 92 bool runOnMachineFunction(MachineFunction &MF) override; 93 void getAnalysisUsage(AnalysisUsage &AU) const override; 94 95 /// Pass identification, replacement for typeid. 96 static char ID; 97 98 private: 99 MachineRegisterInfo *MRI; 100 const X86InstrInfo *TII; 101 const TargetRegisterInfo *TRI; 102 const TargetRegisterClass *PromoteRC; 103 MachineDominatorTree *MDT; 104 105 CondRegArray collectCondsInRegs(MachineBasicBlock &MBB, 106 MachineBasicBlock::iterator CopyDefI); 107 108 unsigned promoteCondToReg(MachineBasicBlock &MBB, 109 MachineBasicBlock::iterator TestPos, 110 DebugLoc TestLoc, X86::CondCode Cond); 111 std::pair<unsigned, bool> 112 getCondOrInverseInReg(MachineBasicBlock &TestMBB, 113 MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, 114 X86::CondCode Cond, CondRegArray &CondRegs); 115 void insertTest(MachineBasicBlock &MBB, MachineBasicBlock::iterator Pos, 116 DebugLoc Loc, unsigned Reg); 117 118 void rewriteArithmetic(MachineBasicBlock &TestMBB, 119 MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, 120 MachineInstr &MI, MachineOperand &FlagUse, 121 CondRegArray &CondRegs); 122 void rewriteCMov(MachineBasicBlock &TestMBB, 123 MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, 124 MachineInstr &CMovI, MachineOperand &FlagUse, 125 CondRegArray &CondRegs); 126 void rewriteCondJmp(MachineBasicBlock &TestMBB, 127 MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, 128 MachineInstr &JmpI, CondRegArray &CondRegs); 129 void rewriteCopy(MachineInstr &MI, MachineOperand &FlagUse, 130 MachineInstr &CopyDefI); 131 void rewriteSetCarryExtended(MachineBasicBlock &TestMBB, 132 MachineBasicBlock::iterator TestPos, 133 DebugLoc TestLoc, MachineInstr &SetBI, 134 MachineOperand &FlagUse, CondRegArray &CondRegs); 135 void rewriteSetCC(MachineBasicBlock &TestMBB, 136 MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, 137 MachineInstr &SetCCI, MachineOperand &FlagUse, 138 CondRegArray &CondRegs); 139 }; 140 141 } // end anonymous namespace 142 143 INITIALIZE_PASS_BEGIN(X86FlagsCopyLoweringPass, DEBUG_TYPE, 144 "X86 EFLAGS copy lowering", false, false) 145 INITIALIZE_PASS_END(X86FlagsCopyLoweringPass, DEBUG_TYPE, 146 "X86 EFLAGS copy lowering", false, false) 147 148 FunctionPass *llvm::createX86FlagsCopyLoweringPass() { 149 return new X86FlagsCopyLoweringPass(); 150 } 151 152 char X86FlagsCopyLoweringPass::ID = 0; 153 154 void X86FlagsCopyLoweringPass::getAnalysisUsage(AnalysisUsage &AU) const { 155 AU.addRequired<MachineDominatorTree>(); 156 MachineFunctionPass::getAnalysisUsage(AU); 157 } 158 159 namespace { 160 /// An enumeration of the arithmetic instruction mnemonics which have 161 /// interesting flag semantics. 162 /// 163 /// We can map instruction opcodes into these mnemonics to make it easy to 164 /// dispatch with specific functionality. 165 enum class FlagArithMnemonic { 166 ADC, 167 ADCX, 168 ADOX, 169 RCL, 170 RCR, 171 SBB, 172 }; 173 } // namespace 174 175 static FlagArithMnemonic getMnemonicFromOpcode(unsigned Opcode) { 176 switch (Opcode) { 177 default: 178 report_fatal_error("No support for lowering a copy into EFLAGS when used " 179 "by this instruction!"); 180 181 #define LLVM_EXPAND_INSTR_SIZES(MNEMONIC, SUFFIX) \ 182 case X86::MNEMONIC##8##SUFFIX: \ 183 case X86::MNEMONIC##16##SUFFIX: \ 184 case X86::MNEMONIC##32##SUFFIX: \ 185 case X86::MNEMONIC##64##SUFFIX: 186 187 #define LLVM_EXPAND_ADC_SBB_INSTR(MNEMONIC) \ 188 LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr) \ 189 LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr_REV) \ 190 LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rm) \ 191 LLVM_EXPAND_INSTR_SIZES(MNEMONIC, mr) \ 192 case X86::MNEMONIC##8ri: \ 193 case X86::MNEMONIC##16ri8: \ 194 case X86::MNEMONIC##32ri8: \ 195 case X86::MNEMONIC##64ri8: \ 196 case X86::MNEMONIC##16ri: \ 197 case X86::MNEMONIC##32ri: \ 198 case X86::MNEMONIC##64ri32: \ 199 case X86::MNEMONIC##8mi: \ 200 case X86::MNEMONIC##16mi8: \ 201 case X86::MNEMONIC##32mi8: \ 202 case X86::MNEMONIC##64mi8: \ 203 case X86::MNEMONIC##16mi: \ 204 case X86::MNEMONIC##32mi: \ 205 case X86::MNEMONIC##64mi32: \ 206 case X86::MNEMONIC##8i8: \ 207 case X86::MNEMONIC##16i16: \ 208 case X86::MNEMONIC##32i32: \ 209 case X86::MNEMONIC##64i32: 210 211 LLVM_EXPAND_ADC_SBB_INSTR(ADC) 212 return FlagArithMnemonic::ADC; 213 214 LLVM_EXPAND_ADC_SBB_INSTR(SBB) 215 return FlagArithMnemonic::SBB; 216 217 #undef LLVM_EXPAND_ADC_SBB_INSTR 218 219 LLVM_EXPAND_INSTR_SIZES(RCL, rCL) 220 LLVM_EXPAND_INSTR_SIZES(RCL, r1) 221 LLVM_EXPAND_INSTR_SIZES(RCL, ri) 222 return FlagArithMnemonic::RCL; 223 224 LLVM_EXPAND_INSTR_SIZES(RCR, rCL) 225 LLVM_EXPAND_INSTR_SIZES(RCR, r1) 226 LLVM_EXPAND_INSTR_SIZES(RCR, ri) 227 return FlagArithMnemonic::RCR; 228 229 #undef LLVM_EXPAND_INSTR_SIZES 230 231 case X86::ADCX32rr: 232 case X86::ADCX64rr: 233 case X86::ADCX32rm: 234 case X86::ADCX64rm: 235 return FlagArithMnemonic::ADCX; 236 237 case X86::ADOX32rr: 238 case X86::ADOX64rr: 239 case X86::ADOX32rm: 240 case X86::ADOX64rm: 241 return FlagArithMnemonic::ADOX; 242 } 243 } 244 245 static MachineBasicBlock &splitBlock(MachineBasicBlock &MBB, 246 MachineInstr &SplitI, 247 const X86InstrInfo &TII) { 248 MachineFunction &MF = *MBB.getParent(); 249 250 assert(SplitI.getParent() == &MBB && 251 "Split instruction must be in the split block!"); 252 assert(SplitI.isBranch() && 253 "Only designed to split a tail of branch instructions!"); 254 assert(X86::getCondFromBranchOpc(SplitI.getOpcode()) != X86::COND_INVALID && 255 "Must split on an actual jCC instruction!"); 256 257 // Dig out the previous instruction to the split point. 258 MachineInstr &PrevI = *std::prev(SplitI.getIterator()); 259 assert(PrevI.isBranch() && "Must split after a branch!"); 260 assert(X86::getCondFromBranchOpc(PrevI.getOpcode()) != X86::COND_INVALID && 261 "Must split after an actual jCC instruction!"); 262 assert(!std::prev(PrevI.getIterator())->isTerminator() && 263 "Must only have this one terminator prior to the split!"); 264 265 // Grab the one successor edge that will stay in `MBB`. 266 MachineBasicBlock &UnsplitSucc = *PrevI.getOperand(0).getMBB(); 267 268 // Analyze the original block to see if we are actually splitting an edge 269 // into two edges. This can happen when we have multiple conditional jumps to 270 // the same successor. 271 bool IsEdgeSplit = 272 std::any_of(SplitI.getIterator(), MBB.instr_end(), 273 [&](MachineInstr &MI) { 274 assert(MI.isTerminator() && 275 "Should only have spliced terminators!"); 276 return llvm::any_of( 277 MI.operands(), [&](MachineOperand &MOp) { 278 return MOp.isMBB() && MOp.getMBB() == &UnsplitSucc; 279 }); 280 }) || 281 MBB.getFallThrough() == &UnsplitSucc; 282 283 MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock(); 284 285 // Insert the new block immediately after the current one. Any existing 286 // fallthrough will be sunk into this new block anyways. 287 MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB); 288 289 // Splice the tail of instructions into the new block. 290 NewMBB.splice(NewMBB.end(), &MBB, SplitI.getIterator(), MBB.end()); 291 292 // Copy the necessary succesors (and their probability info) into the new 293 // block. 294 for (auto SI = MBB.succ_begin(), SE = MBB.succ_end(); SI != SE; ++SI) 295 if (IsEdgeSplit || *SI != &UnsplitSucc) 296 NewMBB.copySuccessor(&MBB, SI); 297 // Normalize the probabilities if we didn't end up splitting the edge. 298 if (!IsEdgeSplit) 299 NewMBB.normalizeSuccProbs(); 300 301 // Now replace all of the moved successors in the original block with the new 302 // block. This will merge their probabilities. 303 for (MachineBasicBlock *Succ : NewMBB.successors()) 304 if (Succ != &UnsplitSucc) 305 MBB.replaceSuccessor(Succ, &NewMBB); 306 307 // We should always end up replacing at least one successor. 308 assert(MBB.isSuccessor(&NewMBB) && 309 "Failed to make the new block a successor!"); 310 311 // Now update all the PHIs. 312 for (MachineBasicBlock *Succ : NewMBB.successors()) { 313 for (MachineInstr &MI : *Succ) { 314 if (!MI.isPHI()) 315 break; 316 317 for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps; 318 OpIdx += 2) { 319 MachineOperand &OpV = MI.getOperand(OpIdx); 320 MachineOperand &OpMBB = MI.getOperand(OpIdx + 1); 321 assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!"); 322 if (OpMBB.getMBB() != &MBB) 323 continue; 324 325 // Replace the operand for unsplit successors 326 if (!IsEdgeSplit || Succ != &UnsplitSucc) { 327 OpMBB.setMBB(&NewMBB); 328 329 // We have to continue scanning as there may be multiple entries in 330 // the PHI. 331 continue; 332 } 333 334 // When we have split the edge append a new successor. 335 MI.addOperand(MF, OpV); 336 MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB)); 337 break; 338 } 339 } 340 } 341 342 return NewMBB; 343 } 344 345 bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) { 346 LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName() 347 << " **********\n"); 348 349 auto &Subtarget = MF.getSubtarget<X86Subtarget>(); 350 MRI = &MF.getRegInfo(); 351 TII = Subtarget.getInstrInfo(); 352 TRI = Subtarget.getRegisterInfo(); 353 MDT = &getAnalysis<MachineDominatorTree>(); 354 PromoteRC = &X86::GR8RegClass; 355 356 if (MF.begin() == MF.end()) 357 // Nothing to do for a degenerate empty function... 358 return false; 359 360 // Collect the copies in RPO so that when there are chains where a copy is in 361 // turn copied again we visit the first one first. This ensures we can find 362 // viable locations for testing the original EFLAGS that dominate all the 363 // uses across complex CFGs. 364 SmallVector<MachineInstr *, 4> Copies; 365 ReversePostOrderTraversal<MachineFunction *> RPOT(&MF); 366 for (MachineBasicBlock *MBB : RPOT) 367 for (MachineInstr &MI : *MBB) 368 if (MI.getOpcode() == TargetOpcode::COPY && 369 MI.getOperand(0).getReg() == X86::EFLAGS) 370 Copies.push_back(&MI); 371 372 for (MachineInstr *CopyI : Copies) { 373 MachineBasicBlock &MBB = *CopyI->getParent(); 374 375 MachineOperand &VOp = CopyI->getOperand(1); 376 assert(VOp.isReg() && 377 "The input to the copy for EFLAGS should always be a register!"); 378 MachineInstr &CopyDefI = *MRI->getVRegDef(VOp.getReg()); 379 if (CopyDefI.getOpcode() != TargetOpcode::COPY) { 380 // FIXME: The big likely candidate here are PHI nodes. We could in theory 381 // handle PHI nodes, but it gets really, really hard. Insanely hard. Hard 382 // enough that it is probably better to change every other part of LLVM 383 // to avoid creating them. The issue is that once we have PHIs we won't 384 // know which original EFLAGS value we need to capture with our setCCs 385 // below. The end result will be computing a complete set of setCCs that 386 // we *might* want, computing them in every place where we copy *out* of 387 // EFLAGS and then doing SSA formation on all of them to insert necessary 388 // PHI nodes and consume those here. Then hoping that somehow we DCE the 389 // unnecessary ones. This DCE seems very unlikely to be successful and so 390 // we will almost certainly end up with a glut of dead setCC 391 // instructions. Until we have a motivating test case and fail to avoid 392 // it by changing other parts of LLVM's lowering, we refuse to handle 393 // this complex case here. 394 LLVM_DEBUG( 395 dbgs() << "ERROR: Encountered unexpected def of an eflags copy: "; 396 CopyDefI.dump()); 397 report_fatal_error( 398 "Cannot lower EFLAGS copy unless it is defined in turn by a copy!"); 399 } 400 401 auto Cleanup = make_scope_exit([&] { 402 // All uses of the EFLAGS copy are now rewritten, kill the copy into 403 // eflags and if dead the copy from. 404 CopyI->eraseFromParent(); 405 if (MRI->use_empty(CopyDefI.getOperand(0).getReg())) 406 CopyDefI.eraseFromParent(); 407 ++NumCopiesEliminated; 408 }); 409 410 MachineOperand &DOp = CopyI->getOperand(0); 411 assert(DOp.isDef() && "Expected register def!"); 412 assert(DOp.getReg() == X86::EFLAGS && "Unexpected copy def register!"); 413 if (DOp.isDead()) 414 continue; 415 416 MachineBasicBlock *TestMBB = CopyDefI.getParent(); 417 auto TestPos = CopyDefI.getIterator(); 418 DebugLoc TestLoc = CopyDefI.getDebugLoc(); 419 420 LLVM_DEBUG(dbgs() << "Rewriting copy: "; CopyI->dump()); 421 422 // Walk up across live-in EFLAGS to find where they were actually def'ed. 423 // 424 // This copy's def may just be part of a region of blocks covered by 425 // a single def of EFLAGS and we want to find the top of that region where 426 // possible. 427 // 428 // This is essentially a search for a *candidate* reaching definition 429 // location. We don't need to ever find the actual reaching definition here, 430 // but we want to walk up the dominator tree to find the highest point which 431 // would be viable for such a definition. 432 auto HasEFLAGSClobber = [&](MachineBasicBlock::iterator Begin, 433 MachineBasicBlock::iterator End) { 434 // Scan backwards as we expect these to be relatively short and often find 435 // a clobber near the end. 436 return llvm::any_of( 437 llvm::reverse(llvm::make_range(Begin, End)), [&](MachineInstr &MI) { 438 // Flag any instruction (other than the copy we are 439 // currently rewriting) that defs EFLAGS. 440 return &MI != CopyI && MI.findRegisterDefOperand(X86::EFLAGS); 441 }); 442 }; 443 auto HasEFLAGSClobberPath = [&](MachineBasicBlock *BeginMBB, 444 MachineBasicBlock *EndMBB) { 445 assert(MDT->dominates(BeginMBB, EndMBB) && 446 "Only support paths down the dominator tree!"); 447 SmallPtrSet<MachineBasicBlock *, 4> Visited; 448 SmallVector<MachineBasicBlock *, 4> Worklist; 449 // We terminate at the beginning. No need to scan it. 450 Visited.insert(BeginMBB); 451 Worklist.push_back(EndMBB); 452 do { 453 auto *MBB = Worklist.pop_back_val(); 454 for (auto *PredMBB : MBB->predecessors()) { 455 if (!Visited.insert(PredMBB).second) 456 continue; 457 if (HasEFLAGSClobber(PredMBB->begin(), PredMBB->end())) 458 return true; 459 // Enqueue this block to walk its predecessors. 460 Worklist.push_back(PredMBB); 461 } 462 } while (!Worklist.empty()); 463 // No clobber found along a path from the begin to end. 464 return false; 465 }; 466 while (TestMBB->isLiveIn(X86::EFLAGS) && !TestMBB->pred_empty() && 467 !HasEFLAGSClobber(TestMBB->begin(), TestPos)) { 468 // Find the nearest common dominator of the predecessors, as 469 // that will be the best candidate to hoist into. 470 MachineBasicBlock *HoistMBB = 471 std::accumulate(std::next(TestMBB->pred_begin()), TestMBB->pred_end(), 472 *TestMBB->pred_begin(), 473 [&](MachineBasicBlock *LHS, MachineBasicBlock *RHS) { 474 return MDT->findNearestCommonDominator(LHS, RHS); 475 }); 476 477 // Now we need to scan all predecessors that may be reached along paths to 478 // the hoist block. A clobber anywhere in any of these blocks the hoist. 479 // Note that this even handles loops because we require *no* clobbers. 480 if (HasEFLAGSClobberPath(HoistMBB, TestMBB)) 481 break; 482 483 // We also need the terminators to not sneakily clobber flags. 484 if (HasEFLAGSClobber(HoistMBB->getFirstTerminator()->getIterator(), 485 HoistMBB->instr_end())) 486 break; 487 488 // We found a viable location, hoist our test position to it. 489 TestMBB = HoistMBB; 490 TestPos = TestMBB->getFirstTerminator()->getIterator(); 491 // Clear the debug location as it would just be confusing after hoisting. 492 TestLoc = DebugLoc(); 493 } 494 LLVM_DEBUG({ 495 auto DefIt = llvm::find_if( 496 llvm::reverse(llvm::make_range(TestMBB->instr_begin(), TestPos)), 497 [&](MachineInstr &MI) { 498 return MI.findRegisterDefOperand(X86::EFLAGS); 499 }); 500 if (DefIt.base() != TestMBB->instr_begin()) { 501 dbgs() << " Using EFLAGS defined by: "; 502 DefIt->dump(); 503 } else { 504 dbgs() << " Using live-in flags for BB:\n"; 505 TestMBB->dump(); 506 } 507 }); 508 509 // While rewriting uses, we buffer jumps and rewrite them in a second pass 510 // because doing so will perturb the CFG that we are walking to find the 511 // uses in the first place. 512 SmallVector<MachineInstr *, 4> JmpIs; 513 514 // Gather the condition flags that have already been preserved in 515 // registers. We do this from scratch each time as we expect there to be 516 // very few of them and we expect to not revisit the same copy definition 517 // many times. If either of those change sufficiently we could build a map 518 // of these up front instead. 519 CondRegArray CondRegs = collectCondsInRegs(*TestMBB, TestPos); 520 521 // Collect the basic blocks we need to scan. Typically this will just be 522 // a single basic block but we may have to scan multiple blocks if the 523 // EFLAGS copy lives into successors. 524 SmallVector<MachineBasicBlock *, 2> Blocks; 525 SmallPtrSet<MachineBasicBlock *, 2> VisitedBlocks; 526 Blocks.push_back(&MBB); 527 528 do { 529 MachineBasicBlock &UseMBB = *Blocks.pop_back_val(); 530 531 // Track when if/when we find a kill of the flags in this block. 532 bool FlagsKilled = false; 533 534 // In most cases, we walk from the beginning to the end of the block. But 535 // when the block is the same block as the copy is from, we will visit it 536 // twice. The first time we start from the copy and go to the end. The 537 // second time we start from the beginning and go to the copy. This lets 538 // us handle copies inside of cycles. 539 // FIXME: This loop is *super* confusing. This is at least in part 540 // a symptom of all of this routine needing to be refactored into 541 // documentable components. Once done, there may be a better way to write 542 // this loop. 543 for (auto MII = (&UseMBB == &MBB && !VisitedBlocks.count(&UseMBB)) 544 ? std::next(CopyI->getIterator()) 545 : UseMBB.instr_begin(), 546 MIE = UseMBB.instr_end(); 547 MII != MIE;) { 548 MachineInstr &MI = *MII++; 549 // If we are in the original copy block and encounter either the copy 550 // def or the copy itself, break so that we don't re-process any part of 551 // the block or process the instructions in the range that was copied 552 // over. 553 if (&MI == CopyI || &MI == &CopyDefI) { 554 assert(&UseMBB == &MBB && VisitedBlocks.count(&MBB) && 555 "Should only encounter these on the second pass over the " 556 "original block."); 557 break; 558 } 559 560 MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS); 561 if (!FlagUse) { 562 if (MI.findRegisterDefOperand(X86::EFLAGS)) { 563 // If EFLAGS are defined, it's as-if they were killed. We can stop 564 // scanning here. 565 // 566 // NB!!! Many instructions only modify some flags. LLVM currently 567 // models this as clobbering all flags, but if that ever changes 568 // this will need to be carefully updated to handle that more 569 // complex logic. 570 FlagsKilled = true; 571 break; 572 } 573 continue; 574 } 575 576 LLVM_DEBUG(dbgs() << " Rewriting use: "; MI.dump()); 577 578 // Check the kill flag before we rewrite as that may change it. 579 if (FlagUse->isKill()) 580 FlagsKilled = true; 581 582 // Once we encounter a branch, the rest of the instructions must also be 583 // branches. We can't rewrite in place here, so we handle them below. 584 // 585 // Note that we don't have to handle tail calls here, even conditional 586 // tail calls, as those are not introduced into the X86 MI until post-RA 587 // branch folding or black placement. As a consequence, we get to deal 588 // with the simpler formulation of conditional branches followed by tail 589 // calls. 590 if (X86::getCondFromBranchOpc(MI.getOpcode()) != X86::COND_INVALID) { 591 auto JmpIt = MI.getIterator(); 592 do { 593 JmpIs.push_back(&*JmpIt); 594 ++JmpIt; 595 } while (JmpIt != UseMBB.instr_end() && 596 X86::getCondFromBranchOpc(JmpIt->getOpcode()) != 597 X86::COND_INVALID); 598 break; 599 } 600 601 // Otherwise we can just rewrite in-place. 602 if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) { 603 rewriteCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); 604 } else if (X86::getCondFromSETOpc(MI.getOpcode()) != 605 X86::COND_INVALID) { 606 rewriteSetCC(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); 607 } else if (MI.getOpcode() == TargetOpcode::COPY) { 608 rewriteCopy(MI, *FlagUse, CopyDefI); 609 } else { 610 // We assume all other instructions that use flags also def them. 611 assert(MI.findRegisterDefOperand(X86::EFLAGS) && 612 "Expected a def of EFLAGS for this instruction!"); 613 614 // NB!!! Several arithmetic instructions only *partially* update 615 // flags. Theoretically, we could generate MI code sequences that 616 // would rely on this fact and observe different flags independently. 617 // But currently LLVM models all of these instructions as clobbering 618 // all the flags in an undef way. We rely on that to simplify the 619 // logic. 620 FlagsKilled = true; 621 622 switch (MI.getOpcode()) { 623 case X86::SETB_C8r: 624 case X86::SETB_C16r: 625 case X86::SETB_C32r: 626 case X86::SETB_C64r: 627 // Use custom lowering for arithmetic that is merely extending the 628 // carry flag. We model this as the SETB_C* pseudo instructions. 629 rewriteSetCarryExtended(*TestMBB, TestPos, TestLoc, MI, *FlagUse, 630 CondRegs); 631 break; 632 633 default: 634 // Generically handle remaining uses as arithmetic instructions. 635 rewriteArithmetic(*TestMBB, TestPos, TestLoc, MI, *FlagUse, 636 CondRegs); 637 break; 638 } 639 break; 640 } 641 642 // If this was the last use of the flags, we're done. 643 if (FlagsKilled) 644 break; 645 } 646 647 // If the flags were killed, we're done with this block. 648 if (FlagsKilled) 649 continue; 650 651 // Otherwise we need to scan successors for ones where the flags live-in 652 // and queue those up for processing. 653 for (MachineBasicBlock *SuccMBB : UseMBB.successors()) 654 if (SuccMBB->isLiveIn(X86::EFLAGS) && 655 VisitedBlocks.insert(SuccMBB).second) { 656 // We currently don't do any PHI insertion and so we require that the 657 // test basic block dominates all of the use basic blocks. Further, we 658 // can't have a cycle from the test block back to itself as that would 659 // create a cycle requiring a PHI to break it. 660 // 661 // We could in theory do PHI insertion here if it becomes useful by 662 // just taking undef values in along every edge that we don't trace 663 // this EFLAGS copy along. This isn't as bad as fully general PHI 664 // insertion, but still seems like a great deal of complexity. 665 // 666 // Because it is theoretically possible that some earlier MI pass or 667 // other lowering transformation could induce this to happen, we do 668 // a hard check even in non-debug builds here. 669 if (SuccMBB == TestMBB || !MDT->dominates(TestMBB, SuccMBB)) { 670 LLVM_DEBUG({ 671 dbgs() 672 << "ERROR: Encountered use that is not dominated by our test " 673 "basic block! Rewriting this would require inserting PHI " 674 "nodes to track the flag state across the CFG.\n\nTest " 675 "block:\n"; 676 TestMBB->dump(); 677 dbgs() << "Use block:\n"; 678 SuccMBB->dump(); 679 }); 680 report_fatal_error( 681 "Cannot lower EFLAGS copy when original copy def " 682 "does not dominate all uses."); 683 } 684 685 Blocks.push_back(SuccMBB); 686 } 687 } while (!Blocks.empty()); 688 689 // Now rewrite the jumps that use the flags. These we handle specially 690 // because if there are multiple jumps in a single basic block we'll have 691 // to do surgery on the CFG. 692 MachineBasicBlock *LastJmpMBB = nullptr; 693 for (MachineInstr *JmpI : JmpIs) { 694 // Past the first jump within a basic block we need to split the blocks 695 // apart. 696 if (JmpI->getParent() == LastJmpMBB) 697 splitBlock(*JmpI->getParent(), *JmpI, *TII); 698 else 699 LastJmpMBB = JmpI->getParent(); 700 701 rewriteCondJmp(*TestMBB, TestPos, TestLoc, *JmpI, CondRegs); 702 } 703 704 // FIXME: Mark the last use of EFLAGS before the copy's def as a kill if 705 // the copy's def operand is itself a kill. 706 } 707 708 #ifndef NDEBUG 709 for (MachineBasicBlock &MBB : MF) 710 for (MachineInstr &MI : MBB) 711 if (MI.getOpcode() == TargetOpcode::COPY && 712 (MI.getOperand(0).getReg() == X86::EFLAGS || 713 MI.getOperand(1).getReg() == X86::EFLAGS)) { 714 LLVM_DEBUG(dbgs() << "ERROR: Found a COPY involving EFLAGS: "; 715 MI.dump()); 716 llvm_unreachable("Unlowered EFLAGS copy!"); 717 } 718 #endif 719 720 return true; 721 } 722 723 /// Collect any conditions that have already been set in registers so that we 724 /// can re-use them rather than adding duplicates. 725 CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs( 726 MachineBasicBlock &MBB, MachineBasicBlock::iterator TestPos) { 727 CondRegArray CondRegs = {}; 728 729 // Scan backwards across the range of instructions with live EFLAGS. 730 for (MachineInstr &MI : 731 llvm::reverse(llvm::make_range(MBB.begin(), TestPos))) { 732 X86::CondCode Cond = X86::getCondFromSETOpc(MI.getOpcode()); 733 if (Cond != X86::COND_INVALID && !MI.mayStore() && MI.getOperand(0).isReg() && 734 TRI->isVirtualRegister(MI.getOperand(0).getReg())) { 735 assert(MI.getOperand(0).isDef() && 736 "A non-storing SETcc should always define a register!"); 737 CondRegs[Cond] = MI.getOperand(0).getReg(); 738 } 739 740 // Stop scanning when we see the first definition of the EFLAGS as prior to 741 // this we would potentially capture the wrong flag state. 742 if (MI.findRegisterDefOperand(X86::EFLAGS)) 743 break; 744 } 745 return CondRegs; 746 } 747 748 unsigned X86FlagsCopyLoweringPass::promoteCondToReg( 749 MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, 750 DebugLoc TestLoc, X86::CondCode Cond) { 751 unsigned Reg = MRI->createVirtualRegister(PromoteRC); 752 auto SetI = BuildMI(TestMBB, TestPos, TestLoc, 753 TII->get(X86::getSETFromCond(Cond)), Reg); 754 (void)SetI; 755 LLVM_DEBUG(dbgs() << " save cond: "; SetI->dump()); 756 ++NumSetCCsInserted; 757 return Reg; 758 } 759 760 std::pair<unsigned, bool> X86FlagsCopyLoweringPass::getCondOrInverseInReg( 761 MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, 762 DebugLoc TestLoc, X86::CondCode Cond, CondRegArray &CondRegs) { 763 unsigned &CondReg = CondRegs[Cond]; 764 unsigned &InvCondReg = CondRegs[X86::GetOppositeBranchCondition(Cond)]; 765 if (!CondReg && !InvCondReg) 766 CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond); 767 768 if (CondReg) 769 return {CondReg, false}; 770 else 771 return {InvCondReg, true}; 772 } 773 774 void X86FlagsCopyLoweringPass::insertTest(MachineBasicBlock &MBB, 775 MachineBasicBlock::iterator Pos, 776 DebugLoc Loc, unsigned Reg) { 777 auto TestI = 778 BuildMI(MBB, Pos, Loc, TII->get(X86::TEST8rr)).addReg(Reg).addReg(Reg); 779 (void)TestI; 780 LLVM_DEBUG(dbgs() << " test cond: "; TestI->dump()); 781 ++NumTestsInserted; 782 } 783 784 void X86FlagsCopyLoweringPass::rewriteArithmetic( 785 MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, 786 DebugLoc TestLoc, MachineInstr &MI, MachineOperand &FlagUse, 787 CondRegArray &CondRegs) { 788 // Arithmetic is either reading CF or OF. Figure out which condition we need 789 // to preserve in a register. 790 X86::CondCode Cond; 791 792 // The addend to use to reset CF or OF when added to the flag value. 793 int Addend; 794 795 switch (getMnemonicFromOpcode(MI.getOpcode())) { 796 case FlagArithMnemonic::ADC: 797 case FlagArithMnemonic::ADCX: 798 case FlagArithMnemonic::RCL: 799 case FlagArithMnemonic::RCR: 800 case FlagArithMnemonic::SBB: 801 Cond = X86::COND_B; // CF == 1 802 // Set up an addend that when one is added will need a carry due to not 803 // having a higher bit available. 804 Addend = 255; 805 break; 806 807 case FlagArithMnemonic::ADOX: 808 Cond = X86::COND_O; // OF == 1 809 // Set up an addend that when one is added will turn from positive to 810 // negative and thus overflow in the signed domain. 811 Addend = 127; 812 break; 813 } 814 815 // Now get a register that contains the value of the flag input to the 816 // arithmetic. We require exactly this flag to simplify the arithmetic 817 // required to materialize it back into the flag. 818 unsigned &CondReg = CondRegs[Cond]; 819 if (!CondReg) 820 CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond); 821 822 MachineBasicBlock &MBB = *MI.getParent(); 823 824 // Insert an instruction that will set the flag back to the desired value. 825 unsigned TmpReg = MRI->createVirtualRegister(PromoteRC); 826 auto AddI = 827 BuildMI(MBB, MI.getIterator(), MI.getDebugLoc(), TII->get(X86::ADD8ri)) 828 .addDef(TmpReg, RegState::Dead) 829 .addReg(CondReg) 830 .addImm(Addend); 831 (void)AddI; 832 LLVM_DEBUG(dbgs() << " add cond: "; AddI->dump()); 833 ++NumAddsInserted; 834 FlagUse.setIsKill(true); 835 } 836 837 void X86FlagsCopyLoweringPass::rewriteCMov(MachineBasicBlock &TestMBB, 838 MachineBasicBlock::iterator TestPos, 839 DebugLoc TestLoc, 840 MachineInstr &CMovI, 841 MachineOperand &FlagUse, 842 CondRegArray &CondRegs) { 843 // First get the register containing this specific condition. 844 X86::CondCode Cond = X86::getCondFromCMovOpc(CMovI.getOpcode()); 845 unsigned CondReg; 846 bool Inverted; 847 std::tie(CondReg, Inverted) = 848 getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs); 849 850 MachineBasicBlock &MBB = *CMovI.getParent(); 851 852 // Insert a direct test of the saved register. 853 insertTest(MBB, CMovI.getIterator(), CMovI.getDebugLoc(), CondReg); 854 855 // Rewrite the CMov to use the !ZF flag from the test (but match register 856 // size and memory operand), and then kill its use of the flags afterward. 857 auto &CMovRC = *MRI->getRegClass(CMovI.getOperand(0).getReg()); 858 CMovI.setDesc(TII->get(X86::getCMovFromCond( 859 Inverted ? X86::COND_E : X86::COND_NE, TRI->getRegSizeInBits(CMovRC) / 8, 860 !CMovI.memoperands_empty()))); 861 FlagUse.setIsKill(true); 862 LLVM_DEBUG(dbgs() << " fixed cmov: "; CMovI.dump()); 863 } 864 865 void X86FlagsCopyLoweringPass::rewriteCondJmp( 866 MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, 867 DebugLoc TestLoc, MachineInstr &JmpI, CondRegArray &CondRegs) { 868 // First get the register containing this specific condition. 869 X86::CondCode Cond = X86::getCondFromBranchOpc(JmpI.getOpcode()); 870 unsigned CondReg; 871 bool Inverted; 872 std::tie(CondReg, Inverted) = 873 getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs); 874 875 MachineBasicBlock &JmpMBB = *JmpI.getParent(); 876 877 // Insert a direct test of the saved register. 878 insertTest(JmpMBB, JmpI.getIterator(), JmpI.getDebugLoc(), CondReg); 879 880 // Rewrite the jump to use the !ZF flag from the test, and kill its use of 881 // flags afterward. 882 JmpI.setDesc(TII->get( 883 X86::GetCondBranchFromCond(Inverted ? X86::COND_E : X86::COND_NE))); 884 const int ImplicitEFLAGSOpIdx = 1; 885 JmpI.getOperand(ImplicitEFLAGSOpIdx).setIsKill(true); 886 LLVM_DEBUG(dbgs() << " fixed jCC: "; JmpI.dump()); 887 } 888 889 void X86FlagsCopyLoweringPass::rewriteCopy(MachineInstr &MI, 890 MachineOperand &FlagUse, 891 MachineInstr &CopyDefI) { 892 // Just replace this copy with the original copy def. 893 MRI->replaceRegWith(MI.getOperand(0).getReg(), 894 CopyDefI.getOperand(0).getReg()); 895 MI.eraseFromParent(); 896 } 897 898 void X86FlagsCopyLoweringPass::rewriteSetCarryExtended( 899 MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, 900 DebugLoc TestLoc, MachineInstr &SetBI, MachineOperand &FlagUse, 901 CondRegArray &CondRegs) { 902 // This routine is only used to handle pseudos for setting a register to zero 903 // or all ones based on CF. This is essentially the sign extended from 1-bit 904 // form of SETB and modeled with the SETB_C* pseudos. They require special 905 // handling as they aren't normal SETcc instructions and are lowered to an 906 // EFLAGS clobbering operation (SBB typically). One simplifying aspect is that 907 // they are only provided in reg-defining forms. A complicating factor is that 908 // they can define many different register widths. 909 assert(SetBI.getOperand(0).isReg() && 910 "Cannot have a non-register defined operand to this variant of SETB!"); 911 912 // Little helper to do the common final step of replacing the register def'ed 913 // by this SETB instruction with a new register and removing the SETB 914 // instruction. 915 auto RewriteToReg = [&](unsigned Reg) { 916 MRI->replaceRegWith(SetBI.getOperand(0).getReg(), Reg); 917 SetBI.eraseFromParent(); 918 }; 919 920 // Grab the register class used for this particular instruction. 921 auto &SetBRC = *MRI->getRegClass(SetBI.getOperand(0).getReg()); 922 923 MachineBasicBlock &MBB = *SetBI.getParent(); 924 auto SetPos = SetBI.getIterator(); 925 auto SetLoc = SetBI.getDebugLoc(); 926 927 auto AdjustReg = [&](unsigned Reg) { 928 auto &OrigRC = *MRI->getRegClass(Reg); 929 if (&OrigRC == &SetBRC) 930 return Reg; 931 932 unsigned NewReg; 933 934 int OrigRegSize = TRI->getRegSizeInBits(OrigRC) / 8; 935 int TargetRegSize = TRI->getRegSizeInBits(SetBRC) / 8; 936 assert(OrigRegSize <= 8 && "No GPRs larger than 64-bits!"); 937 assert(TargetRegSize <= 8 && "No GPRs larger than 64-bits!"); 938 int SubRegIdx[] = {X86::NoSubRegister, X86::sub_8bit, X86::sub_16bit, 939 X86::NoSubRegister, X86::sub_32bit}; 940 941 // If the original size is smaller than the target *and* is smaller than 4 942 // bytes, we need to explicitly zero extend it. We always extend to 4-bytes 943 // to maximize the chance of being able to CSE that operation and to avoid 944 // partial dependency stalls extending to 2-bytes. 945 if (OrigRegSize < TargetRegSize && OrigRegSize < 4) { 946 NewReg = MRI->createVirtualRegister(&X86::GR32RegClass); 947 BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOVZX32rr8), NewReg) 948 .addReg(Reg); 949 if (&SetBRC == &X86::GR32RegClass) 950 return NewReg; 951 Reg = NewReg; 952 OrigRegSize = 4; 953 } 954 955 NewReg = MRI->createVirtualRegister(&SetBRC); 956 if (OrigRegSize < TargetRegSize) { 957 BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::SUBREG_TO_REG), 958 NewReg) 959 .addImm(0) 960 .addReg(Reg) 961 .addImm(SubRegIdx[OrigRegSize]); 962 } else if (OrigRegSize > TargetRegSize) { 963 BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::EXTRACT_SUBREG), 964 NewReg) 965 .addReg(Reg) 966 .addImm(SubRegIdx[TargetRegSize]); 967 } else { 968 BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::COPY), NewReg) 969 .addReg(Reg); 970 } 971 return NewReg; 972 }; 973 974 unsigned &CondReg = CondRegs[X86::COND_B]; 975 if (!CondReg) 976 CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, X86::COND_B); 977 978 // Adjust the condition to have the desired register width by zero-extending 979 // as needed. 980 // FIXME: We should use a better API to avoid the local reference and using a 981 // different variable here. 982 unsigned ExtCondReg = AdjustReg(CondReg); 983 984 // Now we need to turn this into a bitmask. We do this by subtracting it from 985 // zero. 986 unsigned ZeroReg = MRI->createVirtualRegister(&X86::GR32RegClass); 987 BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOV32r0), ZeroReg); 988 ZeroReg = AdjustReg(ZeroReg); 989 990 unsigned Sub; 991 switch (SetBI.getOpcode()) { 992 case X86::SETB_C8r: 993 Sub = X86::SUB8rr; 994 break; 995 996 case X86::SETB_C16r: 997 Sub = X86::SUB16rr; 998 break; 999 1000 case X86::SETB_C32r: 1001 Sub = X86::SUB32rr; 1002 break; 1003 1004 case X86::SETB_C64r: 1005 Sub = X86::SUB64rr; 1006 break; 1007 1008 default: 1009 llvm_unreachable("Invalid SETB_C* opcode!"); 1010 } 1011 unsigned ResultReg = MRI->createVirtualRegister(&SetBRC); 1012 BuildMI(MBB, SetPos, SetLoc, TII->get(Sub), ResultReg) 1013 .addReg(ZeroReg) 1014 .addReg(ExtCondReg); 1015 return RewriteToReg(ResultReg); 1016 } 1017 1018 void X86FlagsCopyLoweringPass::rewriteSetCC(MachineBasicBlock &TestMBB, 1019 MachineBasicBlock::iterator TestPos, 1020 DebugLoc TestLoc, 1021 MachineInstr &SetCCI, 1022 MachineOperand &FlagUse, 1023 CondRegArray &CondRegs) { 1024 X86::CondCode Cond = X86::getCondFromSETOpc(SetCCI.getOpcode()); 1025 // Note that we can't usefully rewrite this to the inverse without complex 1026 // analysis of the users of the setCC. Largely we rely on duplicates which 1027 // could have been avoided already being avoided here. 1028 unsigned &CondReg = CondRegs[Cond]; 1029 if (!CondReg) 1030 CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond); 1031 1032 // Rewriting a register def is trivial: we just replace the register and 1033 // remove the setcc. 1034 if (!SetCCI.mayStore()) { 1035 assert(SetCCI.getOperand(0).isReg() && 1036 "Cannot have a non-register defined operand to SETcc!"); 1037 MRI->replaceRegWith(SetCCI.getOperand(0).getReg(), CondReg); 1038 SetCCI.eraseFromParent(); 1039 return; 1040 } 1041 1042 // Otherwise, we need to emit a store. 1043 auto MIB = BuildMI(*SetCCI.getParent(), SetCCI.getIterator(), 1044 SetCCI.getDebugLoc(), TII->get(X86::MOV8mr)); 1045 // Copy the address operands. 1046 for (int i = 0; i < X86::AddrNumOperands; ++i) 1047 MIB.add(SetCCI.getOperand(i)); 1048 1049 MIB.addReg(CondReg); 1050 1051 MIB->setMemRefs(SetCCI.memoperands_begin(), SetCCI.memoperands_end()); 1052 1053 SetCCI.eraseFromParent(); 1054 return; 1055 } 1056