1 //====- X86FlagsCopyLowering.cpp - Lowers COPY nodes of EFLAGS ------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 /// \file
10 ///
11 /// Lowers COPY nodes of EFLAGS by directly extracting and preserving individual
12 /// flag bits.
13 ///
14 /// We have to do this by carefully analyzing and rewriting the usage of the
15 /// copied EFLAGS register because there is no general way to rematerialize the
16 /// entire EFLAGS register safely and efficiently. Using `popf` both forces
17 /// dynamic stack adjustment and can create correctness issues due to IF, TF,
18 /// and other non-status flags being overwritten. Using sequences involving
19 /// SAHF don't work on all x86 processors and are often quite slow compared to
20 /// directly testing a single status preserved in its own GPR.
21 ///
22 //===----------------------------------------------------------------------===//
23 
24 #include "X86.h"
25 #include "X86InstrBuilder.h"
26 #include "X86InstrInfo.h"
27 #include "X86Subtarget.h"
28 #include "llvm/ADT/ArrayRef.h"
29 #include "llvm/ADT/DenseMap.h"
30 #include "llvm/ADT/PostOrderIterator.h"
31 #include "llvm/ADT/STLExtras.h"
32 #include "llvm/ADT/ScopeExit.h"
33 #include "llvm/ADT/SmallPtrSet.h"
34 #include "llvm/ADT/SmallSet.h"
35 #include "llvm/ADT/SmallVector.h"
36 #include "llvm/ADT/SparseBitVector.h"
37 #include "llvm/ADT/Statistic.h"
38 #include "llvm/CodeGen/MachineBasicBlock.h"
39 #include "llvm/CodeGen/MachineConstantPool.h"
40 #include "llvm/CodeGen/MachineDominators.h"
41 #include "llvm/CodeGen/MachineFunction.h"
42 #include "llvm/CodeGen/MachineFunctionPass.h"
43 #include "llvm/CodeGen/MachineInstr.h"
44 #include "llvm/CodeGen/MachineInstrBuilder.h"
45 #include "llvm/CodeGen/MachineModuleInfo.h"
46 #include "llvm/CodeGen/MachineOperand.h"
47 #include "llvm/CodeGen/MachineRegisterInfo.h"
48 #include "llvm/CodeGen/MachineSSAUpdater.h"
49 #include "llvm/CodeGen/TargetInstrInfo.h"
50 #include "llvm/CodeGen/TargetRegisterInfo.h"
51 #include "llvm/CodeGen/TargetSchedule.h"
52 #include "llvm/CodeGen/TargetSubtargetInfo.h"
53 #include "llvm/IR/DebugLoc.h"
54 #include "llvm/MC/MCSchedule.h"
55 #include "llvm/Pass.h"
56 #include "llvm/Support/CommandLine.h"
57 #include "llvm/Support/Debug.h"
58 #include "llvm/Support/raw_ostream.h"
59 #include <algorithm>
60 #include <cassert>
61 #include <iterator>
62 #include <utility>
63 
64 using namespace llvm;
65 
66 #define PASS_KEY "x86-flags-copy-lowering"
67 #define DEBUG_TYPE PASS_KEY
68 
69 STATISTIC(NumCopiesEliminated, "Number of copies of EFLAGS eliminated");
70 STATISTIC(NumSetCCsInserted, "Number of setCC instructions inserted");
71 STATISTIC(NumTestsInserted, "Number of test instructions inserted");
72 STATISTIC(NumAddsInserted, "Number of adds instructions inserted");
73 
74 namespace llvm {
75 
76 void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
77 
78 } // end namespace llvm
79 
80 namespace {
81 
82 // Convenient array type for storing registers associated with each condition.
83 using CondRegArray = std::array<unsigned, X86::LAST_VALID_COND + 1>;
84 
85 class X86FlagsCopyLoweringPass : public MachineFunctionPass {
86 public:
87   X86FlagsCopyLoweringPass() : MachineFunctionPass(ID) {
88     initializeX86FlagsCopyLoweringPassPass(*PassRegistry::getPassRegistry());
89   }
90 
91   StringRef getPassName() const override { return "X86 EFLAGS copy lowering"; }
92   bool runOnMachineFunction(MachineFunction &MF) override;
93   void getAnalysisUsage(AnalysisUsage &AU) const override;
94 
95   /// Pass identification, replacement for typeid.
96   static char ID;
97 
98 private:
99   MachineRegisterInfo *MRI;
100   const X86InstrInfo *TII;
101   const TargetRegisterInfo *TRI;
102   const TargetRegisterClass *PromoteRC;
103   MachineDominatorTree *MDT;
104 
105   CondRegArray collectCondsInRegs(MachineBasicBlock &MBB,
106                                   MachineBasicBlock::iterator CopyDefI);
107 
108   unsigned promoteCondToReg(MachineBasicBlock &MBB,
109                             MachineBasicBlock::iterator TestPos,
110                             DebugLoc TestLoc, X86::CondCode Cond);
111   std::pair<unsigned, bool>
112   getCondOrInverseInReg(MachineBasicBlock &TestMBB,
113                         MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
114                         X86::CondCode Cond, CondRegArray &CondRegs);
115   void insertTest(MachineBasicBlock &MBB, MachineBasicBlock::iterator Pos,
116                   DebugLoc Loc, unsigned Reg);
117 
118   void rewriteArithmetic(MachineBasicBlock &TestMBB,
119                          MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
120                          MachineInstr &MI, MachineOperand &FlagUse,
121                          CondRegArray &CondRegs);
122   void rewriteCMov(MachineBasicBlock &TestMBB,
123                    MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
124                    MachineInstr &CMovI, MachineOperand &FlagUse,
125                    CondRegArray &CondRegs);
126   void rewriteCondJmp(MachineBasicBlock &TestMBB,
127                       MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
128                       MachineInstr &JmpI, CondRegArray &CondRegs);
129   void rewriteCopy(MachineInstr &MI, MachineOperand &FlagUse,
130                    MachineInstr &CopyDefI);
131   void rewriteSetCarryExtended(MachineBasicBlock &TestMBB,
132                                MachineBasicBlock::iterator TestPos,
133                                DebugLoc TestLoc, MachineInstr &SetBI,
134                                MachineOperand &FlagUse, CondRegArray &CondRegs);
135   void rewriteSetCC(MachineBasicBlock &TestMBB,
136                     MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
137                     MachineInstr &SetCCI, MachineOperand &FlagUse,
138                     CondRegArray &CondRegs);
139 };
140 
141 } // end anonymous namespace
142 
143 INITIALIZE_PASS_BEGIN(X86FlagsCopyLoweringPass, DEBUG_TYPE,
144                       "X86 EFLAGS copy lowering", false, false)
145 INITIALIZE_PASS_END(X86FlagsCopyLoweringPass, DEBUG_TYPE,
146                     "X86 EFLAGS copy lowering", false, false)
147 
148 FunctionPass *llvm::createX86FlagsCopyLoweringPass() {
149   return new X86FlagsCopyLoweringPass();
150 }
151 
152 char X86FlagsCopyLoweringPass::ID = 0;
153 
154 void X86FlagsCopyLoweringPass::getAnalysisUsage(AnalysisUsage &AU) const {
155   AU.addRequired<MachineDominatorTree>();
156   MachineFunctionPass::getAnalysisUsage(AU);
157 }
158 
159 namespace {
160 /// An enumeration of the arithmetic instruction mnemonics which have
161 /// interesting flag semantics.
162 ///
163 /// We can map instruction opcodes into these mnemonics to make it easy to
164 /// dispatch with specific functionality.
165 enum class FlagArithMnemonic {
166   ADC,
167   ADCX,
168   ADOX,
169   RCL,
170   RCR,
171   SBB,
172 };
173 } // namespace
174 
175 static FlagArithMnemonic getMnemonicFromOpcode(unsigned Opcode) {
176   switch (Opcode) {
177   default:
178     report_fatal_error("No support for lowering a copy into EFLAGS when used "
179                        "by this instruction!");
180 
181 #define LLVM_EXPAND_INSTR_SIZES(MNEMONIC, SUFFIX)                              \
182   case X86::MNEMONIC##8##SUFFIX:                                               \
183   case X86::MNEMONIC##16##SUFFIX:                                              \
184   case X86::MNEMONIC##32##SUFFIX:                                              \
185   case X86::MNEMONIC##64##SUFFIX:
186 
187 #define LLVM_EXPAND_ADC_SBB_INSTR(MNEMONIC)                                    \
188   LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr)                                        \
189   LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr_REV)                                    \
190   LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rm)                                        \
191   LLVM_EXPAND_INSTR_SIZES(MNEMONIC, mr)                                        \
192   case X86::MNEMONIC##8ri:                                                     \
193   case X86::MNEMONIC##16ri8:                                                   \
194   case X86::MNEMONIC##32ri8:                                                   \
195   case X86::MNEMONIC##64ri8:                                                   \
196   case X86::MNEMONIC##16ri:                                                    \
197   case X86::MNEMONIC##32ri:                                                    \
198   case X86::MNEMONIC##64ri32:                                                  \
199   case X86::MNEMONIC##8mi:                                                     \
200   case X86::MNEMONIC##16mi8:                                                   \
201   case X86::MNEMONIC##32mi8:                                                   \
202   case X86::MNEMONIC##64mi8:                                                   \
203   case X86::MNEMONIC##16mi:                                                    \
204   case X86::MNEMONIC##32mi:                                                    \
205   case X86::MNEMONIC##64mi32:                                                  \
206   case X86::MNEMONIC##8i8:                                                     \
207   case X86::MNEMONIC##16i16:                                                   \
208   case X86::MNEMONIC##32i32:                                                   \
209   case X86::MNEMONIC##64i32:
210 
211     LLVM_EXPAND_ADC_SBB_INSTR(ADC)
212     return FlagArithMnemonic::ADC;
213 
214     LLVM_EXPAND_ADC_SBB_INSTR(SBB)
215     return FlagArithMnemonic::SBB;
216 
217 #undef LLVM_EXPAND_ADC_SBB_INSTR
218 
219     LLVM_EXPAND_INSTR_SIZES(RCL, rCL)
220     LLVM_EXPAND_INSTR_SIZES(RCL, r1)
221     LLVM_EXPAND_INSTR_SIZES(RCL, ri)
222     return FlagArithMnemonic::RCL;
223 
224     LLVM_EXPAND_INSTR_SIZES(RCR, rCL)
225     LLVM_EXPAND_INSTR_SIZES(RCR, r1)
226     LLVM_EXPAND_INSTR_SIZES(RCR, ri)
227     return FlagArithMnemonic::RCR;
228 
229 #undef LLVM_EXPAND_INSTR_SIZES
230 
231   case X86::ADCX32rr:
232   case X86::ADCX64rr:
233   case X86::ADCX32rm:
234   case X86::ADCX64rm:
235     return FlagArithMnemonic::ADCX;
236 
237   case X86::ADOX32rr:
238   case X86::ADOX64rr:
239   case X86::ADOX32rm:
240   case X86::ADOX64rm:
241     return FlagArithMnemonic::ADOX;
242   }
243 }
244 
245 static MachineBasicBlock &splitBlock(MachineBasicBlock &MBB,
246                                      MachineInstr &SplitI,
247                                      const X86InstrInfo &TII) {
248   MachineFunction &MF = *MBB.getParent();
249 
250   assert(SplitI.getParent() == &MBB &&
251          "Split instruction must be in the split block!");
252   assert(SplitI.isBranch() &&
253          "Only designed to split a tail of branch instructions!");
254   assert(X86::getCondFromBranchOpc(SplitI.getOpcode()) != X86::COND_INVALID &&
255          "Must split on an actual jCC instruction!");
256 
257   // Dig out the previous instruction to the split point.
258   MachineInstr &PrevI = *std::prev(SplitI.getIterator());
259   assert(PrevI.isBranch() && "Must split after a branch!");
260   assert(X86::getCondFromBranchOpc(PrevI.getOpcode()) != X86::COND_INVALID &&
261          "Must split after an actual jCC instruction!");
262   assert(!std::prev(PrevI.getIterator())->isTerminator() &&
263          "Must only have this one terminator prior to the split!");
264 
265   // Grab the one successor edge that will stay in `MBB`.
266   MachineBasicBlock &UnsplitSucc = *PrevI.getOperand(0).getMBB();
267 
268   // Analyze the original block to see if we are actually splitting an edge
269   // into two edges. This can happen when we have multiple conditional jumps to
270   // the same successor.
271   bool IsEdgeSplit =
272       std::any_of(SplitI.getIterator(), MBB.instr_end(),
273                   [&](MachineInstr &MI) {
274                     assert(MI.isTerminator() &&
275                            "Should only have spliced terminators!");
276                     return llvm::any_of(
277                         MI.operands(), [&](MachineOperand &MOp) {
278                           return MOp.isMBB() && MOp.getMBB() == &UnsplitSucc;
279                         });
280                   }) ||
281       MBB.getFallThrough() == &UnsplitSucc;
282 
283   MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock();
284 
285   // Insert the new block immediately after the current one. Any existing
286   // fallthrough will be sunk into this new block anyways.
287   MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB);
288 
289   // Splice the tail of instructions into the new block.
290   NewMBB.splice(NewMBB.end(), &MBB, SplitI.getIterator(), MBB.end());
291 
292   // Copy the necessary succesors (and their probability info) into the new
293   // block.
294   for (auto SI = MBB.succ_begin(), SE = MBB.succ_end(); SI != SE; ++SI)
295     if (IsEdgeSplit || *SI != &UnsplitSucc)
296       NewMBB.copySuccessor(&MBB, SI);
297   // Normalize the probabilities if we didn't end up splitting the edge.
298   if (!IsEdgeSplit)
299     NewMBB.normalizeSuccProbs();
300 
301   // Now replace all of the moved successors in the original block with the new
302   // block. This will merge their probabilities.
303   for (MachineBasicBlock *Succ : NewMBB.successors())
304     if (Succ != &UnsplitSucc)
305       MBB.replaceSuccessor(Succ, &NewMBB);
306 
307   // We should always end up replacing at least one successor.
308   assert(MBB.isSuccessor(&NewMBB) &&
309          "Failed to make the new block a successor!");
310 
311   // Now update all the PHIs.
312   for (MachineBasicBlock *Succ : NewMBB.successors()) {
313     for (MachineInstr &MI : *Succ) {
314       if (!MI.isPHI())
315         break;
316 
317       for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
318            OpIdx += 2) {
319         MachineOperand &OpV = MI.getOperand(OpIdx);
320         MachineOperand &OpMBB = MI.getOperand(OpIdx + 1);
321         assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!");
322         if (OpMBB.getMBB() != &MBB)
323           continue;
324 
325         // Replace the operand for unsplit successors
326         if (!IsEdgeSplit || Succ != &UnsplitSucc) {
327           OpMBB.setMBB(&NewMBB);
328 
329           // We have to continue scanning as there may be multiple entries in
330           // the PHI.
331           continue;
332         }
333 
334         // When we have split the edge append a new successor.
335         MI.addOperand(MF, OpV);
336         MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB));
337         break;
338       }
339     }
340   }
341 
342   return NewMBB;
343 }
344 
345 bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
346   LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
347                     << " **********\n");
348 
349   auto &Subtarget = MF.getSubtarget<X86Subtarget>();
350   MRI = &MF.getRegInfo();
351   TII = Subtarget.getInstrInfo();
352   TRI = Subtarget.getRegisterInfo();
353   MDT = &getAnalysis<MachineDominatorTree>();
354   PromoteRC = &X86::GR8RegClass;
355 
356   if (MF.begin() == MF.end())
357     // Nothing to do for a degenerate empty function...
358     return false;
359 
360   // Collect the copies in RPO so that when there are chains where a copy is in
361   // turn copied again we visit the first one first. This ensures we can find
362   // viable locations for testing the original EFLAGS that dominate all the
363   // uses across complex CFGs.
364   SmallVector<MachineInstr *, 4> Copies;
365   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
366   for (MachineBasicBlock *MBB : RPOT)
367     for (MachineInstr &MI : *MBB)
368       if (MI.getOpcode() == TargetOpcode::COPY &&
369           MI.getOperand(0).getReg() == X86::EFLAGS)
370         Copies.push_back(&MI);
371 
372   for (MachineInstr *CopyI : Copies) {
373     MachineBasicBlock &MBB = *CopyI->getParent();
374 
375     MachineOperand &VOp = CopyI->getOperand(1);
376     assert(VOp.isReg() &&
377            "The input to the copy for EFLAGS should always be a register!");
378     MachineInstr &CopyDefI = *MRI->getVRegDef(VOp.getReg());
379     if (CopyDefI.getOpcode() != TargetOpcode::COPY) {
380       // FIXME: The big likely candidate here are PHI nodes. We could in theory
381       // handle PHI nodes, but it gets really, really hard. Insanely hard. Hard
382       // enough that it is probably better to change every other part of LLVM
383       // to avoid creating them. The issue is that once we have PHIs we won't
384       // know which original EFLAGS value we need to capture with our setCCs
385       // below. The end result will be computing a complete set of setCCs that
386       // we *might* want, computing them in every place where we copy *out* of
387       // EFLAGS and then doing SSA formation on all of them to insert necessary
388       // PHI nodes and consume those here. Then hoping that somehow we DCE the
389       // unnecessary ones. This DCE seems very unlikely to be successful and so
390       // we will almost certainly end up with a glut of dead setCC
391       // instructions. Until we have a motivating test case and fail to avoid
392       // it by changing other parts of LLVM's lowering, we refuse to handle
393       // this complex case here.
394       LLVM_DEBUG(
395           dbgs() << "ERROR: Encountered unexpected def of an eflags copy: ";
396           CopyDefI.dump());
397       report_fatal_error(
398           "Cannot lower EFLAGS copy unless it is defined in turn by a copy!");
399     }
400 
401     auto Cleanup = make_scope_exit([&] {
402       // All uses of the EFLAGS copy are now rewritten, kill the copy into
403       // eflags and if dead the copy from.
404       CopyI->eraseFromParent();
405       if (MRI->use_empty(CopyDefI.getOperand(0).getReg()))
406         CopyDefI.eraseFromParent();
407       ++NumCopiesEliminated;
408     });
409 
410     MachineOperand &DOp = CopyI->getOperand(0);
411     assert(DOp.isDef() && "Expected register def!");
412     assert(DOp.getReg() == X86::EFLAGS && "Unexpected copy def register!");
413     if (DOp.isDead())
414       continue;
415 
416     MachineBasicBlock *TestMBB = CopyDefI.getParent();
417     auto TestPos = CopyDefI.getIterator();
418     DebugLoc TestLoc = CopyDefI.getDebugLoc();
419 
420     LLVM_DEBUG(dbgs() << "Rewriting copy: "; CopyI->dump());
421 
422     // Walk up across live-in EFLAGS to find where they were actually def'ed.
423     //
424     // This copy's def may just be part of a region of blocks covered by
425     // a single def of EFLAGS and we want to find the top of that region where
426     // possible.
427     //
428     // This is essentially a search for a *candidate* reaching definition
429     // location. We don't need to ever find the actual reaching definition here,
430     // but we want to walk up the dominator tree to find the highest point which
431     // would be viable for such a definition.
432     auto HasEFLAGSClobber = [&](MachineBasicBlock::iterator Begin,
433                                 MachineBasicBlock::iterator End) {
434       // Scan backwards as we expect these to be relatively short and often find
435       // a clobber near the end.
436       return llvm::any_of(
437           llvm::reverse(llvm::make_range(Begin, End)), [&](MachineInstr &MI) {
438             // Flag any instruction (other than the copy we are
439             // currently rewriting) that defs EFLAGS.
440             return &MI != CopyI && MI.findRegisterDefOperand(X86::EFLAGS);
441           });
442     };
443     auto HasEFLAGSClobberPath = [&](MachineBasicBlock *BeginMBB,
444                                     MachineBasicBlock *EndMBB) {
445       assert(MDT->dominates(BeginMBB, EndMBB) &&
446              "Only support paths down the dominator tree!");
447       SmallPtrSet<MachineBasicBlock *, 4> Visited;
448       SmallVector<MachineBasicBlock *, 4> Worklist;
449       // We terminate at the beginning. No need to scan it.
450       Visited.insert(BeginMBB);
451       Worklist.push_back(EndMBB);
452       do {
453         auto *MBB = Worklist.pop_back_val();
454         for (auto *PredMBB : MBB->predecessors()) {
455           if (!Visited.insert(PredMBB).second)
456             continue;
457           if (HasEFLAGSClobber(PredMBB->begin(), PredMBB->end()))
458             return true;
459           // Enqueue this block to walk its predecessors.
460           Worklist.push_back(PredMBB);
461         }
462       } while (!Worklist.empty());
463       // No clobber found along a path from the begin to end.
464       return false;
465     };
466     while (TestMBB->isLiveIn(X86::EFLAGS) && !TestMBB->pred_empty() &&
467            !HasEFLAGSClobber(TestMBB->begin(), TestPos)) {
468       // Find the nearest common dominator of the predecessors, as
469       // that will be the best candidate to hoist into.
470       MachineBasicBlock *HoistMBB =
471           std::accumulate(std::next(TestMBB->pred_begin()), TestMBB->pred_end(),
472                           *TestMBB->pred_begin(),
473                           [&](MachineBasicBlock *LHS, MachineBasicBlock *RHS) {
474                             return MDT->findNearestCommonDominator(LHS, RHS);
475                           });
476 
477       // Now we need to scan all predecessors that may be reached along paths to
478       // the hoist block. A clobber anywhere in any of these blocks the hoist.
479       // Note that this even handles loops because we require *no* clobbers.
480       if (HasEFLAGSClobberPath(HoistMBB, TestMBB))
481         break;
482 
483       // We also need the terminators to not sneakily clobber flags.
484       if (HasEFLAGSClobber(HoistMBB->getFirstTerminator()->getIterator(),
485                            HoistMBB->instr_end()))
486         break;
487 
488       // We found a viable location, hoist our test position to it.
489       TestMBB = HoistMBB;
490       TestPos = TestMBB->getFirstTerminator()->getIterator();
491       // Clear the debug location as it would just be confusing after hoisting.
492       TestLoc = DebugLoc();
493     }
494     LLVM_DEBUG({
495       auto DefIt = llvm::find_if(
496           llvm::reverse(llvm::make_range(TestMBB->instr_begin(), TestPos)),
497           [&](MachineInstr &MI) {
498             return MI.findRegisterDefOperand(X86::EFLAGS);
499           });
500       if (DefIt.base() != TestMBB->instr_begin()) {
501         dbgs() << "  Using EFLAGS defined by: ";
502         DefIt->dump();
503       } else {
504         dbgs() << "  Using live-in flags for BB:\n";
505         TestMBB->dump();
506       }
507     });
508 
509     // While rewriting uses, we buffer jumps and rewrite them in a second pass
510     // because doing so will perturb the CFG that we are walking to find the
511     // uses in the first place.
512     SmallVector<MachineInstr *, 4> JmpIs;
513 
514     // Gather the condition flags that have already been preserved in
515     // registers. We do this from scratch each time as we expect there to be
516     // very few of them and we expect to not revisit the same copy definition
517     // many times. If either of those change sufficiently we could build a map
518     // of these up front instead.
519     CondRegArray CondRegs = collectCondsInRegs(*TestMBB, TestPos);
520 
521     // Collect the basic blocks we need to scan. Typically this will just be
522     // a single basic block but we may have to scan multiple blocks if the
523     // EFLAGS copy lives into successors.
524     SmallVector<MachineBasicBlock *, 2> Blocks;
525     SmallPtrSet<MachineBasicBlock *, 2> VisitedBlocks;
526     Blocks.push_back(&MBB);
527 
528     do {
529       MachineBasicBlock &UseMBB = *Blocks.pop_back_val();
530 
531       // Track when if/when we find a kill of the flags in this block.
532       bool FlagsKilled = false;
533 
534       // In most cases, we walk from the beginning to the end of the block. But
535       // when the block is the same block as the copy is from, we will visit it
536       // twice. The first time we start from the copy and go to the end. The
537       // second time we start from the beginning and go to the copy. This lets
538       // us handle copies inside of cycles.
539       // FIXME: This loop is *super* confusing. This is at least in part
540       // a symptom of all of this routine needing to be refactored into
541       // documentable components. Once done, there may be a better way to write
542       // this loop.
543       for (auto MII = (&UseMBB == &MBB && !VisitedBlocks.count(&UseMBB))
544                           ? std::next(CopyI->getIterator())
545                           : UseMBB.instr_begin(),
546                 MIE = UseMBB.instr_end();
547            MII != MIE;) {
548         MachineInstr &MI = *MII++;
549         // If we are in the original copy block and encounter either the copy
550         // def or the copy itself, break so that we don't re-process any part of
551         // the block or process the instructions in the range that was copied
552         // over.
553         if (&MI == CopyI || &MI == &CopyDefI) {
554           assert(&UseMBB == &MBB && VisitedBlocks.count(&MBB) &&
555                  "Should only encounter these on the second pass over the "
556                  "original block.");
557           break;
558         }
559 
560         MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS);
561         if (!FlagUse) {
562           if (MI.findRegisterDefOperand(X86::EFLAGS)) {
563             // If EFLAGS are defined, it's as-if they were killed. We can stop
564             // scanning here.
565             //
566             // NB!!! Many instructions only modify some flags. LLVM currently
567             // models this as clobbering all flags, but if that ever changes
568             // this will need to be carefully updated to handle that more
569             // complex logic.
570             FlagsKilled = true;
571             break;
572           }
573           continue;
574         }
575 
576         LLVM_DEBUG(dbgs() << "  Rewriting use: "; MI.dump());
577 
578         // Check the kill flag before we rewrite as that may change it.
579         if (FlagUse->isKill())
580           FlagsKilled = true;
581 
582         // Once we encounter a branch, the rest of the instructions must also be
583         // branches. We can't rewrite in place here, so we handle them below.
584         //
585         // Note that we don't have to handle tail calls here, even conditional
586         // tail calls, as those are not introduced into the X86 MI until post-RA
587         // branch folding or black placement. As a consequence, we get to deal
588         // with the simpler formulation of conditional branches followed by tail
589         // calls.
590         if (X86::getCondFromBranchOpc(MI.getOpcode()) != X86::COND_INVALID) {
591           auto JmpIt = MI.getIterator();
592           do {
593             JmpIs.push_back(&*JmpIt);
594             ++JmpIt;
595           } while (JmpIt != UseMBB.instr_end() &&
596                    X86::getCondFromBranchOpc(JmpIt->getOpcode()) !=
597                        X86::COND_INVALID);
598           break;
599         }
600 
601         // Otherwise we can just rewrite in-place.
602         if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) {
603           rewriteCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
604         } else if (X86::getCondFromSETOpc(MI.getOpcode()) !=
605                    X86::COND_INVALID) {
606           rewriteSetCC(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
607         } else if (MI.getOpcode() == TargetOpcode::COPY) {
608           rewriteCopy(MI, *FlagUse, CopyDefI);
609         } else {
610           // We assume all other instructions that use flags also def them.
611           assert(MI.findRegisterDefOperand(X86::EFLAGS) &&
612                  "Expected a def of EFLAGS for this instruction!");
613 
614           // NB!!! Several arithmetic instructions only *partially* update
615           // flags. Theoretically, we could generate MI code sequences that
616           // would rely on this fact and observe different flags independently.
617           // But currently LLVM models all of these instructions as clobbering
618           // all the flags in an undef way. We rely on that to simplify the
619           // logic.
620           FlagsKilled = true;
621 
622           switch (MI.getOpcode()) {
623           case X86::SETB_C8r:
624           case X86::SETB_C16r:
625           case X86::SETB_C32r:
626           case X86::SETB_C64r:
627             // Use custom lowering for arithmetic that is merely extending the
628             // carry flag. We model this as the SETB_C* pseudo instructions.
629             rewriteSetCarryExtended(*TestMBB, TestPos, TestLoc, MI, *FlagUse,
630                                     CondRegs);
631             break;
632 
633           default:
634             // Generically handle remaining uses as arithmetic instructions.
635             rewriteArithmetic(*TestMBB, TestPos, TestLoc, MI, *FlagUse,
636                               CondRegs);
637             break;
638           }
639           break;
640         }
641 
642         // If this was the last use of the flags, we're done.
643         if (FlagsKilled)
644           break;
645       }
646 
647       // If the flags were killed, we're done with this block.
648       if (FlagsKilled)
649         continue;
650 
651       // Otherwise we need to scan successors for ones where the flags live-in
652       // and queue those up for processing.
653       for (MachineBasicBlock *SuccMBB : UseMBB.successors())
654         if (SuccMBB->isLiveIn(X86::EFLAGS) &&
655             VisitedBlocks.insert(SuccMBB).second) {
656           // We currently don't do any PHI insertion and so we require that the
657           // test basic block dominates all of the use basic blocks. Further, we
658           // can't have a cycle from the test block back to itself as that would
659           // create a cycle requiring a PHI to break it.
660           //
661           // We could in theory do PHI insertion here if it becomes useful by
662           // just taking undef values in along every edge that we don't trace
663           // this EFLAGS copy along. This isn't as bad as fully general PHI
664           // insertion, but still seems like a great deal of complexity.
665           //
666           // Because it is theoretically possible that some earlier MI pass or
667           // other lowering transformation could induce this to happen, we do
668           // a hard check even in non-debug builds here.
669           if (SuccMBB == TestMBB || !MDT->dominates(TestMBB, SuccMBB)) {
670             LLVM_DEBUG({
671               dbgs()
672                   << "ERROR: Encountered use that is not dominated by our test "
673                      "basic block! Rewriting this would require inserting PHI "
674                      "nodes to track the flag state across the CFG.\n\nTest "
675                      "block:\n";
676               TestMBB->dump();
677               dbgs() << "Use block:\n";
678               SuccMBB->dump();
679             });
680             report_fatal_error(
681                 "Cannot lower EFLAGS copy when original copy def "
682                 "does not dominate all uses.");
683           }
684 
685           Blocks.push_back(SuccMBB);
686         }
687     } while (!Blocks.empty());
688 
689     // Now rewrite the jumps that use the flags. These we handle specially
690     // because if there are multiple jumps in a single basic block we'll have
691     // to do surgery on the CFG.
692     MachineBasicBlock *LastJmpMBB = nullptr;
693     for (MachineInstr *JmpI : JmpIs) {
694       // Past the first jump within a basic block we need to split the blocks
695       // apart.
696       if (JmpI->getParent() == LastJmpMBB)
697         splitBlock(*JmpI->getParent(), *JmpI, *TII);
698       else
699         LastJmpMBB = JmpI->getParent();
700 
701       rewriteCondJmp(*TestMBB, TestPos, TestLoc, *JmpI, CondRegs);
702     }
703 
704     // FIXME: Mark the last use of EFLAGS before the copy's def as a kill if
705     // the copy's def operand is itself a kill.
706   }
707 
708 #ifndef NDEBUG
709   for (MachineBasicBlock &MBB : MF)
710     for (MachineInstr &MI : MBB)
711       if (MI.getOpcode() == TargetOpcode::COPY &&
712           (MI.getOperand(0).getReg() == X86::EFLAGS ||
713            MI.getOperand(1).getReg() == X86::EFLAGS)) {
714         LLVM_DEBUG(dbgs() << "ERROR: Found a COPY involving EFLAGS: ";
715                    MI.dump());
716         llvm_unreachable("Unlowered EFLAGS copy!");
717       }
718 #endif
719 
720   return true;
721 }
722 
723 /// Collect any conditions that have already been set in registers so that we
724 /// can re-use them rather than adding duplicates.
725 CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs(
726     MachineBasicBlock &MBB, MachineBasicBlock::iterator TestPos) {
727   CondRegArray CondRegs = {};
728 
729   // Scan backwards across the range of instructions with live EFLAGS.
730   for (MachineInstr &MI :
731        llvm::reverse(llvm::make_range(MBB.begin(), TestPos))) {
732     X86::CondCode Cond = X86::getCondFromSETOpc(MI.getOpcode());
733     if (Cond != X86::COND_INVALID && !MI.mayStore() && MI.getOperand(0).isReg() &&
734         TRI->isVirtualRegister(MI.getOperand(0).getReg())) {
735       assert(MI.getOperand(0).isDef() &&
736              "A non-storing SETcc should always define a register!");
737       CondRegs[Cond] = MI.getOperand(0).getReg();
738     }
739 
740     // Stop scanning when we see the first definition of the EFLAGS as prior to
741     // this we would potentially capture the wrong flag state.
742     if (MI.findRegisterDefOperand(X86::EFLAGS))
743       break;
744   }
745   return CondRegs;
746 }
747 
748 unsigned X86FlagsCopyLoweringPass::promoteCondToReg(
749     MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
750     DebugLoc TestLoc, X86::CondCode Cond) {
751   unsigned Reg = MRI->createVirtualRegister(PromoteRC);
752   auto SetI = BuildMI(TestMBB, TestPos, TestLoc,
753                       TII->get(X86::getSETFromCond(Cond)), Reg);
754   (void)SetI;
755   LLVM_DEBUG(dbgs() << "    save cond: "; SetI->dump());
756   ++NumSetCCsInserted;
757   return Reg;
758 }
759 
760 std::pair<unsigned, bool> X86FlagsCopyLoweringPass::getCondOrInverseInReg(
761     MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
762     DebugLoc TestLoc, X86::CondCode Cond, CondRegArray &CondRegs) {
763   unsigned &CondReg = CondRegs[Cond];
764   unsigned &InvCondReg = CondRegs[X86::GetOppositeBranchCondition(Cond)];
765   if (!CondReg && !InvCondReg)
766     CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond);
767 
768   if (CondReg)
769     return {CondReg, false};
770   else
771     return {InvCondReg, true};
772 }
773 
774 void X86FlagsCopyLoweringPass::insertTest(MachineBasicBlock &MBB,
775                                           MachineBasicBlock::iterator Pos,
776                                           DebugLoc Loc, unsigned Reg) {
777   auto TestI =
778       BuildMI(MBB, Pos, Loc, TII->get(X86::TEST8rr)).addReg(Reg).addReg(Reg);
779   (void)TestI;
780   LLVM_DEBUG(dbgs() << "    test cond: "; TestI->dump());
781   ++NumTestsInserted;
782 }
783 
784 void X86FlagsCopyLoweringPass::rewriteArithmetic(
785     MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
786     DebugLoc TestLoc, MachineInstr &MI, MachineOperand &FlagUse,
787     CondRegArray &CondRegs) {
788   // Arithmetic is either reading CF or OF. Figure out which condition we need
789   // to preserve in a register.
790   X86::CondCode Cond;
791 
792   // The addend to use to reset CF or OF when added to the flag value.
793   int Addend;
794 
795   switch (getMnemonicFromOpcode(MI.getOpcode())) {
796   case FlagArithMnemonic::ADC:
797   case FlagArithMnemonic::ADCX:
798   case FlagArithMnemonic::RCL:
799   case FlagArithMnemonic::RCR:
800   case FlagArithMnemonic::SBB:
801     Cond = X86::COND_B; // CF == 1
802     // Set up an addend that when one is added will need a carry due to not
803     // having a higher bit available.
804     Addend = 255;
805     break;
806 
807   case FlagArithMnemonic::ADOX:
808     Cond = X86::COND_O; // OF == 1
809     // Set up an addend that when one is added will turn from positive to
810     // negative and thus overflow in the signed domain.
811     Addend = 127;
812     break;
813   }
814 
815   // Now get a register that contains the value of the flag input to the
816   // arithmetic. We require exactly this flag to simplify the arithmetic
817   // required to materialize it back into the flag.
818   unsigned &CondReg = CondRegs[Cond];
819   if (!CondReg)
820     CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond);
821 
822   MachineBasicBlock &MBB = *MI.getParent();
823 
824   // Insert an instruction that will set the flag back to the desired value.
825   unsigned TmpReg = MRI->createVirtualRegister(PromoteRC);
826   auto AddI =
827       BuildMI(MBB, MI.getIterator(), MI.getDebugLoc(), TII->get(X86::ADD8ri))
828           .addDef(TmpReg, RegState::Dead)
829           .addReg(CondReg)
830           .addImm(Addend);
831   (void)AddI;
832   LLVM_DEBUG(dbgs() << "    add cond: "; AddI->dump());
833   ++NumAddsInserted;
834   FlagUse.setIsKill(true);
835 }
836 
837 void X86FlagsCopyLoweringPass::rewriteCMov(MachineBasicBlock &TestMBB,
838                                            MachineBasicBlock::iterator TestPos,
839                                            DebugLoc TestLoc,
840                                            MachineInstr &CMovI,
841                                            MachineOperand &FlagUse,
842                                            CondRegArray &CondRegs) {
843   // First get the register containing this specific condition.
844   X86::CondCode Cond = X86::getCondFromCMovOpc(CMovI.getOpcode());
845   unsigned CondReg;
846   bool Inverted;
847   std::tie(CondReg, Inverted) =
848       getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs);
849 
850   MachineBasicBlock &MBB = *CMovI.getParent();
851 
852   // Insert a direct test of the saved register.
853   insertTest(MBB, CMovI.getIterator(), CMovI.getDebugLoc(), CondReg);
854 
855   // Rewrite the CMov to use the !ZF flag from the test (but match register
856   // size and memory operand), and then kill its use of the flags afterward.
857   auto &CMovRC = *MRI->getRegClass(CMovI.getOperand(0).getReg());
858   CMovI.setDesc(TII->get(X86::getCMovFromCond(
859       Inverted ? X86::COND_E : X86::COND_NE, TRI->getRegSizeInBits(CMovRC) / 8,
860       !CMovI.memoperands_empty())));
861   FlagUse.setIsKill(true);
862   LLVM_DEBUG(dbgs() << "    fixed cmov: "; CMovI.dump());
863 }
864 
865 void X86FlagsCopyLoweringPass::rewriteCondJmp(
866     MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
867     DebugLoc TestLoc, MachineInstr &JmpI, CondRegArray &CondRegs) {
868   // First get the register containing this specific condition.
869   X86::CondCode Cond = X86::getCondFromBranchOpc(JmpI.getOpcode());
870   unsigned CondReg;
871   bool Inverted;
872   std::tie(CondReg, Inverted) =
873       getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs);
874 
875   MachineBasicBlock &JmpMBB = *JmpI.getParent();
876 
877   // Insert a direct test of the saved register.
878   insertTest(JmpMBB, JmpI.getIterator(), JmpI.getDebugLoc(), CondReg);
879 
880   // Rewrite the jump to use the !ZF flag from the test, and kill its use of
881   // flags afterward.
882   JmpI.setDesc(TII->get(
883       X86::GetCondBranchFromCond(Inverted ? X86::COND_E : X86::COND_NE)));
884   const int ImplicitEFLAGSOpIdx = 1;
885   JmpI.getOperand(ImplicitEFLAGSOpIdx).setIsKill(true);
886   LLVM_DEBUG(dbgs() << "    fixed jCC: "; JmpI.dump());
887 }
888 
889 void X86FlagsCopyLoweringPass::rewriteCopy(MachineInstr &MI,
890                                            MachineOperand &FlagUse,
891                                            MachineInstr &CopyDefI) {
892   // Just replace this copy with the original copy def.
893   MRI->replaceRegWith(MI.getOperand(0).getReg(),
894                       CopyDefI.getOperand(0).getReg());
895   MI.eraseFromParent();
896 }
897 
898 void X86FlagsCopyLoweringPass::rewriteSetCarryExtended(
899     MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
900     DebugLoc TestLoc, MachineInstr &SetBI, MachineOperand &FlagUse,
901     CondRegArray &CondRegs) {
902   // This routine is only used to handle pseudos for setting a register to zero
903   // or all ones based on CF. This is essentially the sign extended from 1-bit
904   // form of SETB and modeled with the SETB_C* pseudos. They require special
905   // handling as they aren't normal SETcc instructions and are lowered to an
906   // EFLAGS clobbering operation (SBB typically). One simplifying aspect is that
907   // they are only provided in reg-defining forms. A complicating factor is that
908   // they can define many different register widths.
909   assert(SetBI.getOperand(0).isReg() &&
910          "Cannot have a non-register defined operand to this variant of SETB!");
911 
912   // Little helper to do the common final step of replacing the register def'ed
913   // by this SETB instruction with a new register and removing the SETB
914   // instruction.
915   auto RewriteToReg = [&](unsigned Reg) {
916     MRI->replaceRegWith(SetBI.getOperand(0).getReg(), Reg);
917     SetBI.eraseFromParent();
918   };
919 
920   // Grab the register class used for this particular instruction.
921   auto &SetBRC = *MRI->getRegClass(SetBI.getOperand(0).getReg());
922 
923   MachineBasicBlock &MBB = *SetBI.getParent();
924   auto SetPos = SetBI.getIterator();
925   auto SetLoc = SetBI.getDebugLoc();
926 
927   auto AdjustReg = [&](unsigned Reg) {
928     auto &OrigRC = *MRI->getRegClass(Reg);
929     if (&OrigRC == &SetBRC)
930       return Reg;
931 
932     unsigned NewReg;
933 
934     int OrigRegSize = TRI->getRegSizeInBits(OrigRC) / 8;
935     int TargetRegSize = TRI->getRegSizeInBits(SetBRC) / 8;
936     assert(OrigRegSize <= 8 && "No GPRs larger than 64-bits!");
937     assert(TargetRegSize <= 8 && "No GPRs larger than 64-bits!");
938     int SubRegIdx[] = {X86::NoSubRegister, X86::sub_8bit, X86::sub_16bit,
939                        X86::NoSubRegister, X86::sub_32bit};
940 
941     // If the original size is smaller than the target *and* is smaller than 4
942     // bytes, we need to explicitly zero extend it. We always extend to 4-bytes
943     // to maximize the chance of being able to CSE that operation and to avoid
944     // partial dependency stalls extending to 2-bytes.
945     if (OrigRegSize < TargetRegSize && OrigRegSize < 4) {
946       NewReg = MRI->createVirtualRegister(&X86::GR32RegClass);
947       BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOVZX32rr8), NewReg)
948           .addReg(Reg);
949       if (&SetBRC == &X86::GR32RegClass)
950         return NewReg;
951       Reg = NewReg;
952       OrigRegSize = 4;
953     }
954 
955     NewReg = MRI->createVirtualRegister(&SetBRC);
956     if (OrigRegSize < TargetRegSize) {
957       BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::SUBREG_TO_REG),
958               NewReg)
959           .addImm(0)
960           .addReg(Reg)
961           .addImm(SubRegIdx[OrigRegSize]);
962     } else if (OrigRegSize > TargetRegSize) {
963       BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::EXTRACT_SUBREG),
964               NewReg)
965           .addReg(Reg)
966           .addImm(SubRegIdx[TargetRegSize]);
967     } else {
968       BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::COPY), NewReg)
969           .addReg(Reg);
970     }
971     return NewReg;
972   };
973 
974   unsigned &CondReg = CondRegs[X86::COND_B];
975   if (!CondReg)
976     CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, X86::COND_B);
977 
978   // Adjust the condition to have the desired register width by zero-extending
979   // as needed.
980   // FIXME: We should use a better API to avoid the local reference and using a
981   // different variable here.
982   unsigned ExtCondReg = AdjustReg(CondReg);
983 
984   // Now we need to turn this into a bitmask. We do this by subtracting it from
985   // zero.
986   unsigned ZeroReg = MRI->createVirtualRegister(&X86::GR32RegClass);
987   BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOV32r0), ZeroReg);
988   ZeroReg = AdjustReg(ZeroReg);
989 
990   unsigned Sub;
991   switch (SetBI.getOpcode()) {
992   case X86::SETB_C8r:
993     Sub = X86::SUB8rr;
994     break;
995 
996   case X86::SETB_C16r:
997     Sub = X86::SUB16rr;
998     break;
999 
1000   case X86::SETB_C32r:
1001     Sub = X86::SUB32rr;
1002     break;
1003 
1004   case X86::SETB_C64r:
1005     Sub = X86::SUB64rr;
1006     break;
1007 
1008   default:
1009     llvm_unreachable("Invalid SETB_C* opcode!");
1010   }
1011   unsigned ResultReg = MRI->createVirtualRegister(&SetBRC);
1012   BuildMI(MBB, SetPos, SetLoc, TII->get(Sub), ResultReg)
1013       .addReg(ZeroReg)
1014       .addReg(ExtCondReg);
1015   return RewriteToReg(ResultReg);
1016 }
1017 
1018 void X86FlagsCopyLoweringPass::rewriteSetCC(MachineBasicBlock &TestMBB,
1019                                             MachineBasicBlock::iterator TestPos,
1020                                             DebugLoc TestLoc,
1021                                             MachineInstr &SetCCI,
1022                                             MachineOperand &FlagUse,
1023                                             CondRegArray &CondRegs) {
1024   X86::CondCode Cond = X86::getCondFromSETOpc(SetCCI.getOpcode());
1025   // Note that we can't usefully rewrite this to the inverse without complex
1026   // analysis of the users of the setCC. Largely we rely on duplicates which
1027   // could have been avoided already being avoided here.
1028   unsigned &CondReg = CondRegs[Cond];
1029   if (!CondReg)
1030     CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond);
1031 
1032   // Rewriting a register def is trivial: we just replace the register and
1033   // remove the setcc.
1034   if (!SetCCI.mayStore()) {
1035     assert(SetCCI.getOperand(0).isReg() &&
1036            "Cannot have a non-register defined operand to SETcc!");
1037     MRI->replaceRegWith(SetCCI.getOperand(0).getReg(), CondReg);
1038     SetCCI.eraseFromParent();
1039     return;
1040   }
1041 
1042   // Otherwise, we need to emit a store.
1043   auto MIB = BuildMI(*SetCCI.getParent(), SetCCI.getIterator(),
1044                      SetCCI.getDebugLoc(), TII->get(X86::MOV8mr));
1045   // Copy the address operands.
1046   for (int i = 0; i < X86::AddrNumOperands; ++i)
1047     MIB.add(SetCCI.getOperand(i));
1048 
1049   MIB.addReg(CondReg);
1050 
1051   MIB->setMemRefs(SetCCI.memoperands_begin(), SetCCI.memoperands_end());
1052 
1053   SetCCI.eraseFromParent();
1054   return;
1055 }
1056