1 //====- X86FlagsCopyLowering.cpp - Lowers COPY nodes of EFLAGS ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 ///
10 /// Lowers COPY nodes of EFLAGS by directly extracting and preserving individual
11 /// flag bits.
12 ///
13 /// We have to do this by carefully analyzing and rewriting the usage of the
14 /// copied EFLAGS register because there is no general way to rematerialize the
15 /// entire EFLAGS register safely and efficiently. Using `popf` both forces
16 /// dynamic stack adjustment and can create correctness issues due to IF, TF,
17 /// and other non-status flags being overwritten. Using sequences involving
18 /// SAHF don't work on all x86 processors and are often quite slow compared to
19 /// directly testing a single status preserved in its own GPR.
20 ///
21 //===----------------------------------------------------------------------===//
22 
23 #include "X86.h"
24 #include "X86InstrBuilder.h"
25 #include "X86InstrInfo.h"
26 #include "X86Subtarget.h"
27 #include "llvm/ADT/ArrayRef.h"
28 #include "llvm/ADT/DenseMap.h"
29 #include "llvm/ADT/PostOrderIterator.h"
30 #include "llvm/ADT/STLExtras.h"
31 #include "llvm/ADT/ScopeExit.h"
32 #include "llvm/ADT/SmallPtrSet.h"
33 #include "llvm/ADT/SmallSet.h"
34 #include "llvm/ADT/SmallVector.h"
35 #include "llvm/ADT/SparseBitVector.h"
36 #include "llvm/ADT/Statistic.h"
37 #include "llvm/CodeGen/MachineBasicBlock.h"
38 #include "llvm/CodeGen/MachineConstantPool.h"
39 #include "llvm/CodeGen/MachineDominators.h"
40 #include "llvm/CodeGen/MachineFunction.h"
41 #include "llvm/CodeGen/MachineFunctionPass.h"
42 #include "llvm/CodeGen/MachineInstr.h"
43 #include "llvm/CodeGen/MachineInstrBuilder.h"
44 #include "llvm/CodeGen/MachineModuleInfo.h"
45 #include "llvm/CodeGen/MachineOperand.h"
46 #include "llvm/CodeGen/MachineRegisterInfo.h"
47 #include "llvm/CodeGen/MachineSSAUpdater.h"
48 #include "llvm/CodeGen/TargetInstrInfo.h"
49 #include "llvm/CodeGen/TargetRegisterInfo.h"
50 #include "llvm/CodeGen/TargetSchedule.h"
51 #include "llvm/CodeGen/TargetSubtargetInfo.h"
52 #include "llvm/IR/DebugLoc.h"
53 #include "llvm/MC/MCSchedule.h"
54 #include "llvm/Pass.h"
55 #include "llvm/Support/CommandLine.h"
56 #include "llvm/Support/Debug.h"
57 #include "llvm/Support/raw_ostream.h"
58 #include <algorithm>
59 #include <cassert>
60 #include <iterator>
61 #include <utility>
62 
63 using namespace llvm;
64 
65 #define PASS_KEY "x86-flags-copy-lowering"
66 #define DEBUG_TYPE PASS_KEY
67 
68 STATISTIC(NumCopiesEliminated, "Number of copies of EFLAGS eliminated");
69 STATISTIC(NumSetCCsInserted, "Number of setCC instructions inserted");
70 STATISTIC(NumTestsInserted, "Number of test instructions inserted");
71 STATISTIC(NumAddsInserted, "Number of adds instructions inserted");
72 
73 namespace llvm {
74 
75 void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
76 
77 } // end namespace llvm
78 
79 namespace {
80 
81 // Convenient array type for storing registers associated with each condition.
82 using CondRegArray = std::array<unsigned, X86::LAST_VALID_COND + 1>;
83 
84 class X86FlagsCopyLoweringPass : public MachineFunctionPass {
85 public:
86   X86FlagsCopyLoweringPass() : MachineFunctionPass(ID) {
87     initializeX86FlagsCopyLoweringPassPass(*PassRegistry::getPassRegistry());
88   }
89 
90   StringRef getPassName() const override { return "X86 EFLAGS copy lowering"; }
91   bool runOnMachineFunction(MachineFunction &MF) override;
92   void getAnalysisUsage(AnalysisUsage &AU) const override;
93 
94   /// Pass identification, replacement for typeid.
95   static char ID;
96 
97 private:
98   MachineRegisterInfo *MRI;
99   const X86Subtarget *Subtarget;
100   const X86InstrInfo *TII;
101   const TargetRegisterInfo *TRI;
102   const TargetRegisterClass *PromoteRC;
103   MachineDominatorTree *MDT;
104 
105   CondRegArray collectCondsInRegs(MachineBasicBlock &MBB,
106                                   MachineBasicBlock::iterator CopyDefI);
107 
108   unsigned promoteCondToReg(MachineBasicBlock &MBB,
109                             MachineBasicBlock::iterator TestPos,
110                             DebugLoc TestLoc, X86::CondCode Cond);
111   std::pair<unsigned, bool>
112   getCondOrInverseInReg(MachineBasicBlock &TestMBB,
113                         MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
114                         X86::CondCode Cond, CondRegArray &CondRegs);
115   void insertTest(MachineBasicBlock &MBB, MachineBasicBlock::iterator Pos,
116                   DebugLoc Loc, unsigned Reg);
117 
118   void rewriteArithmetic(MachineBasicBlock &TestMBB,
119                          MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
120                          MachineInstr &MI, MachineOperand &FlagUse,
121                          CondRegArray &CondRegs);
122   void rewriteCMov(MachineBasicBlock &TestMBB,
123                    MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
124                    MachineInstr &CMovI, MachineOperand &FlagUse,
125                    CondRegArray &CondRegs);
126   void rewriteCondJmp(MachineBasicBlock &TestMBB,
127                       MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
128                       MachineInstr &JmpI, CondRegArray &CondRegs);
129   void rewriteCopy(MachineInstr &MI, MachineOperand &FlagUse,
130                    MachineInstr &CopyDefI);
131   void rewriteSetCarryExtended(MachineBasicBlock &TestMBB,
132                                MachineBasicBlock::iterator TestPos,
133                                DebugLoc TestLoc, MachineInstr &SetBI,
134                                MachineOperand &FlagUse, CondRegArray &CondRegs);
135   void rewriteSetCC(MachineBasicBlock &TestMBB,
136                     MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
137                     MachineInstr &SetCCI, MachineOperand &FlagUse,
138                     CondRegArray &CondRegs);
139 };
140 
141 } // end anonymous namespace
142 
143 INITIALIZE_PASS_BEGIN(X86FlagsCopyLoweringPass, DEBUG_TYPE,
144                       "X86 EFLAGS copy lowering", false, false)
145 INITIALIZE_PASS_END(X86FlagsCopyLoweringPass, DEBUG_TYPE,
146                     "X86 EFLAGS copy lowering", false, false)
147 
148 FunctionPass *llvm::createX86FlagsCopyLoweringPass() {
149   return new X86FlagsCopyLoweringPass();
150 }
151 
152 char X86FlagsCopyLoweringPass::ID = 0;
153 
154 void X86FlagsCopyLoweringPass::getAnalysisUsage(AnalysisUsage &AU) const {
155   AU.addRequired<MachineDominatorTree>();
156   MachineFunctionPass::getAnalysisUsage(AU);
157 }
158 
159 namespace {
160 /// An enumeration of the arithmetic instruction mnemonics which have
161 /// interesting flag semantics.
162 ///
163 /// We can map instruction opcodes into these mnemonics to make it easy to
164 /// dispatch with specific functionality.
165 enum class FlagArithMnemonic {
166   ADC,
167   ADCX,
168   ADOX,
169   RCL,
170   RCR,
171   SBB,
172 };
173 } // namespace
174 
175 static FlagArithMnemonic getMnemonicFromOpcode(unsigned Opcode) {
176   switch (Opcode) {
177   default:
178     report_fatal_error("No support for lowering a copy into EFLAGS when used "
179                        "by this instruction!");
180 
181 #define LLVM_EXPAND_INSTR_SIZES(MNEMONIC, SUFFIX)                              \
182   case X86::MNEMONIC##8##SUFFIX:                                               \
183   case X86::MNEMONIC##16##SUFFIX:                                              \
184   case X86::MNEMONIC##32##SUFFIX:                                              \
185   case X86::MNEMONIC##64##SUFFIX:
186 
187 #define LLVM_EXPAND_ADC_SBB_INSTR(MNEMONIC)                                    \
188   LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr)                                        \
189   LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr_REV)                                    \
190   LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rm)                                        \
191   LLVM_EXPAND_INSTR_SIZES(MNEMONIC, mr)                                        \
192   case X86::MNEMONIC##8ri:                                                     \
193   case X86::MNEMONIC##16ri8:                                                   \
194   case X86::MNEMONIC##32ri8:                                                   \
195   case X86::MNEMONIC##64ri8:                                                   \
196   case X86::MNEMONIC##16ri:                                                    \
197   case X86::MNEMONIC##32ri:                                                    \
198   case X86::MNEMONIC##64ri32:                                                  \
199   case X86::MNEMONIC##8mi:                                                     \
200   case X86::MNEMONIC##16mi8:                                                   \
201   case X86::MNEMONIC##32mi8:                                                   \
202   case X86::MNEMONIC##64mi8:                                                   \
203   case X86::MNEMONIC##16mi:                                                    \
204   case X86::MNEMONIC##32mi:                                                    \
205   case X86::MNEMONIC##64mi32:                                                  \
206   case X86::MNEMONIC##8i8:                                                     \
207   case X86::MNEMONIC##16i16:                                                   \
208   case X86::MNEMONIC##32i32:                                                   \
209   case X86::MNEMONIC##64i32:
210 
211     LLVM_EXPAND_ADC_SBB_INSTR(ADC)
212     return FlagArithMnemonic::ADC;
213 
214     LLVM_EXPAND_ADC_SBB_INSTR(SBB)
215     return FlagArithMnemonic::SBB;
216 
217 #undef LLVM_EXPAND_ADC_SBB_INSTR
218 
219     LLVM_EXPAND_INSTR_SIZES(RCL, rCL)
220     LLVM_EXPAND_INSTR_SIZES(RCL, r1)
221     LLVM_EXPAND_INSTR_SIZES(RCL, ri)
222     return FlagArithMnemonic::RCL;
223 
224     LLVM_EXPAND_INSTR_SIZES(RCR, rCL)
225     LLVM_EXPAND_INSTR_SIZES(RCR, r1)
226     LLVM_EXPAND_INSTR_SIZES(RCR, ri)
227     return FlagArithMnemonic::RCR;
228 
229 #undef LLVM_EXPAND_INSTR_SIZES
230 
231   case X86::ADCX32rr:
232   case X86::ADCX64rr:
233   case X86::ADCX32rm:
234   case X86::ADCX64rm:
235     return FlagArithMnemonic::ADCX;
236 
237   case X86::ADOX32rr:
238   case X86::ADOX64rr:
239   case X86::ADOX32rm:
240   case X86::ADOX64rm:
241     return FlagArithMnemonic::ADOX;
242   }
243 }
244 
245 static MachineBasicBlock &splitBlock(MachineBasicBlock &MBB,
246                                      MachineInstr &SplitI,
247                                      const X86InstrInfo &TII) {
248   MachineFunction &MF = *MBB.getParent();
249 
250   assert(SplitI.getParent() == &MBB &&
251          "Split instruction must be in the split block!");
252   assert(SplitI.isBranch() &&
253          "Only designed to split a tail of branch instructions!");
254   assert(X86::getCondFromBranch(SplitI) != X86::COND_INVALID &&
255          "Must split on an actual jCC instruction!");
256 
257   // Dig out the previous instruction to the split point.
258   MachineInstr &PrevI = *std::prev(SplitI.getIterator());
259   assert(PrevI.isBranch() && "Must split after a branch!");
260   assert(X86::getCondFromBranch(PrevI) != X86::COND_INVALID &&
261          "Must split after an actual jCC instruction!");
262   assert(!std::prev(PrevI.getIterator())->isTerminator() &&
263          "Must only have this one terminator prior to the split!");
264 
265   // Grab the one successor edge that will stay in `MBB`.
266   MachineBasicBlock &UnsplitSucc = *PrevI.getOperand(0).getMBB();
267 
268   // Analyze the original block to see if we are actually splitting an edge
269   // into two edges. This can happen when we have multiple conditional jumps to
270   // the same successor.
271   bool IsEdgeSplit =
272       std::any_of(SplitI.getIterator(), MBB.instr_end(),
273                   [&](MachineInstr &MI) {
274                     assert(MI.isTerminator() &&
275                            "Should only have spliced terminators!");
276                     return llvm::any_of(
277                         MI.operands(), [&](MachineOperand &MOp) {
278                           return MOp.isMBB() && MOp.getMBB() == &UnsplitSucc;
279                         });
280                   }) ||
281       MBB.getFallThrough() == &UnsplitSucc;
282 
283   MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock();
284 
285   // Insert the new block immediately after the current one. Any existing
286   // fallthrough will be sunk into this new block anyways.
287   MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB);
288 
289   // Splice the tail of instructions into the new block.
290   NewMBB.splice(NewMBB.end(), &MBB, SplitI.getIterator(), MBB.end());
291 
292   // Copy the necessary succesors (and their probability info) into the new
293   // block.
294   for (auto SI = MBB.succ_begin(), SE = MBB.succ_end(); SI != SE; ++SI)
295     if (IsEdgeSplit || *SI != &UnsplitSucc)
296       NewMBB.copySuccessor(&MBB, SI);
297   // Normalize the probabilities if we didn't end up splitting the edge.
298   if (!IsEdgeSplit)
299     NewMBB.normalizeSuccProbs();
300 
301   // Now replace all of the moved successors in the original block with the new
302   // block. This will merge their probabilities.
303   for (MachineBasicBlock *Succ : NewMBB.successors())
304     if (Succ != &UnsplitSucc)
305       MBB.replaceSuccessor(Succ, &NewMBB);
306 
307   // We should always end up replacing at least one successor.
308   assert(MBB.isSuccessor(&NewMBB) &&
309          "Failed to make the new block a successor!");
310 
311   // Now update all the PHIs.
312   for (MachineBasicBlock *Succ : NewMBB.successors()) {
313     for (MachineInstr &MI : *Succ) {
314       if (!MI.isPHI())
315         break;
316 
317       for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
318            OpIdx += 2) {
319         MachineOperand &OpV = MI.getOperand(OpIdx);
320         MachineOperand &OpMBB = MI.getOperand(OpIdx + 1);
321         assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!");
322         if (OpMBB.getMBB() != &MBB)
323           continue;
324 
325         // Replace the operand for unsplit successors
326         if (!IsEdgeSplit || Succ != &UnsplitSucc) {
327           OpMBB.setMBB(&NewMBB);
328 
329           // We have to continue scanning as there may be multiple entries in
330           // the PHI.
331           continue;
332         }
333 
334         // When we have split the edge append a new successor.
335         MI.addOperand(MF, OpV);
336         MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB));
337         break;
338       }
339     }
340   }
341 
342   return NewMBB;
343 }
344 
345 bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
346   LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
347                     << " **********\n");
348 
349   Subtarget = &MF.getSubtarget<X86Subtarget>();
350   MRI = &MF.getRegInfo();
351   TII = Subtarget->getInstrInfo();
352   TRI = Subtarget->getRegisterInfo();
353   MDT = &getAnalysis<MachineDominatorTree>();
354   PromoteRC = &X86::GR8RegClass;
355 
356   if (MF.begin() == MF.end())
357     // Nothing to do for a degenerate empty function...
358     return false;
359 
360   // Collect the copies in RPO so that when there are chains where a copy is in
361   // turn copied again we visit the first one first. This ensures we can find
362   // viable locations for testing the original EFLAGS that dominate all the
363   // uses across complex CFGs.
364   SmallVector<MachineInstr *, 4> Copies;
365   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
366   for (MachineBasicBlock *MBB : RPOT)
367     for (MachineInstr &MI : *MBB)
368       if (MI.getOpcode() == TargetOpcode::COPY &&
369           MI.getOperand(0).getReg() == X86::EFLAGS)
370         Copies.push_back(&MI);
371 
372   for (MachineInstr *CopyI : Copies) {
373     MachineBasicBlock &MBB = *CopyI->getParent();
374 
375     MachineOperand &VOp = CopyI->getOperand(1);
376     assert(VOp.isReg() &&
377            "The input to the copy for EFLAGS should always be a register!");
378     MachineInstr &CopyDefI = *MRI->getVRegDef(VOp.getReg());
379     if (CopyDefI.getOpcode() != TargetOpcode::COPY) {
380       // FIXME: The big likely candidate here are PHI nodes. We could in theory
381       // handle PHI nodes, but it gets really, really hard. Insanely hard. Hard
382       // enough that it is probably better to change every other part of LLVM
383       // to avoid creating them. The issue is that once we have PHIs we won't
384       // know which original EFLAGS value we need to capture with our setCCs
385       // below. The end result will be computing a complete set of setCCs that
386       // we *might* want, computing them in every place where we copy *out* of
387       // EFLAGS and then doing SSA formation on all of them to insert necessary
388       // PHI nodes and consume those here. Then hoping that somehow we DCE the
389       // unnecessary ones. This DCE seems very unlikely to be successful and so
390       // we will almost certainly end up with a glut of dead setCC
391       // instructions. Until we have a motivating test case and fail to avoid
392       // it by changing other parts of LLVM's lowering, we refuse to handle
393       // this complex case here.
394       LLVM_DEBUG(
395           dbgs() << "ERROR: Encountered unexpected def of an eflags copy: ";
396           CopyDefI.dump());
397       report_fatal_error(
398           "Cannot lower EFLAGS copy unless it is defined in turn by a copy!");
399     }
400 
401     auto Cleanup = make_scope_exit([&] {
402       // All uses of the EFLAGS copy are now rewritten, kill the copy into
403       // eflags and if dead the copy from.
404       CopyI->eraseFromParent();
405       if (MRI->use_empty(CopyDefI.getOperand(0).getReg()))
406         CopyDefI.eraseFromParent();
407       ++NumCopiesEliminated;
408     });
409 
410     MachineOperand &DOp = CopyI->getOperand(0);
411     assert(DOp.isDef() && "Expected register def!");
412     assert(DOp.getReg() == X86::EFLAGS && "Unexpected copy def register!");
413     if (DOp.isDead())
414       continue;
415 
416     MachineBasicBlock *TestMBB = CopyDefI.getParent();
417     auto TestPos = CopyDefI.getIterator();
418     DebugLoc TestLoc = CopyDefI.getDebugLoc();
419 
420     LLVM_DEBUG(dbgs() << "Rewriting copy: "; CopyI->dump());
421 
422     // Walk up across live-in EFLAGS to find where they were actually def'ed.
423     //
424     // This copy's def may just be part of a region of blocks covered by
425     // a single def of EFLAGS and we want to find the top of that region where
426     // possible.
427     //
428     // This is essentially a search for a *candidate* reaching definition
429     // location. We don't need to ever find the actual reaching definition here,
430     // but we want to walk up the dominator tree to find the highest point which
431     // would be viable for such a definition.
432     auto HasEFLAGSClobber = [&](MachineBasicBlock::iterator Begin,
433                                 MachineBasicBlock::iterator End) {
434       // Scan backwards as we expect these to be relatively short and often find
435       // a clobber near the end.
436       return llvm::any_of(
437           llvm::reverse(llvm::make_range(Begin, End)), [&](MachineInstr &MI) {
438             // Flag any instruction (other than the copy we are
439             // currently rewriting) that defs EFLAGS.
440             return &MI != CopyI && MI.findRegisterDefOperand(X86::EFLAGS);
441           });
442     };
443     auto HasEFLAGSClobberPath = [&](MachineBasicBlock *BeginMBB,
444                                     MachineBasicBlock *EndMBB) {
445       assert(MDT->dominates(BeginMBB, EndMBB) &&
446              "Only support paths down the dominator tree!");
447       SmallPtrSet<MachineBasicBlock *, 4> Visited;
448       SmallVector<MachineBasicBlock *, 4> Worklist;
449       // We terminate at the beginning. No need to scan it.
450       Visited.insert(BeginMBB);
451       Worklist.push_back(EndMBB);
452       do {
453         auto *MBB = Worklist.pop_back_val();
454         for (auto *PredMBB : MBB->predecessors()) {
455           if (!Visited.insert(PredMBB).second)
456             continue;
457           if (HasEFLAGSClobber(PredMBB->begin(), PredMBB->end()))
458             return true;
459           // Enqueue this block to walk its predecessors.
460           Worklist.push_back(PredMBB);
461         }
462       } while (!Worklist.empty());
463       // No clobber found along a path from the begin to end.
464       return false;
465     };
466     while (TestMBB->isLiveIn(X86::EFLAGS) && !TestMBB->pred_empty() &&
467            !HasEFLAGSClobber(TestMBB->begin(), TestPos)) {
468       // Find the nearest common dominator of the predecessors, as
469       // that will be the best candidate to hoist into.
470       MachineBasicBlock *HoistMBB =
471           std::accumulate(std::next(TestMBB->pred_begin()), TestMBB->pred_end(),
472                           *TestMBB->pred_begin(),
473                           [&](MachineBasicBlock *LHS, MachineBasicBlock *RHS) {
474                             return MDT->findNearestCommonDominator(LHS, RHS);
475                           });
476 
477       // Now we need to scan all predecessors that may be reached along paths to
478       // the hoist block. A clobber anywhere in any of these blocks the hoist.
479       // Note that this even handles loops because we require *no* clobbers.
480       if (HasEFLAGSClobberPath(HoistMBB, TestMBB))
481         break;
482 
483       // We also need the terminators to not sneakily clobber flags.
484       if (HasEFLAGSClobber(HoistMBB->getFirstTerminator()->getIterator(),
485                            HoistMBB->instr_end()))
486         break;
487 
488       // We found a viable location, hoist our test position to it.
489       TestMBB = HoistMBB;
490       TestPos = TestMBB->getFirstTerminator()->getIterator();
491       // Clear the debug location as it would just be confusing after hoisting.
492       TestLoc = DebugLoc();
493     }
494     LLVM_DEBUG({
495       auto DefIt = llvm::find_if(
496           llvm::reverse(llvm::make_range(TestMBB->instr_begin(), TestPos)),
497           [&](MachineInstr &MI) {
498             return MI.findRegisterDefOperand(X86::EFLAGS);
499           });
500       if (DefIt.base() != TestMBB->instr_begin()) {
501         dbgs() << "  Using EFLAGS defined by: ";
502         DefIt->dump();
503       } else {
504         dbgs() << "  Using live-in flags for BB:\n";
505         TestMBB->dump();
506       }
507     });
508 
509     // While rewriting uses, we buffer jumps and rewrite them in a second pass
510     // because doing so will perturb the CFG that we are walking to find the
511     // uses in the first place.
512     SmallVector<MachineInstr *, 4> JmpIs;
513 
514     // Gather the condition flags that have already been preserved in
515     // registers. We do this from scratch each time as we expect there to be
516     // very few of them and we expect to not revisit the same copy definition
517     // many times. If either of those change sufficiently we could build a map
518     // of these up front instead.
519     CondRegArray CondRegs = collectCondsInRegs(*TestMBB, TestPos);
520 
521     // Collect the basic blocks we need to scan. Typically this will just be
522     // a single basic block but we may have to scan multiple blocks if the
523     // EFLAGS copy lives into successors.
524     SmallVector<MachineBasicBlock *, 2> Blocks;
525     SmallPtrSet<MachineBasicBlock *, 2> VisitedBlocks;
526     Blocks.push_back(&MBB);
527 
528     do {
529       MachineBasicBlock &UseMBB = *Blocks.pop_back_val();
530 
531       // Track when if/when we find a kill of the flags in this block.
532       bool FlagsKilled = false;
533 
534       // In most cases, we walk from the beginning to the end of the block. But
535       // when the block is the same block as the copy is from, we will visit it
536       // twice. The first time we start from the copy and go to the end. The
537       // second time we start from the beginning and go to the copy. This lets
538       // us handle copies inside of cycles.
539       // FIXME: This loop is *super* confusing. This is at least in part
540       // a symptom of all of this routine needing to be refactored into
541       // documentable components. Once done, there may be a better way to write
542       // this loop.
543       for (auto MII = (&UseMBB == &MBB && !VisitedBlocks.count(&UseMBB))
544                           ? std::next(CopyI->getIterator())
545                           : UseMBB.instr_begin(),
546                 MIE = UseMBB.instr_end();
547            MII != MIE;) {
548         MachineInstr &MI = *MII++;
549         // If we are in the original copy block and encounter either the copy
550         // def or the copy itself, break so that we don't re-process any part of
551         // the block or process the instructions in the range that was copied
552         // over.
553         if (&MI == CopyI || &MI == &CopyDefI) {
554           assert(&UseMBB == &MBB && VisitedBlocks.count(&MBB) &&
555                  "Should only encounter these on the second pass over the "
556                  "original block.");
557           break;
558         }
559 
560         MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS);
561         if (!FlagUse) {
562           if (MI.findRegisterDefOperand(X86::EFLAGS)) {
563             // If EFLAGS are defined, it's as-if they were killed. We can stop
564             // scanning here.
565             //
566             // NB!!! Many instructions only modify some flags. LLVM currently
567             // models this as clobbering all flags, but if that ever changes
568             // this will need to be carefully updated to handle that more
569             // complex logic.
570             FlagsKilled = true;
571             break;
572           }
573           continue;
574         }
575 
576         LLVM_DEBUG(dbgs() << "  Rewriting use: "; MI.dump());
577 
578         // Check the kill flag before we rewrite as that may change it.
579         if (FlagUse->isKill())
580           FlagsKilled = true;
581 
582         // Once we encounter a branch, the rest of the instructions must also be
583         // branches. We can't rewrite in place here, so we handle them below.
584         //
585         // Note that we don't have to handle tail calls here, even conditional
586         // tail calls, as those are not introduced into the X86 MI until post-RA
587         // branch folding or black placement. As a consequence, we get to deal
588         // with the simpler formulation of conditional branches followed by tail
589         // calls.
590         if (X86::getCondFromBranch(MI) != X86::COND_INVALID) {
591           auto JmpIt = MI.getIterator();
592           do {
593             JmpIs.push_back(&*JmpIt);
594             ++JmpIt;
595           } while (JmpIt != UseMBB.instr_end() &&
596                    X86::getCondFromBranch(*JmpIt) !=
597                        X86::COND_INVALID);
598           break;
599         }
600 
601         // Otherwise we can just rewrite in-place.
602         if (X86::getCondFromCMov(MI) != X86::COND_INVALID) {
603           rewriteCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
604         } else if (X86::getCondFromSETCC(MI) != X86::COND_INVALID) {
605           rewriteSetCC(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
606         } else if (MI.getOpcode() == TargetOpcode::COPY) {
607           rewriteCopy(MI, *FlagUse, CopyDefI);
608         } else {
609           // We assume all other instructions that use flags also def them.
610           assert(MI.findRegisterDefOperand(X86::EFLAGS) &&
611                  "Expected a def of EFLAGS for this instruction!");
612 
613           // NB!!! Several arithmetic instructions only *partially* update
614           // flags. Theoretically, we could generate MI code sequences that
615           // would rely on this fact and observe different flags independently.
616           // But currently LLVM models all of these instructions as clobbering
617           // all the flags in an undef way. We rely on that to simplify the
618           // logic.
619           FlagsKilled = true;
620 
621           switch (MI.getOpcode()) {
622           case X86::SETB_C8r:
623           case X86::SETB_C16r:
624           case X86::SETB_C32r:
625           case X86::SETB_C64r:
626             // Use custom lowering for arithmetic that is merely extending the
627             // carry flag. We model this as the SETB_C* pseudo instructions.
628             rewriteSetCarryExtended(*TestMBB, TestPos, TestLoc, MI, *FlagUse,
629                                     CondRegs);
630             break;
631 
632           default:
633             // Generically handle remaining uses as arithmetic instructions.
634             rewriteArithmetic(*TestMBB, TestPos, TestLoc, MI, *FlagUse,
635                               CondRegs);
636             break;
637           }
638           break;
639         }
640 
641         // If this was the last use of the flags, we're done.
642         if (FlagsKilled)
643           break;
644       }
645 
646       // If the flags were killed, we're done with this block.
647       if (FlagsKilled)
648         continue;
649 
650       // Otherwise we need to scan successors for ones where the flags live-in
651       // and queue those up for processing.
652       for (MachineBasicBlock *SuccMBB : UseMBB.successors())
653         if (SuccMBB->isLiveIn(X86::EFLAGS) &&
654             VisitedBlocks.insert(SuccMBB).second) {
655           // We currently don't do any PHI insertion and so we require that the
656           // test basic block dominates all of the use basic blocks. Further, we
657           // can't have a cycle from the test block back to itself as that would
658           // create a cycle requiring a PHI to break it.
659           //
660           // We could in theory do PHI insertion here if it becomes useful by
661           // just taking undef values in along every edge that we don't trace
662           // this EFLAGS copy along. This isn't as bad as fully general PHI
663           // insertion, but still seems like a great deal of complexity.
664           //
665           // Because it is theoretically possible that some earlier MI pass or
666           // other lowering transformation could induce this to happen, we do
667           // a hard check even in non-debug builds here.
668           if (SuccMBB == TestMBB || !MDT->dominates(TestMBB, SuccMBB)) {
669             LLVM_DEBUG({
670               dbgs()
671                   << "ERROR: Encountered use that is not dominated by our test "
672                      "basic block! Rewriting this would require inserting PHI "
673                      "nodes to track the flag state across the CFG.\n\nTest "
674                      "block:\n";
675               TestMBB->dump();
676               dbgs() << "Use block:\n";
677               SuccMBB->dump();
678             });
679             report_fatal_error(
680                 "Cannot lower EFLAGS copy when original copy def "
681                 "does not dominate all uses.");
682           }
683 
684           Blocks.push_back(SuccMBB);
685         }
686     } while (!Blocks.empty());
687 
688     // Now rewrite the jumps that use the flags. These we handle specially
689     // because if there are multiple jumps in a single basic block we'll have
690     // to do surgery on the CFG.
691     MachineBasicBlock *LastJmpMBB = nullptr;
692     for (MachineInstr *JmpI : JmpIs) {
693       // Past the first jump within a basic block we need to split the blocks
694       // apart.
695       if (JmpI->getParent() == LastJmpMBB)
696         splitBlock(*JmpI->getParent(), *JmpI, *TII);
697       else
698         LastJmpMBB = JmpI->getParent();
699 
700       rewriteCondJmp(*TestMBB, TestPos, TestLoc, *JmpI, CondRegs);
701     }
702 
703     // FIXME: Mark the last use of EFLAGS before the copy's def as a kill if
704     // the copy's def operand is itself a kill.
705   }
706 
707 #ifndef NDEBUG
708   for (MachineBasicBlock &MBB : MF)
709     for (MachineInstr &MI : MBB)
710       if (MI.getOpcode() == TargetOpcode::COPY &&
711           (MI.getOperand(0).getReg() == X86::EFLAGS ||
712            MI.getOperand(1).getReg() == X86::EFLAGS)) {
713         LLVM_DEBUG(dbgs() << "ERROR: Found a COPY involving EFLAGS: ";
714                    MI.dump());
715         llvm_unreachable("Unlowered EFLAGS copy!");
716       }
717 #endif
718 
719   return true;
720 }
721 
722 /// Collect any conditions that have already been set in registers so that we
723 /// can re-use them rather than adding duplicates.
724 CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs(
725     MachineBasicBlock &MBB, MachineBasicBlock::iterator TestPos) {
726   CondRegArray CondRegs = {};
727 
728   // Scan backwards across the range of instructions with live EFLAGS.
729   for (MachineInstr &MI :
730        llvm::reverse(llvm::make_range(MBB.begin(), TestPos))) {
731     X86::CondCode Cond = X86::getCondFromSETCC(MI);
732     if (Cond != X86::COND_INVALID && !MI.mayStore() && MI.getOperand(0).isReg() &&
733         TRI->isVirtualRegister(MI.getOperand(0).getReg())) {
734       assert(MI.getOperand(0).isDef() &&
735              "A non-storing SETcc should always define a register!");
736       CondRegs[Cond] = MI.getOperand(0).getReg();
737     }
738 
739     // Stop scanning when we see the first definition of the EFLAGS as prior to
740     // this we would potentially capture the wrong flag state.
741     if (MI.findRegisterDefOperand(X86::EFLAGS))
742       break;
743   }
744   return CondRegs;
745 }
746 
747 unsigned X86FlagsCopyLoweringPass::promoteCondToReg(
748     MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
749     DebugLoc TestLoc, X86::CondCode Cond) {
750   unsigned Reg = MRI->createVirtualRegister(PromoteRC);
751   auto SetI = BuildMI(TestMBB, TestPos, TestLoc,
752                       TII->get(X86::SETCCr), Reg).addImm(Cond);
753   (void)SetI;
754   LLVM_DEBUG(dbgs() << "    save cond: "; SetI->dump());
755   ++NumSetCCsInserted;
756   return Reg;
757 }
758 
759 std::pair<unsigned, bool> X86FlagsCopyLoweringPass::getCondOrInverseInReg(
760     MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
761     DebugLoc TestLoc, X86::CondCode Cond, CondRegArray &CondRegs) {
762   unsigned &CondReg = CondRegs[Cond];
763   unsigned &InvCondReg = CondRegs[X86::GetOppositeBranchCondition(Cond)];
764   if (!CondReg && !InvCondReg)
765     CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond);
766 
767   if (CondReg)
768     return {CondReg, false};
769   else
770     return {InvCondReg, true};
771 }
772 
773 void X86FlagsCopyLoweringPass::insertTest(MachineBasicBlock &MBB,
774                                           MachineBasicBlock::iterator Pos,
775                                           DebugLoc Loc, unsigned Reg) {
776   auto TestI =
777       BuildMI(MBB, Pos, Loc, TII->get(X86::TEST8rr)).addReg(Reg).addReg(Reg);
778   (void)TestI;
779   LLVM_DEBUG(dbgs() << "    test cond: "; TestI->dump());
780   ++NumTestsInserted;
781 }
782 
783 void X86FlagsCopyLoweringPass::rewriteArithmetic(
784     MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
785     DebugLoc TestLoc, MachineInstr &MI, MachineOperand &FlagUse,
786     CondRegArray &CondRegs) {
787   // Arithmetic is either reading CF or OF. Figure out which condition we need
788   // to preserve in a register.
789   X86::CondCode Cond;
790 
791   // The addend to use to reset CF or OF when added to the flag value.
792   int Addend;
793 
794   switch (getMnemonicFromOpcode(MI.getOpcode())) {
795   case FlagArithMnemonic::ADC:
796   case FlagArithMnemonic::ADCX:
797   case FlagArithMnemonic::RCL:
798   case FlagArithMnemonic::RCR:
799   case FlagArithMnemonic::SBB:
800     Cond = X86::COND_B; // CF == 1
801     // Set up an addend that when one is added will need a carry due to not
802     // having a higher bit available.
803     Addend = 255;
804     break;
805 
806   case FlagArithMnemonic::ADOX:
807     Cond = X86::COND_O; // OF == 1
808     // Set up an addend that when one is added will turn from positive to
809     // negative and thus overflow in the signed domain.
810     Addend = 127;
811     break;
812   }
813 
814   // Now get a register that contains the value of the flag input to the
815   // arithmetic. We require exactly this flag to simplify the arithmetic
816   // required to materialize it back into the flag.
817   unsigned &CondReg = CondRegs[Cond];
818   if (!CondReg)
819     CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond);
820 
821   MachineBasicBlock &MBB = *MI.getParent();
822 
823   // Insert an instruction that will set the flag back to the desired value.
824   unsigned TmpReg = MRI->createVirtualRegister(PromoteRC);
825   auto AddI =
826       BuildMI(MBB, MI.getIterator(), MI.getDebugLoc(), TII->get(X86::ADD8ri))
827           .addDef(TmpReg, RegState::Dead)
828           .addReg(CondReg)
829           .addImm(Addend);
830   (void)AddI;
831   LLVM_DEBUG(dbgs() << "    add cond: "; AddI->dump());
832   ++NumAddsInserted;
833   FlagUse.setIsKill(true);
834 }
835 
836 void X86FlagsCopyLoweringPass::rewriteCMov(MachineBasicBlock &TestMBB,
837                                            MachineBasicBlock::iterator TestPos,
838                                            DebugLoc TestLoc,
839                                            MachineInstr &CMovI,
840                                            MachineOperand &FlagUse,
841                                            CondRegArray &CondRegs) {
842   // First get the register containing this specific condition.
843   X86::CondCode Cond = X86::getCondFromCMov(CMovI);
844   unsigned CondReg;
845   bool Inverted;
846   std::tie(CondReg, Inverted) =
847       getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs);
848 
849   MachineBasicBlock &MBB = *CMovI.getParent();
850 
851   // Insert a direct test of the saved register.
852   insertTest(MBB, CMovI.getIterator(), CMovI.getDebugLoc(), CondReg);
853 
854   // Rewrite the CMov to use the !ZF flag from the test, and then kill its use
855   // of the flags afterward.
856   CMovI.getOperand(CMovI.getDesc().getNumOperands() - 1)
857       .setImm(Inverted ? X86::COND_E : X86::COND_NE);
858   FlagUse.setIsKill(true);
859   LLVM_DEBUG(dbgs() << "    fixed cmov: "; CMovI.dump());
860 }
861 
862 void X86FlagsCopyLoweringPass::rewriteCondJmp(
863     MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
864     DebugLoc TestLoc, MachineInstr &JmpI, CondRegArray &CondRegs) {
865   // First get the register containing this specific condition.
866   X86::CondCode Cond = X86::getCondFromBranch(JmpI);
867   unsigned CondReg;
868   bool Inverted;
869   std::tie(CondReg, Inverted) =
870       getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs);
871 
872   MachineBasicBlock &JmpMBB = *JmpI.getParent();
873 
874   // Insert a direct test of the saved register.
875   insertTest(JmpMBB, JmpI.getIterator(), JmpI.getDebugLoc(), CondReg);
876 
877   // Rewrite the jump to use the !ZF flag from the test, and kill its use of
878   // flags afterward.
879   JmpI.getOperand(1).setImm(Inverted ? X86::COND_E : X86::COND_NE);
880   JmpI.findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
881   LLVM_DEBUG(dbgs() << "    fixed jCC: "; JmpI.dump());
882 }
883 
884 void X86FlagsCopyLoweringPass::rewriteCopy(MachineInstr &MI,
885                                            MachineOperand &FlagUse,
886                                            MachineInstr &CopyDefI) {
887   // Just replace this copy with the original copy def.
888   MRI->replaceRegWith(MI.getOperand(0).getReg(),
889                       CopyDefI.getOperand(0).getReg());
890   MI.eraseFromParent();
891 }
892 
893 void X86FlagsCopyLoweringPass::rewriteSetCarryExtended(
894     MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
895     DebugLoc TestLoc, MachineInstr &SetBI, MachineOperand &FlagUse,
896     CondRegArray &CondRegs) {
897   // This routine is only used to handle pseudos for setting a register to zero
898   // or all ones based on CF. This is essentially the sign extended from 1-bit
899   // form of SETB and modeled with the SETB_C* pseudos. They require special
900   // handling as they aren't normal SETcc instructions and are lowered to an
901   // EFLAGS clobbering operation (SBB typically). One simplifying aspect is that
902   // they are only provided in reg-defining forms. A complicating factor is that
903   // they can define many different register widths.
904   assert(SetBI.getOperand(0).isReg() &&
905          "Cannot have a non-register defined operand to this variant of SETB!");
906 
907   // Little helper to do the common final step of replacing the register def'ed
908   // by this SETB instruction with a new register and removing the SETB
909   // instruction.
910   auto RewriteToReg = [&](unsigned Reg) {
911     MRI->replaceRegWith(SetBI.getOperand(0).getReg(), Reg);
912     SetBI.eraseFromParent();
913   };
914 
915   // Grab the register class used for this particular instruction.
916   auto &SetBRC = *MRI->getRegClass(SetBI.getOperand(0).getReg());
917 
918   MachineBasicBlock &MBB = *SetBI.getParent();
919   auto SetPos = SetBI.getIterator();
920   auto SetLoc = SetBI.getDebugLoc();
921 
922   auto AdjustReg = [&](unsigned Reg) {
923     auto &OrigRC = *MRI->getRegClass(Reg);
924     if (&OrigRC == &SetBRC)
925       return Reg;
926 
927     unsigned NewReg;
928 
929     int OrigRegSize = TRI->getRegSizeInBits(OrigRC) / 8;
930     int TargetRegSize = TRI->getRegSizeInBits(SetBRC) / 8;
931     assert(OrigRegSize <= 8 && "No GPRs larger than 64-bits!");
932     assert(TargetRegSize <= 8 && "No GPRs larger than 64-bits!");
933     int SubRegIdx[] = {X86::NoSubRegister, X86::sub_8bit, X86::sub_16bit,
934                        X86::NoSubRegister, X86::sub_32bit};
935 
936     // If the original size is smaller than the target *and* is smaller than 4
937     // bytes, we need to explicitly zero extend it. We always extend to 4-bytes
938     // to maximize the chance of being able to CSE that operation and to avoid
939     // partial dependency stalls extending to 2-bytes.
940     if (OrigRegSize < TargetRegSize && OrigRegSize < 4) {
941       NewReg = MRI->createVirtualRegister(&X86::GR32RegClass);
942       BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOVZX32rr8), NewReg)
943           .addReg(Reg);
944       if (&SetBRC == &X86::GR32RegClass)
945         return NewReg;
946       Reg = NewReg;
947       OrigRegSize = 4;
948     }
949 
950     NewReg = MRI->createVirtualRegister(&SetBRC);
951     if (OrigRegSize < TargetRegSize) {
952       BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::SUBREG_TO_REG),
953               NewReg)
954           .addImm(0)
955           .addReg(Reg)
956           .addImm(SubRegIdx[OrigRegSize]);
957     } else if (OrigRegSize > TargetRegSize) {
958       if (TargetRegSize == 1 && !Subtarget->is64Bit()) {
959         // Need to constrain the register class.
960         MRI->constrainRegClass(Reg, &X86::GR32_ABCDRegClass);
961       }
962 
963       BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::COPY),
964               NewReg)
965           .addReg(Reg, 0, SubRegIdx[TargetRegSize]);
966     } else {
967       BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::COPY), NewReg)
968           .addReg(Reg);
969     }
970     return NewReg;
971   };
972 
973   unsigned &CondReg = CondRegs[X86::COND_B];
974   if (!CondReg)
975     CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, X86::COND_B);
976 
977   // Adjust the condition to have the desired register width by zero-extending
978   // as needed.
979   // FIXME: We should use a better API to avoid the local reference and using a
980   // different variable here.
981   unsigned ExtCondReg = AdjustReg(CondReg);
982 
983   // Now we need to turn this into a bitmask. We do this by subtracting it from
984   // zero.
985   unsigned ZeroReg = MRI->createVirtualRegister(&X86::GR32RegClass);
986   BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOV32r0), ZeroReg);
987   ZeroReg = AdjustReg(ZeroReg);
988 
989   unsigned Sub;
990   switch (SetBI.getOpcode()) {
991   case X86::SETB_C8r:
992     Sub = X86::SUB8rr;
993     break;
994 
995   case X86::SETB_C16r:
996     Sub = X86::SUB16rr;
997     break;
998 
999   case X86::SETB_C32r:
1000     Sub = X86::SUB32rr;
1001     break;
1002 
1003   case X86::SETB_C64r:
1004     Sub = X86::SUB64rr;
1005     break;
1006 
1007   default:
1008     llvm_unreachable("Invalid SETB_C* opcode!");
1009   }
1010   unsigned ResultReg = MRI->createVirtualRegister(&SetBRC);
1011   BuildMI(MBB, SetPos, SetLoc, TII->get(Sub), ResultReg)
1012       .addReg(ZeroReg)
1013       .addReg(ExtCondReg);
1014   return RewriteToReg(ResultReg);
1015 }
1016 
1017 void X86FlagsCopyLoweringPass::rewriteSetCC(MachineBasicBlock &TestMBB,
1018                                             MachineBasicBlock::iterator TestPos,
1019                                             DebugLoc TestLoc,
1020                                             MachineInstr &SetCCI,
1021                                             MachineOperand &FlagUse,
1022                                             CondRegArray &CondRegs) {
1023   X86::CondCode Cond = X86::getCondFromSETCC(SetCCI);
1024   // Note that we can't usefully rewrite this to the inverse without complex
1025   // analysis of the users of the setCC. Largely we rely on duplicates which
1026   // could have been avoided already being avoided here.
1027   unsigned &CondReg = CondRegs[Cond];
1028   if (!CondReg)
1029     CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond);
1030 
1031   // Rewriting a register def is trivial: we just replace the register and
1032   // remove the setcc.
1033   if (!SetCCI.mayStore()) {
1034     assert(SetCCI.getOperand(0).isReg() &&
1035            "Cannot have a non-register defined operand to SETcc!");
1036     MRI->replaceRegWith(SetCCI.getOperand(0).getReg(), CondReg);
1037     SetCCI.eraseFromParent();
1038     return;
1039   }
1040 
1041   // Otherwise, we need to emit a store.
1042   auto MIB = BuildMI(*SetCCI.getParent(), SetCCI.getIterator(),
1043                      SetCCI.getDebugLoc(), TII->get(X86::MOV8mr));
1044   // Copy the address operands.
1045   for (int i = 0; i < X86::AddrNumOperands; ++i)
1046     MIB.add(SetCCI.getOperand(i));
1047 
1048   MIB.addReg(CondReg);
1049 
1050   MIB.setMemRefs(SetCCI.memoperands());
1051 
1052   SetCCI.eraseFromParent();
1053   return;
1054 }
1055