1 //====- X86FlagsCopyLowering.cpp - Lowers COPY nodes of EFLAGS ------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 /// \file
10 ///
11 /// Lowers COPY nodes of EFLAGS by directly extracting and preserving individual
12 /// flag bits.
13 ///
14 /// We have to do this by carefully analyzing and rewriting the usage of the
15 /// copied EFLAGS register because there is no general way to rematerialize the
16 /// entire EFLAGS register safely and efficiently. Using `popf` both forces
17 /// dynamic stack adjustment and can create correctness issues due to IF, TF,
18 /// and other non-status flags being overwritten. Using sequences involving
19 /// SAHF don't work on all x86 processors and are often quite slow compared to
20 /// directly testing a single status preserved in its own GPR.
21 ///
22 //===----------------------------------------------------------------------===//
23 
24 #include "X86.h"
25 #include "X86InstrBuilder.h"
26 #include "X86InstrInfo.h"
27 #include "X86Subtarget.h"
28 #include "llvm/ADT/ArrayRef.h"
29 #include "llvm/ADT/DenseMap.h"
30 #include "llvm/ADT/STLExtras.h"
31 #include "llvm/ADT/ScopeExit.h"
32 #include "llvm/ADT/SmallPtrSet.h"
33 #include "llvm/ADT/SmallSet.h"
34 #include "llvm/ADT/SmallVector.h"
35 #include "llvm/ADT/SparseBitVector.h"
36 #include "llvm/ADT/Statistic.h"
37 #include "llvm/CodeGen/MachineBasicBlock.h"
38 #include "llvm/CodeGen/MachineConstantPool.h"
39 #include "llvm/CodeGen/MachineDominators.h"
40 #include "llvm/CodeGen/MachineFunction.h"
41 #include "llvm/CodeGen/MachineFunctionPass.h"
42 #include "llvm/CodeGen/MachineInstr.h"
43 #include "llvm/CodeGen/MachineInstrBuilder.h"
44 #include "llvm/CodeGen/MachineModuleInfo.h"
45 #include "llvm/CodeGen/MachineOperand.h"
46 #include "llvm/CodeGen/MachineRegisterInfo.h"
47 #include "llvm/CodeGen/MachineSSAUpdater.h"
48 #include "llvm/CodeGen/TargetInstrInfo.h"
49 #include "llvm/CodeGen/TargetRegisterInfo.h"
50 #include "llvm/CodeGen/TargetSchedule.h"
51 #include "llvm/CodeGen/TargetSubtargetInfo.h"
52 #include "llvm/IR/DebugLoc.h"
53 #include "llvm/MC/MCSchedule.h"
54 #include "llvm/Pass.h"
55 #include "llvm/Support/CommandLine.h"
56 #include "llvm/Support/Debug.h"
57 #include "llvm/Support/raw_ostream.h"
58 #include <algorithm>
59 #include <cassert>
60 #include <iterator>
61 #include <utility>
62 
63 using namespace llvm;
64 
65 #define PASS_KEY "x86-flags-copy-lowering"
66 #define DEBUG_TYPE PASS_KEY
67 
68 STATISTIC(NumCopiesEliminated, "Number of copies of EFLAGS eliminated");
69 STATISTIC(NumSetCCsInserted, "Number of setCC instructions inserted");
70 STATISTIC(NumTestsInserted, "Number of test instructions inserted");
71 STATISTIC(NumAddsInserted, "Number of adds instructions inserted");
72 
73 namespace llvm {
74 
75 void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
76 
77 } // end namespace llvm
78 
79 namespace {
80 
81 // Convenient array type for storing registers associated with each condition.
82 using CondRegArray = std::array<unsigned, X86::LAST_VALID_COND + 1>;
83 
84 class X86FlagsCopyLoweringPass : public MachineFunctionPass {
85 public:
86   X86FlagsCopyLoweringPass() : MachineFunctionPass(ID) {
87     initializeX86FlagsCopyLoweringPassPass(*PassRegistry::getPassRegistry());
88   }
89 
90   StringRef getPassName() const override { return "X86 EFLAGS copy lowering"; }
91   bool runOnMachineFunction(MachineFunction &MF) override;
92   void getAnalysisUsage(AnalysisUsage &AU) const override;
93 
94   /// Pass identification, replacement for typeid.
95   static char ID;
96 
97 private:
98   MachineRegisterInfo *MRI;
99   const X86InstrInfo *TII;
100   const TargetRegisterInfo *TRI;
101   const TargetRegisterClass *PromoteRC;
102   MachineDominatorTree *MDT;
103 
104   CondRegArray collectCondsInRegs(MachineBasicBlock &MBB,
105                                   MachineInstr &CopyDefI);
106 
107   unsigned promoteCondToReg(MachineBasicBlock &MBB,
108                             MachineBasicBlock::iterator TestPos,
109                             DebugLoc TestLoc, X86::CondCode Cond);
110   std::pair<unsigned, bool>
111   getCondOrInverseInReg(MachineBasicBlock &TestMBB,
112                         MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
113                         X86::CondCode Cond, CondRegArray &CondRegs);
114   void insertTest(MachineBasicBlock &MBB, MachineBasicBlock::iterator Pos,
115                   DebugLoc Loc, unsigned Reg);
116 
117   void rewriteArithmetic(MachineBasicBlock &TestMBB,
118                          MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
119                          MachineInstr &MI, MachineOperand &FlagUse,
120                          CondRegArray &CondRegs);
121   void rewriteCMov(MachineBasicBlock &TestMBB,
122                    MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
123                    MachineInstr &CMovI, MachineOperand &FlagUse,
124                    CondRegArray &CondRegs);
125   void rewriteCondJmp(MachineBasicBlock &TestMBB,
126                       MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
127                       MachineInstr &JmpI, CondRegArray &CondRegs);
128   void rewriteCopy(MachineInstr &MI, MachineOperand &FlagUse,
129                    MachineInstr &CopyDefI);
130   void rewriteSetCarryExtended(MachineBasicBlock &TestMBB,
131                                MachineBasicBlock::iterator TestPos,
132                                DebugLoc TestLoc, MachineInstr &SetBI,
133                                MachineOperand &FlagUse, CondRegArray &CondRegs);
134   void rewriteSetCC(MachineBasicBlock &TestMBB,
135                     MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
136                     MachineInstr &SetCCI, MachineOperand &FlagUse,
137                     CondRegArray &CondRegs);
138 };
139 
140 } // end anonymous namespace
141 
142 INITIALIZE_PASS_BEGIN(X86FlagsCopyLoweringPass, DEBUG_TYPE,
143                       "X86 EFLAGS copy lowering", false, false)
144 INITIALIZE_PASS_END(X86FlagsCopyLoweringPass, DEBUG_TYPE,
145                     "X86 EFLAGS copy lowering", false, false)
146 
147 FunctionPass *llvm::createX86FlagsCopyLoweringPass() {
148   return new X86FlagsCopyLoweringPass();
149 }
150 
151 char X86FlagsCopyLoweringPass::ID = 0;
152 
153 void X86FlagsCopyLoweringPass::getAnalysisUsage(AnalysisUsage &AU) const {
154   AU.addRequired<MachineDominatorTree>();
155   MachineFunctionPass::getAnalysisUsage(AU);
156 }
157 
158 namespace {
159 /// An enumeration of the arithmetic instruction mnemonics which have
160 /// interesting flag semantics.
161 ///
162 /// We can map instruction opcodes into these mnemonics to make it easy to
163 /// dispatch with specific functionality.
164 enum class FlagArithMnemonic {
165   ADC,
166   ADCX,
167   ADOX,
168   RCL,
169   RCR,
170   SBB,
171 };
172 } // namespace
173 
174 static FlagArithMnemonic getMnemonicFromOpcode(unsigned Opcode) {
175   switch (Opcode) {
176   default:
177     report_fatal_error("No support for lowering a copy into EFLAGS when used "
178                        "by this instruction!");
179 
180 #define LLVM_EXPAND_INSTR_SIZES(MNEMONIC, SUFFIX)                              \
181   case X86::MNEMONIC##8##SUFFIX:                                               \
182   case X86::MNEMONIC##16##SUFFIX:                                              \
183   case X86::MNEMONIC##32##SUFFIX:                                              \
184   case X86::MNEMONIC##64##SUFFIX:
185 
186 #define LLVM_EXPAND_ADC_SBB_INSTR(MNEMONIC)                                    \
187   LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr)                                        \
188   LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr_REV)                                    \
189   LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rm)                                        \
190   LLVM_EXPAND_INSTR_SIZES(MNEMONIC, mr)                                        \
191   case X86::MNEMONIC##8ri:                                                     \
192   case X86::MNEMONIC##16ri8:                                                   \
193   case X86::MNEMONIC##32ri8:                                                   \
194   case X86::MNEMONIC##64ri8:                                                   \
195   case X86::MNEMONIC##16ri:                                                    \
196   case X86::MNEMONIC##32ri:                                                    \
197   case X86::MNEMONIC##64ri32:                                                  \
198   case X86::MNEMONIC##8mi:                                                     \
199   case X86::MNEMONIC##16mi8:                                                   \
200   case X86::MNEMONIC##32mi8:                                                   \
201   case X86::MNEMONIC##64mi8:                                                   \
202   case X86::MNEMONIC##16mi:                                                    \
203   case X86::MNEMONIC##32mi:                                                    \
204   case X86::MNEMONIC##64mi32:                                                  \
205   case X86::MNEMONIC##8i8:                                                     \
206   case X86::MNEMONIC##16i16:                                                   \
207   case X86::MNEMONIC##32i32:                                                   \
208   case X86::MNEMONIC##64i32:
209 
210     LLVM_EXPAND_ADC_SBB_INSTR(ADC)
211     return FlagArithMnemonic::ADC;
212 
213     LLVM_EXPAND_ADC_SBB_INSTR(SBB)
214     return FlagArithMnemonic::SBB;
215 
216 #undef LLVM_EXPAND_ADC_SBB_INSTR
217 
218     LLVM_EXPAND_INSTR_SIZES(RCL, rCL)
219     LLVM_EXPAND_INSTR_SIZES(RCL, r1)
220     LLVM_EXPAND_INSTR_SIZES(RCL, ri)
221     return FlagArithMnemonic::RCL;
222 
223     LLVM_EXPAND_INSTR_SIZES(RCR, rCL)
224     LLVM_EXPAND_INSTR_SIZES(RCR, r1)
225     LLVM_EXPAND_INSTR_SIZES(RCR, ri)
226     return FlagArithMnemonic::RCR;
227 
228 #undef LLVM_EXPAND_INSTR_SIZES
229 
230   case X86::ADCX32rr:
231   case X86::ADCX64rr:
232   case X86::ADCX32rm:
233   case X86::ADCX64rm:
234     return FlagArithMnemonic::ADCX;
235 
236   case X86::ADOX32rr:
237   case X86::ADOX64rr:
238   case X86::ADOX32rm:
239   case X86::ADOX64rm:
240     return FlagArithMnemonic::ADOX;
241   }
242 }
243 
244 static MachineBasicBlock &splitBlock(MachineBasicBlock &MBB,
245                                      MachineInstr &SplitI,
246                                      const X86InstrInfo &TII) {
247   MachineFunction &MF = *MBB.getParent();
248 
249   assert(SplitI.getParent() == &MBB &&
250          "Split instruction must be in the split block!");
251   assert(SplitI.isBranch() &&
252          "Only designed to split a tail of branch instructions!");
253   assert(X86::getCondFromBranchOpc(SplitI.getOpcode()) != X86::COND_INVALID &&
254          "Must split on an actual jCC instruction!");
255 
256   // Dig out the previous instruction to the split point.
257   MachineInstr &PrevI = *std::prev(SplitI.getIterator());
258   assert(PrevI.isBranch() && "Must split after a branch!");
259   assert(X86::getCondFromBranchOpc(PrevI.getOpcode()) != X86::COND_INVALID &&
260          "Must split after an actual jCC instruction!");
261   assert(!std::prev(PrevI.getIterator())->isTerminator() &&
262          "Must only have this one terminator prior to the split!");
263 
264   // Grab the one successor edge that will stay in `MBB`.
265   MachineBasicBlock &UnsplitSucc = *PrevI.getOperand(0).getMBB();
266 
267   // Analyze the original block to see if we are actually splitting an edge
268   // into two edges. This can happen when we have multiple conditional jumps to
269   // the same successor.
270   bool IsEdgeSplit =
271       std::any_of(SplitI.getIterator(), MBB.instr_end(),
272                   [&](MachineInstr &MI) {
273                     assert(MI.isTerminator() &&
274                            "Should only have spliced terminators!");
275                     return llvm::any_of(
276                         MI.operands(), [&](MachineOperand &MOp) {
277                           return MOp.isMBB() && MOp.getMBB() == &UnsplitSucc;
278                         });
279                   }) ||
280       MBB.getFallThrough() == &UnsplitSucc;
281 
282   MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock();
283 
284   // Insert the new block immediately after the current one. Any existing
285   // fallthrough will be sunk into this new block anyways.
286   MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB);
287 
288   // Splice the tail of instructions into the new block.
289   NewMBB.splice(NewMBB.end(), &MBB, SplitI.getIterator(), MBB.end());
290 
291   // Copy the necessary succesors (and their probability info) into the new
292   // block.
293   for (auto SI = MBB.succ_begin(), SE = MBB.succ_end(); SI != SE; ++SI)
294     if (IsEdgeSplit || *SI != &UnsplitSucc)
295       NewMBB.copySuccessor(&MBB, SI);
296   // Normalize the probabilities if we didn't end up splitting the edge.
297   if (!IsEdgeSplit)
298     NewMBB.normalizeSuccProbs();
299 
300   // Now replace all of the moved successors in the original block with the new
301   // block. This will merge their probabilities.
302   for (MachineBasicBlock *Succ : NewMBB.successors())
303     if (Succ != &UnsplitSucc)
304       MBB.replaceSuccessor(Succ, &NewMBB);
305 
306   // We should always end up replacing at least one successor.
307   assert(MBB.isSuccessor(&NewMBB) &&
308          "Failed to make the new block a successor!");
309 
310   // Now update all the PHIs.
311   for (MachineBasicBlock *Succ : NewMBB.successors()) {
312     for (MachineInstr &MI : *Succ) {
313       if (!MI.isPHI())
314         break;
315 
316       for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
317            OpIdx += 2) {
318         MachineOperand &OpV = MI.getOperand(OpIdx);
319         MachineOperand &OpMBB = MI.getOperand(OpIdx + 1);
320         assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!");
321         if (OpMBB.getMBB() != &MBB)
322           continue;
323 
324         // Replace the operand for unsplit successors
325         if (!IsEdgeSplit || Succ != &UnsplitSucc) {
326           OpMBB.setMBB(&NewMBB);
327 
328           // We have to continue scanning as there may be multiple entries in
329           // the PHI.
330           continue;
331         }
332 
333         // When we have split the edge append a new successor.
334         MI.addOperand(MF, OpV);
335         MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB));
336         break;
337       }
338     }
339   }
340 
341   return NewMBB;
342 }
343 
344 bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
345   DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
346                << " **********\n");
347 
348   auto &Subtarget = MF.getSubtarget<X86Subtarget>();
349   MRI = &MF.getRegInfo();
350   TII = Subtarget.getInstrInfo();
351   TRI = Subtarget.getRegisterInfo();
352   MDT = &getAnalysis<MachineDominatorTree>();
353   PromoteRC = &X86::GR8RegClass;
354 
355   if (MF.begin() == MF.end())
356     // Nothing to do for a degenerate empty function...
357     return false;
358 
359   SmallVector<MachineInstr *, 4> Copies;
360   for (MachineBasicBlock &MBB : MF)
361     for (MachineInstr &MI : MBB)
362       if (MI.getOpcode() == TargetOpcode::COPY &&
363           MI.getOperand(0).getReg() == X86::EFLAGS)
364         Copies.push_back(&MI);
365 
366   for (MachineInstr *CopyI : Copies) {
367     MachineBasicBlock &MBB = *CopyI->getParent();
368 
369     MachineOperand &VOp = CopyI->getOperand(1);
370     assert(VOp.isReg() &&
371            "The input to the copy for EFLAGS should always be a register!");
372     MachineInstr &CopyDefI = *MRI->getVRegDef(VOp.getReg());
373     if (CopyDefI.getOpcode() != TargetOpcode::COPY) {
374       // FIXME: The big likely candidate here are PHI nodes. We could in theory
375       // handle PHI nodes, but it gets really, really hard. Insanely hard. Hard
376       // enough that it is probably better to change every other part of LLVM
377       // to avoid creating them. The issue is that once we have PHIs we won't
378       // know which original EFLAGS value we need to capture with our setCCs
379       // below. The end result will be computing a complete set of setCCs that
380       // we *might* want, computing them in every place where we copy *out* of
381       // EFLAGS and then doing SSA formation on all of them to insert necessary
382       // PHI nodes and consume those here. Then hoping that somehow we DCE the
383       // unnecessary ones. This DCE seems very unlikely to be successful and so
384       // we will almost certainly end up with a glut of dead setCC
385       // instructions. Until we have a motivating test case and fail to avoid
386       // it by changing other parts of LLVM's lowering, we refuse to handle
387       // this complex case here.
388       DEBUG(dbgs() << "ERROR: Encountered unexpected def of an eflags copy: ";
389             CopyDefI.dump());
390       report_fatal_error(
391           "Cannot lower EFLAGS copy unless it is defined in turn by a copy!");
392     }
393 
394     auto Cleanup = make_scope_exit([&] {
395       // All uses of the EFLAGS copy are now rewritten, kill the copy into
396       // eflags and if dead the copy from.
397       CopyI->eraseFromParent();
398       if (MRI->use_empty(CopyDefI.getOperand(0).getReg()))
399         CopyDefI.eraseFromParent();
400       ++NumCopiesEliminated;
401     });
402 
403     MachineOperand &DOp = CopyI->getOperand(0);
404     assert(DOp.isDef() && "Expected register def!");
405     assert(DOp.getReg() == X86::EFLAGS && "Unexpected copy def register!");
406     if (DOp.isDead())
407       continue;
408 
409     MachineBasicBlock &TestMBB = *CopyDefI.getParent();
410     auto TestPos = CopyDefI.getIterator();
411     DebugLoc TestLoc = CopyDefI.getDebugLoc();
412 
413     DEBUG(dbgs() << "Rewriting copy: "; CopyI->dump());
414 
415     // Scan for usage of newly set EFLAGS so we can rewrite them. We just buffer
416     // jumps because their usage is very constrained.
417     bool FlagsKilled = false;
418     SmallVector<MachineInstr *, 4> JmpIs;
419 
420     // Gather the condition flags that have already been preserved in
421     // registers. We do this from scratch each time as we expect there to be
422     // very few of them and we expect to not revisit the same copy definition
423     // many times. If either of those change sufficiently we could build a map
424     // of these up front instead.
425     CondRegArray CondRegs = collectCondsInRegs(TestMBB, CopyDefI);
426 
427     // Collect the basic blocks we need to scan. Typically this will just be
428     // a single basic block but we may have to scan multiple blocks if the
429     // EFLAGS copy lives into successors.
430     SmallVector<MachineBasicBlock *, 2> Blocks;
431     SmallPtrSet<MachineBasicBlock *, 2> VisitedBlocks;
432     Blocks.push_back(&MBB);
433     VisitedBlocks.insert(&MBB);
434 
435     do {
436       MachineBasicBlock &UseMBB = *Blocks.pop_back_val();
437 
438       // We currently don't do any PHI insertion and so we require that the
439       // test basic block dominates all of the use basic blocks.
440       //
441       // We could in theory do PHI insertion here if it becomes useful by just
442       // taking undef values in along every edge that we don't trace this
443       // EFLAGS copy along. This isn't as bad as fully general PHI insertion,
444       // but still seems like a great deal of complexity.
445       //
446       // Because it is theoretically possible that some earlier MI pass or
447       // other lowering transformation could induce this to happen, we do
448       // a hard check even in non-debug builds here.
449       if (&TestMBB != &UseMBB && !MDT->dominates(&TestMBB, &UseMBB)) {
450         DEBUG({
451           dbgs() << "ERROR: Encountered use that is not dominated by our test "
452                     "basic block! Rewriting this would require inserting PHI "
453                     "nodes to track the flag state across the CFG.\n\nTest "
454                     "block:\n";
455           TestMBB.dump();
456           dbgs() << "Use block:\n";
457           UseMBB.dump();
458         });
459         report_fatal_error("Cannot lower EFLAGS copy when original copy def "
460                            "does not dominate all uses.");
461       }
462 
463       for (auto MII = &UseMBB == &MBB ? std::next(CopyI->getIterator())
464                                       : UseMBB.instr_begin(),
465                 MIE = UseMBB.instr_end();
466            MII != MIE;) {
467         MachineInstr &MI = *MII++;
468         MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS);
469         if (!FlagUse) {
470           if (MI.findRegisterDefOperand(X86::EFLAGS)) {
471             // If EFLAGS are defined, it's as-if they were killed. We can stop
472             // scanning here.
473             //
474             // NB!!! Many instructions only modify some flags. LLVM currently
475             // models this as clobbering all flags, but if that ever changes
476             // this will need to be carefully updated to handle that more
477             // complex logic.
478             FlagsKilled = true;
479             break;
480           }
481           continue;
482         }
483 
484         DEBUG(dbgs() << "  Rewriting use: "; MI.dump());
485 
486         // Check the kill flag before we rewrite as that may change it.
487         if (FlagUse->isKill())
488           FlagsKilled = true;
489 
490         // Once we encounter a branch, the rest of the instructions must also be
491         // branches. We can't rewrite in place here, so we handle them below.
492         //
493         // Note that we don't have to handle tail calls here, even conditional
494         // tail calls, as those are not introduced into the X86 MI until post-RA
495         // branch folding or black placement. As a consequence, we get to deal
496         // with the simpler formulation of conditional branches followed by tail
497         // calls.
498         if (X86::getCondFromBranchOpc(MI.getOpcode()) != X86::COND_INVALID) {
499           auto JmpIt = MI.getIterator();
500           do {
501             JmpIs.push_back(&*JmpIt);
502             ++JmpIt;
503           } while (JmpIt != UseMBB.instr_end() &&
504                    X86::getCondFromBranchOpc(JmpIt->getOpcode()) !=
505                        X86::COND_INVALID);
506           break;
507         }
508 
509         // Otherwise we can just rewrite in-place.
510         if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) {
511           rewriteCMov(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
512         } else if (X86::getCondFromSETOpc(MI.getOpcode()) !=
513                    X86::COND_INVALID) {
514           rewriteSetCC(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
515         } else if (MI.getOpcode() == TargetOpcode::COPY) {
516           rewriteCopy(MI, *FlagUse, CopyDefI);
517         } else {
518           // We assume all other instructions that use flags also def them.
519           assert(MI.findRegisterDefOperand(X86::EFLAGS) &&
520                  "Expected a def of EFLAGS for this instruction!");
521 
522           // NB!!! Several arithmetic instructions only *partially* update
523           // flags. Theoretically, we could generate MI code sequences that
524           // would rely on this fact and observe different flags independently.
525           // But currently LLVM models all of these instructions as clobbering
526           // all the flags in an undef way. We rely on that to simplify the
527           // logic.
528           FlagsKilled = true;
529 
530           switch (MI.getOpcode()) {
531           case X86::SETB_C8r:
532           case X86::SETB_C16r:
533           case X86::SETB_C32r:
534           case X86::SETB_C64r:
535             // Use custom lowering for arithmetic that is merely extending the
536             // carry flag. We model this as the SETB_C* pseudo instructions.
537             rewriteSetCarryExtended(TestMBB, TestPos, TestLoc, MI, *FlagUse,
538                                     CondRegs);
539             break;
540 
541           default:
542             // Generically handle remaining uses as arithmetic instructions.
543             rewriteArithmetic(TestMBB, TestPos, TestLoc, MI, *FlagUse,
544                               CondRegs);
545             break;
546           }
547           break;
548         }
549 
550         // If this was the last use of the flags, we're done.
551         if (FlagsKilled)
552           break;
553       }
554 
555       // If the flags were killed, we're done with this block.
556       if (FlagsKilled)
557         break;
558 
559       // Otherwise we need to scan successors for ones where the flags live-in
560       // and queue those up for processing.
561       for (MachineBasicBlock *SuccMBB : UseMBB.successors())
562         if (SuccMBB->isLiveIn(X86::EFLAGS) &&
563             VisitedBlocks.insert(SuccMBB).second)
564           Blocks.push_back(SuccMBB);
565     } while (!Blocks.empty());
566 
567     // Now rewrite the jumps that use the flags. These we handle specially
568     // because if there are multiple jumps in a single basic block we'll have
569     // to do surgery on the CFG.
570     MachineBasicBlock *LastJmpMBB = nullptr;
571     for (MachineInstr *JmpI : JmpIs) {
572       // Past the first jump within a basic block we need to split the blocks
573       // apart.
574       if (JmpI->getParent() == LastJmpMBB)
575         splitBlock(*JmpI->getParent(), *JmpI, *TII);
576       else
577         LastJmpMBB = JmpI->getParent();
578 
579       rewriteCondJmp(TestMBB, TestPos, TestLoc, *JmpI, CondRegs);
580     }
581 
582     // FIXME: Mark the last use of EFLAGS before the copy's def as a kill if
583     // the copy's def operand is itself a kill.
584   }
585 
586 #ifndef NDEBUG
587   for (MachineBasicBlock &MBB : MF)
588     for (MachineInstr &MI : MBB)
589       if (MI.getOpcode() == TargetOpcode::COPY &&
590           (MI.getOperand(0).getReg() == X86::EFLAGS ||
591            MI.getOperand(1).getReg() == X86::EFLAGS)) {
592         DEBUG(dbgs() << "ERROR: Found a COPY involving EFLAGS: "; MI.dump());
593         llvm_unreachable("Unlowered EFLAGS copy!");
594       }
595 #endif
596 
597   return true;
598 }
599 
600 /// Collect any conditions that have already been set in registers so that we
601 /// can re-use them rather than adding duplicates.
602 CondRegArray
603 X86FlagsCopyLoweringPass::collectCondsInRegs(MachineBasicBlock &MBB,
604                                              MachineInstr &CopyDefI) {
605   CondRegArray CondRegs = {};
606 
607   // Scan backwards across the range of instructions with live EFLAGS.
608   for (MachineInstr &MI : llvm::reverse(
609            llvm::make_range(MBB.instr_begin(), CopyDefI.getIterator()))) {
610     X86::CondCode Cond = X86::getCondFromSETOpc(MI.getOpcode());
611     if (Cond != X86::COND_INVALID && !MI.mayStore() && MI.getOperand(0).isReg() &&
612         TRI->isVirtualRegister(MI.getOperand(0).getReg())) {
613       assert(MI.getOperand(0).isDef() &&
614              "A non-storing SETcc should always define a register!");
615       CondRegs[Cond] = MI.getOperand(0).getReg();
616     }
617 
618     // Stop scanning when we see the first definition of the EFLAGS as prior to
619     // this we would potentially capture the wrong flag state.
620     if (MI.findRegisterDefOperand(X86::EFLAGS))
621       break;
622   }
623   return CondRegs;
624 }
625 
626 unsigned X86FlagsCopyLoweringPass::promoteCondToReg(
627     MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
628     DebugLoc TestLoc, X86::CondCode Cond) {
629   unsigned Reg = MRI->createVirtualRegister(PromoteRC);
630   auto SetI = BuildMI(TestMBB, TestPos, TestLoc,
631                       TII->get(X86::getSETFromCond(Cond)), Reg);
632   (void)SetI;
633   DEBUG(dbgs() << "    save cond: "; SetI->dump());
634   ++NumSetCCsInserted;
635   return Reg;
636 }
637 
638 std::pair<unsigned, bool> X86FlagsCopyLoweringPass::getCondOrInverseInReg(
639     MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
640     DebugLoc TestLoc, X86::CondCode Cond, CondRegArray &CondRegs) {
641   unsigned &CondReg = CondRegs[Cond];
642   unsigned &InvCondReg = CondRegs[X86::GetOppositeBranchCondition(Cond)];
643   if (!CondReg && !InvCondReg)
644     CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond);
645 
646   if (CondReg)
647     return {CondReg, false};
648   else
649     return {InvCondReg, true};
650 }
651 
652 void X86FlagsCopyLoweringPass::insertTest(MachineBasicBlock &MBB,
653                                           MachineBasicBlock::iterator Pos,
654                                           DebugLoc Loc, unsigned Reg) {
655   // We emit test instructions as register/immediate test against -1. This
656   // allows register allocation to fold a memory operand if needed (that will
657   // happen often due to the places this code is emitted). But hopefully will
658   // also allow us to select a shorter encoding of `testb %reg, %reg` when that
659   // would be equivalent.
660   auto TestI =
661       BuildMI(MBB, Pos, Loc, TII->get(X86::TEST8rr)).addReg(Reg).addReg(Reg);
662   (void)TestI;
663   DEBUG(dbgs() << "    test cond: "; TestI->dump());
664   ++NumTestsInserted;
665 }
666 
667 void X86FlagsCopyLoweringPass::rewriteArithmetic(
668     MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
669     DebugLoc TestLoc, MachineInstr &MI, MachineOperand &FlagUse,
670     CondRegArray &CondRegs) {
671   // Arithmetic is either reading CF or OF. Figure out which condition we need
672   // to preserve in a register.
673   X86::CondCode Cond;
674 
675   // The addend to use to reset CF or OF when added to the flag value.
676   int Addend;
677 
678   switch (getMnemonicFromOpcode(MI.getOpcode())) {
679   case FlagArithMnemonic::ADC:
680   case FlagArithMnemonic::ADCX:
681   case FlagArithMnemonic::RCL:
682   case FlagArithMnemonic::RCR:
683   case FlagArithMnemonic::SBB:
684     Cond = X86::COND_B; // CF == 1
685     // Set up an addend that when one is added will need a carry due to not
686     // having a higher bit available.
687     Addend = 255;
688     break;
689 
690   case FlagArithMnemonic::ADOX:
691     Cond = X86::COND_O; // OF == 1
692     // Set up an addend that when one is added will turn from positive to
693     // negative and thus overflow in the signed domain.
694     Addend = 127;
695     break;
696   }
697 
698   // Now get a register that contains the value of the flag input to the
699   // arithmetic. We require exactly this flag to simplify the arithmetic
700   // required to materialize it back into the flag.
701   unsigned &CondReg = CondRegs[Cond];
702   if (!CondReg)
703     CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond);
704 
705   MachineBasicBlock &MBB = *MI.getParent();
706 
707   // Insert an instruction that will set the flag back to the desired value.
708   unsigned TmpReg = MRI->createVirtualRegister(PromoteRC);
709   auto AddI =
710       BuildMI(MBB, MI.getIterator(), MI.getDebugLoc(), TII->get(X86::ADD8ri))
711           .addDef(TmpReg, RegState::Dead)
712           .addReg(CondReg)
713           .addImm(Addend);
714   (void)AddI;
715   DEBUG(dbgs() << "    add cond: "; AddI->dump());
716   ++NumAddsInserted;
717   FlagUse.setIsKill(true);
718 }
719 
720 void X86FlagsCopyLoweringPass::rewriteCMov(MachineBasicBlock &TestMBB,
721                                            MachineBasicBlock::iterator TestPos,
722                                            DebugLoc TestLoc,
723                                            MachineInstr &CMovI,
724                                            MachineOperand &FlagUse,
725                                            CondRegArray &CondRegs) {
726   // First get the register containing this specific condition.
727   X86::CondCode Cond = X86::getCondFromCMovOpc(CMovI.getOpcode());
728   unsigned CondReg;
729   bool Inverted;
730   std::tie(CondReg, Inverted) =
731       getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs);
732 
733   MachineBasicBlock &MBB = *CMovI.getParent();
734 
735   // Insert a direct test of the saved register.
736   insertTest(MBB, CMovI.getIterator(), CMovI.getDebugLoc(), CondReg);
737 
738   // Rewrite the CMov to use the !ZF flag from the test (but match register
739   // size and memory operand), and then kill its use of the flags afterward.
740   auto &CMovRC = *MRI->getRegClass(CMovI.getOperand(0).getReg());
741   CMovI.setDesc(TII->get(X86::getCMovFromCond(
742       Inverted ? X86::COND_E : X86::COND_NE, TRI->getRegSizeInBits(CMovRC) / 8,
743       !CMovI.memoperands_empty())));
744   FlagUse.setIsKill(true);
745   DEBUG(dbgs() << "    fixed cmov: "; CMovI.dump());
746 }
747 
748 void X86FlagsCopyLoweringPass::rewriteCondJmp(
749     MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
750     DebugLoc TestLoc, MachineInstr &JmpI, CondRegArray &CondRegs) {
751   // First get the register containing this specific condition.
752   X86::CondCode Cond = X86::getCondFromBranchOpc(JmpI.getOpcode());
753   unsigned CondReg;
754   bool Inverted;
755   std::tie(CondReg, Inverted) =
756       getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs);
757 
758   MachineBasicBlock &JmpMBB = *JmpI.getParent();
759 
760   // Insert a direct test of the saved register.
761   insertTest(JmpMBB, JmpI.getIterator(), JmpI.getDebugLoc(), CondReg);
762 
763   // Rewrite the jump to use the !ZF flag from the test, and kill its use of
764   // flags afterward.
765   JmpI.setDesc(TII->get(
766       X86::GetCondBranchFromCond(Inverted ? X86::COND_E : X86::COND_NE)));
767   const int ImplicitEFLAGSOpIdx = 1;
768   JmpI.getOperand(ImplicitEFLAGSOpIdx).setIsKill(true);
769   DEBUG(dbgs() << "    fixed jCC: "; JmpI.dump());
770 }
771 
772 void X86FlagsCopyLoweringPass::rewriteCopy(MachineInstr &MI,
773                                            MachineOperand &FlagUse,
774                                            MachineInstr &CopyDefI) {
775   // Just replace this copy with the the original copy def.
776   MRI->replaceRegWith(MI.getOperand(0).getReg(),
777                       CopyDefI.getOperand(0).getReg());
778   MI.eraseFromParent();
779 }
780 
781 void X86FlagsCopyLoweringPass::rewriteSetCarryExtended(
782     MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
783     DebugLoc TestLoc, MachineInstr &SetBI, MachineOperand &FlagUse,
784     CondRegArray &CondRegs) {
785   // This routine is only used to handle pseudos for setting a register to zero
786   // or all ones based on CF. This is essentially the sign extended from 1-bit
787   // form of SETB and modeled with the SETB_C* pseudos. They require special
788   // handling as they aren't normal SETcc instructions and are lowered to an
789   // EFLAGS clobbering operation (SBB typically). One simplifying aspect is that
790   // they are only provided in reg-defining forms. A complicating factor is that
791   // they can define many different register widths.
792   assert(SetBI.getOperand(0).isReg() &&
793          "Cannot have a non-register defined operand to this variant of SETB!");
794 
795   // Little helper to do the common final step of replacing the register def'ed
796   // by this SETB instruction with a new register and removing the SETB
797   // instruction.
798   auto RewriteToReg = [&](unsigned Reg) {
799     MRI->replaceRegWith(SetBI.getOperand(0).getReg(), Reg);
800     SetBI.eraseFromParent();
801   };
802 
803   // Grab the register class used for this particular instruction.
804   auto &SetBRC = *MRI->getRegClass(SetBI.getOperand(0).getReg());
805 
806   MachineBasicBlock &MBB = *SetBI.getParent();
807   auto SetPos = SetBI.getIterator();
808   auto SetLoc = SetBI.getDebugLoc();
809 
810   auto AdjustReg = [&](unsigned Reg) {
811     auto &OrigRC = *MRI->getRegClass(Reg);
812     if (&OrigRC == &SetBRC)
813       return Reg;
814 
815     unsigned NewReg;
816 
817     int OrigRegSize = TRI->getRegSizeInBits(OrigRC) / 8;
818     int TargetRegSize = TRI->getRegSizeInBits(SetBRC) / 8;
819     assert(OrigRegSize <= 8 && "No GPRs larger than 64-bits!");
820     assert(TargetRegSize <= 8 && "No GPRs larger than 64-bits!");
821     int SubRegIdx[] = {X86::NoSubRegister, X86::sub_8bit, X86::sub_16bit,
822                        X86::NoSubRegister, X86::sub_32bit};
823 
824     // If the original size is smaller than the target *and* is smaller than 4
825     // bytes, we need to explicitly zero extend it. We always extend to 4-bytes
826     // to maximize the chance of being able to CSE that operation and to avoid
827     // partial dependency stalls extending to 2-bytes.
828     if (OrigRegSize < TargetRegSize && OrigRegSize < 4) {
829       NewReg = MRI->createVirtualRegister(&X86::GR32RegClass);
830       BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOVZX32rr8), NewReg)
831           .addReg(Reg);
832       if (&SetBRC == &X86::GR32RegClass)
833         return NewReg;
834       Reg = NewReg;
835       OrigRegSize = 4;
836     }
837 
838     NewReg = MRI->createVirtualRegister(&SetBRC);
839     if (OrigRegSize < TargetRegSize) {
840       BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::SUBREG_TO_REG),
841               NewReg)
842           .addImm(0)
843           .addReg(Reg)
844           .addImm(SubRegIdx[OrigRegSize]);
845     } else if (OrigRegSize > TargetRegSize) {
846       BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::EXTRACT_SUBREG),
847               NewReg)
848           .addReg(Reg)
849           .addImm(SubRegIdx[TargetRegSize]);
850     } else {
851       BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::COPY), NewReg)
852           .addReg(Reg);
853     }
854     return NewReg;
855   };
856 
857   unsigned &CondReg = CondRegs[X86::COND_B];
858   if (!CondReg)
859     CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, X86::COND_B);
860 
861   // Adjust the condition to have the desired register width by zero-extending
862   // as needed.
863   // FIXME: We should use a better API to avoid the local reference and using a
864   // different variable here.
865   unsigned ExtCondReg = AdjustReg(CondReg);
866 
867   // Now we need to turn this into a bitmask. We do this by subtracting it from
868   // zero.
869   unsigned ZeroReg = MRI->createVirtualRegister(&X86::GR32RegClass);
870   BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOV32r0), ZeroReg);
871   ZeroReg = AdjustReg(ZeroReg);
872 
873   unsigned Sub;
874   switch (SetBI.getOpcode()) {
875   case X86::SETB_C8r:
876     Sub = X86::SUB8rr;
877     break;
878 
879   case X86::SETB_C16r:
880     Sub = X86::SUB16rr;
881     break;
882 
883   case X86::SETB_C32r:
884     Sub = X86::SUB32rr;
885     break;
886 
887   case X86::SETB_C64r:
888     Sub = X86::SUB64rr;
889     break;
890 
891   default:
892     llvm_unreachable("Invalid SETB_C* opcode!");
893   }
894   unsigned ResultReg = MRI->createVirtualRegister(&SetBRC);
895   BuildMI(MBB, SetPos, SetLoc, TII->get(Sub), ResultReg)
896       .addReg(ZeroReg)
897       .addReg(ExtCondReg);
898   return RewriteToReg(ResultReg);
899 }
900 
901 void X86FlagsCopyLoweringPass::rewriteSetCC(MachineBasicBlock &TestMBB,
902                                             MachineBasicBlock::iterator TestPos,
903                                             DebugLoc TestLoc,
904                                             MachineInstr &SetCCI,
905                                             MachineOperand &FlagUse,
906                                             CondRegArray &CondRegs) {
907   X86::CondCode Cond = X86::getCondFromSETOpc(SetCCI.getOpcode());
908   // Note that we can't usefully rewrite this to the inverse without complex
909   // analysis of the users of the setCC. Largely we rely on duplicates which
910   // could have been avoided already being avoided here.
911   unsigned &CondReg = CondRegs[Cond];
912   if (!CondReg)
913     CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond);
914 
915   // Rewriting a register def is trivial: we just replace the register and
916   // remove the setcc.
917   if (!SetCCI.mayStore()) {
918     assert(SetCCI.getOperand(0).isReg() &&
919            "Cannot have a non-register defined operand to SETcc!");
920     MRI->replaceRegWith(SetCCI.getOperand(0).getReg(), CondReg);
921     SetCCI.eraseFromParent();
922     return;
923   }
924 
925   // Otherwise, we need to emit a store.
926   auto MIB = BuildMI(*SetCCI.getParent(), SetCCI.getIterator(),
927                      SetCCI.getDebugLoc(), TII->get(X86::MOV8mr));
928   // Copy the address operands.
929   for (int i = 0; i < X86::AddrNumOperands; ++i)
930     MIB.add(SetCCI.getOperand(i));
931 
932   MIB.addReg(CondReg);
933 
934   MIB->setMemRefs(SetCCI.memoperands_begin(), SetCCI.memoperands_end());
935 
936   SetCCI.eraseFromParent();
937   return;
938 }
939