1 //====- X86FlagsCopyLowering.cpp - Lowers COPY nodes of EFLAGS ------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 /// \file
10 ///
11 /// Lowers COPY nodes of EFLAGS by directly extracting and preserving individual
12 /// flag bits.
13 ///
14 /// We have to do this by carefully analyzing and rewriting the usage of the
15 /// copied EFLAGS register because there is no general way to rematerialize the
16 /// entire EFLAGS register safely and efficiently. Using `popf` both forces
17 /// dynamic stack adjustment and can create correctness issues due to IF, TF,
18 /// and other non-status flags being overwritten. Using sequences involving
19 /// SAHF don't work on all x86 processors and are often quite slow compared to
20 /// directly testing a single status preserved in its own GPR.
21 ///
22 //===----------------------------------------------------------------------===//
23 
24 #include "X86.h"
25 #include "X86InstrBuilder.h"
26 #include "X86InstrInfo.h"
27 #include "X86Subtarget.h"
28 #include "llvm/ADT/ArrayRef.h"
29 #include "llvm/ADT/DenseMap.h"
30 #include "llvm/ADT/STLExtras.h"
31 #include "llvm/ADT/ScopeExit.h"
32 #include "llvm/ADT/SmallPtrSet.h"
33 #include "llvm/ADT/SmallSet.h"
34 #include "llvm/ADT/SmallVector.h"
35 #include "llvm/ADT/SparseBitVector.h"
36 #include "llvm/ADT/Statistic.h"
37 #include "llvm/CodeGen/MachineBasicBlock.h"
38 #include "llvm/CodeGen/MachineConstantPool.h"
39 #include "llvm/CodeGen/MachineDominators.h"
40 #include "llvm/CodeGen/MachineFunction.h"
41 #include "llvm/CodeGen/MachineFunctionPass.h"
42 #include "llvm/CodeGen/MachineInstr.h"
43 #include "llvm/CodeGen/MachineInstrBuilder.h"
44 #include "llvm/CodeGen/MachineModuleInfo.h"
45 #include "llvm/CodeGen/MachineOperand.h"
46 #include "llvm/CodeGen/MachineRegisterInfo.h"
47 #include "llvm/CodeGen/MachineSSAUpdater.h"
48 #include "llvm/CodeGen/TargetInstrInfo.h"
49 #include "llvm/CodeGen/TargetRegisterInfo.h"
50 #include "llvm/CodeGen/TargetSchedule.h"
51 #include "llvm/CodeGen/TargetSubtargetInfo.h"
52 #include "llvm/IR/DebugLoc.h"
53 #include "llvm/MC/MCSchedule.h"
54 #include "llvm/Pass.h"
55 #include "llvm/Support/CommandLine.h"
56 #include "llvm/Support/Debug.h"
57 #include "llvm/Support/raw_ostream.h"
58 #include <algorithm>
59 #include <cassert>
60 #include <iterator>
61 #include <utility>
62 
63 using namespace llvm;
64 
65 #define PASS_KEY "x86-flags-copy-lowering"
66 #define DEBUG_TYPE PASS_KEY
67 
68 STATISTIC(NumCopiesEliminated, "Number of copies of EFLAGS eliminated");
69 STATISTIC(NumSetCCsInserted, "Number of setCC instructions inserted");
70 STATISTIC(NumTestsInserted, "Number of test instructions inserted");
71 STATISTIC(NumAddsInserted, "Number of adds instructions inserted");
72 
73 namespace llvm {
74 
75 void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
76 
77 } // end namespace llvm
78 
79 namespace {
80 
81 // Convenient array type for storing registers associated with each condition.
82 using CondRegArray = std::array<unsigned, X86::LAST_VALID_COND + 1>;
83 
84 class X86FlagsCopyLoweringPass : public MachineFunctionPass {
85 public:
86   X86FlagsCopyLoweringPass() : MachineFunctionPass(ID) {
87     initializeX86FlagsCopyLoweringPassPass(*PassRegistry::getPassRegistry());
88   }
89 
90   StringRef getPassName() const override { return "X86 EFLAGS copy lowering"; }
91   bool runOnMachineFunction(MachineFunction &MF) override;
92   void getAnalysisUsage(AnalysisUsage &AU) const override;
93 
94   /// Pass identification, replacement for typeid.
95   static char ID;
96 
97 private:
98   MachineRegisterInfo *MRI;
99   const X86InstrInfo *TII;
100   const TargetRegisterInfo *TRI;
101   const TargetRegisterClass *PromoteRC;
102   MachineDominatorTree *MDT;
103 
104   CondRegArray collectCondsInRegs(MachineBasicBlock &MBB,
105                                   MachineInstr &CopyDefI);
106 
107   unsigned promoteCondToReg(MachineBasicBlock &MBB,
108                             MachineBasicBlock::iterator TestPos,
109                             DebugLoc TestLoc, X86::CondCode Cond);
110   std::pair<unsigned, bool>
111   getCondOrInverseInReg(MachineBasicBlock &TestMBB,
112                         MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
113                         X86::CondCode Cond, CondRegArray &CondRegs);
114   void insertTest(MachineBasicBlock &MBB, MachineBasicBlock::iterator Pos,
115                   DebugLoc Loc, unsigned Reg);
116 
117   void rewriteArithmetic(MachineBasicBlock &TestMBB,
118                          MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
119                          MachineInstr &MI, MachineOperand &FlagUse,
120                          CondRegArray &CondRegs);
121   void rewriteCMov(MachineBasicBlock &TestMBB,
122                    MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
123                    MachineInstr &CMovI, MachineOperand &FlagUse,
124                    CondRegArray &CondRegs);
125   void rewriteCondJmp(MachineBasicBlock &TestMBB,
126                       MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
127                       MachineInstr &JmpI, CondRegArray &CondRegs);
128   void rewriteCopy(MachineInstr &MI, MachineOperand &FlagUse,
129                    MachineInstr &CopyDefI);
130   void rewriteSetCarryExtended(MachineBasicBlock &TestMBB,
131                                MachineBasicBlock::iterator TestPos,
132                                DebugLoc TestLoc, MachineInstr &SetBI,
133                                MachineOperand &FlagUse, CondRegArray &CondRegs);
134   void rewriteSetCC(MachineBasicBlock &TestMBB,
135                     MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
136                     MachineInstr &SetCCI, MachineOperand &FlagUse,
137                     CondRegArray &CondRegs);
138 };
139 
140 } // end anonymous namespace
141 
142 INITIALIZE_PASS_BEGIN(X86FlagsCopyLoweringPass, DEBUG_TYPE,
143                       "X86 EFLAGS copy lowering", false, false)
144 INITIALIZE_PASS_END(X86FlagsCopyLoweringPass, DEBUG_TYPE,
145                     "X86 EFLAGS copy lowering", false, false)
146 
147 FunctionPass *llvm::createX86FlagsCopyLoweringPass() {
148   return new X86FlagsCopyLoweringPass();
149 }
150 
151 char X86FlagsCopyLoweringPass::ID = 0;
152 
153 void X86FlagsCopyLoweringPass::getAnalysisUsage(AnalysisUsage &AU) const {
154   AU.addRequired<MachineDominatorTree>();
155   MachineFunctionPass::getAnalysisUsage(AU);
156 }
157 
158 namespace {
159 /// An enumeration of the arithmetic instruction mnemonics which have
160 /// interesting flag semantics.
161 ///
162 /// We can map instruction opcodes into these mnemonics to make it easy to
163 /// dispatch with specific functionality.
164 enum class FlagArithMnemonic {
165   ADC,
166   ADCX,
167   ADOX,
168   RCL,
169   RCR,
170   SBB,
171 };
172 } // namespace
173 
174 static FlagArithMnemonic getMnemonicFromOpcode(unsigned Opcode) {
175   switch (Opcode) {
176   default:
177     report_fatal_error("No support for lowering a copy into EFLAGS when used "
178                        "by this instruction!");
179 
180 #define LLVM_EXPAND_INSTR_SIZES(MNEMONIC, SUFFIX)                              \
181   case X86::MNEMONIC##8##SUFFIX:                                               \
182   case X86::MNEMONIC##16##SUFFIX:                                              \
183   case X86::MNEMONIC##32##SUFFIX:                                              \
184   case X86::MNEMONIC##64##SUFFIX:
185 
186 #define LLVM_EXPAND_ADC_SBB_INSTR(MNEMONIC)                                    \
187   LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr)                                        \
188   LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr_REV)                                    \
189   LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rm)                                        \
190   LLVM_EXPAND_INSTR_SIZES(MNEMONIC, mr)                                        \
191   case X86::MNEMONIC##8ri:                                                     \
192   case X86::MNEMONIC##16ri8:                                                   \
193   case X86::MNEMONIC##32ri8:                                                   \
194   case X86::MNEMONIC##64ri8:                                                   \
195   case X86::MNEMONIC##16ri:                                                    \
196   case X86::MNEMONIC##32ri:                                                    \
197   case X86::MNEMONIC##64ri32:                                                  \
198   case X86::MNEMONIC##8mi:                                                     \
199   case X86::MNEMONIC##16mi8:                                                   \
200   case X86::MNEMONIC##32mi8:                                                   \
201   case X86::MNEMONIC##64mi8:                                                   \
202   case X86::MNEMONIC##16mi:                                                    \
203   case X86::MNEMONIC##32mi:                                                    \
204   case X86::MNEMONIC##64mi32:                                                  \
205   case X86::MNEMONIC##8i8:                                                     \
206   case X86::MNEMONIC##16i16:                                                   \
207   case X86::MNEMONIC##32i32:                                                   \
208   case X86::MNEMONIC##64i32:
209 
210     LLVM_EXPAND_ADC_SBB_INSTR(ADC)
211     return FlagArithMnemonic::ADC;
212 
213     LLVM_EXPAND_ADC_SBB_INSTR(SBB)
214     return FlagArithMnemonic::SBB;
215 
216 #undef LLVM_EXPAND_ADC_SBB_INSTR
217 
218     LLVM_EXPAND_INSTR_SIZES(RCL, rCL)
219     LLVM_EXPAND_INSTR_SIZES(RCL, r1)
220     LLVM_EXPAND_INSTR_SIZES(RCL, ri)
221     return FlagArithMnemonic::RCL;
222 
223     LLVM_EXPAND_INSTR_SIZES(RCR, rCL)
224     LLVM_EXPAND_INSTR_SIZES(RCR, r1)
225     LLVM_EXPAND_INSTR_SIZES(RCR, ri)
226     return FlagArithMnemonic::RCR;
227 
228 #undef LLVM_EXPAND_INSTR_SIZES
229 
230   case X86::ADCX32rr:
231   case X86::ADCX64rr:
232   case X86::ADCX32rm:
233   case X86::ADCX64rm:
234     return FlagArithMnemonic::ADCX;
235 
236   case X86::ADOX32rr:
237   case X86::ADOX64rr:
238   case X86::ADOX32rm:
239   case X86::ADOX64rm:
240     return FlagArithMnemonic::ADOX;
241   }
242 }
243 
244 static MachineBasicBlock &splitBlock(MachineBasicBlock &MBB,
245                                      MachineInstr &SplitI,
246                                      const X86InstrInfo &TII) {
247   MachineFunction &MF = *MBB.getParent();
248 
249   assert(SplitI.getParent() == &MBB &&
250          "Split instruction must be in the split block!");
251   assert(SplitI.isBranch() &&
252          "Only designed to split a tail of branch instructions!");
253   assert(X86::getCondFromBranchOpc(SplitI.getOpcode()) != X86::COND_INVALID &&
254          "Must split on an actual jCC instruction!");
255 
256   // Dig out the previous instruction to the split point.
257   MachineInstr &PrevI = *std::prev(SplitI.getIterator());
258   assert(PrevI.isBranch() && "Must split after a branch!");
259   assert(X86::getCondFromBranchOpc(PrevI.getOpcode()) != X86::COND_INVALID &&
260          "Must split after an actual jCC instruction!");
261   assert(!std::prev(PrevI.getIterator())->isTerminator() &&
262          "Must only have this one terminator prior to the split!");
263 
264   // Grab the one successor edge that will stay in `MBB`.
265   MachineBasicBlock &UnsplitSucc = *PrevI.getOperand(0).getMBB();
266 
267   // Analyze the original block to see if we are actually splitting an edge
268   // into two edges. This can happen when we have multiple conditional jumps to
269   // the same successor.
270   bool IsEdgeSplit =
271       std::any_of(SplitI.getIterator(), MBB.instr_end(),
272                   [&](MachineInstr &MI) {
273                     assert(MI.isTerminator() &&
274                            "Should only have spliced terminators!");
275                     return llvm::any_of(
276                         MI.operands(), [&](MachineOperand &MOp) {
277                           return MOp.isMBB() && MOp.getMBB() == &UnsplitSucc;
278                         });
279                   }) ||
280       MBB.getFallThrough() == &UnsplitSucc;
281 
282   MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock();
283 
284   // Insert the new block immediately after the current one. Any existing
285   // fallthrough will be sunk into this new block anyways.
286   MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB);
287 
288   // Splice the tail of instructions into the new block.
289   NewMBB.splice(NewMBB.end(), &MBB, SplitI.getIterator(), MBB.end());
290 
291   // Copy the necessary succesors (and their probability info) into the new
292   // block.
293   for (auto SI = MBB.succ_begin(), SE = MBB.succ_end(); SI != SE; ++SI)
294     if (IsEdgeSplit || *SI != &UnsplitSucc)
295       NewMBB.copySuccessor(&MBB, SI);
296   // Normalize the probabilities if we didn't end up splitting the edge.
297   if (!IsEdgeSplit)
298     NewMBB.normalizeSuccProbs();
299 
300   // Now replace all of the moved successors in the original block with the new
301   // block. This will merge their probabilities.
302   for (MachineBasicBlock *Succ : NewMBB.successors())
303     if (Succ != &UnsplitSucc)
304       MBB.replaceSuccessor(Succ, &NewMBB);
305 
306   // We should always end up replacing at least one successor.
307   assert(MBB.isSuccessor(&NewMBB) &&
308          "Failed to make the new block a successor!");
309 
310   // Now update all the PHIs.
311   for (MachineBasicBlock *Succ : NewMBB.successors()) {
312     for (MachineInstr &MI : *Succ) {
313       if (!MI.isPHI())
314         break;
315 
316       for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
317            OpIdx += 2) {
318         MachineOperand &OpV = MI.getOperand(OpIdx);
319         MachineOperand &OpMBB = MI.getOperand(OpIdx + 1);
320         assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!");
321         if (OpMBB.getMBB() != &MBB)
322           continue;
323 
324         // Replace the operand for unsplit successors
325         if (!IsEdgeSplit || Succ != &UnsplitSucc) {
326           OpMBB.setMBB(&NewMBB);
327 
328           // We have to continue scanning as there may be multiple entries in
329           // the PHI.
330           continue;
331         }
332 
333         // When we have split the edge append a new successor.
334         MI.addOperand(MF, OpV);
335         MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB));
336         break;
337       }
338     }
339   }
340 
341   return NewMBB;
342 }
343 
344 bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
345   LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
346                     << " **********\n");
347 
348   auto &Subtarget = MF.getSubtarget<X86Subtarget>();
349   MRI = &MF.getRegInfo();
350   TII = Subtarget.getInstrInfo();
351   TRI = Subtarget.getRegisterInfo();
352   MDT = &getAnalysis<MachineDominatorTree>();
353   PromoteRC = &X86::GR8RegClass;
354 
355   if (MF.begin() == MF.end())
356     // Nothing to do for a degenerate empty function...
357     return false;
358 
359   SmallVector<MachineInstr *, 4> Copies;
360   for (MachineBasicBlock &MBB : MF)
361     for (MachineInstr &MI : MBB)
362       if (MI.getOpcode() == TargetOpcode::COPY &&
363           MI.getOperand(0).getReg() == X86::EFLAGS)
364         Copies.push_back(&MI);
365 
366   for (MachineInstr *CopyI : Copies) {
367     MachineBasicBlock &MBB = *CopyI->getParent();
368 
369     MachineOperand &VOp = CopyI->getOperand(1);
370     assert(VOp.isReg() &&
371            "The input to the copy for EFLAGS should always be a register!");
372     MachineInstr &CopyDefI = *MRI->getVRegDef(VOp.getReg());
373     if (CopyDefI.getOpcode() != TargetOpcode::COPY) {
374       // FIXME: The big likely candidate here are PHI nodes. We could in theory
375       // handle PHI nodes, but it gets really, really hard. Insanely hard. Hard
376       // enough that it is probably better to change every other part of LLVM
377       // to avoid creating them. The issue is that once we have PHIs we won't
378       // know which original EFLAGS value we need to capture with our setCCs
379       // below. The end result will be computing a complete set of setCCs that
380       // we *might* want, computing them in every place where we copy *out* of
381       // EFLAGS and then doing SSA formation on all of them to insert necessary
382       // PHI nodes and consume those here. Then hoping that somehow we DCE the
383       // unnecessary ones. This DCE seems very unlikely to be successful and so
384       // we will almost certainly end up with a glut of dead setCC
385       // instructions. Until we have a motivating test case and fail to avoid
386       // it by changing other parts of LLVM's lowering, we refuse to handle
387       // this complex case here.
388       LLVM_DEBUG(
389           dbgs() << "ERROR: Encountered unexpected def of an eflags copy: ";
390           CopyDefI.dump());
391       report_fatal_error(
392           "Cannot lower EFLAGS copy unless it is defined in turn by a copy!");
393     }
394 
395     auto Cleanup = make_scope_exit([&] {
396       // All uses of the EFLAGS copy are now rewritten, kill the copy into
397       // eflags and if dead the copy from.
398       CopyI->eraseFromParent();
399       if (MRI->use_empty(CopyDefI.getOperand(0).getReg()))
400         CopyDefI.eraseFromParent();
401       ++NumCopiesEliminated;
402     });
403 
404     MachineOperand &DOp = CopyI->getOperand(0);
405     assert(DOp.isDef() && "Expected register def!");
406     assert(DOp.getReg() == X86::EFLAGS && "Unexpected copy def register!");
407     if (DOp.isDead())
408       continue;
409 
410     MachineBasicBlock &TestMBB = *CopyDefI.getParent();
411     auto TestPos = CopyDefI.getIterator();
412     DebugLoc TestLoc = CopyDefI.getDebugLoc();
413 
414     LLVM_DEBUG(dbgs() << "Rewriting copy: "; CopyI->dump());
415 
416     // Scan for usage of newly set EFLAGS so we can rewrite them. We just buffer
417     // jumps because their usage is very constrained.
418     bool FlagsKilled = false;
419     SmallVector<MachineInstr *, 4> JmpIs;
420 
421     // Gather the condition flags that have already been preserved in
422     // registers. We do this from scratch each time as we expect there to be
423     // very few of them and we expect to not revisit the same copy definition
424     // many times. If either of those change sufficiently we could build a map
425     // of these up front instead.
426     CondRegArray CondRegs = collectCondsInRegs(TestMBB, CopyDefI);
427 
428     // Collect the basic blocks we need to scan. Typically this will just be
429     // a single basic block but we may have to scan multiple blocks if the
430     // EFLAGS copy lives into successors.
431     SmallVector<MachineBasicBlock *, 2> Blocks;
432     SmallPtrSet<MachineBasicBlock *, 2> VisitedBlocks;
433     Blocks.push_back(&MBB);
434     VisitedBlocks.insert(&MBB);
435 
436     do {
437       MachineBasicBlock &UseMBB = *Blocks.pop_back_val();
438 
439       // We currently don't do any PHI insertion and so we require that the
440       // test basic block dominates all of the use basic blocks.
441       //
442       // We could in theory do PHI insertion here if it becomes useful by just
443       // taking undef values in along every edge that we don't trace this
444       // EFLAGS copy along. This isn't as bad as fully general PHI insertion,
445       // but still seems like a great deal of complexity.
446       //
447       // Because it is theoretically possible that some earlier MI pass or
448       // other lowering transformation could induce this to happen, we do
449       // a hard check even in non-debug builds here.
450       if (&TestMBB != &UseMBB && !MDT->dominates(&TestMBB, &UseMBB)) {
451         LLVM_DEBUG({
452           dbgs() << "ERROR: Encountered use that is not dominated by our test "
453                     "basic block! Rewriting this would require inserting PHI "
454                     "nodes to track the flag state across the CFG.\n\nTest "
455                     "block:\n";
456           TestMBB.dump();
457           dbgs() << "Use block:\n";
458           UseMBB.dump();
459         });
460         report_fatal_error("Cannot lower EFLAGS copy when original copy def "
461                            "does not dominate all uses.");
462       }
463 
464       for (auto MII = &UseMBB == &MBB ? std::next(CopyI->getIterator())
465                                       : UseMBB.instr_begin(),
466                 MIE = UseMBB.instr_end();
467            MII != MIE;) {
468         MachineInstr &MI = *MII++;
469         MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS);
470         if (!FlagUse) {
471           if (MI.findRegisterDefOperand(X86::EFLAGS)) {
472             // If EFLAGS are defined, it's as-if they were killed. We can stop
473             // scanning here.
474             //
475             // NB!!! Many instructions only modify some flags. LLVM currently
476             // models this as clobbering all flags, but if that ever changes
477             // this will need to be carefully updated to handle that more
478             // complex logic.
479             FlagsKilled = true;
480             break;
481           }
482           continue;
483         }
484 
485         LLVM_DEBUG(dbgs() << "  Rewriting use: "; MI.dump());
486 
487         // Check the kill flag before we rewrite as that may change it.
488         if (FlagUse->isKill())
489           FlagsKilled = true;
490 
491         // Once we encounter a branch, the rest of the instructions must also be
492         // branches. We can't rewrite in place here, so we handle them below.
493         //
494         // Note that we don't have to handle tail calls here, even conditional
495         // tail calls, as those are not introduced into the X86 MI until post-RA
496         // branch folding or black placement. As a consequence, we get to deal
497         // with the simpler formulation of conditional branches followed by tail
498         // calls.
499         if (X86::getCondFromBranchOpc(MI.getOpcode()) != X86::COND_INVALID) {
500           auto JmpIt = MI.getIterator();
501           do {
502             JmpIs.push_back(&*JmpIt);
503             ++JmpIt;
504           } while (JmpIt != UseMBB.instr_end() &&
505                    X86::getCondFromBranchOpc(JmpIt->getOpcode()) !=
506                        X86::COND_INVALID);
507           break;
508         }
509 
510         // Otherwise we can just rewrite in-place.
511         if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) {
512           rewriteCMov(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
513         } else if (X86::getCondFromSETOpc(MI.getOpcode()) !=
514                    X86::COND_INVALID) {
515           rewriteSetCC(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
516         } else if (MI.getOpcode() == TargetOpcode::COPY) {
517           rewriteCopy(MI, *FlagUse, CopyDefI);
518         } else {
519           // We assume all other instructions that use flags also def them.
520           assert(MI.findRegisterDefOperand(X86::EFLAGS) &&
521                  "Expected a def of EFLAGS for this instruction!");
522 
523           // NB!!! Several arithmetic instructions only *partially* update
524           // flags. Theoretically, we could generate MI code sequences that
525           // would rely on this fact and observe different flags independently.
526           // But currently LLVM models all of these instructions as clobbering
527           // all the flags in an undef way. We rely on that to simplify the
528           // logic.
529           FlagsKilled = true;
530 
531           switch (MI.getOpcode()) {
532           case X86::SETB_C8r:
533           case X86::SETB_C16r:
534           case X86::SETB_C32r:
535           case X86::SETB_C64r:
536             // Use custom lowering for arithmetic that is merely extending the
537             // carry flag. We model this as the SETB_C* pseudo instructions.
538             rewriteSetCarryExtended(TestMBB, TestPos, TestLoc, MI, *FlagUse,
539                                     CondRegs);
540             break;
541 
542           default:
543             // Generically handle remaining uses as arithmetic instructions.
544             rewriteArithmetic(TestMBB, TestPos, TestLoc, MI, *FlagUse,
545                               CondRegs);
546             break;
547           }
548           break;
549         }
550 
551         // If this was the last use of the flags, we're done.
552         if (FlagsKilled)
553           break;
554       }
555 
556       // If the flags were killed, we're done with this block.
557       if (FlagsKilled)
558         break;
559 
560       // Otherwise we need to scan successors for ones where the flags live-in
561       // and queue those up for processing.
562       for (MachineBasicBlock *SuccMBB : UseMBB.successors())
563         if (SuccMBB->isLiveIn(X86::EFLAGS) &&
564             VisitedBlocks.insert(SuccMBB).second)
565           Blocks.push_back(SuccMBB);
566     } while (!Blocks.empty());
567 
568     // Now rewrite the jumps that use the flags. These we handle specially
569     // because if there are multiple jumps in a single basic block we'll have
570     // to do surgery on the CFG.
571     MachineBasicBlock *LastJmpMBB = nullptr;
572     for (MachineInstr *JmpI : JmpIs) {
573       // Past the first jump within a basic block we need to split the blocks
574       // apart.
575       if (JmpI->getParent() == LastJmpMBB)
576         splitBlock(*JmpI->getParent(), *JmpI, *TII);
577       else
578         LastJmpMBB = JmpI->getParent();
579 
580       rewriteCondJmp(TestMBB, TestPos, TestLoc, *JmpI, CondRegs);
581     }
582 
583     // FIXME: Mark the last use of EFLAGS before the copy's def as a kill if
584     // the copy's def operand is itself a kill.
585   }
586 
587 #ifndef NDEBUG
588   for (MachineBasicBlock &MBB : MF)
589     for (MachineInstr &MI : MBB)
590       if (MI.getOpcode() == TargetOpcode::COPY &&
591           (MI.getOperand(0).getReg() == X86::EFLAGS ||
592            MI.getOperand(1).getReg() == X86::EFLAGS)) {
593         LLVM_DEBUG(dbgs() << "ERROR: Found a COPY involving EFLAGS: ";
594                    MI.dump());
595         llvm_unreachable("Unlowered EFLAGS copy!");
596       }
597 #endif
598 
599   return true;
600 }
601 
602 /// Collect any conditions that have already been set in registers so that we
603 /// can re-use them rather than adding duplicates.
604 CondRegArray
605 X86FlagsCopyLoweringPass::collectCondsInRegs(MachineBasicBlock &MBB,
606                                              MachineInstr &CopyDefI) {
607   CondRegArray CondRegs = {};
608 
609   // Scan backwards across the range of instructions with live EFLAGS.
610   for (MachineInstr &MI : llvm::reverse(
611            llvm::make_range(MBB.instr_begin(), CopyDefI.getIterator()))) {
612     X86::CondCode Cond = X86::getCondFromSETOpc(MI.getOpcode());
613     if (Cond != X86::COND_INVALID && MI.getOperand(0).isReg() &&
614         TRI->isVirtualRegister(MI.getOperand(0).getReg()))
615       CondRegs[Cond] = MI.getOperand(0).getReg();
616 
617     // Stop scanning when we see the first definition of the EFLAGS as prior to
618     // this we would potentially capture the wrong flag state.
619     if (MI.findRegisterDefOperand(X86::EFLAGS))
620       break;
621   }
622   return CondRegs;
623 }
624 
625 unsigned X86FlagsCopyLoweringPass::promoteCondToReg(
626     MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
627     DebugLoc TestLoc, X86::CondCode Cond) {
628   unsigned Reg = MRI->createVirtualRegister(PromoteRC);
629   auto SetI = BuildMI(TestMBB, TestPos, TestLoc,
630                       TII->get(X86::getSETFromCond(Cond)), Reg);
631   (void)SetI;
632   LLVM_DEBUG(dbgs() << "    save cond: "; SetI->dump());
633   ++NumSetCCsInserted;
634   return Reg;
635 }
636 
637 std::pair<unsigned, bool> X86FlagsCopyLoweringPass::getCondOrInverseInReg(
638     MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
639     DebugLoc TestLoc, X86::CondCode Cond, CondRegArray &CondRegs) {
640   unsigned &CondReg = CondRegs[Cond];
641   unsigned &InvCondReg = CondRegs[X86::GetOppositeBranchCondition(Cond)];
642   if (!CondReg && !InvCondReg)
643     CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond);
644 
645   if (CondReg)
646     return {CondReg, false};
647   else
648     return {InvCondReg, true};
649 }
650 
651 void X86FlagsCopyLoweringPass::insertTest(MachineBasicBlock &MBB,
652                                           MachineBasicBlock::iterator Pos,
653                                           DebugLoc Loc, unsigned Reg) {
654   auto TestI =
655       BuildMI(MBB, Pos, Loc, TII->get(X86::TEST8rr)).addReg(Reg).addReg(Reg);
656   (void)TestI;
657   LLVM_DEBUG(dbgs() << "    test cond: "; TestI->dump());
658   ++NumTestsInserted;
659 }
660 
661 void X86FlagsCopyLoweringPass::rewriteArithmetic(
662     MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
663     DebugLoc TestLoc, MachineInstr &MI, MachineOperand &FlagUse,
664     CondRegArray &CondRegs) {
665   // Arithmetic is either reading CF or OF. Figure out which condition we need
666   // to preserve in a register.
667   X86::CondCode Cond;
668 
669   // The addend to use to reset CF or OF when added to the flag value.
670   int Addend;
671 
672   switch (getMnemonicFromOpcode(MI.getOpcode())) {
673   case FlagArithMnemonic::ADC:
674   case FlagArithMnemonic::ADCX:
675   case FlagArithMnemonic::RCL:
676   case FlagArithMnemonic::RCR:
677   case FlagArithMnemonic::SBB:
678     Cond = X86::COND_B; // CF == 1
679     // Set up an addend that when one is added will need a carry due to not
680     // having a higher bit available.
681     Addend = 255;
682     break;
683 
684   case FlagArithMnemonic::ADOX:
685     Cond = X86::COND_O; // OF == 1
686     // Set up an addend that when one is added will turn from positive to
687     // negative and thus overflow in the signed domain.
688     Addend = 127;
689     break;
690   }
691 
692   // Now get a register that contains the value of the flag input to the
693   // arithmetic. We require exactly this flag to simplify the arithmetic
694   // required to materialize it back into the flag.
695   unsigned &CondReg = CondRegs[Cond];
696   if (!CondReg)
697     CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond);
698 
699   MachineBasicBlock &MBB = *MI.getParent();
700 
701   // Insert an instruction that will set the flag back to the desired value.
702   unsigned TmpReg = MRI->createVirtualRegister(PromoteRC);
703   auto AddI =
704       BuildMI(MBB, MI.getIterator(), MI.getDebugLoc(), TII->get(X86::ADD8ri))
705           .addDef(TmpReg, RegState::Dead)
706           .addReg(CondReg)
707           .addImm(Addend);
708   (void)AddI;
709   LLVM_DEBUG(dbgs() << "    add cond: "; AddI->dump());
710   ++NumAddsInserted;
711   FlagUse.setIsKill(true);
712 }
713 
714 void X86FlagsCopyLoweringPass::rewriteCMov(MachineBasicBlock &TestMBB,
715                                            MachineBasicBlock::iterator TestPos,
716                                            DebugLoc TestLoc,
717                                            MachineInstr &CMovI,
718                                            MachineOperand &FlagUse,
719                                            CondRegArray &CondRegs) {
720   // First get the register containing this specific condition.
721   X86::CondCode Cond = X86::getCondFromCMovOpc(CMovI.getOpcode());
722   unsigned CondReg;
723   bool Inverted;
724   std::tie(CondReg, Inverted) =
725       getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs);
726 
727   MachineBasicBlock &MBB = *CMovI.getParent();
728 
729   // Insert a direct test of the saved register.
730   insertTest(MBB, CMovI.getIterator(), CMovI.getDebugLoc(), CondReg);
731 
732   // Rewrite the CMov to use the !ZF flag from the test (but match register
733   // size and memory operand), and then kill its use of the flags afterward.
734   auto &CMovRC = *MRI->getRegClass(CMovI.getOperand(0).getReg());
735   CMovI.setDesc(TII->get(X86::getCMovFromCond(
736       Inverted ? X86::COND_E : X86::COND_NE, TRI->getRegSizeInBits(CMovRC) / 8,
737       !CMovI.memoperands_empty())));
738   FlagUse.setIsKill(true);
739   LLVM_DEBUG(dbgs() << "    fixed cmov: "; CMovI.dump());
740 }
741 
742 void X86FlagsCopyLoweringPass::rewriteCondJmp(
743     MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
744     DebugLoc TestLoc, MachineInstr &JmpI, CondRegArray &CondRegs) {
745   // First get the register containing this specific condition.
746   X86::CondCode Cond = X86::getCondFromBranchOpc(JmpI.getOpcode());
747   unsigned CondReg;
748   bool Inverted;
749   std::tie(CondReg, Inverted) =
750       getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs);
751 
752   MachineBasicBlock &JmpMBB = *JmpI.getParent();
753 
754   // Insert a direct test of the saved register.
755   insertTest(JmpMBB, JmpI.getIterator(), JmpI.getDebugLoc(), CondReg);
756 
757   // Rewrite the jump to use the !ZF flag from the test, and kill its use of
758   // flags afterward.
759   JmpI.setDesc(TII->get(
760       X86::GetCondBranchFromCond(Inverted ? X86::COND_E : X86::COND_NE)));
761   const int ImplicitEFLAGSOpIdx = 1;
762   JmpI.getOperand(ImplicitEFLAGSOpIdx).setIsKill(true);
763   LLVM_DEBUG(dbgs() << "    fixed jCC: "; JmpI.dump());
764 }
765 
766 void X86FlagsCopyLoweringPass::rewriteCopy(MachineInstr &MI,
767                                            MachineOperand &FlagUse,
768                                            MachineInstr &CopyDefI) {
769   // Just replace this copy with the original copy def.
770   MRI->replaceRegWith(MI.getOperand(0).getReg(),
771                       CopyDefI.getOperand(0).getReg());
772   MI.eraseFromParent();
773 }
774 
775 void X86FlagsCopyLoweringPass::rewriteSetCarryExtended(
776     MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
777     DebugLoc TestLoc, MachineInstr &SetBI, MachineOperand &FlagUse,
778     CondRegArray &CondRegs) {
779   // This routine is only used to handle pseudos for setting a register to zero
780   // or all ones based on CF. This is essentially the sign extended from 1-bit
781   // form of SETB and modeled with the SETB_C* pseudos. They require special
782   // handling as they aren't normal SETcc instructions and are lowered to an
783   // EFLAGS clobbering operation (SBB typically). One simplifying aspect is that
784   // they are only provided in reg-defining forms. A complicating factor is that
785   // they can define many different register widths.
786   assert(SetBI.getOperand(0).isReg() &&
787          "Cannot have a non-register defined operand to this variant of SETB!");
788 
789   // Little helper to do the common final step of replacing the register def'ed
790   // by this SETB instruction with a new register and removing the SETB
791   // instruction.
792   auto RewriteToReg = [&](unsigned Reg) {
793     MRI->replaceRegWith(SetBI.getOperand(0).getReg(), Reg);
794     SetBI.eraseFromParent();
795   };
796 
797   // Grab the register class used for this particular instruction.
798   auto &SetBRC = *MRI->getRegClass(SetBI.getOperand(0).getReg());
799 
800   MachineBasicBlock &MBB = *SetBI.getParent();
801   auto SetPos = SetBI.getIterator();
802   auto SetLoc = SetBI.getDebugLoc();
803 
804   auto AdjustReg = [&](unsigned Reg) {
805     auto &OrigRC = *MRI->getRegClass(Reg);
806     if (&OrigRC == &SetBRC)
807       return Reg;
808 
809     unsigned NewReg;
810 
811     int OrigRegSize = TRI->getRegSizeInBits(OrigRC) / 8;
812     int TargetRegSize = TRI->getRegSizeInBits(SetBRC) / 8;
813     assert(OrigRegSize <= 8 && "No GPRs larger than 64-bits!");
814     assert(TargetRegSize <= 8 && "No GPRs larger than 64-bits!");
815     int SubRegIdx[] = {X86::NoSubRegister, X86::sub_8bit, X86::sub_16bit,
816                        X86::NoSubRegister, X86::sub_32bit};
817 
818     // If the original size is smaller than the target *and* is smaller than 4
819     // bytes, we need to explicitly zero extend it. We always extend to 4-bytes
820     // to maximize the chance of being able to CSE that operation and to avoid
821     // partial dependency stalls extending to 2-bytes.
822     if (OrigRegSize < TargetRegSize && OrigRegSize < 4) {
823       NewReg = MRI->createVirtualRegister(&X86::GR32RegClass);
824       BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOVZX32rr8), NewReg)
825           .addReg(Reg);
826       if (&SetBRC == &X86::GR32RegClass)
827         return NewReg;
828       Reg = NewReg;
829       OrigRegSize = 4;
830     }
831 
832     NewReg = MRI->createVirtualRegister(&SetBRC);
833     if (OrigRegSize < TargetRegSize) {
834       BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::SUBREG_TO_REG),
835               NewReg)
836           .addImm(0)
837           .addReg(Reg)
838           .addImm(SubRegIdx[OrigRegSize]);
839     } else if (OrigRegSize > TargetRegSize) {
840       BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::EXTRACT_SUBREG),
841               NewReg)
842           .addReg(Reg)
843           .addImm(SubRegIdx[TargetRegSize]);
844     } else {
845       BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::COPY), NewReg)
846           .addReg(Reg);
847     }
848     return NewReg;
849   };
850 
851   unsigned &CondReg = CondRegs[X86::COND_B];
852   if (!CondReg)
853     CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, X86::COND_B);
854 
855   // Adjust the condition to have the desired register width by zero-extending
856   // as needed.
857   // FIXME: We should use a better API to avoid the local reference and using a
858   // different variable here.
859   unsigned ExtCondReg = AdjustReg(CondReg);
860 
861   // Now we need to turn this into a bitmask. We do this by subtracting it from
862   // zero.
863   unsigned ZeroReg = MRI->createVirtualRegister(&X86::GR32RegClass);
864   BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOV32r0), ZeroReg);
865   ZeroReg = AdjustReg(ZeroReg);
866 
867   unsigned Sub;
868   switch (SetBI.getOpcode()) {
869   case X86::SETB_C8r:
870     Sub = X86::SUB8rr;
871     break;
872 
873   case X86::SETB_C16r:
874     Sub = X86::SUB16rr;
875     break;
876 
877   case X86::SETB_C32r:
878     Sub = X86::SUB32rr;
879     break;
880 
881   case X86::SETB_C64r:
882     Sub = X86::SUB64rr;
883     break;
884 
885   default:
886     llvm_unreachable("Invalid SETB_C* opcode!");
887   }
888   unsigned ResultReg = MRI->createVirtualRegister(&SetBRC);
889   BuildMI(MBB, SetPos, SetLoc, TII->get(Sub), ResultReg)
890       .addReg(ZeroReg)
891       .addReg(ExtCondReg);
892   return RewriteToReg(ResultReg);
893 }
894 
895 void X86FlagsCopyLoweringPass::rewriteSetCC(MachineBasicBlock &TestMBB,
896                                             MachineBasicBlock::iterator TestPos,
897                                             DebugLoc TestLoc,
898                                             MachineInstr &SetCCI,
899                                             MachineOperand &FlagUse,
900                                             CondRegArray &CondRegs) {
901   X86::CondCode Cond = X86::getCondFromSETOpc(SetCCI.getOpcode());
902   // Note that we can't usefully rewrite this to the inverse without complex
903   // analysis of the users of the setCC. Largely we rely on duplicates which
904   // could have been avoided already being avoided here.
905   unsigned &CondReg = CondRegs[Cond];
906   if (!CondReg)
907     CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond);
908 
909   // Rewriting a register def is trivial: we just replace the register and
910   // remove the setcc.
911   if (!SetCCI.mayStore()) {
912     assert(SetCCI.getOperand(0).isReg() &&
913            "Cannot have a non-register defined operand to SETcc!");
914     MRI->replaceRegWith(SetCCI.getOperand(0).getReg(), CondReg);
915     SetCCI.eraseFromParent();
916     return;
917   }
918 
919   // Otherwise, we need to emit a store.
920   auto MIB = BuildMI(*SetCCI.getParent(), SetCCI.getIterator(),
921                      SetCCI.getDebugLoc(), TII->get(X86::MOV8mr));
922   // Copy the address operands.
923   for (int i = 0; i < X86::AddrNumOperands; ++i)
924     MIB.add(SetCCI.getOperand(i));
925 
926   MIB.addReg(CondReg);
927 
928   MIB->setMemRefs(SetCCI.memoperands_begin(), SetCCI.memoperands_end());
929 
930   SetCCI.eraseFromParent();
931   return;
932 }
933