1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass lowers the pseudo control flow instructions to real
11 /// machine instructions.
12 ///
13 /// All control flow is handled using predicated instructions and
14 /// a predicate stack.  Each Scalar ALU controls the operations of 64 Vector
15 /// ALUs.  The Scalar ALU can update the predicate for any of the Vector ALUs
16 /// by writting to the 64-bit EXEC register (each bit corresponds to a
17 /// single vector ALU).  Typically, for predicates, a vector ALU will write
18 /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
19 /// Vector ALU) and then the ScalarALU will AND the VCC register with the
20 /// EXEC to update the predicates.
21 ///
22 /// For example:
23 /// %vcc = V_CMP_GT_F32 %vgpr1, %vgpr2
24 /// %sgpr0 = SI_IF %vcc
25 ///   %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0
26 /// %sgpr0 = SI_ELSE %sgpr0
27 ///   %vgpr0 = V_SUB_F32 %vgpr0, %vgpr0
28 /// SI_END_CF %sgpr0
29 ///
30 /// becomes:
31 ///
32 /// %sgpr0 = S_AND_SAVEEXEC_B64 %vcc  // Save and update the exec mask
33 /// %sgpr0 = S_XOR_B64 %sgpr0, %exec  // Clear live bits from saved exec mask
34 /// S_CBRANCH_EXECZ label0            // This instruction is an optional
35 ///                                   // optimization which allows us to
36 ///                                   // branch if all the bits of
37 ///                                   // EXEC are zero.
38 /// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0 // Do the IF block of the branch
39 ///
40 /// label0:
41 /// %sgpr0 = S_OR_SAVEEXEC_B64 %sgpr0  // Restore the exec mask for the Then block
42 /// %exec = S_XOR_B64 %sgpr0, %exec    // Update the exec mask
43 /// S_BRANCH_EXECZ label1              // Use our branch optimization
44 ///                                    // instruction again.
45 /// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr   // Do the THEN block
46 /// label1:
47 /// %exec = S_OR_B64 %exec, %sgpr0     // Re-enable saved exec mask bits
48 //===----------------------------------------------------------------------===//
49 
50 #include "AMDGPU.h"
51 #include "GCNSubtarget.h"
52 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
53 #include "llvm/ADT/SmallSet.h"
54 #include "llvm/CodeGen/LiveIntervals.h"
55 #include "llvm/CodeGen/LiveVariables.h"
56 #include "llvm/CodeGen/MachineDominators.h"
57 #include "llvm/CodeGen/MachineFunctionPass.h"
58 
59 using namespace llvm;
60 
61 #define DEBUG_TYPE "si-lower-control-flow"
62 
63 static cl::opt<bool>
64 RemoveRedundantEndcf("amdgpu-remove-redundant-endcf",
65     cl::init(true), cl::ReallyHidden);
66 
67 namespace {
68 
69 class SILowerControlFlow : public MachineFunctionPass {
70 private:
71   const SIRegisterInfo *TRI = nullptr;
72   const SIInstrInfo *TII = nullptr;
73   LiveIntervals *LIS = nullptr;
74   LiveVariables *LV = nullptr;
75   MachineDominatorTree *MDT = nullptr;
76   MachineRegisterInfo *MRI = nullptr;
77   SetVector<MachineInstr*> LoweredEndCf;
78   DenseSet<Register> LoweredIf;
79   SmallSet<MachineBasicBlock *, 4> KillBlocks;
80 
81   const TargetRegisterClass *BoolRC = nullptr;
82   unsigned AndOpc;
83   unsigned OrOpc;
84   unsigned XorOpc;
85   unsigned MovTermOpc;
86   unsigned Andn2TermOpc;
87   unsigned XorTermrOpc;
88   unsigned OrTermrOpc;
89   unsigned OrSaveExecOpc;
90   unsigned Exec;
91 
92   bool hasKill(const MachineBasicBlock *Begin, const MachineBasicBlock *End);
93 
94   void emitIf(MachineInstr &MI);
95   void emitElse(MachineInstr &MI);
96   void emitIfBreak(MachineInstr &MI);
97   void emitLoop(MachineInstr &MI);
98 
99   MachineBasicBlock *emitEndCf(MachineInstr &MI);
100 
101   void lowerInitExec(MachineBasicBlock *MBB, MachineInstr &MI);
102 
103   void findMaskOperands(MachineInstr &MI, unsigned OpNo,
104                         SmallVectorImpl<MachineOperand> &Src) const;
105 
106   void combineMasks(MachineInstr &MI);
107 
108   bool removeMBBifRedundant(MachineBasicBlock &MBB);
109 
110   MachineBasicBlock *process(MachineInstr &MI);
111 
112   // Skip to the next instruction, ignoring debug instructions, and trivial
113   // block boundaries (blocks that have one (typically fallthrough) successor,
114   // and the successor has one predecessor.
115   MachineBasicBlock::iterator
116   skipIgnoreExecInstsTrivialSucc(MachineBasicBlock &MBB,
117                                  MachineBasicBlock::iterator It) const;
118 
119   /// Find the insertion point for a new conditional branch.
120   MachineBasicBlock::iterator
121   skipToUncondBrOrEnd(MachineBasicBlock &MBB,
122                       MachineBasicBlock::iterator I) const {
123     assert(I->isTerminator());
124 
125     // FIXME: What if we had multiple pre-existing conditional branches?
126     MachineBasicBlock::iterator End = MBB.end();
127     while (I != End && !I->isUnconditionalBranch())
128       ++I;
129     return I;
130   }
131 
132   // Remove redundant SI_END_CF instructions.
133   void optimizeEndCf();
134 
135 public:
136   static char ID;
137 
138   SILowerControlFlow() : MachineFunctionPass(ID) {}
139 
140   bool runOnMachineFunction(MachineFunction &MF) override;
141 
142   StringRef getPassName() const override {
143     return "SI Lower control flow pseudo instructions";
144   }
145 
146   void getAnalysisUsage(AnalysisUsage &AU) const override {
147     // Should preserve the same set that TwoAddressInstructions does.
148     AU.addPreserved<MachineDominatorTree>();
149     AU.addPreserved<SlotIndexes>();
150     AU.addPreserved<LiveIntervals>();
151     AU.addPreservedID(LiveVariablesID);
152     MachineFunctionPass::getAnalysisUsage(AU);
153   }
154 };
155 
156 } // end anonymous namespace
157 
158 char SILowerControlFlow::ID = 0;
159 
160 INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
161                "SI lower control flow", false, false)
162 
163 static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) {
164   MachineOperand &ImpDefSCC = MI.getOperand(3);
165   assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
166 
167   ImpDefSCC.setIsDead(IsDead);
168 }
169 
170 char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
171 
172 bool SILowerControlFlow::hasKill(const MachineBasicBlock *Begin,
173                                  const MachineBasicBlock *End) {
174   DenseSet<const MachineBasicBlock*> Visited;
175   SmallVector<MachineBasicBlock *, 4> Worklist(Begin->successors());
176 
177   while (!Worklist.empty()) {
178     MachineBasicBlock *MBB = Worklist.pop_back_val();
179 
180     if (MBB == End || !Visited.insert(MBB).second)
181       continue;
182     if (KillBlocks.contains(MBB))
183       return true;
184 
185     Worklist.append(MBB->succ_begin(), MBB->succ_end());
186   }
187 
188   return false;
189 }
190 
191 static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) {
192   Register SaveExecReg = MI.getOperand(0).getReg();
193   auto U = MRI->use_instr_nodbg_begin(SaveExecReg);
194 
195   if (U == MRI->use_instr_nodbg_end() ||
196       std::next(U) != MRI->use_instr_nodbg_end() ||
197       U->getOpcode() != AMDGPU::SI_END_CF)
198     return false;
199 
200   return true;
201 }
202 
203 void SILowerControlFlow::emitIf(MachineInstr &MI) {
204   MachineBasicBlock &MBB = *MI.getParent();
205   const DebugLoc &DL = MI.getDebugLoc();
206   MachineBasicBlock::iterator I(&MI);
207   Register SaveExecReg = MI.getOperand(0).getReg();
208   MachineOperand& Cond = MI.getOperand(1);
209   assert(Cond.getSubReg() == AMDGPU::NoSubRegister);
210 
211   MachineOperand &ImpDefSCC = MI.getOperand(4);
212   assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
213 
214   // If there is only one use of save exec register and that use is SI_END_CF,
215   // we can optimize SI_IF by returning the full saved exec mask instead of
216   // just cleared bits.
217   bool SimpleIf = isSimpleIf(MI, MRI);
218 
219   if (SimpleIf) {
220     // Check for SI_KILL_*_TERMINATOR on path from if to endif.
221     // if there is any such terminator simplifications are not safe.
222     auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg);
223     SimpleIf = !hasKill(MI.getParent(), UseMI->getParent());
224   }
225 
226   // Add an implicit def of exec to discourage scheduling VALU after this which
227   // will interfere with trying to form s_and_saveexec_b64 later.
228   Register CopyReg = SimpleIf ? SaveExecReg
229                        : MRI->createVirtualRegister(BoolRC);
230   MachineInstr *CopyExec =
231     BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg)
232     .addReg(Exec)
233     .addReg(Exec, RegState::ImplicitDefine);
234   LoweredIf.insert(CopyReg);
235 
236   Register Tmp = MRI->createVirtualRegister(BoolRC);
237 
238   MachineInstr *And =
239     BuildMI(MBB, I, DL, TII->get(AndOpc), Tmp)
240     .addReg(CopyReg)
241     .add(Cond);
242   if (LV)
243     LV->replaceKillInstruction(Cond.getReg(), MI, *And);
244 
245   setImpSCCDefDead(*And, true);
246 
247   MachineInstr *Xor = nullptr;
248   if (!SimpleIf) {
249     Xor =
250       BuildMI(MBB, I, DL, TII->get(XorOpc), SaveExecReg)
251       .addReg(Tmp)
252       .addReg(CopyReg);
253     setImpSCCDefDead(*Xor, ImpDefSCC.isDead());
254   }
255 
256   // Use a copy that is a terminator to get correct spill code placement it with
257   // fast regalloc.
258   MachineInstr *SetExec =
259     BuildMI(MBB, I, DL, TII->get(MovTermOpc), Exec)
260     .addReg(Tmp, RegState::Kill);
261   if (LV)
262     LV->getVarInfo(Tmp).Kills.push_back(SetExec);
263 
264   // Skip ahead to the unconditional branch in case there are other terminators
265   // present.
266   I = skipToUncondBrOrEnd(MBB, I);
267 
268   // Insert the S_CBRANCH_EXECZ instruction which will be optimized later
269   // during SIRemoveShortExecBranches.
270   MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
271                             .add(MI.getOperand(2));
272 
273   if (!LIS) {
274     MI.eraseFromParent();
275     return;
276   }
277 
278   LIS->InsertMachineInstrInMaps(*CopyExec);
279 
280   // Replace with and so we don't need to fix the live interval for condition
281   // register.
282   LIS->ReplaceMachineInstrInMaps(MI, *And);
283 
284   if (!SimpleIf)
285     LIS->InsertMachineInstrInMaps(*Xor);
286   LIS->InsertMachineInstrInMaps(*SetExec);
287   LIS->InsertMachineInstrInMaps(*NewBr);
288 
289   LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
290   MI.eraseFromParent();
291 
292   // FIXME: Is there a better way of adjusting the liveness? It shouldn't be
293   // hard to add another def here but I'm not sure how to correctly update the
294   // valno.
295   LIS->removeInterval(SaveExecReg);
296   LIS->createAndComputeVirtRegInterval(SaveExecReg);
297   LIS->createAndComputeVirtRegInterval(Tmp);
298   if (!SimpleIf)
299     LIS->createAndComputeVirtRegInterval(CopyReg);
300 }
301 
302 void SILowerControlFlow::emitElse(MachineInstr &MI) {
303   MachineBasicBlock &MBB = *MI.getParent();
304   const DebugLoc &DL = MI.getDebugLoc();
305 
306   Register DstReg = MI.getOperand(0).getReg();
307 
308   MachineBasicBlock::iterator Start = MBB.begin();
309 
310   // This must be inserted before phis and any spill code inserted before the
311   // else.
312   Register SaveReg = MRI->createVirtualRegister(BoolRC);
313   MachineInstr *OrSaveExec =
314     BuildMI(MBB, Start, DL, TII->get(OrSaveExecOpc), SaveReg)
315     .add(MI.getOperand(1)); // Saved EXEC
316   if (LV)
317     LV->replaceKillInstruction(MI.getOperand(1).getReg(), MI, *OrSaveExec);
318 
319   MachineBasicBlock *DestBB = MI.getOperand(2).getMBB();
320 
321   MachineBasicBlock::iterator ElsePt(MI);
322 
323   // This accounts for any modification of the EXEC mask within the block and
324   // can be optimized out pre-RA when not required.
325   MachineInstr *And = BuildMI(MBB, ElsePt, DL, TII->get(AndOpc), DstReg)
326                           .addReg(Exec)
327                           .addReg(SaveReg);
328 
329   if (LIS)
330     LIS->InsertMachineInstrInMaps(*And);
331 
332   MachineInstr *Xor =
333     BuildMI(MBB, ElsePt, DL, TII->get(XorTermrOpc), Exec)
334     .addReg(Exec)
335     .addReg(DstReg);
336 
337   // Skip ahead to the unconditional branch in case there are other terminators
338   // present.
339   ElsePt = skipToUncondBrOrEnd(MBB, ElsePt);
340 
341   MachineInstr *Branch =
342       BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
343           .addMBB(DestBB);
344 
345   if (!LIS) {
346     MI.eraseFromParent();
347     return;
348   }
349 
350   LIS->RemoveMachineInstrFromMaps(MI);
351   MI.eraseFromParent();
352 
353   LIS->InsertMachineInstrInMaps(*OrSaveExec);
354 
355   LIS->InsertMachineInstrInMaps(*Xor);
356   LIS->InsertMachineInstrInMaps(*Branch);
357 
358   LIS->removeInterval(DstReg);
359   LIS->createAndComputeVirtRegInterval(DstReg);
360   LIS->createAndComputeVirtRegInterval(SaveReg);
361 
362   // Let this be recomputed.
363   LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
364 }
365 
366 void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
367   MachineBasicBlock &MBB = *MI.getParent();
368   const DebugLoc &DL = MI.getDebugLoc();
369   auto Dst = MI.getOperand(0).getReg();
370 
371   // Skip ANDing with exec if the break condition is already masked by exec
372   // because it is a V_CMP in the same basic block. (We know the break
373   // condition operand was an i1 in IR, so if it is a VALU instruction it must
374   // be one with a carry-out.)
375   bool SkipAnding = false;
376   if (MI.getOperand(1).isReg()) {
377     if (MachineInstr *Def = MRI->getUniqueVRegDef(MI.getOperand(1).getReg())) {
378       SkipAnding = Def->getParent() == MI.getParent()
379           && SIInstrInfo::isVALU(*Def);
380     }
381   }
382 
383   // AND the break condition operand with exec, then OR that into the "loop
384   // exit" mask.
385   MachineInstr *And = nullptr, *Or = nullptr;
386   if (!SkipAnding) {
387     Register AndReg = MRI->createVirtualRegister(BoolRC);
388     And = BuildMI(MBB, &MI, DL, TII->get(AndOpc), AndReg)
389              .addReg(Exec)
390              .add(MI.getOperand(1));
391     if (LV)
392       LV->replaceKillInstruction(MI.getOperand(1).getReg(), MI, *And);
393     Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst)
394              .addReg(AndReg)
395              .add(MI.getOperand(2));
396     if (LIS)
397       LIS->createAndComputeVirtRegInterval(AndReg);
398   } else {
399     Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst)
400              .add(MI.getOperand(1))
401              .add(MI.getOperand(2));
402     if (LV)
403       LV->replaceKillInstruction(MI.getOperand(1).getReg(), MI, *Or);
404   }
405   if (LV)
406     LV->replaceKillInstruction(MI.getOperand(2).getReg(), MI, *Or);
407 
408   if (LIS) {
409     if (And)
410       LIS->InsertMachineInstrInMaps(*And);
411     LIS->ReplaceMachineInstrInMaps(MI, *Or);
412   }
413 
414   MI.eraseFromParent();
415 }
416 
417 void SILowerControlFlow::emitLoop(MachineInstr &MI) {
418   MachineBasicBlock &MBB = *MI.getParent();
419   const DebugLoc &DL = MI.getDebugLoc();
420 
421   MachineInstr *AndN2 =
422       BuildMI(MBB, &MI, DL, TII->get(Andn2TermOpc), Exec)
423           .addReg(Exec)
424           .add(MI.getOperand(0));
425 
426   auto BranchPt = skipToUncondBrOrEnd(MBB, MI.getIterator());
427   MachineInstr *Branch =
428       BuildMI(MBB, BranchPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
429           .add(MI.getOperand(1));
430 
431   if (LIS) {
432     LIS->ReplaceMachineInstrInMaps(MI, *AndN2);
433     LIS->InsertMachineInstrInMaps(*Branch);
434   }
435 
436   MI.eraseFromParent();
437 }
438 
439 MachineBasicBlock::iterator
440 SILowerControlFlow::skipIgnoreExecInstsTrivialSucc(
441   MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const {
442 
443   SmallSet<const MachineBasicBlock *, 4> Visited;
444   MachineBasicBlock *B = &MBB;
445   do {
446     if (!Visited.insert(B).second)
447       return MBB.end();
448 
449     auto E = B->end();
450     for ( ; It != E; ++It) {
451       if (TII->mayReadEXEC(*MRI, *It))
452         break;
453     }
454 
455     if (It != E)
456       return It;
457 
458     if (B->succ_size() != 1)
459       return MBB.end();
460 
461     // If there is one trivial successor, advance to the next block.
462     MachineBasicBlock *Succ = *B->succ_begin();
463 
464     It = Succ->begin();
465     B = Succ;
466   } while (true);
467 }
468 
469 MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) {
470   MachineBasicBlock &MBB = *MI.getParent();
471   const DebugLoc &DL = MI.getDebugLoc();
472 
473   MachineBasicBlock::iterator InsPt = MBB.begin();
474 
475   // If we have instructions that aren't prolog instructions, split the block
476   // and emit a terminator instruction. This ensures correct spill placement.
477   // FIXME: We should unconditionally split the block here.
478   bool NeedBlockSplit = false;
479   Register DataReg = MI.getOperand(0).getReg();
480   for (MachineBasicBlock::iterator I = InsPt, E = MI.getIterator();
481        I != E; ++I) {
482     if (I->modifiesRegister(DataReg, TRI)) {
483       NeedBlockSplit = true;
484       break;
485     }
486   }
487 
488   unsigned Opcode = OrOpc;
489   MachineBasicBlock *SplitBB = &MBB;
490   if (NeedBlockSplit) {
491     SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/true, LIS);
492     if (MDT && SplitBB != &MBB) {
493       MachineDomTreeNode *MBBNode = (*MDT)[&MBB];
494       SmallVector<MachineDomTreeNode *> Children(MBBNode->begin(),
495                                                  MBBNode->end());
496       MachineDomTreeNode *SplitBBNode = MDT->addNewBlock(SplitBB, &MBB);
497       for (MachineDomTreeNode *Child : Children)
498         MDT->changeImmediateDominator(Child, SplitBBNode);
499     }
500     Opcode = OrTermrOpc;
501     InsPt = MI;
502   }
503 
504   MachineInstr *NewMI =
505     BuildMI(MBB, InsPt, DL, TII->get(Opcode), Exec)
506     .addReg(Exec)
507     .add(MI.getOperand(0));
508   if (LV)
509     LV->replaceKillInstruction(MI.getOperand(0).getReg(), MI, *NewMI);
510 
511   LoweredEndCf.insert(NewMI);
512 
513   if (LIS)
514     LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
515 
516   MI.eraseFromParent();
517 
518   if (LIS)
519     LIS->handleMove(*NewMI);
520   return SplitBB;
521 }
522 
523 // Returns replace operands for a logical operation, either single result
524 // for exec or two operands if source was another equivalent operation.
525 void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
526        SmallVectorImpl<MachineOperand> &Src) const {
527   MachineOperand &Op = MI.getOperand(OpNo);
528   if (!Op.isReg() || !Op.getReg().isVirtual()) {
529     Src.push_back(Op);
530     return;
531   }
532 
533   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
534   if (!Def || Def->getParent() != MI.getParent() ||
535       !(Def->isFullCopy() || (Def->getOpcode() == MI.getOpcode())))
536     return;
537 
538   // Make sure we do not modify exec between def and use.
539   // A copy with implcitly defined exec inserted earlier is an exclusion, it
540   // does not really modify exec.
541   for (auto I = Def->getIterator(); I != MI.getIterator(); ++I)
542     if (I->modifiesRegister(AMDGPU::EXEC, TRI) &&
543         !(I->isCopy() && I->getOperand(0).getReg() != Exec))
544       return;
545 
546   for (const auto &SrcOp : Def->explicit_operands())
547     if (SrcOp.isReg() && SrcOp.isUse() &&
548         (SrcOp.getReg().isVirtual() || SrcOp.getReg() == Exec))
549       Src.push_back(SrcOp);
550 }
551 
552 // Search and combine pairs of equivalent instructions, like
553 // S_AND_B64 x, (S_AND_B64 x, y) => S_AND_B64 x, y
554 // S_OR_B64  x, (S_OR_B64  x, y) => S_OR_B64  x, y
555 // One of the operands is exec mask.
556 void SILowerControlFlow::combineMasks(MachineInstr &MI) {
557   assert(MI.getNumExplicitOperands() == 3);
558   SmallVector<MachineOperand, 4> Ops;
559   unsigned OpToReplace = 1;
560   findMaskOperands(MI, 1, Ops);
561   if (Ops.size() == 1) OpToReplace = 2; // First operand can be exec or its copy
562   findMaskOperands(MI, 2, Ops);
563   if (Ops.size() != 3) return;
564 
565   unsigned UniqueOpndIdx;
566   if (Ops[0].isIdenticalTo(Ops[1])) UniqueOpndIdx = 2;
567   else if (Ops[0].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
568   else if (Ops[1].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
569   else return;
570 
571   Register Reg = MI.getOperand(OpToReplace).getReg();
572   MI.RemoveOperand(OpToReplace);
573   MI.addOperand(Ops[UniqueOpndIdx]);
574   if (MRI->use_empty(Reg))
575     MRI->getUniqueVRegDef(Reg)->eraseFromParent();
576 }
577 
578 void SILowerControlFlow::optimizeEndCf() {
579   // If the only instruction immediately following this END_CF is an another
580   // END_CF in the only successor we can avoid emitting exec mask restore here.
581   if (!RemoveRedundantEndcf)
582     return;
583 
584   for (MachineInstr *MI : LoweredEndCf) {
585     MachineBasicBlock &MBB = *MI->getParent();
586     auto Next =
587       skipIgnoreExecInstsTrivialSucc(MBB, std::next(MI->getIterator()));
588     if (Next == MBB.end() || !LoweredEndCf.count(&*Next))
589       continue;
590     // Only skip inner END_CF if outer ENDCF belongs to SI_IF.
591     // If that belongs to SI_ELSE then saved mask has an inverted value.
592     Register SavedExec
593       = TII->getNamedOperand(*Next, AMDGPU::OpName::src1)->getReg();
594     assert(SavedExec.isVirtual() && "Expected saved exec to be src1!");
595 
596     const MachineInstr *Def = MRI->getUniqueVRegDef(SavedExec);
597     if (Def && LoweredIf.count(SavedExec)) {
598       LLVM_DEBUG(dbgs() << "Skip redundant "; MI->dump());
599       if (LIS)
600         LIS->RemoveMachineInstrFromMaps(*MI);
601       Register Reg;
602       if (LV)
603         Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
604       MI->eraseFromParent();
605       if (LV)
606         LV->recomputeForSingleDefVirtReg(Reg);
607       removeMBBifRedundant(MBB);
608     }
609   }
610 }
611 
612 MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) {
613   MachineBasicBlock &MBB = *MI.getParent();
614   MachineBasicBlock::iterator I(MI);
615   MachineInstr *Prev = (I != MBB.begin()) ? &*(std::prev(I)) : nullptr;
616 
617   MachineBasicBlock *SplitBB = &MBB;
618 
619   switch (MI.getOpcode()) {
620   case AMDGPU::SI_IF:
621     emitIf(MI);
622     break;
623 
624   case AMDGPU::SI_ELSE:
625     emitElse(MI);
626     break;
627 
628   case AMDGPU::SI_IF_BREAK:
629     emitIfBreak(MI);
630     break;
631 
632   case AMDGPU::SI_LOOP:
633     emitLoop(MI);
634     break;
635 
636   case AMDGPU::SI_WATERFALL_LOOP:
637     MI.setDesc(TII->get(AMDGPU::S_CBRANCH_EXECNZ));
638     break;
639 
640   case AMDGPU::SI_END_CF:
641     SplitBB = emitEndCf(MI);
642     break;
643 
644   default:
645     assert(false && "Attempt to process unsupported instruction");
646     break;
647   }
648 
649   MachineBasicBlock::iterator Next;
650   for (I = Prev ? Prev->getIterator() : MBB.begin(); I != MBB.end(); I = Next) {
651     Next = std::next(I);
652     MachineInstr &MaskMI = *I;
653     switch (MaskMI.getOpcode()) {
654     case AMDGPU::S_AND_B64:
655     case AMDGPU::S_OR_B64:
656     case AMDGPU::S_AND_B32:
657     case AMDGPU::S_OR_B32:
658       // Cleanup bit manipulations on exec mask
659       combineMasks(MaskMI);
660       break;
661     default:
662       I = MBB.end();
663       break;
664     }
665   }
666 
667   return SplitBB;
668 }
669 
670 void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB,
671                                        MachineInstr &MI) {
672   MachineFunction &MF = *MBB->getParent();
673   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
674   bool IsWave32 = ST.isWave32();
675 
676   if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
677     // This should be before all vector instructions.
678     BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
679             TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), Exec)
680         .addImm(MI.getOperand(0).getImm());
681     if (LIS)
682       LIS->RemoveMachineInstrFromMaps(MI);
683     MI.eraseFromParent();
684     return;
685   }
686 
687   // Extract the thread count from an SGPR input and set EXEC accordingly.
688   // Since BFM can't shift by 64, handle that case with CMP + CMOV.
689   //
690   // S_BFE_U32 count, input, {shift, 7}
691   // S_BFM_B64 exec, count, 0
692   // S_CMP_EQ_U32 count, 64
693   // S_CMOV_B64 exec, -1
694   Register InputReg = MI.getOperand(0).getReg();
695   MachineInstr *FirstMI = &*MBB->begin();
696   if (InputReg.isVirtual()) {
697     MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
698     assert(DefInstr && DefInstr->isCopy());
699     if (DefInstr->getParent() == MBB) {
700       if (DefInstr != FirstMI) {
701         // If the `InputReg` is defined in current block, we also need to
702         // move that instruction to the beginning of the block.
703         DefInstr->removeFromParent();
704         MBB->insert(FirstMI, DefInstr);
705         if (LIS)
706           LIS->handleMove(*DefInstr);
707       } else {
708         // If first instruction is definition then move pointer after it.
709         FirstMI = &*std::next(FirstMI->getIterator());
710       }
711     }
712   }
713 
714   // Insert instruction sequence at block beginning (before vector operations).
715   const DebugLoc DL = MI.getDebugLoc();
716   const unsigned WavefrontSize = ST.getWavefrontSize();
717   const unsigned Mask = (WavefrontSize << 1) - 1;
718   Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
719   auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
720                    .addReg(InputReg)
721                    .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
722   if (LV)
723     LV->recomputeForSingleDefVirtReg(InputReg);
724   auto BfmMI =
725       BuildMI(*MBB, FirstMI, DL,
726               TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
727           .addReg(CountReg)
728           .addImm(0);
729   auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
730                    .addReg(CountReg, RegState::Kill)
731                    .addImm(WavefrontSize);
732   if (LV)
733     LV->getVarInfo(CountReg).Kills.push_back(CmpMI);
734   auto CmovMI =
735       BuildMI(*MBB, FirstMI, DL,
736               TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
737               Exec)
738           .addImm(-1);
739 
740   if (!LIS) {
741     MI.eraseFromParent();
742     return;
743   }
744 
745   LIS->RemoveMachineInstrFromMaps(MI);
746   MI.eraseFromParent();
747 
748   LIS->InsertMachineInstrInMaps(*BfeMI);
749   LIS->InsertMachineInstrInMaps(*BfmMI);
750   LIS->InsertMachineInstrInMaps(*CmpMI);
751   LIS->InsertMachineInstrInMaps(*CmovMI);
752 
753   LIS->removeInterval(InputReg);
754   LIS->createAndComputeVirtRegInterval(InputReg);
755   LIS->createAndComputeVirtRegInterval(CountReg);
756 }
757 
758 bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
759   for (auto &I : MBB.instrs()) {
760     if (!I.isDebugInstr() && !I.isUnconditionalBranch())
761       return false;
762   }
763 
764   assert(MBB.succ_size() == 1 && "MBB has more than one successor");
765 
766   MachineBasicBlock *Succ = *MBB.succ_begin();
767   MachineBasicBlock *FallThrough = nullptr;
768 
769   while (!MBB.predecessors().empty()) {
770     MachineBasicBlock *P = *MBB.pred_begin();
771     if (P->getFallThrough() == &MBB)
772       FallThrough = P;
773     P->ReplaceUsesOfBlockWith(&MBB, Succ);
774   }
775   MBB.removeSuccessor(Succ);
776   if (LIS) {
777     for (auto &I : MBB.instrs())
778       LIS->RemoveMachineInstrFromMaps(I);
779   }
780   if (MDT) {
781     // If Succ, the single successor of MBB, is dominated by MBB, MDT needs
782     // updating by changing Succ's idom to the one of MBB; otherwise, MBB must
783     // be a leaf node in MDT and could be erased directly.
784     if (MDT->dominates(&MBB, Succ))
785       MDT->changeImmediateDominator(MDT->getNode(Succ),
786                                     MDT->getNode(&MBB)->getIDom());
787     MDT->eraseNode(&MBB);
788   }
789   MBB.clear();
790   MBB.eraseFromParent();
791   if (FallThrough && !FallThrough->isLayoutSuccessor(Succ)) {
792     if (!Succ->canFallThrough()) {
793       MachineFunction *MF = FallThrough->getParent();
794       MachineFunction::iterator FallThroughPos(FallThrough);
795       MF->splice(std::next(FallThroughPos), Succ);
796     } else
797       BuildMI(*FallThrough, FallThrough->end(),
798               FallThrough->findBranchDebugLoc(), TII->get(AMDGPU::S_BRANCH))
799           .addMBB(Succ);
800   }
801 
802   return true;
803 }
804 
805 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
806   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
807   TII = ST.getInstrInfo();
808   TRI = &TII->getRegisterInfo();
809 
810   // This doesn't actually need LiveIntervals, but we can preserve them.
811   LIS = getAnalysisIfAvailable<LiveIntervals>();
812   // This doesn't actually need LiveVariables, but we can preserve them.
813   LV = getAnalysisIfAvailable<LiveVariables>();
814   MDT = getAnalysisIfAvailable<MachineDominatorTree>();
815   MRI = &MF.getRegInfo();
816   BoolRC = TRI->getBoolRC();
817 
818   if (ST.isWave32()) {
819     AndOpc = AMDGPU::S_AND_B32;
820     OrOpc = AMDGPU::S_OR_B32;
821     XorOpc = AMDGPU::S_XOR_B32;
822     MovTermOpc = AMDGPU::S_MOV_B32_term;
823     Andn2TermOpc = AMDGPU::S_ANDN2_B32_term;
824     XorTermrOpc = AMDGPU::S_XOR_B32_term;
825     OrTermrOpc = AMDGPU::S_OR_B32_term;
826     OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
827     Exec = AMDGPU::EXEC_LO;
828   } else {
829     AndOpc = AMDGPU::S_AND_B64;
830     OrOpc = AMDGPU::S_OR_B64;
831     XorOpc = AMDGPU::S_XOR_B64;
832     MovTermOpc = AMDGPU::S_MOV_B64_term;
833     Andn2TermOpc = AMDGPU::S_ANDN2_B64_term;
834     XorTermrOpc = AMDGPU::S_XOR_B64_term;
835     OrTermrOpc = AMDGPU::S_OR_B64_term;
836     OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
837     Exec = AMDGPU::EXEC;
838   }
839 
840   // Compute set of blocks with kills
841   const bool CanDemote =
842       MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
843   for (auto &MBB : MF) {
844     bool IsKillBlock = false;
845     for (auto &Term : MBB.terminators()) {
846       if (TII->isKillTerminator(Term.getOpcode())) {
847         KillBlocks.insert(&MBB);
848         IsKillBlock = true;
849         break;
850       }
851     }
852     if (CanDemote && !IsKillBlock) {
853       for (auto &MI : MBB) {
854         if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
855           KillBlocks.insert(&MBB);
856           break;
857         }
858       }
859     }
860   }
861 
862   MachineFunction::iterator NextBB;
863   for (MachineFunction::iterator BI = MF.begin();
864        BI != MF.end(); BI = NextBB) {
865     NextBB = std::next(BI);
866     MachineBasicBlock *MBB = &*BI;
867 
868     MachineBasicBlock::iterator I, E, Next;
869     E = MBB->end();
870     for (I = MBB->begin(); I != E; I = Next) {
871       Next = std::next(I);
872       MachineInstr &MI = *I;
873       MachineBasicBlock *SplitMBB = MBB;
874 
875       switch (MI.getOpcode()) {
876       case AMDGPU::SI_IF:
877       case AMDGPU::SI_ELSE:
878       case AMDGPU::SI_IF_BREAK:
879       case AMDGPU::SI_WATERFALL_LOOP:
880       case AMDGPU::SI_LOOP:
881       case AMDGPU::SI_END_CF:
882         SplitMBB = process(MI);
883         break;
884 
885       // FIXME: find a better place for this
886       case AMDGPU::SI_INIT_EXEC:
887       case AMDGPU::SI_INIT_EXEC_FROM_INPUT:
888         lowerInitExec(MBB, MI);
889         if (LIS)
890           LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
891         break;
892 
893       default:
894         break;
895       }
896 
897       if (SplitMBB != MBB) {
898         MBB = Next->getParent();
899         E = MBB->end();
900       }
901     }
902   }
903 
904   optimizeEndCf();
905 
906   LoweredEndCf.clear();
907   LoweredIf.clear();
908   KillBlocks.clear();
909 
910   return true;
911 }
912