1 //===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// This pass compute turns all control flow pseudo instructions into native one
12 /// computing their address on the fly ; it also sets STACK_SIZE info.
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPU.h"
16 #include "AMDGPUSubtarget.h"
17 #include "R600Defines.h"
18 #include "R600InstrInfo.h"
19 #include "R600MachineFunctionInfo.h"
20 #include "R600RegisterInfo.h"
21 #include "llvm/ADT/STLExtras.h"
22 #include "llvm/ADT/SmallVector.h"
23 #include "llvm/ADT/StringRef.h"
24 #include "llvm/CodeGen/MachineBasicBlock.h"
25 #include "llvm/CodeGen/MachineFunction.h"
26 #include "llvm/CodeGen/MachineFunctionPass.h"
27 #include "llvm/CodeGen/MachineInstr.h"
28 #include "llvm/CodeGen/MachineInstrBuilder.h"
29 #include "llvm/CodeGen/MachineOperand.h"
30 #include "llvm/IR/CallingConv.h"
31 #include "llvm/IR/DebugLoc.h"
32 #include "llvm/Support/Debug.h"
33 #include "llvm/Support/MathExtras.h"
34 #include "llvm/Support/raw_ostream.h"
35 #include <algorithm>
36 #include <cassert>
37 #include <cstdint>
38 #include <new>
39 #include <set>
40 #include <utility>
41 #include <vector>
42 
43 using namespace llvm;
44 
45 #define DEBUG_TYPE "r600cf"
46 
47 namespace {
48 
49 struct CFStack {
50 
51   enum StackItem {
52     ENTRY = 0,
53     SUB_ENTRY = 1,
54     FIRST_NON_WQM_PUSH = 2,
55     FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3
56   };
57 
58   const R600Subtarget *ST;
59   std::vector<StackItem> BranchStack;
60   std::vector<StackItem> LoopStack;
61   unsigned MaxStackSize;
62   unsigned CurrentEntries = 0;
63   unsigned CurrentSubEntries = 0;
64 
65   CFStack(const R600Subtarget *st, CallingConv::ID cc) : ST(st),
66       // We need to reserve a stack entry for CALL_FS in vertex shaders.
67       MaxStackSize(cc == CallingConv::AMDGPU_VS ? 1 : 0) {}
68 
69   unsigned getLoopDepth();
70   bool branchStackContains(CFStack::StackItem);
71   bool requiresWorkAroundForInst(unsigned Opcode);
72   unsigned getSubEntrySize(CFStack::StackItem Item);
73   void updateMaxStackSize();
74   void pushBranch(unsigned Opcode, bool isWQM = false);
75   void pushLoop();
76   void popBranch();
77   void popLoop();
78 };
79 
80 unsigned CFStack::getLoopDepth() {
81   return LoopStack.size();
82 }
83 
84 bool CFStack::branchStackContains(CFStack::StackItem Item) {
85   for (std::vector<CFStack::StackItem>::const_iterator I = BranchStack.begin(),
86        E = BranchStack.end(); I != E; ++I) {
87     if (*I == Item)
88       return true;
89   }
90   return false;
91 }
92 
93 bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
94   if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() &&
95       getLoopDepth() > 1)
96     return true;
97 
98   if (!ST->hasCFAluBug())
99     return false;
100 
101   switch(Opcode) {
102   default: return false;
103   case AMDGPU::CF_ALU_PUSH_BEFORE:
104   case AMDGPU::CF_ALU_ELSE_AFTER:
105   case AMDGPU::CF_ALU_BREAK:
106   case AMDGPU::CF_ALU_CONTINUE:
107     if (CurrentSubEntries == 0)
108       return false;
109     if (ST->getWavefrontSize() == 64) {
110       // We are being conservative here.  We only require this work-around if
111       // CurrentSubEntries > 3 &&
112       // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0)
113       //
114       // We have to be conservative, because we don't know for certain that
115       // our stack allocation algorithm for Evergreen/NI is correct.  Applying this
116       // work-around when CurrentSubEntries > 3 allows us to over-allocate stack
117       // resources without any problems.
118       return CurrentSubEntries > 3;
119     } else {
120       assert(ST->getWavefrontSize() == 32);
121       // We are being conservative here.  We only require the work-around if
122       // CurrentSubEntries > 7 &&
123       // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0)
124       // See the comment on the wavefront size == 64 case for why we are
125       // being conservative.
126       return CurrentSubEntries > 7;
127     }
128   }
129 }
130 
131 unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
132   switch(Item) {
133   default:
134     return 0;
135   case CFStack::FIRST_NON_WQM_PUSH:
136   assert(!ST->hasCaymanISA());
137   if (ST->getGeneration() <= R600Subtarget::R700) {
138     // +1 For the push operation.
139     // +2 Extra space required.
140     return 3;
141   } else {
142     // Some documentation says that this is not necessary on Evergreen,
143     // but experimentation has show that we need to allocate 1 extra
144     // sub-entry for the first non-WQM push.
145     // +1 For the push operation.
146     // +1 Extra space required.
147     return 2;
148   }
149   case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY:
150     assert(ST->getGeneration() >= R600Subtarget::EVERGREEN);
151     // +1 For the push operation.
152     // +1 Extra space required.
153     return 2;
154   case CFStack::SUB_ENTRY:
155     return 1;
156   }
157 }
158 
159 void CFStack::updateMaxStackSize() {
160   unsigned CurrentStackSize =
161       CurrentEntries + (alignTo(CurrentSubEntries, 4) / 4);
162   MaxStackSize = std::max(CurrentStackSize, MaxStackSize);
163 }
164 
165 void CFStack::pushBranch(unsigned Opcode, bool isWQM) {
166   CFStack::StackItem Item = CFStack::ENTRY;
167   switch(Opcode) {
168   case AMDGPU::CF_PUSH_EG:
169   case AMDGPU::CF_ALU_PUSH_BEFORE:
170     if (!isWQM) {
171       if (!ST->hasCaymanISA() &&
172           !branchStackContains(CFStack::FIRST_NON_WQM_PUSH))
173         Item = CFStack::FIRST_NON_WQM_PUSH;  // May not be required on Evergreen/NI
174                                              // See comment in
175                                              // CFStack::getSubEntrySize()
176       else if (CurrentEntries > 0 &&
177                ST->getGeneration() > R600Subtarget::EVERGREEN &&
178                !ST->hasCaymanISA() &&
179                !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY))
180         Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY;
181       else
182         Item = CFStack::SUB_ENTRY;
183     } else
184       Item = CFStack::ENTRY;
185     break;
186   }
187   BranchStack.push_back(Item);
188   if (Item == CFStack::ENTRY)
189     CurrentEntries++;
190   else
191     CurrentSubEntries += getSubEntrySize(Item);
192   updateMaxStackSize();
193 }
194 
195 void CFStack::pushLoop() {
196   LoopStack.push_back(CFStack::ENTRY);
197   CurrentEntries++;
198   updateMaxStackSize();
199 }
200 
201 void CFStack::popBranch() {
202   CFStack::StackItem Top = BranchStack.back();
203   if (Top == CFStack::ENTRY)
204     CurrentEntries--;
205   else
206     CurrentSubEntries-= getSubEntrySize(Top);
207   BranchStack.pop_back();
208 }
209 
210 void CFStack::popLoop() {
211   CurrentEntries--;
212   LoopStack.pop_back();
213 }
214 
215 class R600ControlFlowFinalizer : public MachineFunctionPass {
216 private:
217   typedef std::pair<MachineInstr *, std::vector<MachineInstr *>> ClauseFile;
218 
219   enum ControlFlowInstruction {
220     CF_TC,
221     CF_VC,
222     CF_CALL_FS,
223     CF_WHILE_LOOP,
224     CF_END_LOOP,
225     CF_LOOP_BREAK,
226     CF_LOOP_CONTINUE,
227     CF_JUMP,
228     CF_ELSE,
229     CF_POP,
230     CF_END
231   };
232 
233   const R600InstrInfo *TII = nullptr;
234   const R600RegisterInfo *TRI = nullptr;
235   unsigned MaxFetchInst;
236   const R600Subtarget *ST = nullptr;
237 
238   bool IsTrivialInst(MachineInstr &MI) const {
239     switch (MI.getOpcode()) {
240     case AMDGPU::KILL:
241     case AMDGPU::RETURN:
242       return true;
243     default:
244       return false;
245     }
246   }
247 
248   const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const {
249     unsigned Opcode = 0;
250     bool isEg = (ST->getGeneration() >= R600Subtarget::EVERGREEN);
251     switch (CFI) {
252     case CF_TC:
253       Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600;
254       break;
255     case CF_VC:
256       Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600;
257       break;
258     case CF_CALL_FS:
259       Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600;
260       break;
261     case CF_WHILE_LOOP:
262       Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600;
263       break;
264     case CF_END_LOOP:
265       Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600;
266       break;
267     case CF_LOOP_BREAK:
268       Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600;
269       break;
270     case CF_LOOP_CONTINUE:
271       Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600;
272       break;
273     case CF_JUMP:
274       Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600;
275       break;
276     case CF_ELSE:
277       Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600;
278       break;
279     case CF_POP:
280       Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600;
281       break;
282     case CF_END:
283       if (ST->hasCaymanISA()) {
284         Opcode = AMDGPU::CF_END_CM;
285         break;
286       }
287       Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600;
288       break;
289     }
290     assert (Opcode && "No opcode selected");
291     return TII->get(Opcode);
292   }
293 
294   bool isCompatibleWithClause(const MachineInstr &MI,
295                               std::set<unsigned> &DstRegs) const {
296     unsigned DstMI, SrcMI;
297     for (MachineInstr::const_mop_iterator I = MI.operands_begin(),
298                                           E = MI.operands_end();
299          I != E; ++I) {
300       const MachineOperand &MO = *I;
301       if (!MO.isReg())
302         continue;
303       if (MO.isDef()) {
304         unsigned Reg = MO.getReg();
305         if (AMDGPU::R600_Reg128RegClass.contains(Reg))
306           DstMI = Reg;
307         else
308           DstMI = TRI->getMatchingSuperReg(Reg,
309               TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
310               &AMDGPU::R600_Reg128RegClass);
311       }
312       if (MO.isUse()) {
313         unsigned Reg = MO.getReg();
314         if (AMDGPU::R600_Reg128RegClass.contains(Reg))
315           SrcMI = Reg;
316         else
317           SrcMI = TRI->getMatchingSuperReg(Reg,
318               TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
319               &AMDGPU::R600_Reg128RegClass);
320       }
321     }
322     if ((DstRegs.find(SrcMI) == DstRegs.end())) {
323       DstRegs.insert(DstMI);
324       return true;
325     } else
326       return false;
327   }
328 
329   ClauseFile
330   MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
331       const {
332     MachineBasicBlock::iterator ClauseHead = I;
333     std::vector<MachineInstr *> ClauseContent;
334     unsigned AluInstCount = 0;
335     bool IsTex = TII->usesTextureCache(*ClauseHead);
336     std::set<unsigned> DstRegs;
337     for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
338       if (IsTrivialInst(*I))
339         continue;
340       if (AluInstCount >= MaxFetchInst)
341         break;
342       if ((IsTex && !TII->usesTextureCache(*I)) ||
343           (!IsTex && !TII->usesVertexCache(*I)))
344         break;
345       if (!isCompatibleWithClause(*I, DstRegs))
346         break;
347       AluInstCount ++;
348       ClauseContent.push_back(&*I);
349     }
350     MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead),
351         getHWInstrDesc(IsTex?CF_TC:CF_VC))
352         .addImm(0) // ADDR
353         .addImm(AluInstCount - 1); // COUNT
354     return ClauseFile(MIb, std::move(ClauseContent));
355   }
356 
357   void getLiteral(MachineInstr &MI, std::vector<MachineOperand *> &Lits) const {
358     static const unsigned LiteralRegs[] = {
359       AMDGPU::ALU_LITERAL_X,
360       AMDGPU::ALU_LITERAL_Y,
361       AMDGPU::ALU_LITERAL_Z,
362       AMDGPU::ALU_LITERAL_W
363     };
364     const SmallVector<std::pair<MachineOperand *, int64_t>, 3> Srcs =
365         TII->getSrcs(MI);
366     for (const auto &Src:Srcs) {
367       if (Src.first->getReg() != AMDGPU::ALU_LITERAL_X)
368         continue;
369       int64_t Imm = Src.second;
370       std::vector<MachineOperand *>::iterator It =
371           llvm::find_if(Lits, [&](MachineOperand *val) {
372             return val->isImm() && (val->getImm() == Imm);
373           });
374 
375       // Get corresponding Operand
376       MachineOperand &Operand = MI.getOperand(
377           TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::literal));
378 
379       if (It != Lits.end()) {
380         // Reuse existing literal reg
381         unsigned Index = It - Lits.begin();
382         Src.first->setReg(LiteralRegs[Index]);
383       } else {
384         // Allocate new literal reg
385         assert(Lits.size() < 4 && "Too many literals in Instruction Group");
386         Src.first->setReg(LiteralRegs[Lits.size()]);
387         Lits.push_back(&Operand);
388       }
389     }
390   }
391 
392   MachineBasicBlock::iterator insertLiterals(
393       MachineBasicBlock::iterator InsertPos,
394       const std::vector<unsigned> &Literals) const {
395     MachineBasicBlock *MBB = InsertPos->getParent();
396     for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
397       unsigned LiteralPair0 = Literals[i];
398       unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0;
399       InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(),
400           TII->get(AMDGPU::LITERALS))
401           .addImm(LiteralPair0)
402           .addImm(LiteralPair1);
403     }
404     return InsertPos;
405   }
406 
407   ClauseFile
408   MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
409       const {
410     MachineInstr &ClauseHead = *I;
411     std::vector<MachineInstr *> ClauseContent;
412     I++;
413     for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) {
414       if (IsTrivialInst(*I)) {
415         ++I;
416         continue;
417       }
418       if (!I->isBundle() && !TII->isALUInstr(I->getOpcode()))
419         break;
420       std::vector<MachineOperand *>Literals;
421       if (I->isBundle()) {
422         MachineInstr &DeleteMI = *I;
423         MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
424         while (++BI != E && BI->isBundledWithPred()) {
425           BI->unbundleFromPred();
426           for (MachineOperand &MO : BI->operands()) {
427             if (MO.isReg() && MO.isInternalRead())
428               MO.setIsInternalRead(false);
429           }
430           getLiteral(*BI, Literals);
431           ClauseContent.push_back(&*BI);
432         }
433         I = BI;
434         DeleteMI.eraseFromParent();
435       } else {
436         getLiteral(*I, Literals);
437         ClauseContent.push_back(&*I);
438         I++;
439       }
440       for (unsigned i = 0, e = Literals.size(); i < e; i += 2) {
441         MachineInstrBuilder MILit = BuildMI(MBB, I, I->getDebugLoc(),
442             TII->get(AMDGPU::LITERALS));
443         if (Literals[i]->isImm()) {
444             MILit.addImm(Literals[i]->getImm());
445         } else {
446             MILit.addGlobalAddress(Literals[i]->getGlobal(),
447                                    Literals[i]->getOffset());
448         }
449         if (i + 1 < e) {
450           if (Literals[i + 1]->isImm()) {
451             MILit.addImm(Literals[i + 1]->getImm());
452           } else {
453             MILit.addGlobalAddress(Literals[i + 1]->getGlobal(),
454                                    Literals[i + 1]->getOffset());
455           }
456         } else
457           MILit.addImm(0);
458         ClauseContent.push_back(MILit);
459       }
460     }
461     assert(ClauseContent.size() < 128 && "ALU clause is too big");
462     ClauseHead.getOperand(7).setImm(ClauseContent.size() - 1);
463     return ClauseFile(&ClauseHead, std::move(ClauseContent));
464   }
465 
466   void EmitFetchClause(MachineBasicBlock::iterator InsertPos,
467                        const DebugLoc &DL, ClauseFile &Clause,
468                        unsigned &CfCount) {
469     CounterPropagateAddr(*Clause.first, CfCount);
470     MachineBasicBlock *BB = Clause.first->getParent();
471     BuildMI(BB, DL, TII->get(AMDGPU::FETCH_CLAUSE)).addImm(CfCount);
472     for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
473       BB->splice(InsertPos, BB, Clause.second[i]);
474     }
475     CfCount += 2 * Clause.second.size();
476   }
477 
478   void EmitALUClause(MachineBasicBlock::iterator InsertPos, const DebugLoc &DL,
479                      ClauseFile &Clause, unsigned &CfCount) {
480     Clause.first->getOperand(0).setImm(0);
481     CounterPropagateAddr(*Clause.first, CfCount);
482     MachineBasicBlock *BB = Clause.first->getParent();
483     BuildMI(BB, DL, TII->get(AMDGPU::ALU_CLAUSE)).addImm(CfCount);
484     for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
485       BB->splice(InsertPos, BB, Clause.second[i]);
486     }
487     CfCount += Clause.second.size();
488   }
489 
490   void CounterPropagateAddr(MachineInstr &MI, unsigned Addr) const {
491     MI.getOperand(0).setImm(Addr + MI.getOperand(0).getImm());
492   }
493   void CounterPropagateAddr(const std::set<MachineInstr *> &MIs,
494                             unsigned Addr) const {
495     for (MachineInstr *MI : MIs) {
496       CounterPropagateAddr(*MI, Addr);
497     }
498   }
499 
500 public:
501   static char ID;
502 
503   R600ControlFlowFinalizer() : MachineFunctionPass(ID) {}
504 
505   bool runOnMachineFunction(MachineFunction &MF) override {
506     ST = &MF.getSubtarget<R600Subtarget>();
507     MaxFetchInst = ST->getTexVTXClauseSize();
508     TII = ST->getInstrInfo();
509     TRI = ST->getRegisterInfo();
510 
511     R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
512 
513     CFStack CFStack(ST, MF.getFunction()->getCallingConv());
514     for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME;
515         ++MB) {
516       MachineBasicBlock &MBB = *MB;
517       unsigned CfCount = 0;
518       std::vector<std::pair<unsigned, std::set<MachineInstr *>>> LoopStack;
519       std::vector<MachineInstr * > IfThenElseStack;
520       if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_VS) {
521         BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
522             getHWInstrDesc(CF_CALL_FS));
523         CfCount++;
524       }
525       std::vector<ClauseFile> FetchClauses, AluClauses;
526       std::vector<MachineInstr *> LastAlu(1);
527       std::vector<MachineInstr *> ToPopAfter;
528 
529       for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
530           I != E;) {
531         if (TII->usesTextureCache(*I) || TII->usesVertexCache(*I)) {
532           DEBUG(dbgs() << CfCount << ":"; I->dump(););
533           FetchClauses.push_back(MakeFetchClause(MBB, I));
534           CfCount++;
535           LastAlu.back() = nullptr;
536           continue;
537         }
538 
539         MachineBasicBlock::iterator MI = I;
540         if (MI->getOpcode() != AMDGPU::ENDIF)
541           LastAlu.back() = nullptr;
542         if (MI->getOpcode() == AMDGPU::CF_ALU)
543           LastAlu.back() = &*MI;
544         I++;
545         bool RequiresWorkAround =
546             CFStack.requiresWorkAroundForInst(MI->getOpcode());
547         switch (MI->getOpcode()) {
548         case AMDGPU::CF_ALU_PUSH_BEFORE:
549           if (RequiresWorkAround) {
550             DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n");
551             BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG))
552                 .addImm(CfCount + 1)
553                 .addImm(1);
554             MI->setDesc(TII->get(AMDGPU::CF_ALU));
555             CfCount++;
556             CFStack.pushBranch(AMDGPU::CF_PUSH_EG);
557           } else
558             CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE);
559           LLVM_FALLTHROUGH;
560         case AMDGPU::CF_ALU:
561           I = MI;
562           AluClauses.push_back(MakeALUClause(MBB, I));
563           DEBUG(dbgs() << CfCount << ":"; MI->dump(););
564           CfCount++;
565           break;
566         case AMDGPU::WHILELOOP: {
567           CFStack.pushLoop();
568           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
569               getHWInstrDesc(CF_WHILE_LOOP))
570               .addImm(1);
571           std::pair<unsigned, std::set<MachineInstr *>> Pair(CfCount,
572               std::set<MachineInstr *>());
573           Pair.second.insert(MIb);
574           LoopStack.push_back(std::move(Pair));
575           MI->eraseFromParent();
576           CfCount++;
577           break;
578         }
579         case AMDGPU::ENDLOOP: {
580           CFStack.popLoop();
581           std::pair<unsigned, std::set<MachineInstr *>> Pair =
582               std::move(LoopStack.back());
583           LoopStack.pop_back();
584           CounterPropagateAddr(Pair.second, CfCount);
585           BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP))
586               .addImm(Pair.first + 1);
587           MI->eraseFromParent();
588           CfCount++;
589           break;
590         }
591         case AMDGPU::IF_PREDICATE_SET: {
592           LastAlu.push_back(nullptr);
593           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
594               getHWInstrDesc(CF_JUMP))
595               .addImm(0)
596               .addImm(0);
597           IfThenElseStack.push_back(MIb);
598           DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
599           MI->eraseFromParent();
600           CfCount++;
601           break;
602         }
603         case AMDGPU::ELSE: {
604           MachineInstr * JumpInst = IfThenElseStack.back();
605           IfThenElseStack.pop_back();
606           CounterPropagateAddr(*JumpInst, CfCount);
607           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
608               getHWInstrDesc(CF_ELSE))
609               .addImm(0)
610               .addImm(0);
611           DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
612           IfThenElseStack.push_back(MIb);
613           MI->eraseFromParent();
614           CfCount++;
615           break;
616         }
617         case AMDGPU::ENDIF: {
618           CFStack.popBranch();
619           if (LastAlu.back()) {
620             ToPopAfter.push_back(LastAlu.back());
621           } else {
622             MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
623                 getHWInstrDesc(CF_POP))
624                 .addImm(CfCount + 1)
625                 .addImm(1);
626             (void)MIb;
627             DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
628             CfCount++;
629           }
630 
631           MachineInstr *IfOrElseInst = IfThenElseStack.back();
632           IfThenElseStack.pop_back();
633           CounterPropagateAddr(*IfOrElseInst, CfCount);
634           IfOrElseInst->getOperand(1).setImm(1);
635           LastAlu.pop_back();
636           MI->eraseFromParent();
637           break;
638         }
639         case AMDGPU::BREAK: {
640           CfCount ++;
641           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
642               getHWInstrDesc(CF_LOOP_BREAK))
643               .addImm(0);
644           LoopStack.back().second.insert(MIb);
645           MI->eraseFromParent();
646           break;
647         }
648         case AMDGPU::CONTINUE: {
649           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
650               getHWInstrDesc(CF_LOOP_CONTINUE))
651               .addImm(0);
652           LoopStack.back().second.insert(MIb);
653           MI->eraseFromParent();
654           CfCount++;
655           break;
656         }
657         case AMDGPU::RETURN: {
658           DebugLoc DL = MBB.findDebugLoc(MI);
659           BuildMI(MBB, MI, DL, getHWInstrDesc(CF_END));
660           CfCount++;
661           if (CfCount % 2) {
662             BuildMI(MBB, I, DL, TII->get(AMDGPU::PAD));
663             CfCount++;
664           }
665           MI->eraseFromParent();
666           for (unsigned i = 0, e = FetchClauses.size(); i < e; i++)
667             EmitFetchClause(I, DL, FetchClauses[i], CfCount);
668           for (unsigned i = 0, e = AluClauses.size(); i < e; i++)
669             EmitALUClause(I, DL, AluClauses[i], CfCount);
670           break;
671         }
672         default:
673           if (TII->isExport(MI->getOpcode())) {
674             DEBUG(dbgs() << CfCount << ":"; MI->dump(););
675             CfCount++;
676           }
677           break;
678         }
679       }
680       for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) {
681         MachineInstr *Alu = ToPopAfter[i];
682         BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu),
683             TII->get(AMDGPU::CF_ALU_POP_AFTER))
684             .addImm(Alu->getOperand(0).getImm())
685             .addImm(Alu->getOperand(1).getImm())
686             .addImm(Alu->getOperand(2).getImm())
687             .addImm(Alu->getOperand(3).getImm())
688             .addImm(Alu->getOperand(4).getImm())
689             .addImm(Alu->getOperand(5).getImm())
690             .addImm(Alu->getOperand(6).getImm())
691             .addImm(Alu->getOperand(7).getImm())
692             .addImm(Alu->getOperand(8).getImm());
693         Alu->eraseFromParent();
694       }
695       MFI->CFStackSize = CFStack.MaxStackSize;
696     }
697 
698     return false;
699   }
700 
701   StringRef getPassName() const override {
702     return "R600 Control Flow Finalizer Pass";
703   }
704 };
705 
706 } // end anonymous namespace
707 
708 INITIALIZE_PASS_BEGIN(R600ControlFlowFinalizer, DEBUG_TYPE,
709                      "R600 Control Flow Finalizer", false, false)
710 INITIALIZE_PASS_END(R600ControlFlowFinalizer, DEBUG_TYPE,
711                     "R600 Control Flow Finalizer", false, false)
712 
713 char R600ControlFlowFinalizer::ID = 0;
714 
715 char &llvm::R600ControlFlowFinalizerID = R600ControlFlowFinalizer::ID;
716 
717 FunctionPass *llvm::createR600ControlFlowFinalizer() {
718   return new R600ControlFlowFinalizer();
719 }
720