1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// Insert wait instructions for memory reads and writes.
12 ///
13 /// Memory reads and writes are issued asynchronously, so we need to insert
14 /// S_WAITCNT instructions when we want to access any of their results or
15 /// overwrite any register that's used asynchronously.
16 //
17 //===----------------------------------------------------------------------===//
18 
19 #include "AMDGPU.h"
20 #include "AMDGPUSubtarget.h"
21 #include "SIDefines.h"
22 #include "SIInstrInfo.h"
23 #include "SIMachineFunctionInfo.h"
24 #include "SIRegisterInfo.h"
25 #include "Utils/AMDGPUBaseInfo.h"
26 #include "llvm/ADT/DenseMap.h"
27 #include "llvm/ADT/DenseSet.h"
28 #include "llvm/ADT/PostOrderIterator.h"
29 #include "llvm/ADT/STLExtras.h"
30 #include "llvm/ADT/SmallVector.h"
31 #include "llvm/CodeGen/MachineBasicBlock.h"
32 #include "llvm/CodeGen/MachineFunction.h"
33 #include "llvm/CodeGen/MachineFunctionPass.h"
34 #include "llvm/CodeGen/MachineInstr.h"
35 #include "llvm/CodeGen/MachineInstrBuilder.h"
36 #include "llvm/CodeGen/MachineLoopInfo.h"
37 #include "llvm/CodeGen/MachineMemOperand.h"
38 #include "llvm/CodeGen/MachineOperand.h"
39 #include "llvm/CodeGen/MachineRegisterInfo.h"
40 #include "llvm/IR/DebugLoc.h"
41 #include "llvm/Pass.h"
42 #include "llvm/Support/Debug.h"
43 #include "llvm/Support/DebugCounter.h"
44 #include "llvm/Support/ErrorHandling.h"
45 #include "llvm/Support/raw_ostream.h"
46 #include <algorithm>
47 #include <cassert>
48 #include <cstdint>
49 #include <cstring>
50 #include <memory>
51 #include <utility>
52 #include <vector>
53 
54 using namespace llvm;
55 
56 #define DEBUG_TYPE "si-insert-waitcnts"
57 
58 DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
59               "Force emit s_waitcnt expcnt(0) instrs");
60 DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
61               "Force emit s_waitcnt lgkmcnt(0) instrs");
62 DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
63               "Force emit s_waitcnt vmcnt(0) instrs");
64 
65 static cl::opt<unsigned> ForceEmitZeroFlag(
66   "amdgpu-waitcnt-forcezero",
67   cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
68   cl::init(0), cl::Hidden);
69 
70 namespace {
71 
72 // Class of object that encapsulates latest instruction counter score
73 // associated with the operand.  Used for determining whether
74 // s_waitcnt instruction needs to be emited.
75 
76 #define CNT_MASK(t) (1u << (t))
77 
78 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
79 
80 using RegInterval = std::pair<signed, signed>;
81 
82 struct {
83   int32_t VmcntMax;
84   int32_t ExpcntMax;
85   int32_t LgkmcntMax;
86   int32_t NumVGPRsMax;
87   int32_t NumSGPRsMax;
88 } HardwareLimits;
89 
90 struct {
91   unsigned VGPR0;
92   unsigned VGPRL;
93   unsigned SGPR0;
94   unsigned SGPRL;
95 } RegisterEncoding;
96 
97 enum WaitEventType {
98   VMEM_ACCESS,      // vector-memory read & write
99   LDS_ACCESS,       // lds read & write
100   GDS_ACCESS,       // gds read & write
101   SQ_MESSAGE,       // send message
102   SMEM_ACCESS,      // scalar-memory read & write
103   EXP_GPR_LOCK,     // export holding on its data src
104   GDS_GPR_LOCK,     // GDS holding on its data and addr src
105   EXP_POS_ACCESS,   // write to export position
106   EXP_PARAM_ACCESS, // write to export parameter
107   VMW_GPR_LOCK,     // vector-memory write holding on its data src
108   NUM_WAIT_EVENTS,
109 };
110 
111 // The mapping is:
112 //  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
113 //  SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1                  extra VGPR-like slots
114 //  NUM_ALL_VGPRS    .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
115 // We reserve a fixed number of VGPR slots in the scoring tables for
116 // special tokens like SCMEM_LDS (needed for buffer load to LDS).
117 enum RegisterMapping {
118   SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
119   SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
120   NUM_EXTRA_VGPRS = 1,    // A reserved slot for DS.
121   EXTRA_VGPR_LDS = 0,     // This is a placeholder the Shader algorithm uses.
122   NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
123 };
124 
125 #define ForAllWaitEventType(w)                                                 \
126   for (enum WaitEventType w = (enum WaitEventType)0;                           \
127        (w) < (enum WaitEventType)NUM_WAIT_EVENTS;                              \
128        (w) = (enum WaitEventType)((w) + 1))
129 
130 // This is a per-basic-block object that maintains current score brackets
131 // of each wait counter, and a per-register scoreboard for each wait counter.
132 // We also maintain the latest score for every event type that can change the
133 // waitcnt in order to know if there are multiple types of events within
134 // the brackets. When multiple types of event happen in the bracket,
135 // wait count may get decreased out of order, therefore we need to put in
136 // "s_waitcnt 0" before use.
137 class BlockWaitcntBrackets {
138 public:
139   BlockWaitcntBrackets() {
140     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
141          T = (enum InstCounterType)(T + 1)) {
142       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
143     }
144   }
145 
146   ~BlockWaitcntBrackets() = default;
147 
148   static int32_t getWaitCountMax(InstCounterType T) {
149     switch (T) {
150     case VM_CNT:
151       return HardwareLimits.VmcntMax;
152     case LGKM_CNT:
153       return HardwareLimits.LgkmcntMax;
154     case EXP_CNT:
155       return HardwareLimits.ExpcntMax;
156     default:
157       break;
158     }
159     return 0;
160   }
161 
162   void setScoreLB(InstCounterType T, int32_t Val) {
163     assert(T < NUM_INST_CNTS);
164     if (T >= NUM_INST_CNTS)
165       return;
166     ScoreLBs[T] = Val;
167   }
168 
169   void setScoreUB(InstCounterType T, int32_t Val) {
170     assert(T < NUM_INST_CNTS);
171     if (T >= NUM_INST_CNTS)
172       return;
173     ScoreUBs[T] = Val;
174     if (T == EXP_CNT) {
175       int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT));
176       if (ScoreLBs[T] < UB)
177         ScoreLBs[T] = UB;
178     }
179   }
180 
181   int32_t getScoreLB(InstCounterType T) {
182     assert(T < NUM_INST_CNTS);
183     if (T >= NUM_INST_CNTS)
184       return 0;
185     return ScoreLBs[T];
186   }
187 
188   int32_t getScoreUB(InstCounterType T) {
189     assert(T < NUM_INST_CNTS);
190     if (T >= NUM_INST_CNTS)
191       return 0;
192     return ScoreUBs[T];
193   }
194 
195   // Mapping from event to counter.
196   InstCounterType eventCounter(WaitEventType E) {
197     switch (E) {
198     case VMEM_ACCESS:
199       return VM_CNT;
200     case LDS_ACCESS:
201     case GDS_ACCESS:
202     case SQ_MESSAGE:
203     case SMEM_ACCESS:
204       return LGKM_CNT;
205     case EXP_GPR_LOCK:
206     case GDS_GPR_LOCK:
207     case VMW_GPR_LOCK:
208     case EXP_POS_ACCESS:
209     case EXP_PARAM_ACCESS:
210       return EXP_CNT;
211     default:
212       llvm_unreachable("unhandled event type");
213     }
214     return NUM_INST_CNTS;
215   }
216 
217   void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
218     if (GprNo < NUM_ALL_VGPRS) {
219       if (GprNo > VgprUB) {
220         VgprUB = GprNo;
221       }
222       VgprScores[T][GprNo] = Val;
223     } else {
224       assert(T == LGKM_CNT);
225       if (GprNo - NUM_ALL_VGPRS > SgprUB) {
226         SgprUB = GprNo - NUM_ALL_VGPRS;
227       }
228       SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
229     }
230   }
231 
232   int32_t getRegScore(int GprNo, InstCounterType T) {
233     if (GprNo < NUM_ALL_VGPRS) {
234       return VgprScores[T][GprNo];
235     }
236     return SgprScores[GprNo - NUM_ALL_VGPRS];
237   }
238 
239   void clear() {
240     memset(ScoreLBs, 0, sizeof(ScoreLBs));
241     memset(ScoreUBs, 0, sizeof(ScoreUBs));
242     memset(EventUBs, 0, sizeof(EventUBs));
243     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
244          T = (enum InstCounterType)(T + 1)) {
245       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
246     }
247     memset(SgprScores, 0, sizeof(SgprScores));
248   }
249 
250   RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
251                              const MachineRegisterInfo *MRI,
252                              const SIRegisterInfo *TRI, unsigned OpNo,
253                              bool Def) const;
254 
255   void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
256                    const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
257                    unsigned OpNo, int32_t Val);
258 
259   void setWaitAtBeginning() { WaitAtBeginning = true; }
260   void clearWaitAtBeginning() { WaitAtBeginning = false; }
261   bool getWaitAtBeginning() const { return WaitAtBeginning; }
262   void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
263   int32_t getMaxVGPR() const { return VgprUB; }
264   int32_t getMaxSGPR() const { return SgprUB; }
265 
266   int32_t getEventUB(enum WaitEventType W) const {
267     assert(W < NUM_WAIT_EVENTS);
268     return EventUBs[W];
269   }
270 
271   bool counterOutOfOrder(InstCounterType T);
272   unsigned int updateByWait(InstCounterType T, int ScoreToWait);
273   void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
274                      const MachineRegisterInfo *MRI, WaitEventType E,
275                      MachineInstr &MI);
276 
277   bool hasPendingSMEM() const {
278     return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
279             EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
280   }
281 
282   bool hasPendingFlat() const {
283     return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
284              LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
285             (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
286              LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
287   }
288 
289   void setPendingFlat() {
290     LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
291     LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
292   }
293 
294   int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
295 
296   void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
297 
298   bool getRevisitLoop() const { return RevisitLoop; }
299   void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
300 
301   void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
302   int32_t getPostOrder() const { return PostOrder; }
303 
304   void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; }
305   void clearWaitcnt() { Waitcnt = nullptr; }
306   MachineInstr *getWaitcnt() const { return Waitcnt; }
307 
308   bool mixedExpTypes() const { return MixedExpTypes; }
309   void setMixedExpTypes(bool MixedExpTypesIn) {
310     MixedExpTypes = MixedExpTypesIn;
311   }
312 
313   void print(raw_ostream &);
314   void dump() { print(dbgs()); }
315 
316 private:
317   bool WaitAtBeginning = false;
318   bool RevisitLoop = false;
319   bool MixedExpTypes = false;
320   int32_t PostOrder = 0;
321   MachineInstr *Waitcnt = nullptr;
322   int32_t ScoreLBs[NUM_INST_CNTS] = {0};
323   int32_t ScoreUBs[NUM_INST_CNTS] = {0};
324   int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
325   // Remember the last flat memory operation.
326   int32_t LastFlat[NUM_INST_CNTS] = {0};
327   // wait_cnt scores for every vgpr.
328   // Keep track of the VgprUB and SgprUB to make merge at join efficient.
329   int32_t VgprUB = 0;
330   int32_t SgprUB = 0;
331   int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
332   // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
333   int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
334 };
335 
336 // This is a per-loop-region object that records waitcnt status at the end of
337 // loop footer from the previous iteration. We also maintain an iteration
338 // count to track the number of times the loop has been visited. When it
339 // doesn't converge naturally, we force convergence by inserting s_waitcnt 0
340 // at the end of the loop footer.
341 class LoopWaitcntData {
342 public:
343   LoopWaitcntData() = default;
344   ~LoopWaitcntData() = default;
345 
346   void incIterCnt() { IterCnt++; }
347   void resetIterCnt() { IterCnt = 0; }
348   int32_t getIterCnt() { return IterCnt; }
349 
350   void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
351   MachineInstr *getWaitcnt() const { return LfWaitcnt; }
352 
353   void print() {
354     DEBUG(dbgs() << "  iteration " << IterCnt << '\n';);
355   }
356 
357 private:
358   // s_waitcnt added at the end of loop footer to stablize wait scores
359   // at the end of the loop footer.
360   MachineInstr *LfWaitcnt = nullptr;
361   // Number of iterations the loop has been visited, not including the initial
362   // walk over.
363   int32_t IterCnt = 0;
364 };
365 
366 class SIInsertWaitcnts : public MachineFunctionPass {
367 private:
368   const SISubtarget *ST = nullptr;
369   const SIInstrInfo *TII = nullptr;
370   const SIRegisterInfo *TRI = nullptr;
371   const MachineRegisterInfo *MRI = nullptr;
372   const MachineLoopInfo *MLI = nullptr;
373   AMDGPU::IsaInfo::IsaVersion IV;
374   AMDGPUAS AMDGPUASI;
375 
376   DenseSet<MachineBasicBlock *> BlockVisitedSet;
377   DenseSet<MachineInstr *> TrackedWaitcntSet;
378   DenseSet<MachineInstr *> VCCZBugHandledSet;
379 
380   DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
381       BlockWaitcntBracketsMap;
382 
383   std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet;
384 
385   DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
386 
387   std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets;
388 
389   // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
390   // because of amdgpu-waitcnt-forcezero flag
391   bool ForceEmitZeroWaitcnts;
392   bool ForceEmitWaitcnt[NUM_INST_CNTS];
393 
394 public:
395   static char ID;
396 
397   SIInsertWaitcnts() : MachineFunctionPass(ID) {}
398 
399   bool runOnMachineFunction(MachineFunction &MF) override;
400 
401   StringRef getPassName() const override {
402     return "SI insert wait instructions";
403   }
404 
405   void getAnalysisUsage(AnalysisUsage &AU) const override {
406     AU.setPreservesCFG();
407     AU.addRequired<MachineLoopInfo>();
408     MachineFunctionPass::getAnalysisUsage(AU);
409   }
410 
411   void addKillWaitBracket(BlockWaitcntBrackets *Bracket) {
412     // The waitcnt information is copied because it changes as the block is
413     // traversed.
414     KillWaitBrackets.push_back(
415         llvm::make_unique<BlockWaitcntBrackets>(*Bracket));
416   }
417 
418   bool isForceEmitWaitcnt() const {
419     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
420          T = (enum InstCounterType)(T + 1))
421       if (ForceEmitWaitcnt[T])
422         return true;
423     return false;
424   }
425 
426   void setForceEmitWaitcnt() {
427 // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
428 // For debug builds, get the debug counter info and adjust if need be
429 #ifndef NDEBUG
430     if (DebugCounter::isCounterSet(ForceExpCounter) &&
431         DebugCounter::shouldExecute(ForceExpCounter)) {
432       ForceEmitWaitcnt[EXP_CNT] = true;
433     } else {
434       ForceEmitWaitcnt[EXP_CNT] = false;
435     }
436 
437     if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
438          DebugCounter::shouldExecute(ForceLgkmCounter)) {
439       ForceEmitWaitcnt[LGKM_CNT] = true;
440     } else {
441       ForceEmitWaitcnt[LGKM_CNT] = false;
442     }
443 
444     if (DebugCounter::isCounterSet(ForceVMCounter) &&
445         DebugCounter::shouldExecute(ForceVMCounter)) {
446       ForceEmitWaitcnt[VM_CNT] = true;
447     } else {
448       ForceEmitWaitcnt[VM_CNT] = false;
449     }
450 #endif // NDEBUG
451   }
452 
453   bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
454   void generateWaitcntInstBefore(MachineInstr &MI,
455                                   BlockWaitcntBrackets *ScoreBrackets);
456   void updateEventWaitcntAfter(MachineInstr &Inst,
457                                BlockWaitcntBrackets *ScoreBrackets);
458   void mergeInputScoreBrackets(MachineBasicBlock &Block);
459   bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block);
460   unsigned countNumBottomBlocks(const MachineLoop *Loop);
461   void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
462   void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
463   bool isWaitcntStronger(unsigned LHS, unsigned RHS);
464   unsigned combineWaitcnt(unsigned LHS, unsigned RHS);
465 };
466 
467 } // end anonymous namespace
468 
469 RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
470                                                  const SIInstrInfo *TII,
471                                                  const MachineRegisterInfo *MRI,
472                                                  const SIRegisterInfo *TRI,
473                                                  unsigned OpNo,
474                                                  bool Def) const {
475   const MachineOperand &Op = MI->getOperand(OpNo);
476   if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
477       (Def && !Op.isDef()))
478     return {-1, -1};
479 
480   // A use via a PW operand does not need a waitcnt.
481   // A partial write is not a WAW.
482   assert(!Op.getSubReg() || !Op.isUndef());
483 
484   RegInterval Result;
485   const MachineRegisterInfo &MRIA = *MRI;
486 
487   unsigned Reg = TRI->getEncodingValue(Op.getReg());
488 
489   if (TRI->isVGPR(MRIA, Op.getReg())) {
490     assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
491     Result.first = Reg - RegisterEncoding.VGPR0;
492     assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
493   } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
494     assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
495     Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
496     assert(Result.first >= NUM_ALL_VGPRS &&
497            Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
498   }
499   // TODO: Handle TTMP
500   // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
501   else
502     return {-1, -1};
503 
504   const MachineInstr &MIA = *MI;
505   const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
506   unsigned Size = TRI->getRegSizeInBits(*RC);
507   Result.second = Result.first + (Size / 32);
508 
509   return Result;
510 }
511 
512 void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
513                                        const SIInstrInfo *TII,
514                                        const SIRegisterInfo *TRI,
515                                        const MachineRegisterInfo *MRI,
516                                        unsigned OpNo, int32_t Val) {
517   RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
518   DEBUG({
519     const MachineOperand &Opnd = MI->getOperand(OpNo);
520     assert(TRI->isVGPR(*MRI, Opnd.getReg()));
521   });
522   for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
523     setRegScore(RegNo, EXP_CNT, Val);
524   }
525 }
526 
527 void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
528                                          const SIRegisterInfo *TRI,
529                                          const MachineRegisterInfo *MRI,
530                                          WaitEventType E, MachineInstr &Inst) {
531   const MachineRegisterInfo &MRIA = *MRI;
532   InstCounterType T = eventCounter(E);
533   int32_t CurrScore = getScoreUB(T) + 1;
534   // EventUB and ScoreUB need to be update regardless if this event changes
535   // the score of a register or not.
536   // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
537   EventUBs[E] = CurrScore;
538   setScoreUB(T, CurrScore);
539 
540   if (T == EXP_CNT) {
541     // Check for mixed export types. If they are mixed, then a waitcnt exp(0)
542     // is required.
543     if (!MixedExpTypes) {
544       MixedExpTypes = counterOutOfOrder(EXP_CNT);
545     }
546 
547     // Put score on the source vgprs. If this is a store, just use those
548     // specific register(s).
549     if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
550       // All GDS operations must protect their address register (same as
551       // export.)
552       if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
553           Inst.getOpcode() != AMDGPU::DS_CONSUME) {
554         setExpScore(
555             &Inst, TII, TRI, MRI,
556             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
557             CurrScore);
558       }
559       if (Inst.mayStore()) {
560         setExpScore(
561             &Inst, TII, TRI, MRI,
562             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
563             CurrScore);
564         if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
565                                        AMDGPU::OpName::data1) != -1) {
566           setExpScore(&Inst, TII, TRI, MRI,
567                       AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
568                                                  AMDGPU::OpName::data1),
569                       CurrScore);
570         }
571       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
572                  Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
573                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
574                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
575                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
576                  Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
577                  Inst.getOpcode() != AMDGPU::DS_APPEND &&
578                  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
579                  Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
580         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
581           const MachineOperand &Op = Inst.getOperand(I);
582           if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
583             setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
584           }
585         }
586       }
587     } else if (TII->isFLAT(Inst)) {
588       if (Inst.mayStore()) {
589         setExpScore(
590             &Inst, TII, TRI, MRI,
591             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
592             CurrScore);
593       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
594         setExpScore(
595             &Inst, TII, TRI, MRI,
596             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
597             CurrScore);
598       }
599     } else if (TII->isMIMG(Inst)) {
600       if (Inst.mayStore()) {
601         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
602       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
603         setExpScore(
604             &Inst, TII, TRI, MRI,
605             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
606             CurrScore);
607       }
608     } else if (TII->isMTBUF(Inst)) {
609       if (Inst.mayStore()) {
610         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
611       }
612     } else if (TII->isMUBUF(Inst)) {
613       if (Inst.mayStore()) {
614         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
615       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
616         setExpScore(
617             &Inst, TII, TRI, MRI,
618             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
619             CurrScore);
620       }
621     } else {
622       if (TII->isEXP(Inst)) {
623         // For export the destination registers are really temps that
624         // can be used as the actual source after export patching, so
625         // we need to treat them like sources and set the EXP_CNT
626         // score.
627         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
628           MachineOperand &DefMO = Inst.getOperand(I);
629           if (DefMO.isReg() && DefMO.isDef() &&
630               TRI->isVGPR(MRIA, DefMO.getReg())) {
631             setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
632                         CurrScore);
633           }
634         }
635       }
636       for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
637         MachineOperand &MO = Inst.getOperand(I);
638         if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
639           setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
640         }
641       }
642     }
643 #if 0 // TODO: check if this is handled by MUBUF code above.
644   } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
645        Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
646        Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
647     MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
648     unsigned OpNo;//TODO: find the OpNo for this operand;
649     RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
650     for (signed RegNo = Interval.first; RegNo < Interval.second;
651     ++RegNo) {
652       setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
653     }
654 #endif
655   } else {
656     // Match the score to the destination registers.
657     for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
658       RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
659       if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
660         continue;
661       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
662         setRegScore(RegNo, T, CurrScore);
663       }
664     }
665     if (TII->isDS(Inst) && Inst.mayStore()) {
666       setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
667     }
668   }
669 }
670 
671 void BlockWaitcntBrackets::print(raw_ostream &OS) {
672   OS << '\n';
673   for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
674        T = (enum InstCounterType)(T + 1)) {
675     int LB = getScoreLB(T);
676     int UB = getScoreUB(T);
677 
678     switch (T) {
679     case VM_CNT:
680       OS << "    VM_CNT(" << UB - LB << "): ";
681       break;
682     case LGKM_CNT:
683       OS << "    LGKM_CNT(" << UB - LB << "): ";
684       break;
685     case EXP_CNT:
686       OS << "    EXP_CNT(" << UB - LB << "): ";
687       break;
688     default:
689       OS << "    UNKNOWN(" << UB - LB << "): ";
690       break;
691     }
692 
693     if (LB < UB) {
694       // Print vgpr scores.
695       for (int J = 0; J <= getMaxVGPR(); J++) {
696         int RegScore = getRegScore(J, T);
697         if (RegScore <= LB)
698           continue;
699         int RelScore = RegScore - LB - 1;
700         if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
701           OS << RelScore << ":v" << J << " ";
702         } else {
703           OS << RelScore << ":ds ";
704         }
705       }
706       // Also need to print sgpr scores for lgkm_cnt.
707       if (T == LGKM_CNT) {
708         for (int J = 0; J <= getMaxSGPR(); J++) {
709           int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
710           if (RegScore <= LB)
711             continue;
712           int RelScore = RegScore - LB - 1;
713           OS << RelScore << ":s" << J << " ";
714         }
715       }
716     }
717     OS << '\n';
718   }
719   OS << '\n';
720 }
721 
722 unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
723                                                 int ScoreToWait) {
724   unsigned int NeedWait = 0;
725   if (ScoreToWait == -1) {
726     // The score to wait is unknown. This implies that it was not encountered
727     // during the path of the CFG walk done during the current traversal but
728     // may be seen on a different path. Emit an s_wait counter with a
729     // conservative value of 0 for the counter.
730     NeedWait = CNT_MASK(T);
731     setScoreLB(T, getScoreUB(T));
732     return NeedWait;
733   }
734 
735   // If the score of src_operand falls within the bracket, we need an
736   // s_waitcnt instruction.
737   const int32_t LB = getScoreLB(T);
738   const int32_t UB = getScoreUB(T);
739   if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
740     if (T == VM_CNT && hasPendingFlat()) {
741       // If there is a pending FLAT operation, and this is a VM waitcnt,
742       // then we need to force a waitcnt 0 for VM.
743       NeedWait = CNT_MASK(T);
744       setScoreLB(T, getScoreUB(T));
745     } else if (counterOutOfOrder(T)) {
746       // Counter can get decremented out-of-order when there
747       // are multiple types event in the bracket. Also emit an s_wait counter
748       // with a conservative value of 0 for the counter.
749       NeedWait = CNT_MASK(T);
750       setScoreLB(T, getScoreUB(T));
751     } else {
752       NeedWait = CNT_MASK(T);
753       setScoreLB(T, ScoreToWait);
754     }
755   }
756 
757   return NeedWait;
758 }
759 
760 // Where there are multiple types of event in the bracket of a counter,
761 // the decrement may go out of order.
762 bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) {
763   switch (T) {
764   case VM_CNT:
765     return false;
766   case LGKM_CNT: {
767     if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
768         EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) {
769       // Scalar memory read always can go out of order.
770       return true;
771     }
772     int NumEventTypes = 0;
773     if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
774         EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
775       NumEventTypes++;
776     }
777     if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
778         EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
779       NumEventTypes++;
780     }
781     if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] &&
782         EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) {
783       NumEventTypes++;
784     }
785     if (NumEventTypes <= 1) {
786       return false;
787     }
788     break;
789   }
790   case EXP_CNT: {
791     // If there has been a mixture of export types, then a waitcnt exp(0) is
792     // required.
793     if (MixedExpTypes)
794       return true;
795     int NumEventTypes = 0;
796     if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
797         EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
798       NumEventTypes++;
799     }
800     if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
801         EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
802       NumEventTypes++;
803     }
804     if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
805         EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
806       NumEventTypes++;
807     }
808     if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] &&
809         EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) {
810       NumEventTypes++;
811     }
812 
813     if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] &&
814         EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) {
815       NumEventTypes++;
816     }
817 
818     if (NumEventTypes <= 1) {
819       return false;
820     }
821     break;
822   }
823   default:
824     break;
825   }
826   return true;
827 }
828 
829 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
830                       false)
831 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
832                     false)
833 
834 char SIInsertWaitcnts::ID = 0;
835 
836 char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
837 
838 FunctionPass *llvm::createSIInsertWaitcntsPass() {
839   return new SIInsertWaitcnts();
840 }
841 
842 static bool readsVCCZ(const MachineInstr &MI) {
843   unsigned Opc = MI.getOpcode();
844   return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
845          !MI.getOperand(1).isUndef();
846 }
847 
848 /// Given wait count encodings checks if LHS is stronger than RHS.
849 bool SIInsertWaitcnts::isWaitcntStronger(unsigned LHS, unsigned RHS) {
850   if (AMDGPU::decodeVmcnt(IV, LHS) > AMDGPU::decodeVmcnt(IV, RHS))
851     return false;
852   if (AMDGPU::decodeLgkmcnt(IV, LHS) > AMDGPU::decodeLgkmcnt(IV, RHS))
853     return false;
854   if (AMDGPU::decodeExpcnt(IV, LHS) > AMDGPU::decodeExpcnt(IV, RHS))
855     return false;
856   return true;
857 }
858 
859 /// Given wait count encodings create a new encoding which is stronger
860 /// or equal to both.
861 unsigned SIInsertWaitcnts::combineWaitcnt(unsigned LHS, unsigned RHS) {
862   unsigned VmCnt = std::min(AMDGPU::decodeVmcnt(IV, LHS),
863                             AMDGPU::decodeVmcnt(IV, RHS));
864   unsigned LgkmCnt = std::min(AMDGPU::decodeLgkmcnt(IV, LHS),
865                               AMDGPU::decodeLgkmcnt(IV, RHS));
866   unsigned ExpCnt = std::min(AMDGPU::decodeExpcnt(IV, LHS),
867                              AMDGPU::decodeExpcnt(IV, RHS));
868   return AMDGPU::encodeWaitcnt(IV, VmCnt, ExpCnt, LgkmCnt);
869 }
870 
871 ///  Generate s_waitcnt instruction to be placed before cur_Inst.
872 ///  Instructions of a given type are returned in order,
873 ///  but instructions of different types can complete out of order.
874 ///  We rely on this in-order completion
875 ///  and simply assign a score to the memory access instructions.
876 ///  We keep track of the active "score bracket" to determine
877 ///  if an access of a memory read requires an s_waitcnt
878 ///  and if so what the value of each counter is.
879 ///  The "score bracket" is bound by the lower bound and upper bound
880 ///  scores (*_score_LB and *_score_ub respectively).
881 void SIInsertWaitcnts::generateWaitcntInstBefore(
882     MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) {
883   // To emit, or not to emit - that's the question!
884   // Start with an assumption that there is no need to emit.
885   unsigned int EmitWaitcnt = 0;
886 
887   // No need to wait before phi. If a phi-move exists, then the wait should
888   // has been inserted before the move. If a phi-move does not exist, then
889   // wait should be inserted before the real use. The same is true for
890   // sc-merge. It is not a coincident that all these cases correspond to the
891   // instructions that are skipped in the assembling loop.
892   bool NeedLineMapping = false; // TODO: Check on this.
893 
894   // ForceEmitZeroWaitcnt: force a single s_waitcnt 0 due to hw bug
895   bool ForceEmitZeroWaitcnt = false;
896 
897   setForceEmitWaitcnt();
898   bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
899 
900   if (MI.isDebugInstr() &&
901       // TODO: any other opcode?
902       !NeedLineMapping) {
903     return;
904   }
905 
906   // See if an s_waitcnt is forced at block entry, or is needed at
907   // program end.
908   if (ScoreBrackets->getWaitAtBeginning()) {
909     // Note that we have already cleared the state, so we don't need to update
910     // it.
911     ScoreBrackets->clearWaitAtBeginning();
912     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
913          T = (enum InstCounterType)(T + 1)) {
914       EmitWaitcnt |= CNT_MASK(T);
915       ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
916     }
917   }
918 
919   // See if this instruction has a forced S_WAITCNT VM.
920   // TODO: Handle other cases of NeedsWaitcntVmBefore()
921   else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
922            MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
923            MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
924     EmitWaitcnt |=
925         ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
926   }
927 
928   // All waits must be resolved at call return.
929   // NOTE: this could be improved with knowledge of all call sites or
930   //   with knowledge of the called routines.
931   if (MI.getOpcode() == AMDGPU::RETURN ||
932       MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
933       MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
934     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
935          T = (enum InstCounterType)(T + 1)) {
936       if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
937         ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
938         EmitWaitcnt |= CNT_MASK(T);
939       }
940     }
941   }
942   // Resolve vm waits before gs-done.
943   else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
944             MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
945            ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
946             AMDGPU::SendMsg::ID_GS_DONE)) {
947     if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) {
948       ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
949       EmitWaitcnt |= CNT_MASK(VM_CNT);
950     }
951   }
952 #if 0 // TODO: the following blocks of logic when we have fence.
953   else if (MI.getOpcode() == SC_FENCE) {
954     const unsigned int group_size =
955       context->shader_info->GetMaxThreadGroupSize();
956     // group_size == 0 means thread group size is unknown at compile time
957     const bool group_is_multi_wave =
958       (group_size == 0 || group_size > target_info->GetWaveFrontSize());
959     const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
960 
961     for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
962       SCRegType src_type = Inst->GetSrcType(i);
963       switch (src_type) {
964         case SCMEM_LDS:
965           if (group_is_multi_wave ||
966             context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
967             EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
968                                ScoreBrackets->getScoreUB(LGKM_CNT));
969             // LDS may have to wait for VM_CNT after buffer load to LDS
970             if (target_info->HasBufferLoadToLDS()) {
971               EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
972                                  ScoreBrackets->getScoreUB(VM_CNT));
973             }
974           }
975           break;
976 
977         case SCMEM_GDS:
978           if (group_is_multi_wave || fence_is_global) {
979             EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
980               ScoreBrackets->getScoreUB(EXP_CNT));
981             EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
982               ScoreBrackets->getScoreUB(LGKM_CNT));
983           }
984           break;
985 
986         case SCMEM_UAV:
987         case SCMEM_TFBUF:
988         case SCMEM_RING:
989         case SCMEM_SCATTER:
990           if (group_is_multi_wave || fence_is_global) {
991             EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
992               ScoreBrackets->getScoreUB(EXP_CNT));
993             EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
994               ScoreBrackets->getScoreUB(VM_CNT));
995           }
996           break;
997 
998         case SCMEM_SCRATCH:
999         default:
1000           break;
1001       }
1002     }
1003   }
1004 #endif
1005 
1006   // Export & GDS instructions do not read the EXEC mask until after the export
1007   // is granted (which can occur well after the instruction is issued).
1008   // The shader program must flush all EXP operations on the export-count
1009   // before overwriting the EXEC mask.
1010   else {
1011     if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
1012       // Export and GDS are tracked individually, either may trigger a waitcnt
1013       // for EXEC.
1014       EmitWaitcnt |= ScoreBrackets->updateByWait(
1015           EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK));
1016       EmitWaitcnt |= ScoreBrackets->updateByWait(
1017           EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS));
1018       EmitWaitcnt |= ScoreBrackets->updateByWait(
1019           EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS));
1020       EmitWaitcnt |= ScoreBrackets->updateByWait(
1021           EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK));
1022     }
1023 
1024 #if 0 // TODO: the following code to handle CALL.
1025     // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
1026     // However, there is a problem with EXP_CNT, because the call cannot
1027     // easily tell if a register is used in the function, and if it did, then
1028     // the referring instruction would have to have an S_WAITCNT, which is
1029     // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
1030     // before the call.
1031     if (MI.getOpcode() == SC_CALL) {
1032       if (ScoreBrackets->getScoreUB(EXP_CNT) >
1033         ScoreBrackets->getScoreLB(EXP_CNT)) {
1034         ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
1035         EmitWaitcnt |= CNT_MASK(EXP_CNT);
1036       }
1037     }
1038 #endif
1039 
1040     // FIXME: Should not be relying on memoperands.
1041     // Look at the source operands of every instruction to see if
1042     // any of them results from a previous memory operation that affects
1043     // its current usage. If so, an s_waitcnt instruction needs to be
1044     // emitted.
1045     // If the source operand was defined by a load, add the s_waitcnt
1046     // instruction.
1047     for (const MachineMemOperand *Memop : MI.memoperands()) {
1048       unsigned AS = Memop->getAddrSpace();
1049       if (AS != AMDGPUASI.LOCAL_ADDRESS)
1050         continue;
1051       unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1052       // VM_CNT is only relevant to vgpr or LDS.
1053       EmitWaitcnt |= ScoreBrackets->updateByWait(
1054           VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1055     }
1056 
1057     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1058       const MachineOperand &Op = MI.getOperand(I);
1059       const MachineRegisterInfo &MRIA = *MRI;
1060       RegInterval Interval =
1061           ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);
1062       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1063         if (TRI->isVGPR(MRIA, Op.getReg())) {
1064           // VM_CNT is only relevant to vgpr or LDS.
1065           EmitWaitcnt |= ScoreBrackets->updateByWait(
1066               VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1067         }
1068         EmitWaitcnt |= ScoreBrackets->updateByWait(
1069             LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
1070       }
1071     }
1072     // End of for loop that looks at all source operands to decide vm_wait_cnt
1073     // and lgk_wait_cnt.
1074 
1075     // Two cases are handled for destination operands:
1076     // 1) If the destination operand was defined by a load, add the s_waitcnt
1077     // instruction to guarantee the right WAW order.
1078     // 2) If a destination operand that was used by a recent export/store ins,
1079     // add s_waitcnt on exp_cnt to guarantee the WAR order.
1080     if (MI.mayStore()) {
1081       // FIXME: Should not be relying on memoperands.
1082       for (const MachineMemOperand *Memop : MI.memoperands()) {
1083         unsigned AS = Memop->getAddrSpace();
1084         if (AS != AMDGPUASI.LOCAL_ADDRESS)
1085           continue;
1086         unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1087         EmitWaitcnt |= ScoreBrackets->updateByWait(
1088             VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1089         EmitWaitcnt |= ScoreBrackets->updateByWait(
1090             EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
1091       }
1092     }
1093     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1094       MachineOperand &Def = MI.getOperand(I);
1095       const MachineRegisterInfo &MRIA = *MRI;
1096       RegInterval Interval =
1097           ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
1098       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1099         if (TRI->isVGPR(MRIA, Def.getReg())) {
1100           EmitWaitcnt |= ScoreBrackets->updateByWait(
1101               VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1102           EmitWaitcnt |= ScoreBrackets->updateByWait(
1103               EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
1104         }
1105         EmitWaitcnt |= ScoreBrackets->updateByWait(
1106             LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
1107       }
1108     } // End of for loop that looks at all dest operands.
1109   }
1110 
1111   // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
1112   // occurs before the instruction. Doing it here prevents any additional
1113   // S_WAITCNTs from being emitted if the instruction was marked as
1114   // requiring a WAITCNT beforehand.
1115   if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1116       !ST->hasAutoWaitcntBeforeBarrier()) {
1117     EmitWaitcnt |=
1118         ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
1119     EmitWaitcnt |= ScoreBrackets->updateByWait(
1120         EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
1121     EmitWaitcnt |= ScoreBrackets->updateByWait(
1122         LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT));
1123   }
1124 
1125   // TODO: Remove this work-around, enable the assert for Bug 457939
1126   //       after fixing the scheduler. Also, the Shader Compiler code is
1127   //       independent of target.
1128   if (readsVCCZ(MI) && ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
1129     if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1130             ScoreBrackets->getScoreUB(LGKM_CNT) &&
1131         ScoreBrackets->hasPendingSMEM()) {
1132       // Wait on everything, not just LGKM.  vccz reads usually come from
1133       // terminators, and we always wait on everything at the end of the
1134       // block, so if we only wait on LGKM here, we might end up with
1135       // another s_waitcnt inserted right after this if there are non-LGKM
1136       // instructions still outstanding.
1137       // FIXME: this is too conservative / the comment is wrong.
1138       // We don't wait on everything at the end of the block and we combine
1139       // waitcnts so we should never have back-to-back waitcnts.
1140       ForceEmitZeroWaitcnt = true;
1141       EmitWaitcnt = true;
1142     }
1143   }
1144 
1145   // Does this operand processing indicate s_wait counter update?
1146   if (EmitWaitcnt || IsForceEmitWaitcnt) {
1147     int CntVal[NUM_INST_CNTS];
1148 
1149     bool UseDefaultWaitcntStrategy = true;
1150     if (ForceEmitZeroWaitcnt || ForceEmitZeroWaitcnts) {
1151       // Force all waitcnts to 0.
1152       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1153            T = (enum InstCounterType)(T + 1)) {
1154         ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
1155       }
1156       CntVal[VM_CNT] = 0;
1157       CntVal[EXP_CNT] = 0;
1158       CntVal[LGKM_CNT] = 0;
1159       UseDefaultWaitcntStrategy = false;
1160     }
1161 
1162     if (UseDefaultWaitcntStrategy) {
1163       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1164            T = (enum InstCounterType)(T + 1)) {
1165         if (EmitWaitcnt & CNT_MASK(T)) {
1166           int Delta =
1167               ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T);
1168           int MaxDelta = ScoreBrackets->getWaitCountMax(T);
1169           if (Delta >= MaxDelta) {
1170             Delta = -1;
1171             if (T != EXP_CNT) {
1172               ScoreBrackets->setScoreLB(
1173                   T, ScoreBrackets->getScoreUB(T) - MaxDelta);
1174             }
1175             EmitWaitcnt &= ~CNT_MASK(T);
1176           }
1177           CntVal[T] = Delta;
1178         } else {
1179           // If we are not waiting for a particular counter then encode
1180           // it as -1 which means "don't care."
1181           CntVal[T] = -1;
1182         }
1183       }
1184     }
1185 
1186     // If we are not waiting on any counter we can skip the wait altogether.
1187     if (EmitWaitcnt != 0 || IsForceEmitWaitcnt) {
1188       MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
1189       int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
1190       if (!OldWaitcnt ||
1191           (AMDGPU::decodeVmcnt(IV, Imm) !=
1192                           (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
1193           (AMDGPU::decodeExpcnt(IV, Imm) !=
1194            (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
1195           (AMDGPU::decodeLgkmcnt(IV, Imm) !=
1196            (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) {
1197         MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
1198         if (ContainingLoop) {
1199           MachineBasicBlock *TBB = ContainingLoop->getHeader();
1200           BlockWaitcntBrackets *ScoreBracket =
1201               BlockWaitcntBracketsMap[TBB].get();
1202           if (!ScoreBracket) {
1203             assert(!BlockVisitedSet.count(TBB));
1204             BlockWaitcntBracketsMap[TBB] =
1205                 llvm::make_unique<BlockWaitcntBrackets>();
1206             ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
1207           }
1208           ScoreBracket->setRevisitLoop(true);
1209           DEBUG(dbgs() << "set-revisit: Block"
1210                        << ContainingLoop->getHeader()->getNumber() << '\n';);
1211         }
1212       }
1213 
1214       // Update an existing waitcount, or make a new one.
1215       unsigned Enc = AMDGPU::encodeWaitcnt(IV,
1216                       ForceEmitWaitcnt[VM_CNT] ? 0 : CntVal[VM_CNT],
1217                       ForceEmitWaitcnt[EXP_CNT] ? 0 : CntVal[EXP_CNT],
1218                       ForceEmitWaitcnt[LGKM_CNT] ? 0 : CntVal[LGKM_CNT]);
1219       // We don't remove waitcnts that existed prior to the waitcnt
1220       // pass. Check if the waitcnt to-be-inserted can be avoided
1221       // or if the prev waitcnt can be updated.
1222       bool insertSWaitInst = true;
1223       for (MachineBasicBlock::iterator I = MI.getIterator(),
1224                                        B = MI.getParent()->begin();
1225            insertSWaitInst && I != B; --I) {
1226         if (I == MI.getIterator())
1227           continue;
1228 
1229         switch (I->getOpcode()) {
1230         case AMDGPU::S_WAITCNT:
1231           if (isWaitcntStronger(I->getOperand(0).getImm(), Enc))
1232             insertSWaitInst = false;
1233           else if (!OldWaitcnt) {
1234             OldWaitcnt = &*I;
1235             Enc = combineWaitcnt(I->getOperand(0).getImm(), Enc);
1236           }
1237           break;
1238         // TODO: skip over instructions which never require wait.
1239         }
1240         break;
1241       }
1242       if (insertSWaitInst) {
1243         if (OldWaitcnt && OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT) {
1244           if (ForceEmitZeroWaitcnts)
1245             DEBUG(dbgs() << "Force emit s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)\n");
1246           if (IsForceEmitWaitcnt)
1247             DEBUG(dbgs() << "Force emit a s_waitcnt due to debug counter\n");
1248 
1249           OldWaitcnt->getOperand(0).setImm(Enc);
1250           if (!OldWaitcnt->getParent())
1251             MI.getParent()->insert(MI, OldWaitcnt);
1252 
1253           DEBUG(dbgs() << "updateWaitcntInBlock\n"
1254                        << "Old Instr: " << MI << '\n'
1255                        << "New Instr: " << *OldWaitcnt << '\n');
1256         } else {
1257             auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
1258                                MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1259                              .addImm(Enc);
1260             TrackedWaitcntSet.insert(SWaitInst);
1261 
1262             DEBUG(dbgs() << "insertWaitcntInBlock\n"
1263                          << "Old Instr: " << MI << '\n'
1264                          << "New Instr: " << *SWaitInst << '\n');
1265         }
1266       }
1267 
1268       if (CntVal[EXP_CNT] == 0) {
1269         ScoreBrackets->setMixedExpTypes(false);
1270       }
1271     }
1272   }
1273 }
1274 
1275 void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
1276                                              MachineInstr *Waitcnt) {
1277   if (MBB.empty()) {
1278     MBB.push_back(Waitcnt);
1279     return;
1280   }
1281 
1282   MachineBasicBlock::iterator It = MBB.end();
1283   MachineInstr *MI = &*(--It);
1284   if (MI->isBranch()) {
1285     MBB.insert(It, Waitcnt);
1286   } else {
1287     MBB.push_back(Waitcnt);
1288   }
1289 }
1290 
1291 // This is a flat memory operation. Check to see if it has memory
1292 // tokens for both LDS and Memory, and if so mark it as a flat.
1293 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1294   if (MI.memoperands_empty())
1295     return true;
1296 
1297   for (const MachineMemOperand *Memop : MI.memoperands()) {
1298     unsigned AS = Memop->getAddrSpace();
1299     if (AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS)
1300       return true;
1301   }
1302 
1303   return false;
1304 }
1305 
1306 void SIInsertWaitcnts::updateEventWaitcntAfter(
1307     MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
1308   // Now look at the instruction opcode. If it is a memory access
1309   // instruction, update the upper-bound of the appropriate counter's
1310   // bracket and the destination operand scores.
1311   // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
1312   if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
1313     if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
1314       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1315       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1316     } else {
1317       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1318     }
1319   } else if (TII->isFLAT(Inst)) {
1320     assert(Inst.mayLoad() || Inst.mayStore());
1321 
1322     if (TII->usesVM_CNT(Inst))
1323       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1324 
1325     if (TII->usesLGKM_CNT(Inst)) {
1326       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1327 
1328       // This is a flat memory operation, so note it - it will require
1329       // that both the VM and LGKM be flushed to zero if it is pending when
1330       // a VM or LGKM dependency occurs.
1331       if (mayAccessLDSThroughFlat(Inst))
1332         ScoreBrackets->setPendingFlat();
1333     }
1334   } else if (SIInstrInfo::isVMEM(Inst) &&
1335              // TODO: get a better carve out.
1336              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
1337              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
1338              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
1339     ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1340     if (ST->vmemWriteNeedsExpWaitcnt() &&
1341         (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
1342       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1343     }
1344   } else if (TII->isSMRD(Inst)) {
1345     ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1346   } else {
1347     switch (Inst.getOpcode()) {
1348     case AMDGPU::S_SENDMSG:
1349     case AMDGPU::S_SENDMSGHALT:
1350       ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1351       break;
1352     case AMDGPU::EXP:
1353     case AMDGPU::EXP_DONE: {
1354       int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1355       if (Imm >= 32 && Imm <= 63)
1356         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1357       else if (Imm >= 12 && Imm <= 15)
1358         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1359       else
1360         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1361       break;
1362     }
1363     case AMDGPU::S_MEMTIME:
1364     case AMDGPU::S_MEMREALTIME:
1365       ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1366       break;
1367     default:
1368       break;
1369     }
1370   }
1371 }
1372 
1373 // Merge the score brackets of the Block's predecessors;
1374 // this merged score bracket is used when adding waitcnts to the Block
1375 void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
1376   BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1377   int32_t MaxPending[NUM_INST_CNTS] = {0};
1378   int32_t MaxFlat[NUM_INST_CNTS] = {0};
1379   bool MixedExpTypes = false;
1380 
1381   // For single basic block loops, we need to retain the Block's
1382   // score bracket to have accurate Pred info. So, make a copy of Block's
1383   // score bracket, clear() it (which retains several important bits of info),
1384   // populate, and then replace en masse. For non-single basic block loops,
1385   // just clear Block's current score bracket and repopulate in-place.
1386   bool IsSelfPred;
1387   std::unique_ptr<BlockWaitcntBrackets> S;
1388 
1389   IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block))
1390     != Block.pred_end();
1391   if (IsSelfPred) {
1392     S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
1393     ScoreBrackets = S.get();
1394   }
1395 
1396   ScoreBrackets->clear();
1397 
1398   // See if there are any uninitialized predecessors. If so, emit an
1399   // s_waitcnt 0 at the beginning of the block.
1400   for (MachineBasicBlock *Pred : Block.predecessors()) {
1401     BlockWaitcntBrackets *PredScoreBrackets =
1402         BlockWaitcntBracketsMap[Pred].get();
1403     bool Visited = BlockVisitedSet.count(Pred);
1404     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
1405       continue;
1406     }
1407     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1408          T = (enum InstCounterType)(T + 1)) {
1409       int span =
1410           PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);
1411       MaxPending[T] = std::max(MaxPending[T], span);
1412       span =
1413           PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
1414       MaxFlat[T] = std::max(MaxFlat[T], span);
1415     }
1416 
1417     MixedExpTypes |= PredScoreBrackets->mixedExpTypes();
1418   }
1419 
1420   // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1421   // Also handle kills for exit block.
1422   if (Block.succ_empty() && !KillWaitBrackets.empty()) {
1423     for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
1424       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1425            T = (enum InstCounterType)(T + 1)) {
1426         int Span = KillWaitBrackets[I]->getScoreUB(T) -
1427                    KillWaitBrackets[I]->getScoreLB(T);
1428         MaxPending[T] = std::max(MaxPending[T], Span);
1429         Span = KillWaitBrackets[I]->pendingFlat(T) -
1430                KillWaitBrackets[I]->getScoreLB(T);
1431         MaxFlat[T] = std::max(MaxFlat[T], Span);
1432       }
1433 
1434       MixedExpTypes |= KillWaitBrackets[I]->mixedExpTypes();
1435     }
1436   }
1437 
1438   // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
1439   for (MachineBasicBlock *Pred : Block.predecessors()) {
1440     BlockWaitcntBrackets *PredScoreBrackets =
1441         BlockWaitcntBracketsMap[Pred].get();
1442     bool Visited = BlockVisitedSet.count(Pred);
1443     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
1444       continue;
1445     }
1446 
1447     int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
1448                   PredScoreBrackets->getScoreLB(EXP_CNT);
1449     MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
1450     int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) -
1451                   PredScoreBrackets->getScoreLB(EXP_CNT);
1452     MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
1453   }
1454 
1455   // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1456   if (Block.succ_empty() && !KillWaitBrackets.empty()) {
1457     for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
1458       int GDSSpan = KillWaitBrackets[I]->getEventUB(GDS_GPR_LOCK) -
1459                     KillWaitBrackets[I]->getScoreLB(EXP_CNT);
1460       MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
1461       int EXPSpan = KillWaitBrackets[I]->getEventUB(EXP_GPR_LOCK) -
1462                     KillWaitBrackets[I]->getScoreLB(EXP_CNT);
1463       MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
1464     }
1465   }
1466 
1467 #if 0
1468   // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
1469   // TODO: how does LC distinguish between function entry and main entry?
1470   // If this is the entry to a function, force a wait.
1471   MachineBasicBlock &Entry = Block.getParent()->front();
1472   if (Entry.getNumber() == Block.getNumber()) {
1473     ScoreBrackets->setWaitAtBeginning();
1474     return;
1475   }
1476 #endif
1477 
1478   // Now set the current Block's brackets to the largest ending bracket.
1479   for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1480        T = (enum InstCounterType)(T + 1)) {
1481     ScoreBrackets->setScoreUB(T, MaxPending[T]);
1482     ScoreBrackets->setScoreLB(T, 0);
1483     ScoreBrackets->setLastFlat(T, MaxFlat[T]);
1484   }
1485 
1486   ScoreBrackets->setMixedExpTypes(MixedExpTypes);
1487 
1488   // Set the register scoreboard.
1489   for (MachineBasicBlock *Pred : Block.predecessors()) {
1490     if (!BlockVisitedSet.count(Pred)) {
1491       continue;
1492     }
1493 
1494     BlockWaitcntBrackets *PredScoreBrackets =
1495         BlockWaitcntBracketsMap[Pred].get();
1496 
1497     // Now merge the gpr_reg_score information
1498     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1499          T = (enum InstCounterType)(T + 1)) {
1500       int PredLB = PredScoreBrackets->getScoreLB(T);
1501       int PredUB = PredScoreBrackets->getScoreUB(T);
1502       if (PredLB < PredUB) {
1503         int PredScale = MaxPending[T] - PredUB;
1504         // Merge vgpr scores.
1505         for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
1506           int PredRegScore = PredScoreBrackets->getRegScore(J, T);
1507           if (PredRegScore <= PredLB)
1508             continue;
1509           int NewRegScore = PredScale + PredRegScore;
1510           ScoreBrackets->setRegScore(
1511               J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
1512         }
1513         // Also need to merge sgpr scores for lgkm_cnt.
1514         if (T == LGKM_CNT) {
1515           for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
1516             int PredRegScore =
1517                 PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
1518             if (PredRegScore <= PredLB)
1519               continue;
1520             int NewRegScore = PredScale + PredRegScore;
1521             ScoreBrackets->setRegScore(
1522                 J + NUM_ALL_VGPRS, LGKM_CNT,
1523                 std::max(
1524                     ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
1525                     NewRegScore));
1526           }
1527         }
1528       }
1529     }
1530 
1531     // Also merge the WaitEvent information.
1532     ForAllWaitEventType(W) {
1533       enum InstCounterType T = PredScoreBrackets->eventCounter(W);
1534       int PredEventUB = PredScoreBrackets->getEventUB(W);
1535       if (PredEventUB > PredScoreBrackets->getScoreLB(T)) {
1536         int NewEventUB =
1537             MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T);
1538         if (NewEventUB > 0) {
1539           ScoreBrackets->setEventUB(
1540               W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
1541         }
1542       }
1543     }
1544   }
1545 
1546   // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1547   // Set the register scoreboard.
1548   if (Block.succ_empty() && !KillWaitBrackets.empty()) {
1549     for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
1550       // Now merge the gpr_reg_score information.
1551       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1552            T = (enum InstCounterType)(T + 1)) {
1553         int PredLB = KillWaitBrackets[I]->getScoreLB(T);
1554         int PredUB = KillWaitBrackets[I]->getScoreUB(T);
1555         if (PredLB < PredUB) {
1556           int PredScale = MaxPending[T] - PredUB;
1557           // Merge vgpr scores.
1558           for (int J = 0; J <= KillWaitBrackets[I]->getMaxVGPR(); J++) {
1559             int PredRegScore = KillWaitBrackets[I]->getRegScore(J, T);
1560             if (PredRegScore <= PredLB)
1561               continue;
1562             int NewRegScore = PredScale + PredRegScore;
1563             ScoreBrackets->setRegScore(
1564                 J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
1565           }
1566           // Also need to merge sgpr scores for lgkm_cnt.
1567           if (T == LGKM_CNT) {
1568             for (int J = 0; J <= KillWaitBrackets[I]->getMaxSGPR(); J++) {
1569               int PredRegScore =
1570                   KillWaitBrackets[I]->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
1571               if (PredRegScore <= PredLB)
1572                 continue;
1573               int NewRegScore = PredScale + PredRegScore;
1574               ScoreBrackets->setRegScore(
1575                   J + NUM_ALL_VGPRS, LGKM_CNT,
1576                   std::max(
1577                       ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
1578                       NewRegScore));
1579             }
1580           }
1581         }
1582       }
1583 
1584       // Also merge the WaitEvent information.
1585       ForAllWaitEventType(W) {
1586         enum InstCounterType T = KillWaitBrackets[I]->eventCounter(W);
1587         int PredEventUB = KillWaitBrackets[I]->getEventUB(W);
1588         if (PredEventUB > KillWaitBrackets[I]->getScoreLB(T)) {
1589           int NewEventUB =
1590               MaxPending[T] + PredEventUB - KillWaitBrackets[I]->getScoreUB(T);
1591           if (NewEventUB > 0) {
1592             ScoreBrackets->setEventUB(
1593                 W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
1594           }
1595         }
1596       }
1597     }
1598   }
1599 
1600   // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
1601   // sequencing predecessors, because changes to EXEC require waitcnts due to
1602   // the delayed nature of these operations.
1603   for (MachineBasicBlock *Pred : Block.predecessors()) {
1604     if (!BlockVisitedSet.count(Pred)) {
1605       continue;
1606     }
1607 
1608     BlockWaitcntBrackets *PredScoreBrackets =
1609         BlockWaitcntBracketsMap[Pred].get();
1610 
1611     int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK);
1612     if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
1613       int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub -
1614                        PredScoreBrackets->getScoreUB(EXP_CNT);
1615       if (new_gds_ub > 0) {
1616         ScoreBrackets->setEventUB(
1617             GDS_GPR_LOCK,
1618             std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub));
1619       }
1620     }
1621     int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK);
1622     if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
1623       int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub -
1624                        PredScoreBrackets->getScoreUB(EXP_CNT);
1625       if (new_exp_ub > 0) {
1626         ScoreBrackets->setEventUB(
1627             EXP_GPR_LOCK,
1628             std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub));
1629       }
1630     }
1631   }
1632 
1633   // if a single block loop, update the score brackets. Not needed for other
1634   // blocks, as we did this in-place
1635   if (IsSelfPred) {
1636     BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
1637   }
1638 }
1639 
1640 /// Return true if the given basic block is a "bottom" block of a loop. This
1641 /// differs from MachineLoop::getBottomBlock in that it works even if the loop
1642 /// is discontiguous. This also handles multiple back-edges for the same
1643 /// "header" block of a loop.
1644 bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
1645                                     const MachineBasicBlock *Block) {
1646   for (MachineBasicBlock *MBB : Loop->blocks()) {
1647     if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {
1648       return true;
1649     }
1650   }
1651   return false;
1652 }
1653 
1654 /// Count the number of "bottom" basic blocks of a loop.
1655 unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) {
1656   unsigned Count = 0;
1657   for (MachineBasicBlock *MBB : Loop->blocks()) {
1658     if (MBB->isSuccessor(Loop->getHeader())) {
1659       Count++;
1660     }
1661   }
1662   return Count;
1663 }
1664 
1665 // Generate s_waitcnt instructions where needed.
1666 void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1667                                             MachineBasicBlock &Block) {
1668   // Initialize the state information.
1669   mergeInputScoreBrackets(Block);
1670 
1671   BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1672 
1673   DEBUG({
1674     dbgs() << "*** Block" << Block.getNumber() << " ***";
1675     ScoreBrackets->dump();
1676   });
1677 
1678   // Walk over the instructions.
1679   for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
1680        Iter != E;) {
1681     MachineInstr &Inst = *Iter;
1682     // Remove any previously existing waitcnts.
1683     if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
1684       // Leave pre-existing waitcnts, but note their existence via setWaitcnt.
1685       // Remove the waitcnt-pass-generated waitcnts; the pass will add them back
1686       // as needed.
1687       if (!TrackedWaitcntSet.count(&Inst))
1688         ++Iter;
1689       else {
1690         ++Iter;
1691         Inst.removeFromParent();
1692       }
1693       ScoreBrackets->setWaitcnt(&Inst);
1694       continue;
1695     }
1696 
1697     // Kill instructions generate a conditional branch to the endmain block.
1698     // Merge the current waitcnt state into the endmain block information.
1699     // TODO: Are there other flavors of KILL instruction?
1700     if (Inst.getOpcode() == AMDGPU::KILL) {
1701       addKillWaitBracket(ScoreBrackets);
1702     }
1703 
1704     bool VCCZBugWorkAround = false;
1705     if (readsVCCZ(Inst) &&
1706         (!VCCZBugHandledSet.count(&Inst))) {
1707       if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1708               ScoreBrackets->getScoreUB(LGKM_CNT) &&
1709           ScoreBrackets->hasPendingSMEM()) {
1710         if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS)
1711           VCCZBugWorkAround = true;
1712       }
1713     }
1714 
1715     // Generate an s_waitcnt instruction to be placed before
1716     // cur_Inst, if needed.
1717     generateWaitcntInstBefore(Inst, ScoreBrackets);
1718 
1719     updateEventWaitcntAfter(Inst, ScoreBrackets);
1720 
1721 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
1722     // If this instruction generates a S_SETVSKIP because it is an
1723     // indexed resource, and we are on Tahiti, then it will also force
1724     // an S_WAITCNT vmcnt(0)
1725     if (RequireCheckResourceType(Inst, context)) {
1726       // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1727       ScoreBrackets->setScoreLB(VM_CNT,
1728       ScoreBrackets->getScoreUB(VM_CNT));
1729     }
1730 #endif
1731 
1732     ScoreBrackets->clearWaitcnt();
1733 
1734     DEBUG({
1735       Inst.print(dbgs());
1736       ScoreBrackets->dump();
1737     });
1738 
1739     // Check to see if this is a GWS instruction. If so, and if this is CI or
1740     // VI, then the generated code sequence will include an S_WAITCNT 0.
1741     // TODO: Are these the only GWS instructions?
1742     if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
1743         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
1744         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
1745         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
1746         Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
1747       // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
1748       ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
1749       ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
1750       ScoreBrackets->updateByWait(LGKM_CNT,
1751                                   ScoreBrackets->getScoreUB(LGKM_CNT));
1752     }
1753 
1754     // TODO: Remove this work-around after fixing the scheduler and enable the
1755     // assert above.
1756     if (VCCZBugWorkAround) {
1757       // Restore the vccz bit.  Any time a value is written to vcc, the vcc
1758       // bit is updated, so we can restore the bit by reading the value of
1759       // vcc and then writing it back to the register.
1760       BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
1761               AMDGPU::VCC)
1762           .addReg(AMDGPU::VCC);
1763       VCCZBugHandledSet.insert(&Inst);
1764     }
1765 
1766     ++Iter;
1767   }
1768 
1769   // Check if we need to force convergence at loop footer.
1770   MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
1771   if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {
1772     LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1773     WaitcntData->print();
1774     DEBUG(dbgs() << '\n';);
1775 
1776     // The iterative waitcnt insertion algorithm aims for optimal waitcnt
1777     // placement and doesn't always guarantee convergence for a loop. Each
1778     // loop should take at most 2 iterations for it to converge naturally.
1779     // When this max is reached and result doesn't converge, we force
1780     // convergence by inserting a s_waitcnt at the end of loop footer.
1781     if (WaitcntData->getIterCnt() > 2) {
1782       // To ensure convergence, need to make wait events at loop footer be no
1783       // more than those from the previous iteration.
1784       // As a simplification, instead of tracking individual scores and
1785       // generating the precise wait count, just wait on 0.
1786       bool HasPending = false;
1787       MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
1788       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1789            T = (enum InstCounterType)(T + 1)) {
1790         if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
1791           ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
1792           HasPending = true;
1793         }
1794       }
1795 
1796       if (HasPending) {
1797         if (!SWaitInst) {
1798           SWaitInst = Block.getParent()->CreateMachineInstr(
1799               TII->get(AMDGPU::S_WAITCNT), DebugLoc());
1800           TrackedWaitcntSet.insert(SWaitInst);
1801           const MachineOperand &Op = MachineOperand::CreateImm(0);
1802           SWaitInst->addOperand(MF, Op);
1803 #if 0 // TODO: Format the debug output
1804           OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
1805           OutputTransformAdd(SWaitInst, context);
1806 #endif
1807         }
1808 #if 0 // TODO: ??
1809         _DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
1810 #endif
1811       }
1812 
1813       if (SWaitInst) {
1814         DEBUG({
1815           SWaitInst->print(dbgs());
1816           dbgs() << "\nAdjusted score board:";
1817           ScoreBrackets->dump();
1818         });
1819 
1820         // Add this waitcnt to the block. It is either newly created or
1821         // created in previous iterations and added back since block traversal
1822         // always removes waitcnts.
1823         insertWaitcntBeforeCF(Block, SWaitInst);
1824         WaitcntData->setWaitcnt(SWaitInst);
1825       }
1826     }
1827   }
1828 }
1829 
1830 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
1831   ST = &MF.getSubtarget<SISubtarget>();
1832   TII = ST->getInstrInfo();
1833   TRI = &TII->getRegisterInfo();
1834   MRI = &MF.getRegInfo();
1835   MLI = &getAnalysis<MachineLoopInfo>();
1836   IV = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
1837   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1838   AMDGPUASI = ST->getAMDGPUAS();
1839 
1840   ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
1841   for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1842        T = (enum InstCounterType)(T + 1))
1843     ForceEmitWaitcnt[T] = false;
1844 
1845   HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1846   HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1847   HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1848 
1849   HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
1850   HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
1851   assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1852   assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1853 
1854   RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1855   RegisterEncoding.VGPRL =
1856       RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
1857   RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1858   RegisterEncoding.SGPRL =
1859       RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
1860 
1861   TrackedWaitcntSet.clear();
1862   BlockVisitedSet.clear();
1863   VCCZBugHandledSet.clear();
1864   LoopWaitcntDataMap.clear();
1865 
1866   // Walk over the blocks in reverse post-dominator order, inserting
1867   // s_waitcnt where needed.
1868   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
1869   bool Modified = false;
1870   for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
1871            I = RPOT.begin(),
1872            E = RPOT.end(), J = RPOT.begin();
1873        I != E;) {
1874     MachineBasicBlock &MBB = **I;
1875 
1876     BlockVisitedSet.insert(&MBB);
1877 
1878     BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1879     if (!ScoreBrackets) {
1880       BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>();
1881       ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1882     }
1883     ScoreBrackets->setPostOrder(MBB.getNumber());
1884     MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
1885     if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
1886       LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();
1887 
1888     // If we are walking into the block from before the loop, then guarantee
1889     // at least 1 re-walk over the loop to propagate the information, even if
1890     // no S_WAITCNT instructions were generated.
1891     if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {
1892       unsigned Count = countNumBottomBlocks(ContainingLoop);
1893 
1894       // If the loop has multiple back-edges, and so more than one "bottom"
1895       // basic block, we have to guarantee a re-walk over every blocks.
1896       if ((std::count(BlockWaitcntProcessedSet.begin(),
1897                       BlockWaitcntProcessedSet.end(), &MBB) < Count)) {
1898         BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
1899         DEBUG(dbgs() << "set-revisit: Block"
1900                      << ContainingLoop->getHeader()->getNumber() << '\n';);
1901       }
1902     }
1903 
1904     // Walk over the instructions.
1905     insertWaitcntInBlock(MF, MBB);
1906 
1907     // Flag that waitcnts have been processed at least once.
1908     BlockWaitcntProcessedSet.push_back(&MBB);
1909 
1910     // See if we want to revisit the loop. If a loop has multiple back-edges,
1911     // we shouldn't revisit the same "bottom" basic block.
1912     if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&
1913         std::count(BlockWaitcntProcessedSet.begin(),
1914                    BlockWaitcntProcessedSet.end(), &MBB) == 1) {
1915       MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
1916       BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
1917       if (EntrySB && EntrySB->getRevisitLoop()) {
1918         EntrySB->setRevisitLoop(false);
1919         J = I;
1920         int32_t PostOrder = EntrySB->getPostOrder();
1921         // TODO: Avoid this loop. Find another way to set I.
1922         for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
1923                  X = RPOT.begin(),
1924                  Y = RPOT.end();
1925              X != Y; ++X) {
1926           MachineBasicBlock &MBBX = **X;
1927           if (MBBX.getNumber() == PostOrder) {
1928             I = X;
1929             break;
1930           }
1931         }
1932         LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1933         WaitcntData->incIterCnt();
1934         DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);
1935         continue;
1936       } else {
1937         LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1938         // Loop converged, reset iteration count. If this loop gets revisited,
1939         // it must be from an outer loop, the counter will restart, this will
1940         // ensure we don't force convergence on such revisits.
1941         WaitcntData->resetIterCnt();
1942       }
1943     }
1944 
1945     J = I;
1946     ++I;
1947   }
1948 
1949   SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
1950 
1951   bool HaveScalarStores = false;
1952 
1953   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
1954        ++BI) {
1955     MachineBasicBlock &MBB = *BI;
1956 
1957     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
1958          ++I) {
1959       if (!HaveScalarStores && TII->isScalarStore(*I))
1960         HaveScalarStores = true;
1961 
1962       if (I->getOpcode() == AMDGPU::S_ENDPGM ||
1963           I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1964         EndPgmBlocks.push_back(&MBB);
1965     }
1966   }
1967 
1968   if (HaveScalarStores) {
1969     // If scalar writes are used, the cache must be flushed or else the next
1970     // wave to reuse the same scratch memory can be clobbered.
1971     //
1972     // Insert s_dcache_wb at wave termination points if there were any scalar
1973     // stores, and only if the cache hasn't already been flushed. This could be
1974     // improved by looking across blocks for flushes in postdominating blocks
1975     // from the stores but an explicitly requested flush is probably very rare.
1976     for (MachineBasicBlock *MBB : EndPgmBlocks) {
1977       bool SeenDCacheWB = false;
1978 
1979       for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
1980            ++I) {
1981         if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1982           SeenDCacheWB = true;
1983         else if (TII->isScalarStore(*I))
1984           SeenDCacheWB = false;
1985 
1986         // FIXME: It would be better to insert this before a waitcnt if any.
1987         if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1988              I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1989             !SeenDCacheWB) {
1990           Modified = true;
1991           BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1992         }
1993       }
1994     }
1995   }
1996 
1997   if (!MFI->isEntryFunction()) {
1998     // Wait for any outstanding memory operations that the input registers may
1999     // depend on. We can't track them and it's better to the wait after the
2000     // costly call sequence.
2001 
2002     // TODO: Could insert earlier and schedule more liberally with operations
2003     // that only use caller preserved registers.
2004     MachineBasicBlock &EntryBB = MF.front();
2005     BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
2006       .addImm(0);
2007 
2008     Modified = true;
2009   }
2010 
2011   return Modified;
2012 }
2013