1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// Insert wait instructions for memory reads and writes.
12 ///
13 /// Memory reads and writes are issued asynchronously, so we need to insert
14 /// S_WAITCNT instructions when we want to access any of their results or
15 /// overwrite any register that's used asynchronously.
16 //
17 //===----------------------------------------------------------------------===//
18 
19 #include "AMDGPU.h"
20 #include "AMDGPUSubtarget.h"
21 #include "SIDefines.h"
22 #include "SIInstrInfo.h"
23 #include "SIMachineFunctionInfo.h"
24 #include "SIRegisterInfo.h"
25 #include "Utils/AMDGPUBaseInfo.h"
26 #include "llvm/ADT/DenseMap.h"
27 #include "llvm/ADT/DenseSet.h"
28 #include "llvm/ADT/PostOrderIterator.h"
29 #include "llvm/ADT/STLExtras.h"
30 #include "llvm/ADT/SmallVector.h"
31 #include "llvm/CodeGen/MachineBasicBlock.h"
32 #include "llvm/CodeGen/MachineFunction.h"
33 #include "llvm/CodeGen/MachineFunctionPass.h"
34 #include "llvm/CodeGen/MachineInstr.h"
35 #include "llvm/CodeGen/MachineInstrBuilder.h"
36 #include "llvm/CodeGen/MachineLoopInfo.h"
37 #include "llvm/CodeGen/MachineMemOperand.h"
38 #include "llvm/CodeGen/MachineOperand.h"
39 #include "llvm/CodeGen/MachineRegisterInfo.h"
40 #include "llvm/IR/DebugLoc.h"
41 #include "llvm/Pass.h"
42 #include "llvm/Support/Debug.h"
43 #include "llvm/Support/DebugCounter.h"
44 #include "llvm/Support/ErrorHandling.h"
45 #include "llvm/Support/raw_ostream.h"
46 #include <algorithm>
47 #include <cassert>
48 #include <cstdint>
49 #include <cstring>
50 #include <memory>
51 #include <utility>
52 #include <vector>
53 
54 using namespace llvm;
55 
56 #define DEBUG_TYPE "si-insert-waitcnts"
57 
58 DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
59               "Force emit s_waitcnt expcnt(0) instrs");
60 DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
61               "Force emit s_waitcnt lgkmcnt(0) instrs");
62 DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
63               "Force emit s_waitcnt vmcnt(0) instrs");
64 
65 static cl::opt<unsigned> ForceEmitZeroFlag(
66   "amdgpu-waitcnt-forcezero",
67   cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
68   cl::init(0), cl::Hidden);
69 
70 namespace {
71 
72 // Class of object that encapsulates latest instruction counter score
73 // associated with the operand.  Used for determining whether
74 // s_waitcnt instruction needs to be emited.
75 
76 #define CNT_MASK(t) (1u << (t))
77 
78 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
79 
80 using RegInterval = std::pair<signed, signed>;
81 
82 struct {
83   int32_t VmcntMax;
84   int32_t ExpcntMax;
85   int32_t LgkmcntMax;
86   int32_t NumVGPRsMax;
87   int32_t NumSGPRsMax;
88 } HardwareLimits;
89 
90 struct {
91   unsigned VGPR0;
92   unsigned VGPRL;
93   unsigned SGPR0;
94   unsigned SGPRL;
95 } RegisterEncoding;
96 
97 enum WaitEventType {
98   VMEM_ACCESS,      // vector-memory read & write
99   LDS_ACCESS,       // lds read & write
100   GDS_ACCESS,       // gds read & write
101   SQ_MESSAGE,       // send message
102   SMEM_ACCESS,      // scalar-memory read & write
103   EXP_GPR_LOCK,     // export holding on its data src
104   GDS_GPR_LOCK,     // GDS holding on its data and addr src
105   EXP_POS_ACCESS,   // write to export position
106   EXP_PARAM_ACCESS, // write to export parameter
107   VMW_GPR_LOCK,     // vector-memory write holding on its data src
108   NUM_WAIT_EVENTS,
109 };
110 
111 // The mapping is:
112 //  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
113 //  SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1                  extra VGPR-like slots
114 //  NUM_ALL_VGPRS    .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
115 // We reserve a fixed number of VGPR slots in the scoring tables for
116 // special tokens like SCMEM_LDS (needed for buffer load to LDS).
117 enum RegisterMapping {
118   SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
119   SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
120   NUM_EXTRA_VGPRS = 1,    // A reserved slot for DS.
121   EXTRA_VGPR_LDS = 0,     // This is a placeholder the Shader algorithm uses.
122   NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
123 };
124 
125 #define ForAllWaitEventType(w)                                                 \
126   for (enum WaitEventType w = (enum WaitEventType)0;                           \
127        (w) < (enum WaitEventType)NUM_WAIT_EVENTS;                              \
128        (w) = (enum WaitEventType)((w) + 1))
129 
130 // This is a per-basic-block object that maintains current score brackets
131 // of each wait counter, and a per-register scoreboard for each wait counter.
132 // We also maintain the latest score for every event type that can change the
133 // waitcnt in order to know if there are multiple types of events within
134 // the brackets. When multiple types of event happen in the bracket,
135 // wait count may get decreased out of order, therefore we need to put in
136 // "s_waitcnt 0" before use.
137 class BlockWaitcntBrackets {
138 public:
139   BlockWaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
140     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
141          T = (enum InstCounterType)(T + 1)) {
142       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
143     }
144   }
145 
146   ~BlockWaitcntBrackets() = default;
147 
148   static int32_t getWaitCountMax(InstCounterType T) {
149     switch (T) {
150     case VM_CNT:
151       return HardwareLimits.VmcntMax;
152     case LGKM_CNT:
153       return HardwareLimits.LgkmcntMax;
154     case EXP_CNT:
155       return HardwareLimits.ExpcntMax;
156     default:
157       break;
158     }
159     return 0;
160   }
161 
162   void setScoreLB(InstCounterType T, int32_t Val) {
163     assert(T < NUM_INST_CNTS);
164     if (T >= NUM_INST_CNTS)
165       return;
166     ScoreLBs[T] = Val;
167   }
168 
169   void setScoreUB(InstCounterType T, int32_t Val) {
170     assert(T < NUM_INST_CNTS);
171     if (T >= NUM_INST_CNTS)
172       return;
173     ScoreUBs[T] = Val;
174     if (T == EXP_CNT) {
175       int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT));
176       if (ScoreLBs[T] < UB)
177         ScoreLBs[T] = UB;
178     }
179   }
180 
181   int32_t getScoreLB(InstCounterType T) {
182     assert(T < NUM_INST_CNTS);
183     if (T >= NUM_INST_CNTS)
184       return 0;
185     return ScoreLBs[T];
186   }
187 
188   int32_t getScoreUB(InstCounterType T) {
189     assert(T < NUM_INST_CNTS);
190     if (T >= NUM_INST_CNTS)
191       return 0;
192     return ScoreUBs[T];
193   }
194 
195   // Mapping from event to counter.
196   InstCounterType eventCounter(WaitEventType E) {
197     switch (E) {
198     case VMEM_ACCESS:
199       return VM_CNT;
200     case LDS_ACCESS:
201     case GDS_ACCESS:
202     case SQ_MESSAGE:
203     case SMEM_ACCESS:
204       return LGKM_CNT;
205     case EXP_GPR_LOCK:
206     case GDS_GPR_LOCK:
207     case VMW_GPR_LOCK:
208     case EXP_POS_ACCESS:
209     case EXP_PARAM_ACCESS:
210       return EXP_CNT;
211     default:
212       llvm_unreachable("unhandled event type");
213     }
214     return NUM_INST_CNTS;
215   }
216 
217   void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
218     if (GprNo < NUM_ALL_VGPRS) {
219       if (GprNo > VgprUB) {
220         VgprUB = GprNo;
221       }
222       VgprScores[T][GprNo] = Val;
223     } else {
224       assert(T == LGKM_CNT);
225       if (GprNo - NUM_ALL_VGPRS > SgprUB) {
226         SgprUB = GprNo - NUM_ALL_VGPRS;
227       }
228       SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
229     }
230   }
231 
232   int32_t getRegScore(int GprNo, InstCounterType T) {
233     if (GprNo < NUM_ALL_VGPRS) {
234       return VgprScores[T][GprNo];
235     }
236     return SgprScores[GprNo - NUM_ALL_VGPRS];
237   }
238 
239   void clear() {
240     memset(ScoreLBs, 0, sizeof(ScoreLBs));
241     memset(ScoreUBs, 0, sizeof(ScoreUBs));
242     memset(EventUBs, 0, sizeof(EventUBs));
243     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
244          T = (enum InstCounterType)(T + 1)) {
245       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
246     }
247     memset(SgprScores, 0, sizeof(SgprScores));
248   }
249 
250   RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
251                              const MachineRegisterInfo *MRI,
252                              const SIRegisterInfo *TRI, unsigned OpNo,
253                              bool Def) const;
254 
255   void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
256                    const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
257                    unsigned OpNo, int32_t Val);
258 
259   void setWaitAtBeginning() { WaitAtBeginning = true; }
260   void clearWaitAtBeginning() { WaitAtBeginning = false; }
261   bool getWaitAtBeginning() const { return WaitAtBeginning; }
262   void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
263   int32_t getMaxVGPR() const { return VgprUB; }
264   int32_t getMaxSGPR() const { return SgprUB; }
265 
266   int32_t getEventUB(enum WaitEventType W) const {
267     assert(W < NUM_WAIT_EVENTS);
268     return EventUBs[W];
269   }
270 
271   bool counterOutOfOrder(InstCounterType T);
272   unsigned int updateByWait(InstCounterType T, int ScoreToWait);
273   void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
274                      const MachineRegisterInfo *MRI, WaitEventType E,
275                      MachineInstr &MI);
276 
277   bool hasPendingSMEM() const {
278     return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
279             EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
280   }
281 
282   bool hasPendingFlat() const {
283     return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
284              LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
285             (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
286              LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
287   }
288 
289   void setPendingFlat() {
290     LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
291     LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
292   }
293 
294   int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
295 
296   void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
297 
298   bool getRevisitLoop() const { return RevisitLoop; }
299   void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
300 
301   void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
302   int32_t getPostOrder() const { return PostOrder; }
303 
304   void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; }
305   void clearWaitcnt() { Waitcnt = nullptr; }
306   MachineInstr *getWaitcnt() const { return Waitcnt; }
307 
308   bool mixedExpTypes() const { return MixedExpTypes; }
309   void setMixedExpTypes(bool MixedExpTypesIn) {
310     MixedExpTypes = MixedExpTypesIn;
311   }
312 
313   void print(raw_ostream &);
314   void dump() { print(dbgs()); }
315 
316 private:
317   const GCNSubtarget *ST = nullptr;
318   bool WaitAtBeginning = false;
319   bool RevisitLoop = false;
320   bool MixedExpTypes = false;
321   int32_t PostOrder = 0;
322   MachineInstr *Waitcnt = nullptr;
323   int32_t ScoreLBs[NUM_INST_CNTS] = {0};
324   int32_t ScoreUBs[NUM_INST_CNTS] = {0};
325   int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
326   // Remember the last flat memory operation.
327   int32_t LastFlat[NUM_INST_CNTS] = {0};
328   // wait_cnt scores for every vgpr.
329   // Keep track of the VgprUB and SgprUB to make merge at join efficient.
330   int32_t VgprUB = 0;
331   int32_t SgprUB = 0;
332   int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
333   // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
334   int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
335 };
336 
337 // This is a per-loop-region object that records waitcnt status at the end of
338 // loop footer from the previous iteration. We also maintain an iteration
339 // count to track the number of times the loop has been visited. When it
340 // doesn't converge naturally, we force convergence by inserting s_waitcnt 0
341 // at the end of the loop footer.
342 class LoopWaitcntData {
343 public:
344   LoopWaitcntData() = default;
345   ~LoopWaitcntData() = default;
346 
347   void incIterCnt() { IterCnt++; }
348   void resetIterCnt() { IterCnt = 0; }
349   unsigned getIterCnt() { return IterCnt; }
350 
351   void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
352   MachineInstr *getWaitcnt() const { return LfWaitcnt; }
353 
354   void print() { LLVM_DEBUG(dbgs() << "  iteration " << IterCnt << '\n';); }
355 
356 private:
357   // s_waitcnt added at the end of loop footer to stablize wait scores
358   // at the end of the loop footer.
359   MachineInstr *LfWaitcnt = nullptr;
360   // Number of iterations the loop has been visited, not including the initial
361   // walk over.
362   int32_t IterCnt = 0;
363 };
364 
365 class SIInsertWaitcnts : public MachineFunctionPass {
366 private:
367   const GCNSubtarget *ST = nullptr;
368   const SIInstrInfo *TII = nullptr;
369   const SIRegisterInfo *TRI = nullptr;
370   const MachineRegisterInfo *MRI = nullptr;
371   const MachineLoopInfo *MLI = nullptr;
372   AMDGPU::IsaVersion IV;
373 
374   DenseSet<MachineBasicBlock *> BlockVisitedSet;
375   DenseSet<MachineInstr *> TrackedWaitcntSet;
376   DenseSet<MachineInstr *> VCCZBugHandledSet;
377 
378   DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
379       BlockWaitcntBracketsMap;
380 
381   std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet;
382 
383   DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
384 
385   // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
386   // because of amdgpu-waitcnt-forcezero flag
387   bool ForceEmitZeroWaitcnts;
388   bool ForceEmitWaitcnt[NUM_INST_CNTS];
389 
390 public:
391   static char ID;
392 
393   SIInsertWaitcnts() : MachineFunctionPass(ID) {
394     (void)ForceExpCounter;
395     (void)ForceLgkmCounter;
396     (void)ForceVMCounter;
397   }
398 
399   bool runOnMachineFunction(MachineFunction &MF) override;
400 
401   StringRef getPassName() const override {
402     return "SI insert wait instructions";
403   }
404 
405   void getAnalysisUsage(AnalysisUsage &AU) const override {
406     AU.setPreservesCFG();
407     AU.addRequired<MachineLoopInfo>();
408     MachineFunctionPass::getAnalysisUsage(AU);
409   }
410 
411   bool isForceEmitWaitcnt() const {
412     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
413          T = (enum InstCounterType)(T + 1))
414       if (ForceEmitWaitcnt[T])
415         return true;
416     return false;
417   }
418 
419   void setForceEmitWaitcnt() {
420 // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
421 // For debug builds, get the debug counter info and adjust if need be
422 #ifndef NDEBUG
423     if (DebugCounter::isCounterSet(ForceExpCounter) &&
424         DebugCounter::shouldExecute(ForceExpCounter)) {
425       ForceEmitWaitcnt[EXP_CNT] = true;
426     } else {
427       ForceEmitWaitcnt[EXP_CNT] = false;
428     }
429 
430     if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
431          DebugCounter::shouldExecute(ForceLgkmCounter)) {
432       ForceEmitWaitcnt[LGKM_CNT] = true;
433     } else {
434       ForceEmitWaitcnt[LGKM_CNT] = false;
435     }
436 
437     if (DebugCounter::isCounterSet(ForceVMCounter) &&
438         DebugCounter::shouldExecute(ForceVMCounter)) {
439       ForceEmitWaitcnt[VM_CNT] = true;
440     } else {
441       ForceEmitWaitcnt[VM_CNT] = false;
442     }
443 #endif // NDEBUG
444   }
445 
446   bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
447   void generateWaitcntInstBefore(MachineInstr &MI,
448                                   BlockWaitcntBrackets *ScoreBrackets);
449   void updateEventWaitcntAfter(MachineInstr &Inst,
450                                BlockWaitcntBrackets *ScoreBrackets);
451   void mergeInputScoreBrackets(MachineBasicBlock &Block);
452   bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block);
453   unsigned countNumBottomBlocks(const MachineLoop *Loop);
454   void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
455   void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
456   bool isWaitcntStronger(unsigned LHS, unsigned RHS);
457   unsigned combineWaitcnt(unsigned LHS, unsigned RHS);
458 };
459 
460 } // end anonymous namespace
461 
462 RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
463                                                  const SIInstrInfo *TII,
464                                                  const MachineRegisterInfo *MRI,
465                                                  const SIRegisterInfo *TRI,
466                                                  unsigned OpNo,
467                                                  bool Def) const {
468   const MachineOperand &Op = MI->getOperand(OpNo);
469   if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
470       (Def && !Op.isDef()))
471     return {-1, -1};
472 
473   // A use via a PW operand does not need a waitcnt.
474   // A partial write is not a WAW.
475   assert(!Op.getSubReg() || !Op.isUndef());
476 
477   RegInterval Result;
478   const MachineRegisterInfo &MRIA = *MRI;
479 
480   unsigned Reg = TRI->getEncodingValue(Op.getReg());
481 
482   if (TRI->isVGPR(MRIA, Op.getReg())) {
483     assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
484     Result.first = Reg - RegisterEncoding.VGPR0;
485     assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
486   } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
487     assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
488     Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
489     assert(Result.first >= NUM_ALL_VGPRS &&
490            Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
491   }
492   // TODO: Handle TTMP
493   // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
494   else
495     return {-1, -1};
496 
497   const MachineInstr &MIA = *MI;
498   const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
499   unsigned Size = TRI->getRegSizeInBits(*RC);
500   Result.second = Result.first + (Size / 32);
501 
502   return Result;
503 }
504 
505 void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
506                                        const SIInstrInfo *TII,
507                                        const SIRegisterInfo *TRI,
508                                        const MachineRegisterInfo *MRI,
509                                        unsigned OpNo, int32_t Val) {
510   RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
511   LLVM_DEBUG({
512     const MachineOperand &Opnd = MI->getOperand(OpNo);
513     assert(TRI->isVGPR(*MRI, Opnd.getReg()));
514   });
515   for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
516     setRegScore(RegNo, EXP_CNT, Val);
517   }
518 }
519 
520 void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
521                                          const SIRegisterInfo *TRI,
522                                          const MachineRegisterInfo *MRI,
523                                          WaitEventType E, MachineInstr &Inst) {
524   const MachineRegisterInfo &MRIA = *MRI;
525   InstCounterType T = eventCounter(E);
526   int32_t CurrScore = getScoreUB(T) + 1;
527   // EventUB and ScoreUB need to be update regardless if this event changes
528   // the score of a register or not.
529   // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
530   EventUBs[E] = CurrScore;
531   setScoreUB(T, CurrScore);
532 
533   if (T == EXP_CNT) {
534     // Check for mixed export types. If they are mixed, then a waitcnt exp(0)
535     // is required.
536     if (!MixedExpTypes) {
537       MixedExpTypes = counterOutOfOrder(EXP_CNT);
538     }
539 
540     // Put score on the source vgprs. If this is a store, just use those
541     // specific register(s).
542     if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
543       // All GDS operations must protect their address register (same as
544       // export.)
545       if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
546           Inst.getOpcode() != AMDGPU::DS_CONSUME) {
547         setExpScore(
548             &Inst, TII, TRI, MRI,
549             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
550             CurrScore);
551       }
552       if (Inst.mayStore()) {
553         setExpScore(
554             &Inst, TII, TRI, MRI,
555             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
556             CurrScore);
557         if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
558                                        AMDGPU::OpName::data1) != -1) {
559           setExpScore(&Inst, TII, TRI, MRI,
560                       AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
561                                                  AMDGPU::OpName::data1),
562                       CurrScore);
563         }
564       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
565                  Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
566                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
567                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
568                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
569                  Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
570                  Inst.getOpcode() != AMDGPU::DS_APPEND &&
571                  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
572                  Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
573         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
574           const MachineOperand &Op = Inst.getOperand(I);
575           if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
576             setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
577           }
578         }
579       }
580     } else if (TII->isFLAT(Inst)) {
581       if (Inst.mayStore()) {
582         setExpScore(
583             &Inst, TII, TRI, MRI,
584             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
585             CurrScore);
586       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
587         setExpScore(
588             &Inst, TII, TRI, MRI,
589             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
590             CurrScore);
591       }
592     } else if (TII->isMIMG(Inst)) {
593       if (Inst.mayStore()) {
594         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
595       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
596         setExpScore(
597             &Inst, TII, TRI, MRI,
598             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
599             CurrScore);
600       }
601     } else if (TII->isMTBUF(Inst)) {
602       if (Inst.mayStore()) {
603         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
604       }
605     } else if (TII->isMUBUF(Inst)) {
606       if (Inst.mayStore()) {
607         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
608       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
609         setExpScore(
610             &Inst, TII, TRI, MRI,
611             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
612             CurrScore);
613       }
614     } else {
615       if (TII->isEXP(Inst)) {
616         // For export the destination registers are really temps that
617         // can be used as the actual source after export patching, so
618         // we need to treat them like sources and set the EXP_CNT
619         // score.
620         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
621           MachineOperand &DefMO = Inst.getOperand(I);
622           if (DefMO.isReg() && DefMO.isDef() &&
623               TRI->isVGPR(MRIA, DefMO.getReg())) {
624             setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
625                         CurrScore);
626           }
627         }
628       }
629       for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
630         MachineOperand &MO = Inst.getOperand(I);
631         if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
632           setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
633         }
634       }
635     }
636 #if 0 // TODO: check if this is handled by MUBUF code above.
637   } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
638        Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
639        Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
640     MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
641     unsigned OpNo;//TODO: find the OpNo for this operand;
642     RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
643     for (signed RegNo = Interval.first; RegNo < Interval.second;
644     ++RegNo) {
645       setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
646     }
647 #endif
648   } else {
649     // Match the score to the destination registers.
650     for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
651       RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
652       if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
653         continue;
654       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
655         setRegScore(RegNo, T, CurrScore);
656       }
657     }
658     if (TII->isDS(Inst) && Inst.mayStore()) {
659       setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
660     }
661   }
662 }
663 
664 void BlockWaitcntBrackets::print(raw_ostream &OS) {
665   OS << '\n';
666   for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
667        T = (enum InstCounterType)(T + 1)) {
668     int LB = getScoreLB(T);
669     int UB = getScoreUB(T);
670 
671     switch (T) {
672     case VM_CNT:
673       OS << "    VM_CNT(" << UB - LB << "): ";
674       break;
675     case LGKM_CNT:
676       OS << "    LGKM_CNT(" << UB - LB << "): ";
677       break;
678     case EXP_CNT:
679       OS << "    EXP_CNT(" << UB - LB << "): ";
680       break;
681     default:
682       OS << "    UNKNOWN(" << UB - LB << "): ";
683       break;
684     }
685 
686     if (LB < UB) {
687       // Print vgpr scores.
688       for (int J = 0; J <= getMaxVGPR(); J++) {
689         int RegScore = getRegScore(J, T);
690         if (RegScore <= LB)
691           continue;
692         int RelScore = RegScore - LB - 1;
693         if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
694           OS << RelScore << ":v" << J << " ";
695         } else {
696           OS << RelScore << ":ds ";
697         }
698       }
699       // Also need to print sgpr scores for lgkm_cnt.
700       if (T == LGKM_CNT) {
701         for (int J = 0; J <= getMaxSGPR(); J++) {
702           int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
703           if (RegScore <= LB)
704             continue;
705           int RelScore = RegScore - LB - 1;
706           OS << RelScore << ":s" << J << " ";
707         }
708       }
709     }
710     OS << '\n';
711   }
712   OS << '\n';
713 }
714 
715 unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
716                                                 int ScoreToWait) {
717   unsigned int NeedWait = 0;
718   if (ScoreToWait == -1) {
719     // The score to wait is unknown. This implies that it was not encountered
720     // during the path of the CFG walk done during the current traversal but
721     // may be seen on a different path. Emit an s_wait counter with a
722     // conservative value of 0 for the counter.
723     NeedWait = CNT_MASK(T);
724     setScoreLB(T, getScoreUB(T));
725     return NeedWait;
726   }
727 
728   // If the score of src_operand falls within the bracket, we need an
729   // s_waitcnt instruction.
730   const int32_t LB = getScoreLB(T);
731   const int32_t UB = getScoreUB(T);
732   if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
733     if ((T == VM_CNT || T == LGKM_CNT) &&
734         hasPendingFlat() &&
735         !ST->hasFlatLgkmVMemCountInOrder()) {
736       // If there is a pending FLAT operation, and this is a VMem or LGKM
737       // waitcnt and the target can report early completion, then we need
738       // to force a waitcnt 0.
739       NeedWait = CNT_MASK(T);
740       setScoreLB(T, getScoreUB(T));
741     } else if (counterOutOfOrder(T)) {
742       // Counter can get decremented out-of-order when there
743       // are multiple types event in the bracket. Also emit an s_wait counter
744       // with a conservative value of 0 for the counter.
745       NeedWait = CNT_MASK(T);
746       setScoreLB(T, getScoreUB(T));
747     } else {
748       NeedWait = CNT_MASK(T);
749       setScoreLB(T, ScoreToWait);
750     }
751   }
752 
753   return NeedWait;
754 }
755 
756 // Where there are multiple types of event in the bracket of a counter,
757 // the decrement may go out of order.
758 bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) {
759   switch (T) {
760   case VM_CNT:
761     return false;
762   case LGKM_CNT: {
763     if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
764         EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) {
765       // Scalar memory read always can go out of order.
766       return true;
767     }
768     int NumEventTypes = 0;
769     if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
770         EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
771       NumEventTypes++;
772     }
773     if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
774         EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
775       NumEventTypes++;
776     }
777     if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] &&
778         EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) {
779       NumEventTypes++;
780     }
781     if (NumEventTypes <= 1) {
782       return false;
783     }
784     break;
785   }
786   case EXP_CNT: {
787     // If there has been a mixture of export types, then a waitcnt exp(0) is
788     // required.
789     if (MixedExpTypes)
790       return true;
791     int NumEventTypes = 0;
792     if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
793         EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
794       NumEventTypes++;
795     }
796     if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
797         EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
798       NumEventTypes++;
799     }
800     if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
801         EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
802       NumEventTypes++;
803     }
804     if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] &&
805         EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) {
806       NumEventTypes++;
807     }
808 
809     if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] &&
810         EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) {
811       NumEventTypes++;
812     }
813 
814     if (NumEventTypes <= 1) {
815       return false;
816     }
817     break;
818   }
819   default:
820     break;
821   }
822   return true;
823 }
824 
825 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
826                       false)
827 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
828                     false)
829 
830 char SIInsertWaitcnts::ID = 0;
831 
832 char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
833 
834 FunctionPass *llvm::createSIInsertWaitcntsPass() {
835   return new SIInsertWaitcnts();
836 }
837 
838 static bool readsVCCZ(const MachineInstr &MI) {
839   unsigned Opc = MI.getOpcode();
840   return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
841          !MI.getOperand(1).isUndef();
842 }
843 
844 /// Given wait count encodings checks if LHS is stronger than RHS.
845 bool SIInsertWaitcnts::isWaitcntStronger(unsigned LHS, unsigned RHS) {
846   if (AMDGPU::decodeVmcnt(IV, LHS) > AMDGPU::decodeVmcnt(IV, RHS))
847     return false;
848   if (AMDGPU::decodeLgkmcnt(IV, LHS) > AMDGPU::decodeLgkmcnt(IV, RHS))
849     return false;
850   if (AMDGPU::decodeExpcnt(IV, LHS) > AMDGPU::decodeExpcnt(IV, RHS))
851     return false;
852   return true;
853 }
854 
855 /// Given wait count encodings create a new encoding which is stronger
856 /// or equal to both.
857 unsigned SIInsertWaitcnts::combineWaitcnt(unsigned LHS, unsigned RHS) {
858   unsigned VmCnt = std::min(AMDGPU::decodeVmcnt(IV, LHS),
859                             AMDGPU::decodeVmcnt(IV, RHS));
860   unsigned LgkmCnt = std::min(AMDGPU::decodeLgkmcnt(IV, LHS),
861                               AMDGPU::decodeLgkmcnt(IV, RHS));
862   unsigned ExpCnt = std::min(AMDGPU::decodeExpcnt(IV, LHS),
863                              AMDGPU::decodeExpcnt(IV, RHS));
864   return AMDGPU::encodeWaitcnt(IV, VmCnt, ExpCnt, LgkmCnt);
865 }
866 
867 ///  Generate s_waitcnt instruction to be placed before cur_Inst.
868 ///  Instructions of a given type are returned in order,
869 ///  but instructions of different types can complete out of order.
870 ///  We rely on this in-order completion
871 ///  and simply assign a score to the memory access instructions.
872 ///  We keep track of the active "score bracket" to determine
873 ///  if an access of a memory read requires an s_waitcnt
874 ///  and if so what the value of each counter is.
875 ///  The "score bracket" is bound by the lower bound and upper bound
876 ///  scores (*_score_LB and *_score_ub respectively).
877 void SIInsertWaitcnts::generateWaitcntInstBefore(
878     MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) {
879   // To emit, or not to emit - that's the question!
880   // Start with an assumption that there is no need to emit.
881   unsigned int EmitWaitcnt = 0;
882 
883   // ForceEmitZeroWaitcnt: force a single s_waitcnt 0 due to hw bug
884   bool ForceEmitZeroWaitcnt = false;
885 
886   setForceEmitWaitcnt();
887   bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
888 
889   if (MI.isDebugInstr())
890     return;
891 
892   // See if an s_waitcnt is forced at block entry, or is needed at
893   // program end.
894   if (ScoreBrackets->getWaitAtBeginning()) {
895     // Note that we have already cleared the state, so we don't need to update
896     // it.
897     ScoreBrackets->clearWaitAtBeginning();
898     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
899          T = (enum InstCounterType)(T + 1)) {
900       EmitWaitcnt |= CNT_MASK(T);
901       ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
902     }
903   }
904 
905   // See if this instruction has a forced S_WAITCNT VM.
906   // TODO: Handle other cases of NeedsWaitcntVmBefore()
907   else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
908            MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
909            MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
910     EmitWaitcnt |=
911         ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
912   }
913 
914   // All waits must be resolved at call return.
915   // NOTE: this could be improved with knowledge of all call sites or
916   //   with knowledge of the called routines.
917   if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
918       MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
919     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
920          T = (enum InstCounterType)(T + 1)) {
921       if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
922         ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
923         EmitWaitcnt |= CNT_MASK(T);
924       }
925     }
926   }
927   // Resolve vm waits before gs-done.
928   else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
929             MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
930            ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
931             AMDGPU::SendMsg::ID_GS_DONE)) {
932     if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) {
933       ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
934       EmitWaitcnt |= CNT_MASK(VM_CNT);
935     }
936   }
937 #if 0 // TODO: the following blocks of logic when we have fence.
938   else if (MI.getOpcode() == SC_FENCE) {
939     const unsigned int group_size =
940       context->shader_info->GetMaxThreadGroupSize();
941     // group_size == 0 means thread group size is unknown at compile time
942     const bool group_is_multi_wave =
943       (group_size == 0 || group_size > target_info->GetWaveFrontSize());
944     const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
945 
946     for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
947       SCRegType src_type = Inst->GetSrcType(i);
948       switch (src_type) {
949         case SCMEM_LDS:
950           if (group_is_multi_wave ||
951             context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
952             EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
953                                ScoreBrackets->getScoreUB(LGKM_CNT));
954             // LDS may have to wait for VM_CNT after buffer load to LDS
955             if (target_info->HasBufferLoadToLDS()) {
956               EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
957                                  ScoreBrackets->getScoreUB(VM_CNT));
958             }
959           }
960           break;
961 
962         case SCMEM_GDS:
963           if (group_is_multi_wave || fence_is_global) {
964             EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
965               ScoreBrackets->getScoreUB(EXP_CNT));
966             EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
967               ScoreBrackets->getScoreUB(LGKM_CNT));
968           }
969           break;
970 
971         case SCMEM_UAV:
972         case SCMEM_TFBUF:
973         case SCMEM_RING:
974         case SCMEM_SCATTER:
975           if (group_is_multi_wave || fence_is_global) {
976             EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
977               ScoreBrackets->getScoreUB(EXP_CNT));
978             EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
979               ScoreBrackets->getScoreUB(VM_CNT));
980           }
981           break;
982 
983         case SCMEM_SCRATCH:
984         default:
985           break;
986       }
987     }
988   }
989 #endif
990 
991   // Export & GDS instructions do not read the EXEC mask until after the export
992   // is granted (which can occur well after the instruction is issued).
993   // The shader program must flush all EXP operations on the export-count
994   // before overwriting the EXEC mask.
995   else {
996     if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
997       // Export and GDS are tracked individually, either may trigger a waitcnt
998       // for EXEC.
999       EmitWaitcnt |= ScoreBrackets->updateByWait(
1000           EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK));
1001       EmitWaitcnt |= ScoreBrackets->updateByWait(
1002           EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS));
1003       EmitWaitcnt |= ScoreBrackets->updateByWait(
1004           EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS));
1005       EmitWaitcnt |= ScoreBrackets->updateByWait(
1006           EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK));
1007     }
1008 
1009 #if 0 // TODO: the following code to handle CALL.
1010     // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
1011     // However, there is a problem with EXP_CNT, because the call cannot
1012     // easily tell if a register is used in the function, and if it did, then
1013     // the referring instruction would have to have an S_WAITCNT, which is
1014     // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
1015     // before the call.
1016     if (MI.getOpcode() == SC_CALL) {
1017       if (ScoreBrackets->getScoreUB(EXP_CNT) >
1018         ScoreBrackets->getScoreLB(EXP_CNT)) {
1019         ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
1020         EmitWaitcnt |= CNT_MASK(EXP_CNT);
1021       }
1022     }
1023 #endif
1024 
1025     // FIXME: Should not be relying on memoperands.
1026     // Look at the source operands of every instruction to see if
1027     // any of them results from a previous memory operation that affects
1028     // its current usage. If so, an s_waitcnt instruction needs to be
1029     // emitted.
1030     // If the source operand was defined by a load, add the s_waitcnt
1031     // instruction.
1032     for (const MachineMemOperand *Memop : MI.memoperands()) {
1033       unsigned AS = Memop->getAddrSpace();
1034       if (AS != AMDGPUAS::LOCAL_ADDRESS)
1035         continue;
1036       unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1037       // VM_CNT is only relevant to vgpr or LDS.
1038       EmitWaitcnt |= ScoreBrackets->updateByWait(
1039           VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1040     }
1041 
1042     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1043       const MachineOperand &Op = MI.getOperand(I);
1044       const MachineRegisterInfo &MRIA = *MRI;
1045       RegInterval Interval =
1046           ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);
1047       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1048         if (TRI->isVGPR(MRIA, Op.getReg())) {
1049           // VM_CNT is only relevant to vgpr or LDS.
1050           EmitWaitcnt |= ScoreBrackets->updateByWait(
1051               VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1052         }
1053         EmitWaitcnt |= ScoreBrackets->updateByWait(
1054             LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
1055       }
1056     }
1057     // End of for loop that looks at all source operands to decide vm_wait_cnt
1058     // and lgk_wait_cnt.
1059 
1060     // Two cases are handled for destination operands:
1061     // 1) If the destination operand was defined by a load, add the s_waitcnt
1062     // instruction to guarantee the right WAW order.
1063     // 2) If a destination operand that was used by a recent export/store ins,
1064     // add s_waitcnt on exp_cnt to guarantee the WAR order.
1065     if (MI.mayStore()) {
1066       // FIXME: Should not be relying on memoperands.
1067       for (const MachineMemOperand *Memop : MI.memoperands()) {
1068         unsigned AS = Memop->getAddrSpace();
1069         if (AS != AMDGPUAS::LOCAL_ADDRESS)
1070           continue;
1071         unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1072         EmitWaitcnt |= ScoreBrackets->updateByWait(
1073             VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1074         EmitWaitcnt |= ScoreBrackets->updateByWait(
1075             EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
1076       }
1077     }
1078     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1079       MachineOperand &Def = MI.getOperand(I);
1080       const MachineRegisterInfo &MRIA = *MRI;
1081       RegInterval Interval =
1082           ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
1083       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1084         if (TRI->isVGPR(MRIA, Def.getReg())) {
1085           EmitWaitcnt |= ScoreBrackets->updateByWait(
1086               VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1087           EmitWaitcnt |= ScoreBrackets->updateByWait(
1088               EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
1089         }
1090         EmitWaitcnt |= ScoreBrackets->updateByWait(
1091             LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
1092       }
1093     } // End of for loop that looks at all dest operands.
1094   }
1095 
1096   // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
1097   // occurs before the instruction. Doing it here prevents any additional
1098   // S_WAITCNTs from being emitted if the instruction was marked as
1099   // requiring a WAITCNT beforehand.
1100   if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1101       !ST->hasAutoWaitcntBeforeBarrier()) {
1102     EmitWaitcnt |=
1103         ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
1104     EmitWaitcnt |= ScoreBrackets->updateByWait(
1105         EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
1106     EmitWaitcnt |= ScoreBrackets->updateByWait(
1107         LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT));
1108   }
1109 
1110   // TODO: Remove this work-around, enable the assert for Bug 457939
1111   //       after fixing the scheduler. Also, the Shader Compiler code is
1112   //       independent of target.
1113   if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) {
1114     if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1115             ScoreBrackets->getScoreUB(LGKM_CNT) &&
1116         ScoreBrackets->hasPendingSMEM()) {
1117       // Wait on everything, not just LGKM.  vccz reads usually come from
1118       // terminators, and we always wait on everything at the end of the
1119       // block, so if we only wait on LGKM here, we might end up with
1120       // another s_waitcnt inserted right after this if there are non-LGKM
1121       // instructions still outstanding.
1122       // FIXME: this is too conservative / the comment is wrong.
1123       // We don't wait on everything at the end of the block and we combine
1124       // waitcnts so we should never have back-to-back waitcnts.
1125       ForceEmitZeroWaitcnt = true;
1126       EmitWaitcnt = true;
1127     }
1128   }
1129 
1130   // Does this operand processing indicate s_wait counter update?
1131   if (EmitWaitcnt || IsForceEmitWaitcnt) {
1132     int CntVal[NUM_INST_CNTS];
1133 
1134     if (ForceEmitZeroWaitcnt || ForceEmitZeroWaitcnts) {
1135       // Force all waitcnts to 0.
1136       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1137            T = (enum InstCounterType)(T + 1)) {
1138         ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
1139       }
1140       CntVal[VM_CNT] = 0;
1141       CntVal[EXP_CNT] = 0;
1142       CntVal[LGKM_CNT] = 0;
1143     } else {
1144       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1145            T = (enum InstCounterType)(T + 1)) {
1146         if (EmitWaitcnt & CNT_MASK(T)) {
1147           int Delta =
1148               ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T);
1149           int MaxDelta = ScoreBrackets->getWaitCountMax(T);
1150           if (Delta >= MaxDelta) {
1151             Delta = -1;
1152             if (T != EXP_CNT) {
1153               ScoreBrackets->setScoreLB(
1154                   T, ScoreBrackets->getScoreUB(T) - MaxDelta);
1155             }
1156             EmitWaitcnt &= ~CNT_MASK(T);
1157           }
1158           CntVal[T] = Delta;
1159         } else {
1160           // If we are not waiting for a particular counter then encode
1161           // it as -1 which means "don't care."
1162           CntVal[T] = -1;
1163         }
1164       }
1165     }
1166 
1167     MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
1168     int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
1169     if (!OldWaitcnt ||
1170         (AMDGPU::decodeVmcnt(IV, Imm) !=
1171          (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
1172         (AMDGPU::decodeExpcnt(IV, Imm) !=
1173          (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
1174         (AMDGPU::decodeLgkmcnt(IV, Imm) !=
1175          (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) {
1176       MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
1177       if (ContainingLoop) {
1178         MachineBasicBlock *TBB = ContainingLoop->getHeader();
1179         BlockWaitcntBrackets *ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
1180         if (!ScoreBracket) {
1181           assert(!BlockVisitedSet.count(TBB));
1182           BlockWaitcntBracketsMap[TBB] =
1183               llvm::make_unique<BlockWaitcntBrackets>(ST);
1184           ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
1185         }
1186         ScoreBracket->setRevisitLoop(true);
1187         LLVM_DEBUG(dbgs() << "set-revisit2: Block"
1188                           << ContainingLoop->getHeader()->getNumber() << '\n';);
1189       }
1190     }
1191 
1192     // Update an existing waitcount, or make a new one.
1193     unsigned Enc = AMDGPU::encodeWaitcnt(IV,
1194                       ForceEmitWaitcnt[VM_CNT] ? 0 : CntVal[VM_CNT],
1195                       ForceEmitWaitcnt[EXP_CNT] ? 0 : CntVal[EXP_CNT],
1196                       ForceEmitWaitcnt[LGKM_CNT] ? 0 : CntVal[LGKM_CNT]);
1197     // We don't remove waitcnts that existed prior to the waitcnt
1198     // pass. Check if the waitcnt to-be-inserted can be avoided
1199     // or if the prev waitcnt can be updated.
1200     bool insertSWaitInst = true;
1201     for (MachineBasicBlock::iterator I = MI.getIterator(),
1202                                      B = MI.getParent()->begin();
1203          insertSWaitInst && I != B; --I) {
1204       if (I == MI.getIterator())
1205         continue;
1206 
1207       switch (I->getOpcode()) {
1208       case AMDGPU::S_WAITCNT:
1209         if (isWaitcntStronger(I->getOperand(0).getImm(), Enc))
1210           insertSWaitInst = false;
1211         else if (!OldWaitcnt) {
1212           OldWaitcnt = &*I;
1213           Enc = combineWaitcnt(I->getOperand(0).getImm(), Enc);
1214         }
1215         break;
1216         // TODO: skip over instructions which never require wait.
1217       }
1218       break;
1219     }
1220     if (insertSWaitInst) {
1221       if (OldWaitcnt) {
1222         assert(OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT);
1223         if (ForceEmitZeroWaitcnts)
1224           LLVM_DEBUG(dbgs()
1225                      << "Force emit s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)\n");
1226         if (IsForceEmitWaitcnt)
1227           LLVM_DEBUG(dbgs() << "Force emit a s_waitcnt due to debug counter\n");
1228 
1229         OldWaitcnt->getOperand(0).setImm(Enc);
1230         if (!OldWaitcnt->getParent())
1231           MI.getParent()->insert(MI, OldWaitcnt);
1232 
1233         LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
1234                           << "Old Instr: " << MI << '\n'
1235                           << "New Instr: " << *OldWaitcnt << '\n');
1236       } else {
1237         auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
1238                                  MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1239                              .addImm(Enc);
1240         TrackedWaitcntSet.insert(SWaitInst);
1241 
1242         LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
1243                           << "Old Instr: " << MI << '\n'
1244                           << "New Instr: " << *SWaitInst << '\n');
1245       }
1246     }
1247 
1248     if (CntVal[EXP_CNT] == 0) {
1249       ScoreBrackets->setMixedExpTypes(false);
1250     }
1251   }
1252 }
1253 
1254 void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
1255                                              MachineInstr *Waitcnt) {
1256   if (MBB.empty()) {
1257     MBB.push_back(Waitcnt);
1258     return;
1259   }
1260 
1261   MachineBasicBlock::iterator It = MBB.end();
1262   MachineInstr *MI = &*(--It);
1263   if (MI->isBranch()) {
1264     MBB.insert(It, Waitcnt);
1265   } else {
1266     MBB.push_back(Waitcnt);
1267   }
1268 }
1269 
1270 // This is a flat memory operation. Check to see if it has memory
1271 // tokens for both LDS and Memory, and if so mark it as a flat.
1272 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1273   if (MI.memoperands_empty())
1274     return true;
1275 
1276   for (const MachineMemOperand *Memop : MI.memoperands()) {
1277     unsigned AS = Memop->getAddrSpace();
1278     if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
1279       return true;
1280   }
1281 
1282   return false;
1283 }
1284 
1285 void SIInsertWaitcnts::updateEventWaitcntAfter(
1286     MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
1287   // Now look at the instruction opcode. If it is a memory access
1288   // instruction, update the upper-bound of the appropriate counter's
1289   // bracket and the destination operand scores.
1290   // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
1291   if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
1292     if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
1293       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1294       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1295     } else {
1296       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1297     }
1298   } else if (TII->isFLAT(Inst)) {
1299     assert(Inst.mayLoad() || Inst.mayStore());
1300 
1301     if (TII->usesVM_CNT(Inst))
1302       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1303 
1304     if (TII->usesLGKM_CNT(Inst)) {
1305       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1306 
1307       // This is a flat memory operation, so note it - it will require
1308       // that both the VM and LGKM be flushed to zero if it is pending when
1309       // a VM or LGKM dependency occurs.
1310       if (mayAccessLDSThroughFlat(Inst))
1311         ScoreBrackets->setPendingFlat();
1312     }
1313   } else if (SIInstrInfo::isVMEM(Inst) &&
1314              // TODO: get a better carve out.
1315              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
1316              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
1317              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
1318     ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1319     if (ST->vmemWriteNeedsExpWaitcnt() &&
1320         (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
1321       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1322     }
1323   } else if (TII->isSMRD(Inst)) {
1324     ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1325   } else {
1326     switch (Inst.getOpcode()) {
1327     case AMDGPU::S_SENDMSG:
1328     case AMDGPU::S_SENDMSGHALT:
1329       ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1330       break;
1331     case AMDGPU::EXP:
1332     case AMDGPU::EXP_DONE: {
1333       int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1334       if (Imm >= 32 && Imm <= 63)
1335         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1336       else if (Imm >= 12 && Imm <= 15)
1337         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1338       else
1339         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1340       break;
1341     }
1342     case AMDGPU::S_MEMTIME:
1343     case AMDGPU::S_MEMREALTIME:
1344       ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1345       break;
1346     default:
1347       break;
1348     }
1349   }
1350 }
1351 
1352 // Merge the score brackets of the Block's predecessors;
1353 // this merged score bracket is used when adding waitcnts to the Block
1354 void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
1355   BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1356   int32_t MaxPending[NUM_INST_CNTS] = {0};
1357   int32_t MaxFlat[NUM_INST_CNTS] = {0};
1358   bool MixedExpTypes = false;
1359 
1360   // For single basic block loops, we need to retain the Block's
1361   // score bracket to have accurate Pred info. So, make a copy of Block's
1362   // score bracket, clear() it (which retains several important bits of info),
1363   // populate, and then replace en masse. For non-single basic block loops,
1364   // just clear Block's current score bracket and repopulate in-place.
1365   bool IsSelfPred;
1366   std::unique_ptr<BlockWaitcntBrackets> S;
1367 
1368   IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block))
1369     != Block.pred_end();
1370   if (IsSelfPred) {
1371     S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
1372     ScoreBrackets = S.get();
1373   }
1374 
1375   ScoreBrackets->clear();
1376 
1377   // See if there are any uninitialized predecessors. If so, emit an
1378   // s_waitcnt 0 at the beginning of the block.
1379   for (MachineBasicBlock *Pred : Block.predecessors()) {
1380     BlockWaitcntBrackets *PredScoreBrackets =
1381         BlockWaitcntBracketsMap[Pred].get();
1382     bool Visited = BlockVisitedSet.count(Pred);
1383     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
1384       continue;
1385     }
1386     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1387          T = (enum InstCounterType)(T + 1)) {
1388       int span =
1389           PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);
1390       MaxPending[T] = std::max(MaxPending[T], span);
1391       span =
1392           PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
1393       MaxFlat[T] = std::max(MaxFlat[T], span);
1394     }
1395 
1396     MixedExpTypes |= PredScoreBrackets->mixedExpTypes();
1397   }
1398 
1399   // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
1400   for (MachineBasicBlock *Pred : Block.predecessors()) {
1401     BlockWaitcntBrackets *PredScoreBrackets =
1402         BlockWaitcntBracketsMap[Pred].get();
1403     bool Visited = BlockVisitedSet.count(Pred);
1404     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
1405       continue;
1406     }
1407 
1408     int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
1409                   PredScoreBrackets->getScoreLB(EXP_CNT);
1410     MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
1411     int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) -
1412                   PredScoreBrackets->getScoreLB(EXP_CNT);
1413     MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
1414   }
1415 
1416 #if 0
1417   // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
1418   // TODO: how does LC distinguish between function entry and main entry?
1419   // If this is the entry to a function, force a wait.
1420   MachineBasicBlock &Entry = Block.getParent()->front();
1421   if (Entry.getNumber() == Block.getNumber()) {
1422     ScoreBrackets->setWaitAtBeginning();
1423     return;
1424   }
1425 #endif
1426 
1427   // Now set the current Block's brackets to the largest ending bracket.
1428   for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1429        T = (enum InstCounterType)(T + 1)) {
1430     ScoreBrackets->setScoreUB(T, MaxPending[T]);
1431     ScoreBrackets->setScoreLB(T, 0);
1432     ScoreBrackets->setLastFlat(T, MaxFlat[T]);
1433   }
1434 
1435   ScoreBrackets->setMixedExpTypes(MixedExpTypes);
1436 
1437   // Set the register scoreboard.
1438   for (MachineBasicBlock *Pred : Block.predecessors()) {
1439     if (!BlockVisitedSet.count(Pred)) {
1440       continue;
1441     }
1442 
1443     BlockWaitcntBrackets *PredScoreBrackets =
1444         BlockWaitcntBracketsMap[Pred].get();
1445 
1446     // Now merge the gpr_reg_score information
1447     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1448          T = (enum InstCounterType)(T + 1)) {
1449       int PredLB = PredScoreBrackets->getScoreLB(T);
1450       int PredUB = PredScoreBrackets->getScoreUB(T);
1451       if (PredLB < PredUB) {
1452         int PredScale = MaxPending[T] - PredUB;
1453         // Merge vgpr scores.
1454         for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
1455           int PredRegScore = PredScoreBrackets->getRegScore(J, T);
1456           if (PredRegScore <= PredLB)
1457             continue;
1458           int NewRegScore = PredScale + PredRegScore;
1459           ScoreBrackets->setRegScore(
1460               J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
1461         }
1462         // Also need to merge sgpr scores for lgkm_cnt.
1463         if (T == LGKM_CNT) {
1464           for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
1465             int PredRegScore =
1466                 PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
1467             if (PredRegScore <= PredLB)
1468               continue;
1469             int NewRegScore = PredScale + PredRegScore;
1470             ScoreBrackets->setRegScore(
1471                 J + NUM_ALL_VGPRS, LGKM_CNT,
1472                 std::max(
1473                     ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
1474                     NewRegScore));
1475           }
1476         }
1477       }
1478     }
1479 
1480     // Also merge the WaitEvent information.
1481     ForAllWaitEventType(W) {
1482       enum InstCounterType T = PredScoreBrackets->eventCounter(W);
1483       int PredEventUB = PredScoreBrackets->getEventUB(W);
1484       if (PredEventUB > PredScoreBrackets->getScoreLB(T)) {
1485         int NewEventUB =
1486             MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T);
1487         if (NewEventUB > 0) {
1488           ScoreBrackets->setEventUB(
1489               W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
1490         }
1491       }
1492     }
1493   }
1494 
1495   // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
1496   // sequencing predecessors, because changes to EXEC require waitcnts due to
1497   // the delayed nature of these operations.
1498   for (MachineBasicBlock *Pred : Block.predecessors()) {
1499     if (!BlockVisitedSet.count(Pred)) {
1500       continue;
1501     }
1502 
1503     BlockWaitcntBrackets *PredScoreBrackets =
1504         BlockWaitcntBracketsMap[Pred].get();
1505 
1506     int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK);
1507     if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
1508       int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub -
1509                        PredScoreBrackets->getScoreUB(EXP_CNT);
1510       if (new_gds_ub > 0) {
1511         ScoreBrackets->setEventUB(
1512             GDS_GPR_LOCK,
1513             std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub));
1514       }
1515     }
1516     int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK);
1517     if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
1518       int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub -
1519                        PredScoreBrackets->getScoreUB(EXP_CNT);
1520       if (new_exp_ub > 0) {
1521         ScoreBrackets->setEventUB(
1522             EXP_GPR_LOCK,
1523             std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub));
1524       }
1525     }
1526   }
1527 
1528   // if a single block loop, update the score brackets. Not needed for other
1529   // blocks, as we did this in-place
1530   if (IsSelfPred) {
1531     BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
1532   }
1533 }
1534 
1535 /// Return true if the given basic block is a "bottom" block of a loop.
1536 /// This works even if the loop is discontiguous. This also handles
1537 /// multiple back-edges for the same "header" block of a loop.
1538 bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
1539                                     const MachineBasicBlock *Block) {
1540   for (MachineBasicBlock *MBB : Loop->blocks()) {
1541     if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {
1542       return true;
1543     }
1544   }
1545   return false;
1546 }
1547 
1548 /// Count the number of "bottom" basic blocks of a loop.
1549 unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) {
1550   unsigned Count = 0;
1551   for (MachineBasicBlock *MBB : Loop->blocks()) {
1552     if (MBB->isSuccessor(Loop->getHeader())) {
1553       Count++;
1554     }
1555   }
1556   return Count;
1557 }
1558 
1559 // Generate s_waitcnt instructions where needed.
1560 void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1561                                             MachineBasicBlock &Block) {
1562   // Initialize the state information.
1563   mergeInputScoreBrackets(Block);
1564 
1565   BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1566 
1567   LLVM_DEBUG({
1568     dbgs() << "*** Block" << Block.getNumber() << " ***";
1569     ScoreBrackets->dump();
1570   });
1571 
1572   // Walk over the instructions.
1573   for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
1574        Iter != E;) {
1575     MachineInstr &Inst = *Iter;
1576     // Remove any previously existing waitcnts.
1577     if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
1578       // Leave pre-existing waitcnts, but note their existence via setWaitcnt.
1579       // Remove the waitcnt-pass-generated waitcnts; the pass will add them back
1580       // as needed.
1581       if (!TrackedWaitcntSet.count(&Inst))
1582         ++Iter;
1583       else {
1584         ++Iter;
1585         Inst.removeFromParent();
1586       }
1587       ScoreBrackets->setWaitcnt(&Inst);
1588       continue;
1589     }
1590 
1591     bool VCCZBugWorkAround = false;
1592     if (readsVCCZ(Inst) &&
1593         (!VCCZBugHandledSet.count(&Inst))) {
1594       if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1595               ScoreBrackets->getScoreUB(LGKM_CNT) &&
1596           ScoreBrackets->hasPendingSMEM()) {
1597         if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
1598           VCCZBugWorkAround = true;
1599       }
1600     }
1601 
1602     // Generate an s_waitcnt instruction to be placed before
1603     // cur_Inst, if needed.
1604     generateWaitcntInstBefore(Inst, ScoreBrackets);
1605 
1606     updateEventWaitcntAfter(Inst, ScoreBrackets);
1607 
1608 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
1609     // If this instruction generates a S_SETVSKIP because it is an
1610     // indexed resource, and we are on Tahiti, then it will also force
1611     // an S_WAITCNT vmcnt(0)
1612     if (RequireCheckResourceType(Inst, context)) {
1613       // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1614       ScoreBrackets->setScoreLB(VM_CNT,
1615       ScoreBrackets->getScoreUB(VM_CNT));
1616     }
1617 #endif
1618 
1619     ScoreBrackets->clearWaitcnt();
1620 
1621     LLVM_DEBUG({
1622       Inst.print(dbgs());
1623       ScoreBrackets->dump();
1624     });
1625 
1626     // Check to see if this is a GWS instruction. If so, and if this is CI or
1627     // VI, then the generated code sequence will include an S_WAITCNT 0.
1628     // TODO: Are these the only GWS instructions?
1629     if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
1630         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
1631         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
1632         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
1633         Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
1634       // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
1635       ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
1636       ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
1637       ScoreBrackets->updateByWait(LGKM_CNT,
1638                                   ScoreBrackets->getScoreUB(LGKM_CNT));
1639     }
1640 
1641     // TODO: Remove this work-around after fixing the scheduler and enable the
1642     // assert above.
1643     if (VCCZBugWorkAround) {
1644       // Restore the vccz bit.  Any time a value is written to vcc, the vcc
1645       // bit is updated, so we can restore the bit by reading the value of
1646       // vcc and then writing it back to the register.
1647       BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
1648               AMDGPU::VCC)
1649           .addReg(AMDGPU::VCC);
1650       VCCZBugHandledSet.insert(&Inst);
1651     }
1652 
1653     ++Iter;
1654   }
1655 
1656   // Check if we need to force convergence at loop footer.
1657   MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
1658   if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {
1659     LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1660     WaitcntData->print();
1661     LLVM_DEBUG(dbgs() << '\n';);
1662 
1663     // The iterative waitcnt insertion algorithm aims for optimal waitcnt
1664     // placement, but doesn't guarantee convergence for a loop. Each
1665     // loop should take at most (n+1) iterations for it to converge naturally,
1666     // where n is the number of bottom blocks. If this threshold is reached and
1667     // the result hasn't converged, then we force convergence by inserting
1668     // a s_waitcnt at the end of loop footer.
1669     if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) {
1670       // To ensure convergence, need to make wait events at loop footer be no
1671       // more than those from the previous iteration.
1672       // As a simplification, instead of tracking individual scores and
1673       // generating the precise wait count, just wait on 0.
1674       bool HasPending = false;
1675       MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
1676       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1677            T = (enum InstCounterType)(T + 1)) {
1678         if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
1679           ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
1680           HasPending = true;
1681           break;
1682         }
1683       }
1684 
1685       if (HasPending) {
1686         if (!SWaitInst) {
1687           SWaitInst = BuildMI(Block, Block.getFirstNonPHI(),
1688                               DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1689                               .addImm(0);
1690           TrackedWaitcntSet.insert(SWaitInst);
1691 #if 0 // TODO: Format the debug output
1692           OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
1693           OutputTransformAdd(SWaitInst, context);
1694 #endif
1695         }
1696 #if 0 // TODO: ??
1697         _DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
1698 #endif
1699       }
1700 
1701       if (SWaitInst) {
1702         LLVM_DEBUG({
1703           SWaitInst->print(dbgs());
1704           dbgs() << "\nAdjusted score board:";
1705           ScoreBrackets->dump();
1706         });
1707 
1708         // Add this waitcnt to the block. It is either newly created or
1709         // created in previous iterations and added back since block traversal
1710         // always removes waitcnts.
1711         insertWaitcntBeforeCF(Block, SWaitInst);
1712         WaitcntData->setWaitcnt(SWaitInst);
1713       }
1714     }
1715   }
1716 }
1717 
1718 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
1719   ST = &MF.getSubtarget<GCNSubtarget>();
1720   TII = ST->getInstrInfo();
1721   TRI = &TII->getRegisterInfo();
1722   MRI = &MF.getRegInfo();
1723   MLI = &getAnalysis<MachineLoopInfo>();
1724   IV = AMDGPU::getIsaVersion(ST->getCPU());
1725   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1726 
1727   ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
1728   for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1729        T = (enum InstCounterType)(T + 1))
1730     ForceEmitWaitcnt[T] = false;
1731 
1732   HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1733   HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1734   HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1735 
1736   HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
1737   HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
1738   assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1739   assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1740 
1741   RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1742   RegisterEncoding.VGPRL =
1743       RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
1744   RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1745   RegisterEncoding.SGPRL =
1746       RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
1747 
1748   TrackedWaitcntSet.clear();
1749   BlockVisitedSet.clear();
1750   VCCZBugHandledSet.clear();
1751   LoopWaitcntDataMap.clear();
1752   BlockWaitcntProcessedSet.clear();
1753 
1754   // Walk over the blocks in reverse post order, inserting
1755   // s_waitcnt where needed.
1756   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
1757   bool Modified = false;
1758   for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
1759            I = RPOT.begin(),
1760            E = RPOT.end(), J = RPOT.begin();
1761        I != E;) {
1762     MachineBasicBlock &MBB = **I;
1763 
1764     BlockVisitedSet.insert(&MBB);
1765 
1766     BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1767     if (!ScoreBrackets) {
1768       BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>(ST);
1769       ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1770     }
1771     ScoreBrackets->setPostOrder(MBB.getNumber());
1772     MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
1773     if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
1774       LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();
1775 
1776     // If we are walking into the block from before the loop, then guarantee
1777     // at least 1 re-walk over the loop to propagate the information, even if
1778     // no S_WAITCNT instructions were generated.
1779     if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {
1780       unsigned Count = countNumBottomBlocks(ContainingLoop);
1781 
1782       // If the loop has multiple back-edges, and so more than one "bottom"
1783       // basic block, we have to guarantee a re-walk over every blocks.
1784       if ((std::count(BlockWaitcntProcessedSet.begin(),
1785                       BlockWaitcntProcessedSet.end(), &MBB) < (int)Count)) {
1786         BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
1787         LLVM_DEBUG(dbgs() << "set-revisit1: Block"
1788                           << ContainingLoop->getHeader()->getNumber() << '\n';);
1789       }
1790     }
1791 
1792     // Walk over the instructions.
1793     insertWaitcntInBlock(MF, MBB);
1794 
1795     // Record that waitcnts have been processed at least once for this block.
1796     BlockWaitcntProcessedSet.push_back(&MBB);
1797 
1798     // See if we want to revisit the loop. If a loop has multiple back-edges,
1799     // we shouldn't revisit the same "bottom" basic block.
1800     if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&
1801         std::count(BlockWaitcntProcessedSet.begin(),
1802                    BlockWaitcntProcessedSet.end(), &MBB) == 1) {
1803       MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
1804       BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
1805       if (EntrySB && EntrySB->getRevisitLoop()) {
1806         EntrySB->setRevisitLoop(false);
1807         J = I;
1808         int32_t PostOrder = EntrySB->getPostOrder();
1809         // TODO: Avoid this loop. Find another way to set I.
1810         for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
1811                  X = RPOT.begin(),
1812                  Y = RPOT.end();
1813              X != Y; ++X) {
1814           MachineBasicBlock &MBBX = **X;
1815           if (MBBX.getNumber() == PostOrder) {
1816             I = X;
1817             break;
1818           }
1819         }
1820         LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1821         WaitcntData->incIterCnt();
1822         LLVM_DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);
1823         continue;
1824       } else {
1825         LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1826         // Loop converged, reset iteration count. If this loop gets revisited,
1827         // it must be from an outer loop, the counter will restart, this will
1828         // ensure we don't force convergence on such revisits.
1829         WaitcntData->resetIterCnt();
1830       }
1831     }
1832 
1833     J = I;
1834     ++I;
1835   }
1836 
1837   SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
1838 
1839   bool HaveScalarStores = false;
1840 
1841   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
1842        ++BI) {
1843     MachineBasicBlock &MBB = *BI;
1844 
1845     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
1846          ++I) {
1847       if (!HaveScalarStores && TII->isScalarStore(*I))
1848         HaveScalarStores = true;
1849 
1850       if (I->getOpcode() == AMDGPU::S_ENDPGM ||
1851           I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1852         EndPgmBlocks.push_back(&MBB);
1853     }
1854   }
1855 
1856   if (HaveScalarStores) {
1857     // If scalar writes are used, the cache must be flushed or else the next
1858     // wave to reuse the same scratch memory can be clobbered.
1859     //
1860     // Insert s_dcache_wb at wave termination points if there were any scalar
1861     // stores, and only if the cache hasn't already been flushed. This could be
1862     // improved by looking across blocks for flushes in postdominating blocks
1863     // from the stores but an explicitly requested flush is probably very rare.
1864     for (MachineBasicBlock *MBB : EndPgmBlocks) {
1865       bool SeenDCacheWB = false;
1866 
1867       for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
1868            ++I) {
1869         if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1870           SeenDCacheWB = true;
1871         else if (TII->isScalarStore(*I))
1872           SeenDCacheWB = false;
1873 
1874         // FIXME: It would be better to insert this before a waitcnt if any.
1875         if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1876              I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1877             !SeenDCacheWB) {
1878           Modified = true;
1879           BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1880         }
1881       }
1882     }
1883   }
1884 
1885   if (!MFI->isEntryFunction()) {
1886     // Wait for any outstanding memory operations that the input registers may
1887     // depend on. We can't track them and it's better to the wait after the
1888     // costly call sequence.
1889 
1890     // TODO: Could insert earlier and schedule more liberally with operations
1891     // that only use caller preserved registers.
1892     MachineBasicBlock &EntryBB = MF.front();
1893     BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1894       .addImm(0);
1895 
1896     Modified = true;
1897   }
1898 
1899   return Modified;
1900 }
1901