1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Insert wait instructions for memory reads and writes.
12 ///
13 /// Memory reads and writes are issued asynchronously, so we need to insert
14 /// S_WAITCNT instructions when we want to access any of their results or
15 /// overwrite any register that's used asynchronously.
16 //
17 //===----------------------------------------------------------------------===//
18 
19 #include "AMDGPU.h"
20 #include "AMDGPUSubtarget.h"
21 #include "SIDefines.h"
22 #include "SIInstrInfo.h"
23 #include "SIMachineFunctionInfo.h"
24 #include "SIRegisterInfo.h"
25 #include "Utils/AMDGPUBaseInfo.h"
26 #include "llvm/ADT/DenseMap.h"
27 #include "llvm/ADT/DenseSet.h"
28 #include "llvm/ADT/PostOrderIterator.h"
29 #include "llvm/ADT/STLExtras.h"
30 #include "llvm/ADT/SmallVector.h"
31 #include "llvm/CodeGen/MachineBasicBlock.h"
32 #include "llvm/CodeGen/MachineFunction.h"
33 #include "llvm/CodeGen/MachineFunctionPass.h"
34 #include "llvm/CodeGen/MachineInstr.h"
35 #include "llvm/CodeGen/MachineInstrBuilder.h"
36 #include "llvm/CodeGen/MachineLoopInfo.h"
37 #include "llvm/CodeGen/MachineMemOperand.h"
38 #include "llvm/CodeGen/MachineOperand.h"
39 #include "llvm/CodeGen/MachineRegisterInfo.h"
40 #include "llvm/IR/DebugLoc.h"
41 #include "llvm/Pass.h"
42 #include "llvm/Support/Debug.h"
43 #include "llvm/Support/ErrorHandling.h"
44 #include "llvm/Support/raw_ostream.h"
45 #include <algorithm>
46 #include <cassert>
47 #include <cstdint>
48 #include <cstring>
49 #include <memory>
50 #include <utility>
51 #include <vector>
52 
53 #define DEBUG_TYPE "si-insert-waitcnts"
54 
55 using namespace llvm;
56 
57 namespace {
58 
59 // Class of object that encapsulates latest instruction counter score
60 // associated with the operand.  Used for determining whether
61 // s_waitcnt instruction needs to be emited.
62 
63 #define CNT_MASK(t) (1u << (t))
64 
65 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
66 
67 using RegInterval = std::pair<signed, signed>;
68 
69 struct {
70   int32_t VmcntMax;
71   int32_t ExpcntMax;
72   int32_t LgkmcntMax;
73   int32_t NumVGPRsMax;
74   int32_t NumSGPRsMax;
75 } HardwareLimits;
76 
77 struct {
78   unsigned VGPR0;
79   unsigned VGPRL;
80   unsigned SGPR0;
81   unsigned SGPRL;
82 } RegisterEncoding;
83 
84 enum WaitEventType {
85   VMEM_ACCESS,      // vector-memory read & write
86   LDS_ACCESS,       // lds read & write
87   GDS_ACCESS,       // gds read & write
88   SQ_MESSAGE,       // send message
89   SMEM_ACCESS,      // scalar-memory read & write
90   EXP_GPR_LOCK,     // export holding on its data src
91   GDS_GPR_LOCK,     // GDS holding on its data and addr src
92   EXP_POS_ACCESS,   // write to export position
93   EXP_PARAM_ACCESS, // write to export parameter
94   VMW_GPR_LOCK,     // vector-memory write holding on its data src
95   NUM_WAIT_EVENTS,
96 };
97 
98 // The mapping is:
99 //  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
100 //  SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1                  extra VGPR-like slots
101 //  NUM_ALL_VGPRS    .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
102 // We reserve a fixed number of VGPR slots in the scoring tables for
103 // special tokens like SCMEM_LDS (needed for buffer load to LDS).
104 enum RegisterMapping {
105   SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
106   SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
107   NUM_EXTRA_VGPRS = 1,    // A reserved slot for DS.
108   EXTRA_VGPR_LDS = 0,     // This is a placeholder the Shader algorithm uses.
109   NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
110 };
111 
112 #define ForAllWaitEventType(w)                                                 \
113   for (enum WaitEventType w = (enum WaitEventType)0;                           \
114        (w) < (enum WaitEventType)NUM_WAIT_EVENTS;                              \
115        (w) = (enum WaitEventType)((w) + 1))
116 
117 // This is a per-basic-block object that maintains current score brackets
118 // of each wait-counter, and a per-register scoreboard for each wait-couner.
119 // We also maintain the latest score for every event type that can change the
120 // waitcnt in order to know if there are multiple types of events within
121 // the brackets. When multiple types of event happen in the bracket,
122 // wait-count may get decreased out of order, therefore we need to put in
123 // "s_waitcnt 0" before use.
124 class BlockWaitcntBrackets {
125 public:
126   BlockWaitcntBrackets() {
127     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
128          T = (enum InstCounterType)(T + 1)) {
129       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
130     }
131   }
132 
133   ~BlockWaitcntBrackets() = default;
134 
135   static int32_t getWaitCountMax(InstCounterType T) {
136     switch (T) {
137     case VM_CNT:
138       return HardwareLimits.VmcntMax;
139     case LGKM_CNT:
140       return HardwareLimits.LgkmcntMax;
141     case EXP_CNT:
142       return HardwareLimits.ExpcntMax;
143     default:
144       break;
145     }
146     return 0;
147   }
148 
149   void setScoreLB(InstCounterType T, int32_t Val) {
150     assert(T < NUM_INST_CNTS);
151     if (T >= NUM_INST_CNTS)
152       return;
153     ScoreLBs[T] = Val;
154   }
155 
156   void setScoreUB(InstCounterType T, int32_t Val) {
157     assert(T < NUM_INST_CNTS);
158     if (T >= NUM_INST_CNTS)
159       return;
160     ScoreUBs[T] = Val;
161     if (T == EXP_CNT) {
162       int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT));
163       if (ScoreLBs[T] < UB)
164         ScoreLBs[T] = UB;
165     }
166   }
167 
168   int32_t getScoreLB(InstCounterType T) {
169     assert(T < NUM_INST_CNTS);
170     if (T >= NUM_INST_CNTS)
171       return 0;
172     return ScoreLBs[T];
173   }
174 
175   int32_t getScoreUB(InstCounterType T) {
176     assert(T < NUM_INST_CNTS);
177     if (T >= NUM_INST_CNTS)
178       return 0;
179     return ScoreUBs[T];
180   }
181 
182   // Mapping from event to counter.
183   InstCounterType eventCounter(WaitEventType E) {
184     switch (E) {
185     case VMEM_ACCESS:
186       return VM_CNT;
187     case LDS_ACCESS:
188     case GDS_ACCESS:
189     case SQ_MESSAGE:
190     case SMEM_ACCESS:
191       return LGKM_CNT;
192     case EXP_GPR_LOCK:
193     case GDS_GPR_LOCK:
194     case VMW_GPR_LOCK:
195     case EXP_POS_ACCESS:
196     case EXP_PARAM_ACCESS:
197       return EXP_CNT;
198     default:
199       llvm_unreachable("unhandled event type");
200     }
201     return NUM_INST_CNTS;
202   }
203 
204   void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
205     if (GprNo < NUM_ALL_VGPRS) {
206       if (GprNo > VgprUB) {
207         VgprUB = GprNo;
208       }
209       VgprScores[T][GprNo] = Val;
210     } else {
211       assert(T == LGKM_CNT);
212       if (GprNo - NUM_ALL_VGPRS > SgprUB) {
213         SgprUB = GprNo - NUM_ALL_VGPRS;
214       }
215       SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
216     }
217   }
218 
219   int32_t getRegScore(int GprNo, InstCounterType T) {
220     if (GprNo < NUM_ALL_VGPRS) {
221       return VgprScores[T][GprNo];
222     }
223     return SgprScores[GprNo - NUM_ALL_VGPRS];
224   }
225 
226   void clear() {
227     memset(ScoreLBs, 0, sizeof(ScoreLBs));
228     memset(ScoreUBs, 0, sizeof(ScoreUBs));
229     memset(EventUBs, 0, sizeof(EventUBs));
230     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
231          T = (enum InstCounterType)(T + 1)) {
232       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
233     }
234     memset(SgprScores, 0, sizeof(SgprScores));
235   }
236 
237   RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
238                              const MachineRegisterInfo *MRI,
239                              const SIRegisterInfo *TRI, unsigned OpNo,
240                              bool Def) const;
241 
242   void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
243                    const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
244                    unsigned OpNo, int32_t Val);
245 
246   void setWaitAtBeginning() { WaitAtBeginning = true; }
247   void clearWaitAtBeginning() { WaitAtBeginning = false; }
248   bool getWaitAtBeginning() const { return WaitAtBeginning; }
249   void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
250   int32_t getMaxVGPR() const { return VgprUB; }
251   int32_t getMaxSGPR() const { return SgprUB; }
252 
253   int32_t getEventUB(enum WaitEventType W) const {
254     assert(W < NUM_WAIT_EVENTS);
255     return EventUBs[W];
256   }
257 
258   bool counterOutOfOrder(InstCounterType T);
259   unsigned int updateByWait(InstCounterType T, int ScoreToWait);
260   void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
261                      const MachineRegisterInfo *MRI, WaitEventType E,
262                      MachineInstr &MI);
263 
264   bool hasPendingSMEM() const {
265     return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
266             EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
267   }
268 
269   bool hasPendingFlat() const {
270     return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
271              LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
272             (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
273              LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
274   }
275 
276   void setPendingFlat() {
277     LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
278     LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
279   }
280 
281   int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
282 
283   void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
284 
285   bool getRevisitLoop() const { return RevisitLoop; }
286   void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
287 
288   void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
289   int32_t getPostOrder() const { return PostOrder; }
290 
291   void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; }
292   void clearWaitcnt() { Waitcnt = nullptr; }
293   MachineInstr *getWaitcnt() const { return Waitcnt; }
294 
295   bool mixedExpTypes() const { return MixedExpTypes; }
296   void setMixedExpTypes(bool MixedExpTypesIn) {
297     MixedExpTypes = MixedExpTypesIn;
298   }
299 
300   void print(raw_ostream &);
301   void dump() { print(dbgs()); }
302 
303 private:
304   bool WaitAtBeginning = false;
305   bool RevisitLoop = false;
306   bool ValidLoop = false;
307   bool MixedExpTypes = false;
308   MachineLoop *LoopRegion = nullptr;
309   int32_t PostOrder = 0;
310   MachineInstr *Waitcnt = nullptr;
311   int32_t ScoreLBs[NUM_INST_CNTS] = {0};
312   int32_t ScoreUBs[NUM_INST_CNTS] = {0};
313   int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
314   // Remember the last flat memory operation.
315   int32_t LastFlat[NUM_INST_CNTS] = {0};
316   // wait_cnt scores for every vgpr.
317   // Keep track of the VgprUB and SgprUB to make merge at join efficient.
318   int32_t VgprUB = 0;
319   int32_t SgprUB = 0;
320   int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
321   // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
322   int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
323 };
324 
325 // This is a per-loop-region object that records waitcnt status at the end of
326 // loop footer from the previous iteration. We also maintain an iteration
327 // count to track the number of times the loop has been visited. When it
328 // doesn't converge naturally, we force convergence by inserting s_waitcnt 0
329 // at the end of the loop footer.
330 class LoopWaitcntData {
331 public:
332   LoopWaitcntData() = default;
333   ~LoopWaitcntData() = default;
334 
335   void incIterCnt() { IterCnt++; }
336   void resetIterCnt() { IterCnt = 0; }
337   int32_t getIterCnt() { return IterCnt; }
338 
339   void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
340   MachineInstr *getWaitcnt() const { return LfWaitcnt; }
341 
342   void print() {
343     DEBUG(dbgs() << "  iteration " << IterCnt << '\n';);
344   }
345 
346 private:
347   // s_waitcnt added at the end of loop footer to stablize wait scores
348   // at the end of the loop footer.
349   MachineInstr *LfWaitcnt = nullptr;
350   // Number of iterations the loop has been visited, not including the initial
351   // walk over.
352   int32_t IterCnt = 0;
353 };
354 
355 class SIInsertWaitcnts : public MachineFunctionPass {
356 private:
357   const SISubtarget *ST = nullptr;
358   const SIInstrInfo *TII = nullptr;
359   const SIRegisterInfo *TRI = nullptr;
360   const MachineRegisterInfo *MRI = nullptr;
361   const MachineLoopInfo *MLI = nullptr;
362   AMDGPU::IsaInfo::IsaVersion IV;
363   AMDGPUAS AMDGPUASI;
364 
365   DenseSet<MachineBasicBlock *> BlockVisitedSet;
366   DenseSet<MachineInstr *> CompilerGeneratedWaitcntSet;
367   DenseSet<MachineInstr *> VCCZBugHandledSet;
368 
369   DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
370       BlockWaitcntBracketsMap;
371 
372   DenseSet<MachineBasicBlock *> BlockWaitcntProcessedSet;
373 
374   DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
375 
376   std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets;
377 
378 public:
379   static char ID;
380 
381   SIInsertWaitcnts() : MachineFunctionPass(ID) {}
382 
383   bool runOnMachineFunction(MachineFunction &MF) override;
384 
385   StringRef getPassName() const override {
386     return "SI insert wait instructions";
387   }
388 
389   void getAnalysisUsage(AnalysisUsage &AU) const override {
390     AU.setPreservesCFG();
391     AU.addRequired<MachineLoopInfo>();
392     MachineFunctionPass::getAnalysisUsage(AU);
393   }
394 
395   void addKillWaitBracket(BlockWaitcntBrackets *Bracket) {
396     // The waitcnt information is copied because it changes as the block is
397     // traversed.
398     KillWaitBrackets.push_back(
399         llvm::make_unique<BlockWaitcntBrackets>(*Bracket));
400   }
401 
402   bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
403   MachineInstr *generateSWaitCntInstBefore(MachineInstr &MI,
404                                            BlockWaitcntBrackets *ScoreBrackets);
405   void updateEventWaitCntAfter(MachineInstr &Inst,
406                                BlockWaitcntBrackets *ScoreBrackets);
407   void mergeInputScoreBrackets(MachineBasicBlock &Block);
408   MachineBasicBlock *loopBottom(const MachineLoop *Loop);
409   void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
410   void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
411 };
412 
413 } // end anonymous namespace
414 
415 RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
416                                                  const SIInstrInfo *TII,
417                                                  const MachineRegisterInfo *MRI,
418                                                  const SIRegisterInfo *TRI,
419                                                  unsigned OpNo,
420                                                  bool Def) const {
421   const MachineOperand &Op = MI->getOperand(OpNo);
422   if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
423       (Def && !Op.isDef()))
424     return {-1, -1};
425 
426   // A use via a PW operand does not need a waitcnt.
427   // A partial write is not a WAW.
428   assert(!Op.getSubReg() || !Op.isUndef());
429 
430   RegInterval Result;
431   const MachineRegisterInfo &MRIA = *MRI;
432 
433   unsigned Reg = TRI->getEncodingValue(Op.getReg());
434 
435   if (TRI->isVGPR(MRIA, Op.getReg())) {
436     assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
437     Result.first = Reg - RegisterEncoding.VGPR0;
438     assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
439   } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
440     assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
441     Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
442     assert(Result.first >= NUM_ALL_VGPRS &&
443            Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
444   }
445   // TODO: Handle TTMP
446   // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
447   else
448     return {-1, -1};
449 
450   const MachineInstr &MIA = *MI;
451   const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
452   unsigned Size = TRI->getRegSizeInBits(*RC);
453   Result.second = Result.first + (Size / 32);
454 
455   return Result;
456 }
457 
458 void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
459                                        const SIInstrInfo *TII,
460                                        const SIRegisterInfo *TRI,
461                                        const MachineRegisterInfo *MRI,
462                                        unsigned OpNo, int32_t Val) {
463   RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
464   DEBUG({
465     const MachineOperand &Opnd = MI->getOperand(OpNo);
466     assert(TRI->isVGPR(*MRI, Opnd.getReg()));
467   });
468   for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
469     setRegScore(RegNo, EXP_CNT, Val);
470   }
471 }
472 
473 void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
474                                          const SIRegisterInfo *TRI,
475                                          const MachineRegisterInfo *MRI,
476                                          WaitEventType E, MachineInstr &Inst) {
477   const MachineRegisterInfo &MRIA = *MRI;
478   InstCounterType T = eventCounter(E);
479   int32_t CurrScore = getScoreUB(T) + 1;
480   // EventUB and ScoreUB need to be update regardless if this event changes
481   // the score of a register or not.
482   // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
483   EventUBs[E] = CurrScore;
484   setScoreUB(T, CurrScore);
485 
486   if (T == EXP_CNT) {
487     // Check for mixed export types. If they are mixed, then a waitcnt exp(0)
488     // is required.
489     if (!MixedExpTypes) {
490       MixedExpTypes = counterOutOfOrder(EXP_CNT);
491     }
492 
493     // Put score on the source vgprs. If this is a store, just use those
494     // specific register(s).
495     if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
496       // All GDS operations must protect their address register (same as
497       // export.)
498       if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
499           Inst.getOpcode() != AMDGPU::DS_CONSUME) {
500         setExpScore(
501             &Inst, TII, TRI, MRI,
502             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
503             CurrScore);
504       }
505       if (Inst.mayStore()) {
506         setExpScore(
507             &Inst, TII, TRI, MRI,
508             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
509             CurrScore);
510         if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
511                                        AMDGPU::OpName::data1) != -1) {
512           setExpScore(&Inst, TII, TRI, MRI,
513                       AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
514                                                  AMDGPU::OpName::data1),
515                       CurrScore);
516         }
517       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
518                  Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
519                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
520                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
521                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
522                  Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
523                  Inst.getOpcode() != AMDGPU::DS_APPEND &&
524                  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
525                  Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
526         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
527           const MachineOperand &Op = Inst.getOperand(I);
528           if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
529             setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
530           }
531         }
532       }
533     } else if (TII->isFLAT(Inst)) {
534       if (Inst.mayStore()) {
535         setExpScore(
536             &Inst, TII, TRI, MRI,
537             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
538             CurrScore);
539       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
540         setExpScore(
541             &Inst, TII, TRI, MRI,
542             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
543             CurrScore);
544       }
545     } else if (TII->isMIMG(Inst)) {
546       if (Inst.mayStore()) {
547         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
548       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
549         setExpScore(
550             &Inst, TII, TRI, MRI,
551             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
552             CurrScore);
553       }
554     } else if (TII->isMTBUF(Inst)) {
555       if (Inst.mayStore()) {
556         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
557       }
558     } else if (TII->isMUBUF(Inst)) {
559       if (Inst.mayStore()) {
560         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
561       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
562         setExpScore(
563             &Inst, TII, TRI, MRI,
564             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
565             CurrScore);
566       }
567     } else {
568       if (TII->isEXP(Inst)) {
569         // For export the destination registers are really temps that
570         // can be used as the actual source after export patching, so
571         // we need to treat them like sources and set the EXP_CNT
572         // score.
573         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
574           MachineOperand &DefMO = Inst.getOperand(I);
575           if (DefMO.isReg() && DefMO.isDef() &&
576               TRI->isVGPR(MRIA, DefMO.getReg())) {
577             setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
578                         CurrScore);
579           }
580         }
581       }
582       for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
583         MachineOperand &MO = Inst.getOperand(I);
584         if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
585           setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
586         }
587       }
588     }
589 #if 0 // TODO: check if this is handled by MUBUF code above.
590   } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
591        Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
592        Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
593     MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
594     unsigned OpNo;//TODO: find the OpNo for this operand;
595     RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
596     for (signed RegNo = Interval.first; RegNo < Interval.second;
597     ++RegNo) {
598       setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
599     }
600 #endif
601   } else {
602     // Match the score to the destination registers.
603     for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
604       RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
605       if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
606         continue;
607       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
608         setRegScore(RegNo, T, CurrScore);
609       }
610     }
611     if (TII->isDS(Inst) && Inst.mayStore()) {
612       setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
613     }
614   }
615 }
616 
617 void BlockWaitcntBrackets::print(raw_ostream &OS) {
618   OS << '\n';
619   for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
620        T = (enum InstCounterType)(T + 1)) {
621     int LB = getScoreLB(T);
622     int UB = getScoreUB(T);
623 
624     switch (T) {
625     case VM_CNT:
626       OS << "    VM_CNT(" << UB - LB << "): ";
627       break;
628     case LGKM_CNT:
629       OS << "    LGKM_CNT(" << UB - LB << "): ";
630       break;
631     case EXP_CNT:
632       OS << "    EXP_CNT(" << UB - LB << "): ";
633       break;
634     default:
635       OS << "    UNKNOWN(" << UB - LB << "): ";
636       break;
637     }
638 
639     if (LB < UB) {
640       // Print vgpr scores.
641       for (int J = 0; J <= getMaxVGPR(); J++) {
642         int RegScore = getRegScore(J, T);
643         if (RegScore <= LB)
644           continue;
645         int RelScore = RegScore - LB - 1;
646         if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
647           OS << RelScore << ":v" << J << " ";
648         } else {
649           OS << RelScore << ":ds ";
650         }
651       }
652       // Also need to print sgpr scores for lgkm_cnt.
653       if (T == LGKM_CNT) {
654         for (int J = 0; J <= getMaxSGPR(); J++) {
655           int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
656           if (RegScore <= LB)
657             continue;
658           int RelScore = RegScore - LB - 1;
659           OS << RelScore << ":s" << J << " ";
660         }
661       }
662     }
663     OS << '\n';
664   }
665   OS << '\n';
666 }
667 
668 unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
669                                                 int ScoreToWait) {
670   unsigned int NeedWait = 0;
671   if (ScoreToWait == -1) {
672     // The score to wait is unknown. This implies that it was not encountered
673     // during the path of the CFG walk done during the current traversal but
674     // may be seen on a different path. Emit an s_wait counter with a
675     // conservative value of 0 for the counter.
676     NeedWait = CNT_MASK(T);
677     setScoreLB(T, getScoreUB(T));
678     return NeedWait;
679   }
680 
681   // If the score of src_operand falls within the bracket, we need an
682   // s_waitcnt instruction.
683   const int32_t LB = getScoreLB(T);
684   const int32_t UB = getScoreUB(T);
685   if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
686     if (T == VM_CNT && hasPendingFlat()) {
687       // If there is a pending FLAT operation, and this is a VM waitcnt,
688       // then we need to force a waitcnt 0 for VM.
689       NeedWait = CNT_MASK(T);
690       setScoreLB(T, getScoreUB(T));
691     } else if (counterOutOfOrder(T)) {
692       // Counter can get decremented out-of-order when there
693       // are multiple types event in the brack. Also emit an s_wait counter
694       // with a conservative value of 0 for the counter.
695       NeedWait = CNT_MASK(T);
696       setScoreLB(T, getScoreUB(T));
697     } else {
698       NeedWait = CNT_MASK(T);
699       setScoreLB(T, ScoreToWait);
700     }
701   }
702 
703   return NeedWait;
704 }
705 
706 // Where there are multiple types of event in the bracket of a counter,
707 // the decrement may go out of order.
708 bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) {
709   switch (T) {
710   case VM_CNT:
711     return false;
712   case LGKM_CNT: {
713     if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
714         EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) {
715       // Scalar memory read always can go out of order.
716       return true;
717     }
718     int NumEventTypes = 0;
719     if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
720         EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
721       NumEventTypes++;
722     }
723     if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
724         EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
725       NumEventTypes++;
726     }
727     if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] &&
728         EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) {
729       NumEventTypes++;
730     }
731     if (NumEventTypes <= 1) {
732       return false;
733     }
734     break;
735   }
736   case EXP_CNT: {
737     // If there has been a mixture of export types, then a waitcnt exp(0) is
738     // required.
739     if (MixedExpTypes)
740       return true;
741     int NumEventTypes = 0;
742     if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
743         EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
744       NumEventTypes++;
745     }
746     if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
747         EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
748       NumEventTypes++;
749     }
750     if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
751         EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
752       NumEventTypes++;
753     }
754     if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] &&
755         EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) {
756       NumEventTypes++;
757     }
758 
759     if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] &&
760         EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) {
761       NumEventTypes++;
762     }
763 
764     if (NumEventTypes <= 1) {
765       return false;
766     }
767     break;
768   }
769   default:
770     break;
771   }
772   return true;
773 }
774 
775 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
776                       false)
777 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
778                     false)
779 
780 char SIInsertWaitcnts::ID = 0;
781 
782 char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
783 
784 FunctionPass *llvm::createSIInsertWaitcntsPass() {
785   return new SIInsertWaitcnts();
786 }
787 
788 static bool readsVCCZ(const MachineInstr &MI) {
789   unsigned Opc = MI.getOpcode();
790   return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
791          !MI.getOperand(1).isUndef();
792 }
793 
794 ///  \brief Generate s_waitcnt instruction to be placed before cur_Inst.
795 ///  Instructions of a given type are returned in order,
796 ///  but instructions of different types can complete out of order.
797 ///  We rely on this in-order completion
798 ///  and simply assign a score to the memory access instructions.
799 ///  We keep track of the active "score bracket" to determine
800 ///  if an access of a memory read requires an s_waitcnt
801 ///  and if so what the value of each counter is.
802 ///  The "score bracket" is bound by the lower bound and upper bound
803 ///  scores (*_score_LB and *_score_ub respectively).
804 MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
805     MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) {
806   // To emit, or not to emit - that's the question!
807   // Start with an assumption that there is no need to emit.
808   unsigned int EmitSwaitcnt = 0;
809   // s_waitcnt instruction to return; default is NULL.
810   MachineInstr *SWaitInst = nullptr;
811   // No need to wait before phi. If a phi-move exists, then the wait should
812   // has been inserted before the move. If a phi-move does not exist, then
813   // wait should be inserted before the real use. The same is true for
814   // sc-merge. It is not a coincident that all these cases correspond to the
815   // instructions that are skipped in the assembling loop.
816   bool NeedLineMapping = false; // TODO: Check on this.
817   if (MI.isDebugValue() &&
818       // TODO: any other opcode?
819       !NeedLineMapping) {
820     return SWaitInst;
821   }
822 
823   // See if an s_waitcnt is forced at block entry, or is needed at
824   // program end.
825   if (ScoreBrackets->getWaitAtBeginning()) {
826     // Note that we have already cleared the state, so we don't need to update
827     // it.
828     ScoreBrackets->clearWaitAtBeginning();
829     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
830          T = (enum InstCounterType)(T + 1)) {
831       EmitSwaitcnt |= CNT_MASK(T);
832       ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
833     }
834   }
835 
836   // See if this instruction has a forced S_WAITCNT VM.
837   // TODO: Handle other cases of NeedsWaitcntVmBefore()
838   else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
839            MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
840            MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
841     EmitSwaitcnt |=
842         ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
843   }
844 
845   // All waits must be resolved at call return.
846   // NOTE: this could be improved with knowledge of all call sites or
847   //   with knowledge of the called routines.
848   if (MI.getOpcode() == AMDGPU::RETURN ||
849       MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
850       MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
851     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
852          T = (enum InstCounterType)(T + 1)) {
853       if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
854         ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
855         EmitSwaitcnt |= CNT_MASK(T);
856       }
857     }
858   }
859   // Resolve vm waits before gs-done.
860   else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
861             MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
862            ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
863             AMDGPU::SendMsg::ID_GS_DONE)) {
864     if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) {
865       ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
866       EmitSwaitcnt |= CNT_MASK(VM_CNT);
867     }
868   }
869 #if 0 // TODO: the following blocks of logic when we have fence.
870   else if (MI.getOpcode() == SC_FENCE) {
871     const unsigned int group_size =
872       context->shader_info->GetMaxThreadGroupSize();
873     // group_size == 0 means thread group size is unknown at compile time
874     const bool group_is_multi_wave =
875       (group_size == 0 || group_size > target_info->GetWaveFrontSize());
876     const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
877 
878     for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
879       SCRegType src_type = Inst->GetSrcType(i);
880       switch (src_type) {
881         case SCMEM_LDS:
882           if (group_is_multi_wave ||
883             context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
884             EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
885                                ScoreBrackets->getScoreUB(LGKM_CNT));
886             // LDS may have to wait for VM_CNT after buffer load to LDS
887             if (target_info->HasBufferLoadToLDS()) {
888               EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
889                                  ScoreBrackets->getScoreUB(VM_CNT));
890             }
891           }
892           break;
893 
894         case SCMEM_GDS:
895           if (group_is_multi_wave || fence_is_global) {
896             EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
897               ScoreBrackets->getScoreUB(EXP_CNT));
898             EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
899               ScoreBrackets->getScoreUB(LGKM_CNT));
900           }
901           break;
902 
903         case SCMEM_UAV:
904         case SCMEM_TFBUF:
905         case SCMEM_RING:
906         case SCMEM_SCATTER:
907           if (group_is_multi_wave || fence_is_global) {
908             EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
909               ScoreBrackets->getScoreUB(EXP_CNT));
910             EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
911               ScoreBrackets->getScoreUB(VM_CNT));
912           }
913           break;
914 
915         case SCMEM_SCRATCH:
916         default:
917           break;
918       }
919     }
920   }
921 #endif
922 
923   // Export & GDS instructions do not read the EXEC mask until after the export
924   // is granted (which can occur well after the instruction is issued).
925   // The shader program must flush all EXP operations on the export-count
926   // before overwriting the EXEC mask.
927   else {
928     if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
929       // Export and GDS are tracked individually, either may trigger a waitcnt
930       // for EXEC.
931       EmitSwaitcnt |= ScoreBrackets->updateByWait(
932           EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK));
933       EmitSwaitcnt |= ScoreBrackets->updateByWait(
934           EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS));
935       EmitSwaitcnt |= ScoreBrackets->updateByWait(
936           EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS));
937       EmitSwaitcnt |= ScoreBrackets->updateByWait(
938           EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK));
939     }
940 
941 #if 0 // TODO: the following code to handle CALL.
942     // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
943     // However, there is a problem with EXP_CNT, because the call cannot
944     // easily tell if a register is used in the function, and if it did, then
945     // the referring instruction would have to have an S_WAITCNT, which is
946     // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
947     // before the call.
948     if (MI.getOpcode() == SC_CALL) {
949       if (ScoreBrackets->getScoreUB(EXP_CNT) >
950         ScoreBrackets->getScoreLB(EXP_CNT)) {
951         ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
952         EmitSwaitcnt |= CNT_MASK(EXP_CNT);
953       }
954     }
955 #endif
956 
957     // FIXME: Should not be relying on memoperands.
958     // Look at the source operands of every instruction to see if
959     // any of them results from a previous memory operation that affects
960     // its current usage. If so, an s_waitcnt instruction needs to be
961     // emitted.
962     // If the source operand was defined by a load, add the s_waitcnt
963     // instruction.
964     for (const MachineMemOperand *Memop : MI.memoperands()) {
965       unsigned AS = Memop->getAddrSpace();
966       if (AS != AMDGPUASI.LOCAL_ADDRESS)
967         continue;
968       unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
969       // VM_CNT is only relevant to vgpr or LDS.
970       EmitSwaitcnt |= ScoreBrackets->updateByWait(
971           VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
972     }
973 
974     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
975       const MachineOperand &Op = MI.getOperand(I);
976       const MachineRegisterInfo &MRIA = *MRI;
977       RegInterval Interval =
978           ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);
979       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
980         if (TRI->isVGPR(MRIA, Op.getReg())) {
981           // VM_CNT is only relevant to vgpr or LDS.
982           EmitSwaitcnt |= ScoreBrackets->updateByWait(
983               VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
984         }
985         EmitSwaitcnt |= ScoreBrackets->updateByWait(
986             LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
987       }
988     }
989     // End of for loop that looks at all source operands to decide vm_wait_cnt
990     // and lgk_wait_cnt.
991 
992     // Two cases are handled for destination operands:
993     // 1) If the destination operand was defined by a load, add the s_waitcnt
994     // instruction to guarantee the right WAW order.
995     // 2) If a destination operand that was used by a recent export/store ins,
996     // add s_waitcnt on exp_cnt to guarantee the WAR order.
997     if (MI.mayStore()) {
998       // FIXME: Should not be relying on memoperands.
999       for (const MachineMemOperand *Memop : MI.memoperands()) {
1000         unsigned AS = Memop->getAddrSpace();
1001         if (AS != AMDGPUASI.LOCAL_ADDRESS)
1002           continue;
1003         unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1004         EmitSwaitcnt |= ScoreBrackets->updateByWait(
1005             VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1006         EmitSwaitcnt |= ScoreBrackets->updateByWait(
1007             EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
1008       }
1009     }
1010     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1011       MachineOperand &Def = MI.getOperand(I);
1012       const MachineRegisterInfo &MRIA = *MRI;
1013       RegInterval Interval =
1014           ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
1015       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1016         if (TRI->isVGPR(MRIA, Def.getReg())) {
1017           EmitSwaitcnt |= ScoreBrackets->updateByWait(
1018               VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1019           EmitSwaitcnt |= ScoreBrackets->updateByWait(
1020               EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
1021         }
1022         EmitSwaitcnt |= ScoreBrackets->updateByWait(
1023             LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
1024       }
1025     } // End of for loop that looks at all dest operands.
1026   }
1027 
1028   // TODO: Tie force zero to a compiler triage option.
1029   bool ForceZero = false;
1030 
1031   // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
1032   // occurs before the instruction. Doing it here prevents any additional
1033   // S_WAITCNTs from being emitted if the instruction was marked as
1034   // requiring a WAITCNT beforehand.
1035   if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1036       !ST->hasAutoWaitcntBeforeBarrier()) {
1037     EmitSwaitcnt |=
1038         ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
1039     EmitSwaitcnt |= ScoreBrackets->updateByWait(
1040         EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
1041     EmitSwaitcnt |= ScoreBrackets->updateByWait(
1042         LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT));
1043   }
1044 
1045   // TODO: Remove this work-around, enable the assert for Bug 457939
1046   //       after fixing the scheduler. Also, the Shader Compiler code is
1047   //       independent of target.
1048   if (readsVCCZ(MI) && ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
1049     if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1050             ScoreBrackets->getScoreUB(LGKM_CNT) &&
1051         ScoreBrackets->hasPendingSMEM()) {
1052       // Wait on everything, not just LGKM.  vccz reads usually come from
1053       // terminators, and we always wait on everything at the end of the
1054       // block, so if we only wait on LGKM here, we might end up with
1055       // another s_waitcnt inserted right after this if there are non-LGKM
1056       // instructions still outstanding.
1057       ForceZero = true;
1058       EmitSwaitcnt = true;
1059     }
1060   }
1061 
1062   // Does this operand processing indicate s_wait counter update?
1063   if (EmitSwaitcnt) {
1064     int CntVal[NUM_INST_CNTS];
1065 
1066     bool UseDefaultWaitcntStrategy = true;
1067     if (ForceZero) {
1068       // Force all waitcnts to 0.
1069       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1070            T = (enum InstCounterType)(T + 1)) {
1071         ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
1072       }
1073       CntVal[VM_CNT] = 0;
1074       CntVal[EXP_CNT] = 0;
1075       CntVal[LGKM_CNT] = 0;
1076       UseDefaultWaitcntStrategy = false;
1077     }
1078 
1079     if (UseDefaultWaitcntStrategy) {
1080       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1081            T = (enum InstCounterType)(T + 1)) {
1082         if (EmitSwaitcnt & CNT_MASK(T)) {
1083           int Delta =
1084               ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T);
1085           int MaxDelta = ScoreBrackets->getWaitCountMax(T);
1086           if (Delta >= MaxDelta) {
1087             Delta = -1;
1088             if (T != EXP_CNT) {
1089               ScoreBrackets->setScoreLB(
1090                   T, ScoreBrackets->getScoreUB(T) - MaxDelta);
1091             }
1092             EmitSwaitcnt &= ~CNT_MASK(T);
1093           }
1094           CntVal[T] = Delta;
1095         } else {
1096           // If we are not waiting for a particular counter then encode
1097           // it as -1 which means "don't care."
1098           CntVal[T] = -1;
1099         }
1100       }
1101     }
1102 
1103     // If we are not waiting on any counter we can skip the wait altogether.
1104     if (EmitSwaitcnt != 0) {
1105       MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
1106       int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
1107       if (!OldWaitcnt || (AMDGPU::decodeVmcnt(IV, Imm) !=
1108                           (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
1109           (AMDGPU::decodeExpcnt(IV, Imm) !=
1110            (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
1111           (AMDGPU::decodeLgkmcnt(IV, Imm) !=
1112            (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) {
1113         MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
1114         if (ContainingLoop) {
1115           MachineBasicBlock *TBB = ContainingLoop->getHeader();
1116           BlockWaitcntBrackets *ScoreBracket =
1117               BlockWaitcntBracketsMap[TBB].get();
1118           if (!ScoreBracket) {
1119             assert(BlockVisitedSet.find(TBB) == BlockVisitedSet.end());
1120             BlockWaitcntBracketsMap[TBB] =
1121                 llvm::make_unique<BlockWaitcntBrackets>();
1122             ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
1123           }
1124           ScoreBracket->setRevisitLoop(true);
1125           DEBUG(dbgs() << "set-revisit: block"
1126                        << ContainingLoop->getHeader()->getNumber() << '\n';);
1127         }
1128       }
1129 
1130       // Update an existing waitcount, or make a new one.
1131       MachineFunction &MF = *MI.getParent()->getParent();
1132       if (OldWaitcnt && OldWaitcnt->getOpcode() != AMDGPU::S_WAITCNT) {
1133         SWaitInst = OldWaitcnt;
1134       } else {
1135         SWaitInst = MF.CreateMachineInstr(TII->get(AMDGPU::S_WAITCNT),
1136                                           MI.getDebugLoc());
1137         CompilerGeneratedWaitcntSet.insert(SWaitInst);
1138       }
1139 
1140       const MachineOperand &Op =
1141           MachineOperand::CreateImm(AMDGPU::encodeWaitcnt(
1142               IV, CntVal[VM_CNT], CntVal[EXP_CNT], CntVal[LGKM_CNT]));
1143       SWaitInst->addOperand(MF, Op);
1144 
1145       if (CntVal[EXP_CNT] == 0) {
1146         ScoreBrackets->setMixedExpTypes(false);
1147       }
1148     }
1149   }
1150 
1151   return SWaitInst;
1152 }
1153 
1154 void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
1155                                              MachineInstr *Waitcnt) {
1156   if (MBB.empty()) {
1157     MBB.push_back(Waitcnt);
1158     return;
1159   }
1160 
1161   MachineBasicBlock::iterator It = MBB.end();
1162   MachineInstr *MI = &*(--It);
1163   if (MI->isBranch()) {
1164     MBB.insert(It, Waitcnt);
1165   } else {
1166     MBB.push_back(Waitcnt);
1167   }
1168 }
1169 
1170 // This is a flat memory operation. Check to see if it has memory
1171 // tokens for both LDS and Memory, and if so mark it as a flat.
1172 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1173   if (MI.memoperands_empty())
1174     return true;
1175 
1176   for (const MachineMemOperand *Memop : MI.memoperands()) {
1177     unsigned AS = Memop->getAddrSpace();
1178     if (AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS)
1179       return true;
1180   }
1181 
1182   return false;
1183 }
1184 
1185 void SIInsertWaitcnts::updateEventWaitCntAfter(
1186     MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
1187   // Now look at the instruction opcode. If it is a memory access
1188   // instruction, update the upper-bound of the appropriate counter's
1189   // bracket and the destination operand scores.
1190   // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
1191   if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
1192     if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
1193       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1194       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1195     } else {
1196       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1197     }
1198   } else if (TII->isFLAT(Inst)) {
1199     assert(Inst.mayLoad() || Inst.mayStore());
1200 
1201     if (TII->usesVM_CNT(Inst))
1202       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1203 
1204     if (TII->usesLGKM_CNT(Inst)) {
1205       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1206 
1207       // This is a flat memory operation, so note it - it will require
1208       // that both the VM and LGKM be flushed to zero if it is pending when
1209       // a VM or LGKM dependency occurs.
1210       if (mayAccessLDSThroughFlat(Inst))
1211         ScoreBrackets->setPendingFlat();
1212     }
1213   } else if (SIInstrInfo::isVMEM(Inst) &&
1214              // TODO: get a better carve out.
1215              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
1216              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
1217              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
1218     ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1219     if ( // TODO: assumed yes -- target_info->MemWriteNeedsExpWait() &&
1220         (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
1221       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1222     }
1223   } else if (TII->isSMRD(Inst)) {
1224     ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1225   } else {
1226     switch (Inst.getOpcode()) {
1227     case AMDGPU::S_SENDMSG:
1228     case AMDGPU::S_SENDMSGHALT:
1229       ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1230       break;
1231     case AMDGPU::EXP:
1232     case AMDGPU::EXP_DONE: {
1233       int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1234       if (Imm >= 32 && Imm <= 63)
1235         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1236       else if (Imm >= 12 && Imm <= 15)
1237         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1238       else
1239         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1240       break;
1241     }
1242     case AMDGPU::S_MEMTIME:
1243     case AMDGPU::S_MEMREALTIME:
1244       ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1245       break;
1246     default:
1247       break;
1248     }
1249   }
1250 }
1251 
1252 void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
1253   BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1254   int32_t MaxPending[NUM_INST_CNTS] = {0};
1255   int32_t MaxFlat[NUM_INST_CNTS] = {0};
1256   bool MixedExpTypes = false;
1257 
1258   // Clear the score bracket state.
1259   ScoreBrackets->clear();
1260 
1261   // Compute the number of pending elements on block entry.
1262 
1263   // IMPORTANT NOTE: If iterative handling of loops is added, the code will
1264   // need to handle single BBs with backedges to themselves. This means that
1265   // they will need to retain and not clear their initial state.
1266 
1267   // See if there are any uninitialized predecessors. If so, emit an
1268   // s_waitcnt 0 at the beginning of the block.
1269   for (MachineBasicBlock *pred : Block.predecessors()) {
1270     BlockWaitcntBrackets *PredScoreBrackets =
1271         BlockWaitcntBracketsMap[pred].get();
1272     bool Visited = BlockVisitedSet.find(pred) != BlockVisitedSet.end();
1273     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
1274       break;
1275     }
1276     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1277          T = (enum InstCounterType)(T + 1)) {
1278       int span =
1279           PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);
1280       MaxPending[T] = std::max(MaxPending[T], span);
1281       span =
1282           PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
1283       MaxFlat[T] = std::max(MaxFlat[T], span);
1284     }
1285 
1286     MixedExpTypes |= PredScoreBrackets->mixedExpTypes();
1287   }
1288 
1289   // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1290   // Also handle kills for exit block.
1291   if (Block.succ_empty() && !KillWaitBrackets.empty()) {
1292     for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
1293       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1294            T = (enum InstCounterType)(T + 1)) {
1295         int Span = KillWaitBrackets[I]->getScoreUB(T) -
1296                    KillWaitBrackets[I]->getScoreLB(T);
1297         MaxPending[T] = std::max(MaxPending[T], Span);
1298         Span = KillWaitBrackets[I]->pendingFlat(T) -
1299                KillWaitBrackets[I]->getScoreLB(T);
1300         MaxFlat[T] = std::max(MaxFlat[T], Span);
1301       }
1302 
1303       MixedExpTypes |= KillWaitBrackets[I]->mixedExpTypes();
1304     }
1305   }
1306 
1307   // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
1308   for (MachineBasicBlock *Pred : Block.predecessors()) {
1309     BlockWaitcntBrackets *PredScoreBrackets =
1310         BlockWaitcntBracketsMap[Pred].get();
1311     bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end();
1312     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
1313       break;
1314     }
1315 
1316     int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
1317                   PredScoreBrackets->getScoreLB(EXP_CNT);
1318     MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
1319     int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) -
1320                   PredScoreBrackets->getScoreLB(EXP_CNT);
1321     MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
1322   }
1323 
1324   // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1325   if (Block.succ_empty() && !KillWaitBrackets.empty()) {
1326     for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
1327       int GDSSpan = KillWaitBrackets[I]->getEventUB(GDS_GPR_LOCK) -
1328                     KillWaitBrackets[I]->getScoreLB(EXP_CNT);
1329       MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
1330       int EXPSpan = KillWaitBrackets[I]->getEventUB(EXP_GPR_LOCK) -
1331                     KillWaitBrackets[I]->getScoreLB(EXP_CNT);
1332       MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
1333     }
1334   }
1335 
1336 #if 0
1337   // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
1338   // TODO: how does LC distinguish between function entry and main entry?
1339   // If this is the entry to a function, force a wait.
1340   MachineBasicBlock &Entry = Block.getParent()->front();
1341   if (Entry.getNumber() == Block.getNumber()) {
1342     ScoreBrackets->setWaitAtBeginning();
1343     return;
1344   }
1345 #endif
1346 
1347   // Now set the current Block's brackets to the largest ending bracket.
1348   for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1349        T = (enum InstCounterType)(T + 1)) {
1350     ScoreBrackets->setScoreUB(T, MaxPending[T]);
1351     ScoreBrackets->setScoreLB(T, 0);
1352     ScoreBrackets->setLastFlat(T, MaxFlat[T]);
1353   }
1354 
1355   ScoreBrackets->setMixedExpTypes(MixedExpTypes);
1356 
1357   // Set the register scoreboard.
1358   for (MachineBasicBlock *Pred : Block.predecessors()) {
1359     if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
1360       break;
1361     }
1362 
1363     BlockWaitcntBrackets *PredScoreBrackets =
1364         BlockWaitcntBracketsMap[Pred].get();
1365 
1366     // Now merge the gpr_reg_score information
1367     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1368          T = (enum InstCounterType)(T + 1)) {
1369       int PredLB = PredScoreBrackets->getScoreLB(T);
1370       int PredUB = PredScoreBrackets->getScoreUB(T);
1371       if (PredLB < PredUB) {
1372         int PredScale = MaxPending[T] - PredUB;
1373         // Merge vgpr scores.
1374         for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
1375           int PredRegScore = PredScoreBrackets->getRegScore(J, T);
1376           if (PredRegScore <= PredLB)
1377             continue;
1378           int NewRegScore = PredScale + PredRegScore;
1379           ScoreBrackets->setRegScore(
1380               J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
1381         }
1382         // Also need to merge sgpr scores for lgkm_cnt.
1383         if (T == LGKM_CNT) {
1384           for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
1385             int PredRegScore =
1386                 PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
1387             if (PredRegScore <= PredLB)
1388               continue;
1389             int NewRegScore = PredScale + PredRegScore;
1390             ScoreBrackets->setRegScore(
1391                 J + NUM_ALL_VGPRS, LGKM_CNT,
1392                 std::max(
1393                     ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
1394                     NewRegScore));
1395           }
1396         }
1397       }
1398     }
1399 
1400     // Also merge the WaitEvent information.
1401     ForAllWaitEventType(W) {
1402       enum InstCounterType T = PredScoreBrackets->eventCounter(W);
1403       int PredEventUB = PredScoreBrackets->getEventUB(W);
1404       if (PredEventUB > PredScoreBrackets->getScoreLB(T)) {
1405         int NewEventUB =
1406             MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T);
1407         if (NewEventUB > 0) {
1408           ScoreBrackets->setEventUB(
1409               W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
1410         }
1411       }
1412     }
1413   }
1414 
1415   // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1416   // Set the register scoreboard.
1417   if (Block.succ_empty() && !KillWaitBrackets.empty()) {
1418     for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
1419       // Now merge the gpr_reg_score information.
1420       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1421            T = (enum InstCounterType)(T + 1)) {
1422         int PredLB = KillWaitBrackets[I]->getScoreLB(T);
1423         int PredUB = KillWaitBrackets[I]->getScoreUB(T);
1424         if (PredLB < PredUB) {
1425           int PredScale = MaxPending[T] - PredUB;
1426           // Merge vgpr scores.
1427           for (int J = 0; J <= KillWaitBrackets[I]->getMaxVGPR(); J++) {
1428             int PredRegScore = KillWaitBrackets[I]->getRegScore(J, T);
1429             if (PredRegScore <= PredLB)
1430               continue;
1431             int NewRegScore = PredScale + PredRegScore;
1432             ScoreBrackets->setRegScore(
1433                 J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
1434           }
1435           // Also need to merge sgpr scores for lgkm_cnt.
1436           if (T == LGKM_CNT) {
1437             for (int J = 0; J <= KillWaitBrackets[I]->getMaxSGPR(); J++) {
1438               int PredRegScore =
1439                   KillWaitBrackets[I]->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
1440               if (PredRegScore <= PredLB)
1441                 continue;
1442               int NewRegScore = PredScale + PredRegScore;
1443               ScoreBrackets->setRegScore(
1444                   J + NUM_ALL_VGPRS, LGKM_CNT,
1445                   std::max(
1446                       ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
1447                       NewRegScore));
1448             }
1449           }
1450         }
1451       }
1452 
1453       // Also merge the WaitEvent information.
1454       ForAllWaitEventType(W) {
1455         enum InstCounterType T = KillWaitBrackets[I]->eventCounter(W);
1456         int PredEventUB = KillWaitBrackets[I]->getEventUB(W);
1457         if (PredEventUB > KillWaitBrackets[I]->getScoreLB(T)) {
1458           int NewEventUB =
1459               MaxPending[T] + PredEventUB - KillWaitBrackets[I]->getScoreUB(T);
1460           if (NewEventUB > 0) {
1461             ScoreBrackets->setEventUB(
1462                 W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
1463           }
1464         }
1465       }
1466     }
1467   }
1468 
1469   // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
1470   // sequencing predecessors, because changes to EXEC require waitcnts due to
1471   // the delayed nature of these operations.
1472   for (MachineBasicBlock *Pred : Block.predecessors()) {
1473     if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
1474       break;
1475     }
1476 
1477     BlockWaitcntBrackets *PredScoreBrackets =
1478         BlockWaitcntBracketsMap[Pred].get();
1479 
1480     int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK);
1481     if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
1482       int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub -
1483                        PredScoreBrackets->getScoreUB(EXP_CNT);
1484       if (new_gds_ub > 0) {
1485         ScoreBrackets->setEventUB(
1486             GDS_GPR_LOCK,
1487             std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub));
1488       }
1489     }
1490     int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK);
1491     if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
1492       int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub -
1493                        PredScoreBrackets->getScoreUB(EXP_CNT);
1494       if (new_exp_ub > 0) {
1495         ScoreBrackets->setEventUB(
1496             EXP_GPR_LOCK,
1497             std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub));
1498       }
1499     }
1500   }
1501 }
1502 
1503 /// Return the "bottom" block of a loop. This differs from
1504 /// MachineLoop::getBottomBlock in that it works even if the loop is
1505 /// discontiguous.
1506 MachineBasicBlock *SIInsertWaitcnts::loopBottom(const MachineLoop *Loop) {
1507   MachineBasicBlock *Bottom = Loop->getHeader();
1508   for (MachineBasicBlock *MBB : Loop->blocks())
1509     if (MBB->getNumber() > Bottom->getNumber())
1510       Bottom = MBB;
1511   return Bottom;
1512 }
1513 
1514 // Generate s_waitcnt instructions where needed.
1515 void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1516                                             MachineBasicBlock &Block) {
1517   // Initialize the state information.
1518   mergeInputScoreBrackets(Block);
1519 
1520   BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1521 
1522   DEBUG({
1523     dbgs() << "Block" << Block.getNumber();
1524     ScoreBrackets->dump();
1525   });
1526 
1527   bool InsertNOP = false;
1528 
1529   // Walk over the instructions.
1530   for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
1531        Iter != E;) {
1532     MachineInstr &Inst = *Iter;
1533     // Remove any previously existing waitcnts.
1534     if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
1535       // TODO: Register the old waitcnt and optimize the following waitcnts.
1536       // Leaving the previously existing waitcnts is conservatively correct.
1537       if (CompilerGeneratedWaitcntSet.find(&Inst) ==
1538           CompilerGeneratedWaitcntSet.end())
1539         ++Iter;
1540       else {
1541         ScoreBrackets->setWaitcnt(&Inst);
1542         ++Iter;
1543         Inst.removeFromParent();
1544       }
1545       continue;
1546     }
1547 
1548     // Kill instructions generate a conditional branch to the endmain block.
1549     // Merge the current waitcnt state into the endmain block information.
1550     // TODO: Are there other flavors of KILL instruction?
1551     if (Inst.getOpcode() == AMDGPU::KILL) {
1552       addKillWaitBracket(ScoreBrackets);
1553     }
1554 
1555     bool VCCZBugWorkAround = false;
1556     if (readsVCCZ(Inst) &&
1557         (VCCZBugHandledSet.find(&Inst) == VCCZBugHandledSet.end())) {
1558       if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1559               ScoreBrackets->getScoreUB(LGKM_CNT) &&
1560           ScoreBrackets->hasPendingSMEM()) {
1561         if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS)
1562           VCCZBugWorkAround = true;
1563       }
1564     }
1565 
1566     // Generate an s_waitcnt instruction to be placed before
1567     // cur_Inst, if needed.
1568     MachineInstr *SWaitInst = generateSWaitCntInstBefore(Inst, ScoreBrackets);
1569 
1570     if (SWaitInst) {
1571       Block.insert(Inst, SWaitInst);
1572       if (ScoreBrackets->getWaitcnt() != SWaitInst) {
1573         DEBUG(dbgs() << "insertWaitcntInBlock\n"
1574                      << "Old Instr: " << Inst << '\n'
1575                      << "New Instr: " << *SWaitInst << '\n';);
1576       }
1577     }
1578 
1579     updateEventWaitCntAfter(Inst, ScoreBrackets);
1580 
1581 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
1582     // If this instruction generates a S_SETVSKIP because it is an
1583     // indexed resource, and we are on Tahiti, then it will also force
1584     // an S_WAITCNT vmcnt(0)
1585     if (RequireCheckResourceType(Inst, context)) {
1586       // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1587       ScoreBrackets->setScoreLB(VM_CNT,
1588       ScoreBrackets->getScoreUB(VM_CNT));
1589     }
1590 #endif
1591 
1592     ScoreBrackets->clearWaitcnt();
1593 
1594     if (SWaitInst) {
1595       DEBUG({ SWaitInst->print(dbgs() << '\n'); });
1596     }
1597     DEBUG({
1598       Inst.print(dbgs());
1599       ScoreBrackets->dump();
1600     });
1601 
1602     // Check to see if this is a GWS instruction. If so, and if this is CI or
1603     // VI, then the generated code sequence will include an S_WAITCNT 0.
1604     // TODO: Are these the only GWS instructions?
1605     if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
1606         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
1607         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
1608         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
1609         Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
1610       // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
1611       ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
1612       ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
1613       ScoreBrackets->updateByWait(LGKM_CNT,
1614                                   ScoreBrackets->getScoreUB(LGKM_CNT));
1615     }
1616 
1617     // TODO: Remove this work-around after fixing the scheduler and enable the
1618     // assert above.
1619     if (VCCZBugWorkAround) {
1620       // Restore the vccz bit.  Any time a value is written to vcc, the vcc
1621       // bit is updated, so we can restore the bit by reading the value of
1622       // vcc and then writing it back to the register.
1623       BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
1624               AMDGPU::VCC)
1625           .addReg(AMDGPU::VCC);
1626       VCCZBugHandledSet.insert(&Inst);
1627     }
1628 
1629     if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
1630 
1631       // This avoids a s_nop after a waitcnt has just been inserted.
1632       if (!SWaitInst && InsertNOP) {
1633         BuildMI(Block, Inst, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
1634       }
1635       InsertNOP = false;
1636 
1637       // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
1638       // or SMEM clause, respectively.
1639       //
1640       // The temporary workaround is to break the clauses with S_NOP.
1641       //
1642       // The proper solution would be to allocate registers such that all source
1643       // and destination registers don't overlap, e.g. this is illegal:
1644       //   r0 = load r2
1645       //   r2 = load r0
1646       bool IsSMEM = false;
1647       bool IsVMEM = false;
1648       if (TII->isSMRD(Inst))
1649         IsSMEM = true;
1650       else if (TII->usesVM_CNT(Inst))
1651         IsVMEM = true;
1652 
1653       ++Iter;
1654       if (Iter == E)
1655         break;
1656 
1657       MachineInstr &Next = *Iter;
1658 
1659       // TODO: How about consecutive SMEM instructions?
1660       //       The comments above says break the clause but the code does not.
1661       // if ((TII->isSMRD(next) && isSMEM) ||
1662       if (!IsSMEM && TII->usesVM_CNT(Next) && IsVMEM &&
1663           // TODO: Enable this check when hasSoftClause is upstreamed.
1664           // ST->hasSoftClauses() &&
1665           ST->isXNACKEnabled()) {
1666         // Insert a NOP to break the clause.
1667         InsertNOP = true;
1668         continue;
1669       }
1670 
1671       // There must be "S_NOP 0" between an instruction writing M0 and
1672       // S_SENDMSG.
1673       if ((Next.getOpcode() == AMDGPU::S_SENDMSG ||
1674            Next.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
1675           Inst.definesRegister(AMDGPU::M0))
1676         InsertNOP = true;
1677 
1678       continue;
1679     }
1680 
1681     ++Iter;
1682   }
1683 
1684   // Check if we need to force convergence at loop footer.
1685   MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
1686   if (ContainingLoop && loopBottom(ContainingLoop) == &Block) {
1687     LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1688     WaitcntData->print();
1689     DEBUG(dbgs() << '\n';);
1690 
1691     // The iterative waitcnt insertion algorithm aims for optimal waitcnt
1692     // placement and doesn't always guarantee convergence for a loop. Each
1693     // loop should take at most 2 iterations for it to converge naturally.
1694     // When this max is reached and result doesn't converge, we force
1695     // convergence by inserting a s_waitcnt at the end of loop footer.
1696     if (WaitcntData->getIterCnt() > 2) {
1697       // To ensure convergence, need to make wait events at loop footer be no
1698       // more than those from the previous iteration.
1699       // As a simplification, Instead of tracking individual scores and
1700       // generate the precise wait count, just wait on 0.
1701       bool HasPending = false;
1702       MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
1703       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1704            T = (enum InstCounterType)(T + 1)) {
1705         if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
1706           ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
1707           HasPending = true;
1708         }
1709       }
1710 
1711       if (HasPending) {
1712         if (!SWaitInst) {
1713           SWaitInst = Block.getParent()->CreateMachineInstr(
1714               TII->get(AMDGPU::S_WAITCNT), DebugLoc());
1715           CompilerGeneratedWaitcntSet.insert(SWaitInst);
1716           const MachineOperand &Op = MachineOperand::CreateImm(0);
1717           SWaitInst->addOperand(MF, Op);
1718 #if 0 // TODO: Format the debug output
1719           OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
1720           OutputTransformAdd(SWaitInst, context);
1721 #endif
1722         }
1723 #if 0 // TODO: ??
1724         _DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
1725 #endif
1726       }
1727 
1728       if (SWaitInst) {
1729         DEBUG({
1730           SWaitInst->print(dbgs());
1731           dbgs() << "\nAdjusted score board:";
1732           ScoreBrackets->dump();
1733         });
1734 
1735         // Add this waitcnt to the block. It is either newly created or
1736         // created in previous iterations and added back since block traversal
1737         // always remove waitcnt.
1738         insertWaitcntBeforeCF(Block, SWaitInst);
1739         WaitcntData->setWaitcnt(SWaitInst);
1740       }
1741     }
1742   }
1743 }
1744 
1745 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
1746   ST = &MF.getSubtarget<SISubtarget>();
1747   TII = ST->getInstrInfo();
1748   TRI = &TII->getRegisterInfo();
1749   MRI = &MF.getRegInfo();
1750   MLI = &getAnalysis<MachineLoopInfo>();
1751   IV = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
1752   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1753   AMDGPUASI = ST->getAMDGPUAS();
1754 
1755   HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1756   HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1757   HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1758 
1759   HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
1760   HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
1761   assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1762   assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1763 
1764   RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1765   RegisterEncoding.VGPRL =
1766       RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
1767   RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1768   RegisterEncoding.SGPRL =
1769       RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
1770 
1771   // Walk over the blocks in reverse post-dominator order, inserting
1772   // s_waitcnt where needed.
1773   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
1774   bool Modified = false;
1775   for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
1776            I = RPOT.begin(),
1777            E = RPOT.end(), J = RPOT.begin();
1778        I != E;) {
1779     MachineBasicBlock &MBB = **I;
1780 
1781     BlockVisitedSet.insert(&MBB);
1782 
1783     BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1784     if (!ScoreBrackets) {
1785       BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>();
1786       ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1787     }
1788     ScoreBrackets->setPostOrder(MBB.getNumber());
1789     MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
1790     if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
1791       LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();
1792 
1793     // If we are walking into the block from before the loop, then guarantee
1794     // at least 1 re-walk over the loop to propagate the information, even if
1795     // no S_WAITCNT instructions were generated.
1796     if (ContainingLoop && ContainingLoop->getHeader() == &MBB && J < I &&
1797         (BlockWaitcntProcessedSet.find(&MBB) ==
1798          BlockWaitcntProcessedSet.end())) {
1799       BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
1800       DEBUG(dbgs() << "set-revisit: block"
1801                    << ContainingLoop->getHeader()->getNumber() << '\n';);
1802     }
1803 
1804     // Walk over the instructions.
1805     insertWaitcntInBlock(MF, MBB);
1806 
1807     // Flag that waitcnts have been processed at least once.
1808     BlockWaitcntProcessedSet.insert(&MBB);
1809 
1810     // See if we want to revisit the loop.
1811     if (ContainingLoop && loopBottom(ContainingLoop) == &MBB) {
1812       MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
1813       BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
1814       if (EntrySB && EntrySB->getRevisitLoop()) {
1815         EntrySB->setRevisitLoop(false);
1816         J = I;
1817         int32_t PostOrder = EntrySB->getPostOrder();
1818         // TODO: Avoid this loop. Find another way to set I.
1819         for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
1820                  X = RPOT.begin(),
1821                  Y = RPOT.end();
1822              X != Y; ++X) {
1823           MachineBasicBlock &MBBX = **X;
1824           if (MBBX.getNumber() == PostOrder) {
1825             I = X;
1826             break;
1827           }
1828         }
1829         LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1830         WaitcntData->incIterCnt();
1831         DEBUG(dbgs() << "revisit: block" << EntryBB->getNumber() << '\n';);
1832         continue;
1833       } else {
1834         LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1835         // Loop converged, reset iteration count. If this loop gets revisited,
1836         // it must be from an outer loop, the counter will restart, this will
1837         // ensure we don't force convergence on such revisits.
1838         WaitcntData->resetIterCnt();
1839       }
1840     }
1841 
1842     J = I;
1843     ++I;
1844   }
1845 
1846   SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
1847 
1848   bool HaveScalarStores = false;
1849 
1850   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
1851        ++BI) {
1852     MachineBasicBlock &MBB = *BI;
1853 
1854     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
1855          ++I) {
1856       if (!HaveScalarStores && TII->isScalarStore(*I))
1857         HaveScalarStores = true;
1858 
1859       if (I->getOpcode() == AMDGPU::S_ENDPGM ||
1860           I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1861         EndPgmBlocks.push_back(&MBB);
1862     }
1863   }
1864 
1865   if (HaveScalarStores) {
1866     // If scalar writes are used, the cache must be flushed or else the next
1867     // wave to reuse the same scratch memory can be clobbered.
1868     //
1869     // Insert s_dcache_wb at wave termination points if there were any scalar
1870     // stores, and only if the cache hasn't already been flushed. This could be
1871     // improved by looking across blocks for flushes in postdominating blocks
1872     // from the stores but an explicitly requested flush is probably very rare.
1873     for (MachineBasicBlock *MBB : EndPgmBlocks) {
1874       bool SeenDCacheWB = false;
1875 
1876       for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
1877            ++I) {
1878         if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1879           SeenDCacheWB = true;
1880         else if (TII->isScalarStore(*I))
1881           SeenDCacheWB = false;
1882 
1883         // FIXME: It would be better to insert this before a waitcnt if any.
1884         if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1885              I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1886             !SeenDCacheWB) {
1887           Modified = true;
1888           BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1889         }
1890       }
1891     }
1892   }
1893 
1894   if (!MFI->isEntryFunction()) {
1895     // Wait for any outstanding memory operations that the input registers may
1896     // depend on. We can't track them and it's better to to the wait after the
1897     // costly call sequence.
1898 
1899     // TODO: Could insert earlier and schedule more liberally with operations
1900     // that only use caller preserved registers.
1901     MachineBasicBlock &EntryBB = MF.front();
1902     BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1903       .addImm(0);
1904 
1905     Modified = true;
1906   }
1907 
1908   return Modified;
1909 }
1910