1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Insert wait instructions for memory reads and writes.
12 ///
13 /// Memory reads and writes are issued asynchronously, so we need to insert
14 /// S_WAITCNT instructions when we want to access any of their results or
15 /// overwrite any register that's used asynchronously.
16 //
17 //===----------------------------------------------------------------------===//
18 
19 #include "AMDGPU.h"
20 #include "AMDGPUSubtarget.h"
21 #include "SIDefines.h"
22 #include "SIInstrInfo.h"
23 #include "SIMachineFunctionInfo.h"
24 #include "SIRegisterInfo.h"
25 #include "Utils/AMDGPUBaseInfo.h"
26 #include "llvm/ADT/DenseMap.h"
27 #include "llvm/ADT/DenseSet.h"
28 #include "llvm/ADT/PostOrderIterator.h"
29 #include "llvm/ADT/STLExtras.h"
30 #include "llvm/ADT/SmallVector.h"
31 #include "llvm/CodeGen/MachineBasicBlock.h"
32 #include "llvm/CodeGen/MachineFunction.h"
33 #include "llvm/CodeGen/MachineFunctionPass.h"
34 #include "llvm/CodeGen/MachineInstr.h"
35 #include "llvm/CodeGen/MachineInstrBuilder.h"
36 #include "llvm/CodeGen/MachineLoopInfo.h"
37 #include "llvm/CodeGen/MachineMemOperand.h"
38 #include "llvm/CodeGen/MachineOperand.h"
39 #include "llvm/CodeGen/MachineRegisterInfo.h"
40 #include "llvm/IR/DebugLoc.h"
41 #include "llvm/Pass.h"
42 #include "llvm/Support/Debug.h"
43 #include "llvm/Support/ErrorHandling.h"
44 #include "llvm/Support/raw_ostream.h"
45 #include <algorithm>
46 #include <cassert>
47 #include <cstdint>
48 #include <cstring>
49 #include <memory>
50 #include <utility>
51 #include <vector>
52 
53 #define DEBUG_TYPE "si-insert-waitcnts"
54 
55 using namespace llvm;
56 
57 namespace {
58 
59 // Class of object that encapsulates latest instruction counter score
60 // associated with the operand.  Used for determining whether
61 // s_waitcnt instruction needs to be emited.
62 
63 #define CNT_MASK(t) (1u << (t))
64 
65 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
66 
67 using RegInterval = std::pair<signed, signed>;
68 
69 struct {
70   int32_t VmcntMax;
71   int32_t ExpcntMax;
72   int32_t LgkmcntMax;
73   int32_t NumVGPRsMax;
74   int32_t NumSGPRsMax;
75 } HardwareLimits;
76 
77 struct {
78   unsigned VGPR0;
79   unsigned VGPRL;
80   unsigned SGPR0;
81   unsigned SGPRL;
82 } RegisterEncoding;
83 
84 enum WaitEventType {
85   VMEM_ACCESS,      // vector-memory read & write
86   LDS_ACCESS,       // lds read & write
87   GDS_ACCESS,       // gds read & write
88   SQ_MESSAGE,       // send message
89   SMEM_ACCESS,      // scalar-memory read & write
90   EXP_GPR_LOCK,     // export holding on its data src
91   GDS_GPR_LOCK,     // GDS holding on its data and addr src
92   EXP_POS_ACCESS,   // write to export position
93   EXP_PARAM_ACCESS, // write to export parameter
94   VMW_GPR_LOCK,     // vector-memory write holding on its data src
95   NUM_WAIT_EVENTS,
96 };
97 
98 // The mapping is:
99 //  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
100 //  SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1                  extra VGPR-like slots
101 //  NUM_ALL_VGPRS    .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
102 // We reserve a fixed number of VGPR slots in the scoring tables for
103 // special tokens like SCMEM_LDS (needed for buffer load to LDS).
104 enum RegisterMapping {
105   SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
106   SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
107   NUM_EXTRA_VGPRS = 1,    // A reserved slot for DS.
108   EXTRA_VGPR_LDS = 0,     // This is a placeholder the Shader algorithm uses.
109   NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
110 };
111 
112 #define ForAllWaitEventType(w)                                                 \
113   for (enum WaitEventType w = (enum WaitEventType)0;                           \
114        (w) < (enum WaitEventType)NUM_WAIT_EVENTS;                              \
115        (w) = (enum WaitEventType)((w) + 1))
116 
117 // This is a per-basic-block object that maintains current score brackets
118 // of each wait counter, and a per-register scoreboard for each wait counter.
119 // We also maintain the latest score for every event type that can change the
120 // waitcnt in order to know if there are multiple types of events within
121 // the brackets. When multiple types of event happen in the bracket,
122 // wait count may get decreased out of order, therefore we need to put in
123 // "s_waitcnt 0" before use.
124 class BlockWaitcntBrackets {
125 public:
126   BlockWaitcntBrackets() {
127     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
128          T = (enum InstCounterType)(T + 1)) {
129       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
130     }
131   }
132 
133   ~BlockWaitcntBrackets() = default;
134 
135   static int32_t getWaitCountMax(InstCounterType T) {
136     switch (T) {
137     case VM_CNT:
138       return HardwareLimits.VmcntMax;
139     case LGKM_CNT:
140       return HardwareLimits.LgkmcntMax;
141     case EXP_CNT:
142       return HardwareLimits.ExpcntMax;
143     default:
144       break;
145     }
146     return 0;
147   }
148 
149   void setScoreLB(InstCounterType T, int32_t Val) {
150     assert(T < NUM_INST_CNTS);
151     if (T >= NUM_INST_CNTS)
152       return;
153     ScoreLBs[T] = Val;
154   }
155 
156   void setScoreUB(InstCounterType T, int32_t Val) {
157     assert(T < NUM_INST_CNTS);
158     if (T >= NUM_INST_CNTS)
159       return;
160     ScoreUBs[T] = Val;
161     if (T == EXP_CNT) {
162       int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT));
163       if (ScoreLBs[T] < UB)
164         ScoreLBs[T] = UB;
165     }
166   }
167 
168   int32_t getScoreLB(InstCounterType T) {
169     assert(T < NUM_INST_CNTS);
170     if (T >= NUM_INST_CNTS)
171       return 0;
172     return ScoreLBs[T];
173   }
174 
175   int32_t getScoreUB(InstCounterType T) {
176     assert(T < NUM_INST_CNTS);
177     if (T >= NUM_INST_CNTS)
178       return 0;
179     return ScoreUBs[T];
180   }
181 
182   // Mapping from event to counter.
183   InstCounterType eventCounter(WaitEventType E) {
184     switch (E) {
185     case VMEM_ACCESS:
186       return VM_CNT;
187     case LDS_ACCESS:
188     case GDS_ACCESS:
189     case SQ_MESSAGE:
190     case SMEM_ACCESS:
191       return LGKM_CNT;
192     case EXP_GPR_LOCK:
193     case GDS_GPR_LOCK:
194     case VMW_GPR_LOCK:
195     case EXP_POS_ACCESS:
196     case EXP_PARAM_ACCESS:
197       return EXP_CNT;
198     default:
199       llvm_unreachable("unhandled event type");
200     }
201     return NUM_INST_CNTS;
202   }
203 
204   void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
205     if (GprNo < NUM_ALL_VGPRS) {
206       if (GprNo > VgprUB) {
207         VgprUB = GprNo;
208       }
209       VgprScores[T][GprNo] = Val;
210     } else {
211       assert(T == LGKM_CNT);
212       if (GprNo - NUM_ALL_VGPRS > SgprUB) {
213         SgprUB = GprNo - NUM_ALL_VGPRS;
214       }
215       SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
216     }
217   }
218 
219   int32_t getRegScore(int GprNo, InstCounterType T) {
220     if (GprNo < NUM_ALL_VGPRS) {
221       return VgprScores[T][GprNo];
222     }
223     return SgprScores[GprNo - NUM_ALL_VGPRS];
224   }
225 
226   void clear() {
227     memset(ScoreLBs, 0, sizeof(ScoreLBs));
228     memset(ScoreUBs, 0, sizeof(ScoreUBs));
229     memset(EventUBs, 0, sizeof(EventUBs));
230     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
231          T = (enum InstCounterType)(T + 1)) {
232       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
233     }
234     memset(SgprScores, 0, sizeof(SgprScores));
235   }
236 
237   RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
238                              const MachineRegisterInfo *MRI,
239                              const SIRegisterInfo *TRI, unsigned OpNo,
240                              bool Def) const;
241 
242   void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
243                    const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
244                    unsigned OpNo, int32_t Val);
245 
246   void setWaitAtBeginning() { WaitAtBeginning = true; }
247   void clearWaitAtBeginning() { WaitAtBeginning = false; }
248   bool getWaitAtBeginning() const { return WaitAtBeginning; }
249   void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
250   int32_t getMaxVGPR() const { return VgprUB; }
251   int32_t getMaxSGPR() const { return SgprUB; }
252 
253   int32_t getEventUB(enum WaitEventType W) const {
254     assert(W < NUM_WAIT_EVENTS);
255     return EventUBs[W];
256   }
257 
258   bool counterOutOfOrder(InstCounterType T);
259   unsigned int updateByWait(InstCounterType T, int ScoreToWait);
260   void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
261                      const MachineRegisterInfo *MRI, WaitEventType E,
262                      MachineInstr &MI);
263 
264   bool hasPendingSMEM() const {
265     return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
266             EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
267   }
268 
269   bool hasPendingFlat() const {
270     return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
271              LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
272             (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
273              LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
274   }
275 
276   void setPendingFlat() {
277     LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
278     LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
279   }
280 
281   int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
282 
283   void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
284 
285   bool getRevisitLoop() const { return RevisitLoop; }
286   void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
287 
288   void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
289   int32_t getPostOrder() const { return PostOrder; }
290 
291   void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; }
292   void clearWaitcnt() { Waitcnt = nullptr; }
293   MachineInstr *getWaitcnt() const { return Waitcnt; }
294 
295   bool mixedExpTypes() const { return MixedExpTypes; }
296   void setMixedExpTypes(bool MixedExpTypesIn) {
297     MixedExpTypes = MixedExpTypesIn;
298   }
299 
300   void print(raw_ostream &);
301   void dump() { print(dbgs()); }
302 
303 private:
304   bool WaitAtBeginning = false;
305   bool RevisitLoop = false;
306   bool MixedExpTypes = false;
307   int32_t PostOrder = 0;
308   MachineInstr *Waitcnt = nullptr;
309   int32_t ScoreLBs[NUM_INST_CNTS] = {0};
310   int32_t ScoreUBs[NUM_INST_CNTS] = {0};
311   int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
312   // Remember the last flat memory operation.
313   int32_t LastFlat[NUM_INST_CNTS] = {0};
314   // wait_cnt scores for every vgpr.
315   // Keep track of the VgprUB and SgprUB to make merge at join efficient.
316   int32_t VgprUB = 0;
317   int32_t SgprUB = 0;
318   int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
319   // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
320   int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
321 };
322 
323 // This is a per-loop-region object that records waitcnt status at the end of
324 // loop footer from the previous iteration. We also maintain an iteration
325 // count to track the number of times the loop has been visited. When it
326 // doesn't converge naturally, we force convergence by inserting s_waitcnt 0
327 // at the end of the loop footer.
328 class LoopWaitcntData {
329 public:
330   LoopWaitcntData() = default;
331   ~LoopWaitcntData() = default;
332 
333   void incIterCnt() { IterCnt++; }
334   void resetIterCnt() { IterCnt = 0; }
335   int32_t getIterCnt() { return IterCnt; }
336 
337   void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
338   MachineInstr *getWaitcnt() const { return LfWaitcnt; }
339 
340   void print() {
341     DEBUG(dbgs() << "  iteration " << IterCnt << '\n';);
342   }
343 
344 private:
345   // s_waitcnt added at the end of loop footer to stablize wait scores
346   // at the end of the loop footer.
347   MachineInstr *LfWaitcnt = nullptr;
348   // Number of iterations the loop has been visited, not including the initial
349   // walk over.
350   int32_t IterCnt = 0;
351 };
352 
353 class SIInsertWaitcnts : public MachineFunctionPass {
354 private:
355   const SISubtarget *ST = nullptr;
356   const SIInstrInfo *TII = nullptr;
357   const SIRegisterInfo *TRI = nullptr;
358   const MachineRegisterInfo *MRI = nullptr;
359   const MachineLoopInfo *MLI = nullptr;
360   AMDGPU::IsaInfo::IsaVersion IV;
361   AMDGPUAS AMDGPUASI;
362 
363   DenseSet<MachineBasicBlock *> BlockVisitedSet;
364   DenseSet<MachineInstr *> TrackedWaitcntSet;
365   DenseSet<MachineInstr *> VCCZBugHandledSet;
366 
367   DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
368       BlockWaitcntBracketsMap;
369 
370   std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet;
371 
372   DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
373 
374   std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets;
375 
376 public:
377   static char ID;
378 
379   SIInsertWaitcnts() : MachineFunctionPass(ID) {}
380 
381   bool runOnMachineFunction(MachineFunction &MF) override;
382 
383   StringRef getPassName() const override {
384     return "SI insert wait instructions";
385   }
386 
387   void getAnalysisUsage(AnalysisUsage &AU) const override {
388     AU.setPreservesCFG();
389     AU.addRequired<MachineLoopInfo>();
390     MachineFunctionPass::getAnalysisUsage(AU);
391   }
392 
393   void addKillWaitBracket(BlockWaitcntBrackets *Bracket) {
394     // The waitcnt information is copied because it changes as the block is
395     // traversed.
396     KillWaitBrackets.push_back(
397         llvm::make_unique<BlockWaitcntBrackets>(*Bracket));
398   }
399 
400   bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
401   void generateSWaitCntInstBefore(MachineInstr &MI,
402                                   BlockWaitcntBrackets *ScoreBrackets);
403   void updateEventWaitCntAfter(MachineInstr &Inst,
404                                BlockWaitcntBrackets *ScoreBrackets);
405   void mergeInputScoreBrackets(MachineBasicBlock &Block);
406   bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block);
407   unsigned countNumBottomBlocks(const MachineLoop *Loop);
408   void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
409   void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
410   bool isWaitcntStronger(unsigned LHS, unsigned RHS);
411   unsigned combineWaitcnt(unsigned LHS, unsigned RHS);
412 };
413 
414 } // end anonymous namespace
415 
416 RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
417                                                  const SIInstrInfo *TII,
418                                                  const MachineRegisterInfo *MRI,
419                                                  const SIRegisterInfo *TRI,
420                                                  unsigned OpNo,
421                                                  bool Def) const {
422   const MachineOperand &Op = MI->getOperand(OpNo);
423   if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
424       (Def && !Op.isDef()))
425     return {-1, -1};
426 
427   // A use via a PW operand does not need a waitcnt.
428   // A partial write is not a WAW.
429   assert(!Op.getSubReg() || !Op.isUndef());
430 
431   RegInterval Result;
432   const MachineRegisterInfo &MRIA = *MRI;
433 
434   unsigned Reg = TRI->getEncodingValue(Op.getReg());
435 
436   if (TRI->isVGPR(MRIA, Op.getReg())) {
437     assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
438     Result.first = Reg - RegisterEncoding.VGPR0;
439     assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
440   } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
441     assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
442     Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
443     assert(Result.first >= NUM_ALL_VGPRS &&
444            Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
445   }
446   // TODO: Handle TTMP
447   // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
448   else
449     return {-1, -1};
450 
451   const MachineInstr &MIA = *MI;
452   const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
453   unsigned Size = TRI->getRegSizeInBits(*RC);
454   Result.second = Result.first + (Size / 32);
455 
456   return Result;
457 }
458 
459 void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
460                                        const SIInstrInfo *TII,
461                                        const SIRegisterInfo *TRI,
462                                        const MachineRegisterInfo *MRI,
463                                        unsigned OpNo, int32_t Val) {
464   RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
465   DEBUG({
466     const MachineOperand &Opnd = MI->getOperand(OpNo);
467     assert(TRI->isVGPR(*MRI, Opnd.getReg()));
468   });
469   for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
470     setRegScore(RegNo, EXP_CNT, Val);
471   }
472 }
473 
474 void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
475                                          const SIRegisterInfo *TRI,
476                                          const MachineRegisterInfo *MRI,
477                                          WaitEventType E, MachineInstr &Inst) {
478   const MachineRegisterInfo &MRIA = *MRI;
479   InstCounterType T = eventCounter(E);
480   int32_t CurrScore = getScoreUB(T) + 1;
481   // EventUB and ScoreUB need to be update regardless if this event changes
482   // the score of a register or not.
483   // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
484   EventUBs[E] = CurrScore;
485   setScoreUB(T, CurrScore);
486 
487   if (T == EXP_CNT) {
488     // Check for mixed export types. If they are mixed, then a waitcnt exp(0)
489     // is required.
490     if (!MixedExpTypes) {
491       MixedExpTypes = counterOutOfOrder(EXP_CNT);
492     }
493 
494     // Put score on the source vgprs. If this is a store, just use those
495     // specific register(s).
496     if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
497       // All GDS operations must protect their address register (same as
498       // export.)
499       if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
500           Inst.getOpcode() != AMDGPU::DS_CONSUME) {
501         setExpScore(
502             &Inst, TII, TRI, MRI,
503             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
504             CurrScore);
505       }
506       if (Inst.mayStore()) {
507         setExpScore(
508             &Inst, TII, TRI, MRI,
509             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
510             CurrScore);
511         if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
512                                        AMDGPU::OpName::data1) != -1) {
513           setExpScore(&Inst, TII, TRI, MRI,
514                       AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
515                                                  AMDGPU::OpName::data1),
516                       CurrScore);
517         }
518       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
519                  Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
520                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
521                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
522                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
523                  Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
524                  Inst.getOpcode() != AMDGPU::DS_APPEND &&
525                  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
526                  Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
527         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
528           const MachineOperand &Op = Inst.getOperand(I);
529           if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
530             setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
531           }
532         }
533       }
534     } else if (TII->isFLAT(Inst)) {
535       if (Inst.mayStore()) {
536         setExpScore(
537             &Inst, TII, TRI, MRI,
538             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
539             CurrScore);
540       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
541         setExpScore(
542             &Inst, TII, TRI, MRI,
543             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
544             CurrScore);
545       }
546     } else if (TII->isMIMG(Inst)) {
547       if (Inst.mayStore()) {
548         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
549       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
550         setExpScore(
551             &Inst, TII, TRI, MRI,
552             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
553             CurrScore);
554       }
555     } else if (TII->isMTBUF(Inst)) {
556       if (Inst.mayStore()) {
557         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
558       }
559     } else if (TII->isMUBUF(Inst)) {
560       if (Inst.mayStore()) {
561         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
562       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
563         setExpScore(
564             &Inst, TII, TRI, MRI,
565             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
566             CurrScore);
567       }
568     } else {
569       if (TII->isEXP(Inst)) {
570         // For export the destination registers are really temps that
571         // can be used as the actual source after export patching, so
572         // we need to treat them like sources and set the EXP_CNT
573         // score.
574         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
575           MachineOperand &DefMO = Inst.getOperand(I);
576           if (DefMO.isReg() && DefMO.isDef() &&
577               TRI->isVGPR(MRIA, DefMO.getReg())) {
578             setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
579                         CurrScore);
580           }
581         }
582       }
583       for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
584         MachineOperand &MO = Inst.getOperand(I);
585         if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
586           setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
587         }
588       }
589     }
590 #if 0 // TODO: check if this is handled by MUBUF code above.
591   } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
592        Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
593        Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
594     MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
595     unsigned OpNo;//TODO: find the OpNo for this operand;
596     RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
597     for (signed RegNo = Interval.first; RegNo < Interval.second;
598     ++RegNo) {
599       setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
600     }
601 #endif
602   } else {
603     // Match the score to the destination registers.
604     for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
605       RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
606       if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
607         continue;
608       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
609         setRegScore(RegNo, T, CurrScore);
610       }
611     }
612     if (TII->isDS(Inst) && Inst.mayStore()) {
613       setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
614     }
615   }
616 }
617 
618 void BlockWaitcntBrackets::print(raw_ostream &OS) {
619   OS << '\n';
620   for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
621        T = (enum InstCounterType)(T + 1)) {
622     int LB = getScoreLB(T);
623     int UB = getScoreUB(T);
624 
625     switch (T) {
626     case VM_CNT:
627       OS << "    VM_CNT(" << UB - LB << "): ";
628       break;
629     case LGKM_CNT:
630       OS << "    LGKM_CNT(" << UB - LB << "): ";
631       break;
632     case EXP_CNT:
633       OS << "    EXP_CNT(" << UB - LB << "): ";
634       break;
635     default:
636       OS << "    UNKNOWN(" << UB - LB << "): ";
637       break;
638     }
639 
640     if (LB < UB) {
641       // Print vgpr scores.
642       for (int J = 0; J <= getMaxVGPR(); J++) {
643         int RegScore = getRegScore(J, T);
644         if (RegScore <= LB)
645           continue;
646         int RelScore = RegScore - LB - 1;
647         if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
648           OS << RelScore << ":v" << J << " ";
649         } else {
650           OS << RelScore << ":ds ";
651         }
652       }
653       // Also need to print sgpr scores for lgkm_cnt.
654       if (T == LGKM_CNT) {
655         for (int J = 0; J <= getMaxSGPR(); J++) {
656           int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
657           if (RegScore <= LB)
658             continue;
659           int RelScore = RegScore - LB - 1;
660           OS << RelScore << ":s" << J << " ";
661         }
662       }
663     }
664     OS << '\n';
665   }
666   OS << '\n';
667 }
668 
669 unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
670                                                 int ScoreToWait) {
671   unsigned int NeedWait = 0;
672   if (ScoreToWait == -1) {
673     // The score to wait is unknown. This implies that it was not encountered
674     // during the path of the CFG walk done during the current traversal but
675     // may be seen on a different path. Emit an s_wait counter with a
676     // conservative value of 0 for the counter.
677     NeedWait = CNT_MASK(T);
678     setScoreLB(T, getScoreUB(T));
679     return NeedWait;
680   }
681 
682   // If the score of src_operand falls within the bracket, we need an
683   // s_waitcnt instruction.
684   const int32_t LB = getScoreLB(T);
685   const int32_t UB = getScoreUB(T);
686   if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
687     if (T == VM_CNT && hasPendingFlat()) {
688       // If there is a pending FLAT operation, and this is a VM waitcnt,
689       // then we need to force a waitcnt 0 for VM.
690       NeedWait = CNT_MASK(T);
691       setScoreLB(T, getScoreUB(T));
692     } else if (counterOutOfOrder(T)) {
693       // Counter can get decremented out-of-order when there
694       // are multiple types event in the bracket. Also emit an s_wait counter
695       // with a conservative value of 0 for the counter.
696       NeedWait = CNT_MASK(T);
697       setScoreLB(T, getScoreUB(T));
698     } else {
699       NeedWait = CNT_MASK(T);
700       setScoreLB(T, ScoreToWait);
701     }
702   }
703 
704   return NeedWait;
705 }
706 
707 // Where there are multiple types of event in the bracket of a counter,
708 // the decrement may go out of order.
709 bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) {
710   switch (T) {
711   case VM_CNT:
712     return false;
713   case LGKM_CNT: {
714     if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
715         EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) {
716       // Scalar memory read always can go out of order.
717       return true;
718     }
719     int NumEventTypes = 0;
720     if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
721         EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
722       NumEventTypes++;
723     }
724     if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
725         EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
726       NumEventTypes++;
727     }
728     if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] &&
729         EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) {
730       NumEventTypes++;
731     }
732     if (NumEventTypes <= 1) {
733       return false;
734     }
735     break;
736   }
737   case EXP_CNT: {
738     // If there has been a mixture of export types, then a waitcnt exp(0) is
739     // required.
740     if (MixedExpTypes)
741       return true;
742     int NumEventTypes = 0;
743     if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
744         EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
745       NumEventTypes++;
746     }
747     if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
748         EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
749       NumEventTypes++;
750     }
751     if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
752         EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
753       NumEventTypes++;
754     }
755     if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] &&
756         EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) {
757       NumEventTypes++;
758     }
759 
760     if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] &&
761         EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) {
762       NumEventTypes++;
763     }
764 
765     if (NumEventTypes <= 1) {
766       return false;
767     }
768     break;
769   }
770   default:
771     break;
772   }
773   return true;
774 }
775 
776 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
777                       false)
778 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
779                     false)
780 
781 char SIInsertWaitcnts::ID = 0;
782 
783 char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
784 
785 FunctionPass *llvm::createSIInsertWaitcntsPass() {
786   return new SIInsertWaitcnts();
787 }
788 
789 static bool readsVCCZ(const MachineInstr &MI) {
790   unsigned Opc = MI.getOpcode();
791   return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
792          !MI.getOperand(1).isUndef();
793 }
794 
795 /// \brief Given wait count encodings checks if LHS is stronger than RHS.
796 bool SIInsertWaitcnts::isWaitcntStronger(unsigned LHS, unsigned RHS) {
797   if (AMDGPU::decodeVmcnt(IV, LHS) > AMDGPU::decodeVmcnt(IV, RHS))
798     return false;
799   if (AMDGPU::decodeLgkmcnt(IV, LHS) > AMDGPU::decodeLgkmcnt(IV, RHS))
800     return false;
801   if (AMDGPU::decodeExpcnt(IV, LHS) > AMDGPU::decodeExpcnt(IV, RHS))
802     return false;
803   return true;
804 }
805 
806 /// \brief Given wait count encodings create a new encoding which is stronger
807 /// or equal to both.
808 unsigned SIInsertWaitcnts::combineWaitcnt(unsigned LHS, unsigned RHS) {
809   unsigned VmCnt = std::min(AMDGPU::decodeVmcnt(IV, LHS),
810                             AMDGPU::decodeVmcnt(IV, RHS));
811   unsigned LgkmCnt = std::min(AMDGPU::decodeLgkmcnt(IV, LHS),
812                               AMDGPU::decodeLgkmcnt(IV, RHS));
813   unsigned ExpCnt = std::min(AMDGPU::decodeExpcnt(IV, LHS),
814                              AMDGPU::decodeExpcnt(IV, RHS));
815   return AMDGPU::encodeWaitcnt(IV, VmCnt, ExpCnt, LgkmCnt);
816 }
817 
818 ///  \brief Generate s_waitcnt instruction to be placed before cur_Inst.
819 ///  Instructions of a given type are returned in order,
820 ///  but instructions of different types can complete out of order.
821 ///  We rely on this in-order completion
822 ///  and simply assign a score to the memory access instructions.
823 ///  We keep track of the active "score bracket" to determine
824 ///  if an access of a memory read requires an s_waitcnt
825 ///  and if so what the value of each counter is.
826 ///  The "score bracket" is bound by the lower bound and upper bound
827 ///  scores (*_score_LB and *_score_ub respectively).
828 void SIInsertWaitcnts::generateSWaitCntInstBefore(
829     MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) {
830   // To emit, or not to emit - that's the question!
831   // Start with an assumption that there is no need to emit.
832   unsigned int EmitSwaitcnt = 0;
833   // No need to wait before phi. If a phi-move exists, then the wait should
834   // has been inserted before the move. If a phi-move does not exist, then
835   // wait should be inserted before the real use. The same is true for
836   // sc-merge. It is not a coincident that all these cases correspond to the
837   // instructions that are skipped in the assembling loop.
838   bool NeedLineMapping = false; // TODO: Check on this.
839   if (MI.isDebugValue() &&
840       // TODO: any other opcode?
841       !NeedLineMapping) {
842     return;
843   }
844 
845   // See if an s_waitcnt is forced at block entry, or is needed at
846   // program end.
847   if (ScoreBrackets->getWaitAtBeginning()) {
848     // Note that we have already cleared the state, so we don't need to update
849     // it.
850     ScoreBrackets->clearWaitAtBeginning();
851     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
852          T = (enum InstCounterType)(T + 1)) {
853       EmitSwaitcnt |= CNT_MASK(T);
854       ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
855     }
856   }
857 
858   // See if this instruction has a forced S_WAITCNT VM.
859   // TODO: Handle other cases of NeedsWaitcntVmBefore()
860   else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
861            MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
862            MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
863     EmitSwaitcnt |=
864         ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
865   }
866 
867   // All waits must be resolved at call return.
868   // NOTE: this could be improved with knowledge of all call sites or
869   //   with knowledge of the called routines.
870   if (MI.getOpcode() == AMDGPU::RETURN ||
871       MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
872       MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
873     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
874          T = (enum InstCounterType)(T + 1)) {
875       if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
876         ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
877         EmitSwaitcnt |= CNT_MASK(T);
878       }
879     }
880   }
881   // Resolve vm waits before gs-done.
882   else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
883             MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
884            ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
885             AMDGPU::SendMsg::ID_GS_DONE)) {
886     if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) {
887       ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
888       EmitSwaitcnt |= CNT_MASK(VM_CNT);
889     }
890   }
891 #if 0 // TODO: the following blocks of logic when we have fence.
892   else if (MI.getOpcode() == SC_FENCE) {
893     const unsigned int group_size =
894       context->shader_info->GetMaxThreadGroupSize();
895     // group_size == 0 means thread group size is unknown at compile time
896     const bool group_is_multi_wave =
897       (group_size == 0 || group_size > target_info->GetWaveFrontSize());
898     const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
899 
900     for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
901       SCRegType src_type = Inst->GetSrcType(i);
902       switch (src_type) {
903         case SCMEM_LDS:
904           if (group_is_multi_wave ||
905             context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
906             EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
907                                ScoreBrackets->getScoreUB(LGKM_CNT));
908             // LDS may have to wait for VM_CNT after buffer load to LDS
909             if (target_info->HasBufferLoadToLDS()) {
910               EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
911                                  ScoreBrackets->getScoreUB(VM_CNT));
912             }
913           }
914           break;
915 
916         case SCMEM_GDS:
917           if (group_is_multi_wave || fence_is_global) {
918             EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
919               ScoreBrackets->getScoreUB(EXP_CNT));
920             EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
921               ScoreBrackets->getScoreUB(LGKM_CNT));
922           }
923           break;
924 
925         case SCMEM_UAV:
926         case SCMEM_TFBUF:
927         case SCMEM_RING:
928         case SCMEM_SCATTER:
929           if (group_is_multi_wave || fence_is_global) {
930             EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
931               ScoreBrackets->getScoreUB(EXP_CNT));
932             EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
933               ScoreBrackets->getScoreUB(VM_CNT));
934           }
935           break;
936 
937         case SCMEM_SCRATCH:
938         default:
939           break;
940       }
941     }
942   }
943 #endif
944 
945   // Export & GDS instructions do not read the EXEC mask until after the export
946   // is granted (which can occur well after the instruction is issued).
947   // The shader program must flush all EXP operations on the export-count
948   // before overwriting the EXEC mask.
949   else {
950     if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
951       // Export and GDS are tracked individually, either may trigger a waitcnt
952       // for EXEC.
953       EmitSwaitcnt |= ScoreBrackets->updateByWait(
954           EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK));
955       EmitSwaitcnt |= ScoreBrackets->updateByWait(
956           EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS));
957       EmitSwaitcnt |= ScoreBrackets->updateByWait(
958           EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS));
959       EmitSwaitcnt |= ScoreBrackets->updateByWait(
960           EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK));
961     }
962 
963 #if 0 // TODO: the following code to handle CALL.
964     // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
965     // However, there is a problem with EXP_CNT, because the call cannot
966     // easily tell if a register is used in the function, and if it did, then
967     // the referring instruction would have to have an S_WAITCNT, which is
968     // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
969     // before the call.
970     if (MI.getOpcode() == SC_CALL) {
971       if (ScoreBrackets->getScoreUB(EXP_CNT) >
972         ScoreBrackets->getScoreLB(EXP_CNT)) {
973         ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
974         EmitSwaitcnt |= CNT_MASK(EXP_CNT);
975       }
976     }
977 #endif
978 
979     // FIXME: Should not be relying on memoperands.
980     // Look at the source operands of every instruction to see if
981     // any of them results from a previous memory operation that affects
982     // its current usage. If so, an s_waitcnt instruction needs to be
983     // emitted.
984     // If the source operand was defined by a load, add the s_waitcnt
985     // instruction.
986     for (const MachineMemOperand *Memop : MI.memoperands()) {
987       unsigned AS = Memop->getAddrSpace();
988       if (AS != AMDGPUASI.LOCAL_ADDRESS)
989         continue;
990       unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
991       // VM_CNT is only relevant to vgpr or LDS.
992       EmitSwaitcnt |= ScoreBrackets->updateByWait(
993           VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
994     }
995 
996     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
997       const MachineOperand &Op = MI.getOperand(I);
998       const MachineRegisterInfo &MRIA = *MRI;
999       RegInterval Interval =
1000           ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);
1001       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1002         if (TRI->isVGPR(MRIA, Op.getReg())) {
1003           // VM_CNT is only relevant to vgpr or LDS.
1004           EmitSwaitcnt |= ScoreBrackets->updateByWait(
1005               VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1006         }
1007         EmitSwaitcnt |= ScoreBrackets->updateByWait(
1008             LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
1009       }
1010     }
1011     // End of for loop that looks at all source operands to decide vm_wait_cnt
1012     // and lgk_wait_cnt.
1013 
1014     // Two cases are handled for destination operands:
1015     // 1) If the destination operand was defined by a load, add the s_waitcnt
1016     // instruction to guarantee the right WAW order.
1017     // 2) If a destination operand that was used by a recent export/store ins,
1018     // add s_waitcnt on exp_cnt to guarantee the WAR order.
1019     if (MI.mayStore()) {
1020       // FIXME: Should not be relying on memoperands.
1021       for (const MachineMemOperand *Memop : MI.memoperands()) {
1022         unsigned AS = Memop->getAddrSpace();
1023         if (AS != AMDGPUASI.LOCAL_ADDRESS)
1024           continue;
1025         unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1026         EmitSwaitcnt |= ScoreBrackets->updateByWait(
1027             VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1028         EmitSwaitcnt |= ScoreBrackets->updateByWait(
1029             EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
1030       }
1031     }
1032     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1033       MachineOperand &Def = MI.getOperand(I);
1034       const MachineRegisterInfo &MRIA = *MRI;
1035       RegInterval Interval =
1036           ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
1037       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1038         if (TRI->isVGPR(MRIA, Def.getReg())) {
1039           EmitSwaitcnt |= ScoreBrackets->updateByWait(
1040               VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1041           EmitSwaitcnt |= ScoreBrackets->updateByWait(
1042               EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
1043         }
1044         EmitSwaitcnt |= ScoreBrackets->updateByWait(
1045             LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
1046       }
1047     } // End of for loop that looks at all dest operands.
1048   }
1049 
1050   // TODO: Tie force zero to a compiler triage option.
1051   bool ForceZero = false;
1052 
1053   // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
1054   // occurs before the instruction. Doing it here prevents any additional
1055   // S_WAITCNTs from being emitted if the instruction was marked as
1056   // requiring a WAITCNT beforehand.
1057   if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1058       !ST->hasAutoWaitcntBeforeBarrier()) {
1059     EmitSwaitcnt |=
1060         ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
1061     EmitSwaitcnt |= ScoreBrackets->updateByWait(
1062         EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
1063     EmitSwaitcnt |= ScoreBrackets->updateByWait(
1064         LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT));
1065   }
1066 
1067   // TODO: Remove this work-around, enable the assert for Bug 457939
1068   //       after fixing the scheduler. Also, the Shader Compiler code is
1069   //       independent of target.
1070   if (readsVCCZ(MI) && ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
1071     if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1072             ScoreBrackets->getScoreUB(LGKM_CNT) &&
1073         ScoreBrackets->hasPendingSMEM()) {
1074       // Wait on everything, not just LGKM.  vccz reads usually come from
1075       // terminators, and we always wait on everything at the end of the
1076       // block, so if we only wait on LGKM here, we might end up with
1077       // another s_waitcnt inserted right after this if there are non-LGKM
1078       // instructions still outstanding.
1079       ForceZero = true;
1080       EmitSwaitcnt = true;
1081     }
1082   }
1083 
1084   // Does this operand processing indicate s_wait counter update?
1085   if (EmitSwaitcnt) {
1086     int CntVal[NUM_INST_CNTS];
1087 
1088     bool UseDefaultWaitcntStrategy = true;
1089     if (ForceZero) {
1090       // Force all waitcnts to 0.
1091       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1092            T = (enum InstCounterType)(T + 1)) {
1093         ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
1094       }
1095       CntVal[VM_CNT] = 0;
1096       CntVal[EXP_CNT] = 0;
1097       CntVal[LGKM_CNT] = 0;
1098       UseDefaultWaitcntStrategy = false;
1099     }
1100 
1101     if (UseDefaultWaitcntStrategy) {
1102       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1103            T = (enum InstCounterType)(T + 1)) {
1104         if (EmitSwaitcnt & CNT_MASK(T)) {
1105           int Delta =
1106               ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T);
1107           int MaxDelta = ScoreBrackets->getWaitCountMax(T);
1108           if (Delta >= MaxDelta) {
1109             Delta = -1;
1110             if (T != EXP_CNT) {
1111               ScoreBrackets->setScoreLB(
1112                   T, ScoreBrackets->getScoreUB(T) - MaxDelta);
1113             }
1114             EmitSwaitcnt &= ~CNT_MASK(T);
1115           }
1116           CntVal[T] = Delta;
1117         } else {
1118           // If we are not waiting for a particular counter then encode
1119           // it as -1 which means "don't care."
1120           CntVal[T] = -1;
1121         }
1122       }
1123     }
1124 
1125     // If we are not waiting on any counter we can skip the wait altogether.
1126     if (EmitSwaitcnt != 0) {
1127       MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
1128       int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
1129       if (!OldWaitcnt ||
1130           (AMDGPU::decodeVmcnt(IV, Imm) !=
1131                           (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
1132           (AMDGPU::decodeExpcnt(IV, Imm) !=
1133            (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
1134           (AMDGPU::decodeLgkmcnt(IV, Imm) !=
1135            (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) {
1136         MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
1137         if (ContainingLoop) {
1138           MachineBasicBlock *TBB = ContainingLoop->getHeader();
1139           BlockWaitcntBrackets *ScoreBracket =
1140               BlockWaitcntBracketsMap[TBB].get();
1141           if (!ScoreBracket) {
1142             assert(!BlockVisitedSet.count(TBB));
1143             BlockWaitcntBracketsMap[TBB] =
1144                 llvm::make_unique<BlockWaitcntBrackets>();
1145             ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
1146           }
1147           ScoreBracket->setRevisitLoop(true);
1148           DEBUG(dbgs() << "set-revisit: Block"
1149                        << ContainingLoop->getHeader()->getNumber() << '\n';);
1150         }
1151       }
1152 
1153       // Update an existing waitcount, or make a new one.
1154       unsigned Enc = AMDGPU::encodeWaitcnt(IV, CntVal[VM_CNT],
1155                                            CntVal[EXP_CNT], CntVal[LGKM_CNT]);
1156       // We don't remove waitcnts that existed prior to the waitcnt
1157       // pass. Check if the waitcnt to-be-inserted can be avoided
1158       // or if the prev waitcnt can be updated.
1159       bool insertSWaitInst = true;
1160       for (MachineBasicBlock::iterator I = MI.getIterator(),
1161                                        B = MI.getParent()->begin();
1162            insertSWaitInst && I != B; --I) {
1163         if (I == MI.getIterator())
1164           continue;
1165 
1166         switch (I->getOpcode()) {
1167         case AMDGPU::S_WAITCNT:
1168           if (isWaitcntStronger(I->getOperand(0).getImm(), Enc))
1169             insertSWaitInst = false;
1170           else if (!OldWaitcnt) {
1171             OldWaitcnt = &*I;
1172             Enc = combineWaitcnt(I->getOperand(0).getImm(), Enc);
1173           }
1174           break;
1175         // TODO: skip over instructions which never require wait.
1176         }
1177         break;
1178       }
1179       if (insertSWaitInst) {
1180         if (OldWaitcnt && OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT) {
1181           OldWaitcnt->getOperand(0).setImm(Enc);
1182           if (!OldWaitcnt->getParent())
1183             MI.getParent()->insert(MI, OldWaitcnt);
1184 
1185           DEBUG(dbgs() << "updateWaitcntInBlock\n"
1186                        << "Old Instr: " << MI << '\n'
1187                        << "New Instr: " << *OldWaitcnt << '\n');
1188         } else {
1189             auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
1190                                MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1191                              .addImm(Enc);
1192             TrackedWaitcntSet.insert(SWaitInst);
1193 
1194             DEBUG(dbgs() << "insertWaitcntInBlock\n"
1195                          << "Old Instr: " << MI << '\n'
1196                          << "New Instr: " << *SWaitInst << '\n');
1197         }
1198       }
1199 
1200       if (CntVal[EXP_CNT] == 0) {
1201         ScoreBrackets->setMixedExpTypes(false);
1202       }
1203     }
1204   }
1205 }
1206 
1207 void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
1208                                              MachineInstr *Waitcnt) {
1209   if (MBB.empty()) {
1210     MBB.push_back(Waitcnt);
1211     return;
1212   }
1213 
1214   MachineBasicBlock::iterator It = MBB.end();
1215   MachineInstr *MI = &*(--It);
1216   if (MI->isBranch()) {
1217     MBB.insert(It, Waitcnt);
1218   } else {
1219     MBB.push_back(Waitcnt);
1220   }
1221 }
1222 
1223 // This is a flat memory operation. Check to see if it has memory
1224 // tokens for both LDS and Memory, and if so mark it as a flat.
1225 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1226   if (MI.memoperands_empty())
1227     return true;
1228 
1229   for (const MachineMemOperand *Memop : MI.memoperands()) {
1230     unsigned AS = Memop->getAddrSpace();
1231     if (AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS)
1232       return true;
1233   }
1234 
1235   return false;
1236 }
1237 
1238 void SIInsertWaitcnts::updateEventWaitCntAfter(
1239     MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
1240   // Now look at the instruction opcode. If it is a memory access
1241   // instruction, update the upper-bound of the appropriate counter's
1242   // bracket and the destination operand scores.
1243   // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
1244   if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
1245     if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
1246       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1247       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1248     } else {
1249       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1250     }
1251   } else if (TII->isFLAT(Inst)) {
1252     assert(Inst.mayLoad() || Inst.mayStore());
1253 
1254     if (TII->usesVM_CNT(Inst))
1255       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1256 
1257     if (TII->usesLGKM_CNT(Inst)) {
1258       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1259 
1260       // This is a flat memory operation, so note it - it will require
1261       // that both the VM and LGKM be flushed to zero if it is pending when
1262       // a VM or LGKM dependency occurs.
1263       if (mayAccessLDSThroughFlat(Inst))
1264         ScoreBrackets->setPendingFlat();
1265     }
1266   } else if (SIInstrInfo::isVMEM(Inst) &&
1267              // TODO: get a better carve out.
1268              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
1269              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
1270              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
1271     ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1272     if ( // TODO: assumed yes -- target_info->MemWriteNeedsExpWait() &&
1273         (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
1274       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1275     }
1276   } else if (TII->isSMRD(Inst)) {
1277     ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1278   } else {
1279     switch (Inst.getOpcode()) {
1280     case AMDGPU::S_SENDMSG:
1281     case AMDGPU::S_SENDMSGHALT:
1282       ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1283       break;
1284     case AMDGPU::EXP:
1285     case AMDGPU::EXP_DONE: {
1286       int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1287       if (Imm >= 32 && Imm <= 63)
1288         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1289       else if (Imm >= 12 && Imm <= 15)
1290         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1291       else
1292         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1293       break;
1294     }
1295     case AMDGPU::S_MEMTIME:
1296     case AMDGPU::S_MEMREALTIME:
1297       ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1298       break;
1299     default:
1300       break;
1301     }
1302   }
1303 }
1304 
1305 // Merge the score brackets of the Block's predecessors;
1306 // this merged score bracket is used when adding waitcnts to the Block
1307 void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
1308   BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1309   int32_t MaxPending[NUM_INST_CNTS] = {0};
1310   int32_t MaxFlat[NUM_INST_CNTS] = {0};
1311   bool MixedExpTypes = false;
1312 
1313   // For single basic block loops, we need to retain the Block's
1314   // score bracket to have accurate Pred info. So, make a copy of Block's
1315   // score bracket, clear() it (which retains several important bits of info),
1316   // populate, and then replace en masse. For non-single basic block loops,
1317   // just clear Block's current score bracket and repopulate in-place.
1318   bool IsSelfPred;
1319   std::unique_ptr<BlockWaitcntBrackets> S;
1320 
1321   IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block))
1322     != Block.pred_end();
1323   if (IsSelfPred) {
1324     S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
1325     ScoreBrackets = S.get();
1326   }
1327 
1328   ScoreBrackets->clear();
1329 
1330   // See if there are any uninitialized predecessors. If so, emit an
1331   // s_waitcnt 0 at the beginning of the block.
1332   for (MachineBasicBlock *Pred : Block.predecessors()) {
1333     BlockWaitcntBrackets *PredScoreBrackets =
1334         BlockWaitcntBracketsMap[Pred].get();
1335     bool Visited = BlockVisitedSet.count(Pred);
1336     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
1337       continue;
1338     }
1339     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1340          T = (enum InstCounterType)(T + 1)) {
1341       int span =
1342           PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);
1343       MaxPending[T] = std::max(MaxPending[T], span);
1344       span =
1345           PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
1346       MaxFlat[T] = std::max(MaxFlat[T], span);
1347     }
1348 
1349     MixedExpTypes |= PredScoreBrackets->mixedExpTypes();
1350   }
1351 
1352   // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1353   // Also handle kills for exit block.
1354   if (Block.succ_empty() && !KillWaitBrackets.empty()) {
1355     for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
1356       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1357            T = (enum InstCounterType)(T + 1)) {
1358         int Span = KillWaitBrackets[I]->getScoreUB(T) -
1359                    KillWaitBrackets[I]->getScoreLB(T);
1360         MaxPending[T] = std::max(MaxPending[T], Span);
1361         Span = KillWaitBrackets[I]->pendingFlat(T) -
1362                KillWaitBrackets[I]->getScoreLB(T);
1363         MaxFlat[T] = std::max(MaxFlat[T], Span);
1364       }
1365 
1366       MixedExpTypes |= KillWaitBrackets[I]->mixedExpTypes();
1367     }
1368   }
1369 
1370   // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
1371   for (MachineBasicBlock *Pred : Block.predecessors()) {
1372     BlockWaitcntBrackets *PredScoreBrackets =
1373         BlockWaitcntBracketsMap[Pred].get();
1374     bool Visited = BlockVisitedSet.count(Pred);
1375     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
1376       continue;
1377     }
1378 
1379     int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
1380                   PredScoreBrackets->getScoreLB(EXP_CNT);
1381     MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
1382     int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) -
1383                   PredScoreBrackets->getScoreLB(EXP_CNT);
1384     MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
1385   }
1386 
1387   // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1388   if (Block.succ_empty() && !KillWaitBrackets.empty()) {
1389     for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
1390       int GDSSpan = KillWaitBrackets[I]->getEventUB(GDS_GPR_LOCK) -
1391                     KillWaitBrackets[I]->getScoreLB(EXP_CNT);
1392       MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
1393       int EXPSpan = KillWaitBrackets[I]->getEventUB(EXP_GPR_LOCK) -
1394                     KillWaitBrackets[I]->getScoreLB(EXP_CNT);
1395       MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
1396     }
1397   }
1398 
1399 #if 0
1400   // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
1401   // TODO: how does LC distinguish between function entry and main entry?
1402   // If this is the entry to a function, force a wait.
1403   MachineBasicBlock &Entry = Block.getParent()->front();
1404   if (Entry.getNumber() == Block.getNumber()) {
1405     ScoreBrackets->setWaitAtBeginning();
1406     return;
1407   }
1408 #endif
1409 
1410   // Now set the current Block's brackets to the largest ending bracket.
1411   for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1412        T = (enum InstCounterType)(T + 1)) {
1413     ScoreBrackets->setScoreUB(T, MaxPending[T]);
1414     ScoreBrackets->setScoreLB(T, 0);
1415     ScoreBrackets->setLastFlat(T, MaxFlat[T]);
1416   }
1417 
1418   ScoreBrackets->setMixedExpTypes(MixedExpTypes);
1419 
1420   // Set the register scoreboard.
1421   for (MachineBasicBlock *Pred : Block.predecessors()) {
1422     if (!BlockVisitedSet.count(Pred)) {
1423       continue;
1424     }
1425 
1426     BlockWaitcntBrackets *PredScoreBrackets =
1427         BlockWaitcntBracketsMap[Pred].get();
1428 
1429     // Now merge the gpr_reg_score information
1430     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1431          T = (enum InstCounterType)(T + 1)) {
1432       int PredLB = PredScoreBrackets->getScoreLB(T);
1433       int PredUB = PredScoreBrackets->getScoreUB(T);
1434       if (PredLB < PredUB) {
1435         int PredScale = MaxPending[T] - PredUB;
1436         // Merge vgpr scores.
1437         for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
1438           int PredRegScore = PredScoreBrackets->getRegScore(J, T);
1439           if (PredRegScore <= PredLB)
1440             continue;
1441           int NewRegScore = PredScale + PredRegScore;
1442           ScoreBrackets->setRegScore(
1443               J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
1444         }
1445         // Also need to merge sgpr scores for lgkm_cnt.
1446         if (T == LGKM_CNT) {
1447           for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
1448             int PredRegScore =
1449                 PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
1450             if (PredRegScore <= PredLB)
1451               continue;
1452             int NewRegScore = PredScale + PredRegScore;
1453             ScoreBrackets->setRegScore(
1454                 J + NUM_ALL_VGPRS, LGKM_CNT,
1455                 std::max(
1456                     ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
1457                     NewRegScore));
1458           }
1459         }
1460       }
1461     }
1462 
1463     // Also merge the WaitEvent information.
1464     ForAllWaitEventType(W) {
1465       enum InstCounterType T = PredScoreBrackets->eventCounter(W);
1466       int PredEventUB = PredScoreBrackets->getEventUB(W);
1467       if (PredEventUB > PredScoreBrackets->getScoreLB(T)) {
1468         int NewEventUB =
1469             MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T);
1470         if (NewEventUB > 0) {
1471           ScoreBrackets->setEventUB(
1472               W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
1473         }
1474       }
1475     }
1476   }
1477 
1478   // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1479   // Set the register scoreboard.
1480   if (Block.succ_empty() && !KillWaitBrackets.empty()) {
1481     for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
1482       // Now merge the gpr_reg_score information.
1483       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1484            T = (enum InstCounterType)(T + 1)) {
1485         int PredLB = KillWaitBrackets[I]->getScoreLB(T);
1486         int PredUB = KillWaitBrackets[I]->getScoreUB(T);
1487         if (PredLB < PredUB) {
1488           int PredScale = MaxPending[T] - PredUB;
1489           // Merge vgpr scores.
1490           for (int J = 0; J <= KillWaitBrackets[I]->getMaxVGPR(); J++) {
1491             int PredRegScore = KillWaitBrackets[I]->getRegScore(J, T);
1492             if (PredRegScore <= PredLB)
1493               continue;
1494             int NewRegScore = PredScale + PredRegScore;
1495             ScoreBrackets->setRegScore(
1496                 J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
1497           }
1498           // Also need to merge sgpr scores for lgkm_cnt.
1499           if (T == LGKM_CNT) {
1500             for (int J = 0; J <= KillWaitBrackets[I]->getMaxSGPR(); J++) {
1501               int PredRegScore =
1502                   KillWaitBrackets[I]->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
1503               if (PredRegScore <= PredLB)
1504                 continue;
1505               int NewRegScore = PredScale + PredRegScore;
1506               ScoreBrackets->setRegScore(
1507                   J + NUM_ALL_VGPRS, LGKM_CNT,
1508                   std::max(
1509                       ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
1510                       NewRegScore));
1511             }
1512           }
1513         }
1514       }
1515 
1516       // Also merge the WaitEvent information.
1517       ForAllWaitEventType(W) {
1518         enum InstCounterType T = KillWaitBrackets[I]->eventCounter(W);
1519         int PredEventUB = KillWaitBrackets[I]->getEventUB(W);
1520         if (PredEventUB > KillWaitBrackets[I]->getScoreLB(T)) {
1521           int NewEventUB =
1522               MaxPending[T] + PredEventUB - KillWaitBrackets[I]->getScoreUB(T);
1523           if (NewEventUB > 0) {
1524             ScoreBrackets->setEventUB(
1525                 W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
1526           }
1527         }
1528       }
1529     }
1530   }
1531 
1532   // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
1533   // sequencing predecessors, because changes to EXEC require waitcnts due to
1534   // the delayed nature of these operations.
1535   for (MachineBasicBlock *Pred : Block.predecessors()) {
1536     if (!BlockVisitedSet.count(Pred)) {
1537       continue;
1538     }
1539 
1540     BlockWaitcntBrackets *PredScoreBrackets =
1541         BlockWaitcntBracketsMap[Pred].get();
1542 
1543     int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK);
1544     if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
1545       int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub -
1546                        PredScoreBrackets->getScoreUB(EXP_CNT);
1547       if (new_gds_ub > 0) {
1548         ScoreBrackets->setEventUB(
1549             GDS_GPR_LOCK,
1550             std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub));
1551       }
1552     }
1553     int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK);
1554     if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
1555       int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub -
1556                        PredScoreBrackets->getScoreUB(EXP_CNT);
1557       if (new_exp_ub > 0) {
1558         ScoreBrackets->setEventUB(
1559             EXP_GPR_LOCK,
1560             std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub));
1561       }
1562     }
1563   }
1564 
1565   // if a single block loop, update the score brackets. Not needed for other
1566   // blocks, as we did this in-place
1567   if (IsSelfPred) {
1568     BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
1569   }
1570 }
1571 
1572 /// Return true if the given basic block is a "bottom" block of a loop. This
1573 /// differs from MachineLoop::getBottomBlock in that it works even if the loop
1574 /// is discontiguous. This also handles multiple back-edges for the same
1575 /// "header" block of a loop.
1576 bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
1577                                     const MachineBasicBlock *Block) {
1578   for (MachineBasicBlock *MBB : Loop->blocks()) {
1579     if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {
1580       return true;
1581     }
1582   }
1583   return false;
1584 }
1585 
1586 /// Count the number of "bottom" basic blocks of a loop.
1587 unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) {
1588   unsigned Count = 0;
1589   for (MachineBasicBlock *MBB : Loop->blocks()) {
1590     if (MBB->isSuccessor(Loop->getHeader())) {
1591       Count++;
1592     }
1593   }
1594   return Count;
1595 }
1596 
1597 // Generate s_waitcnt instructions where needed.
1598 void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1599                                             MachineBasicBlock &Block) {
1600   // Initialize the state information.
1601   mergeInputScoreBrackets(Block);
1602 
1603   BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1604 
1605   DEBUG({
1606     dbgs() << "Block" << Block.getNumber();
1607     ScoreBrackets->dump();
1608   });
1609 
1610   // Walk over the instructions.
1611   for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
1612        Iter != E;) {
1613     MachineInstr &Inst = *Iter;
1614     // Remove any previously existing waitcnts.
1615     if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
1616       // Leave pre-existing waitcnts, but note their existence via setWaitcnt.
1617       // Remove the waitcnt-pass-generated waitcnts; the pass will add them back
1618       // as needed.
1619       if (!TrackedWaitcntSet.count(&Inst))
1620         ++Iter;
1621       else {
1622         ++Iter;
1623         Inst.removeFromParent();
1624       }
1625       ScoreBrackets->setWaitcnt(&Inst);
1626       continue;
1627     }
1628 
1629     // Kill instructions generate a conditional branch to the endmain block.
1630     // Merge the current waitcnt state into the endmain block information.
1631     // TODO: Are there other flavors of KILL instruction?
1632     if (Inst.getOpcode() == AMDGPU::KILL) {
1633       addKillWaitBracket(ScoreBrackets);
1634     }
1635 
1636     bool VCCZBugWorkAround = false;
1637     if (readsVCCZ(Inst) &&
1638         (!VCCZBugHandledSet.count(&Inst))) {
1639       if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1640               ScoreBrackets->getScoreUB(LGKM_CNT) &&
1641           ScoreBrackets->hasPendingSMEM()) {
1642         if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS)
1643           VCCZBugWorkAround = true;
1644       }
1645     }
1646 
1647     // Generate an s_waitcnt instruction to be placed before
1648     // cur_Inst, if needed.
1649     generateSWaitCntInstBefore(Inst, ScoreBrackets);
1650 
1651     updateEventWaitCntAfter(Inst, ScoreBrackets);
1652 
1653 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
1654     // If this instruction generates a S_SETVSKIP because it is an
1655     // indexed resource, and we are on Tahiti, then it will also force
1656     // an S_WAITCNT vmcnt(0)
1657     if (RequireCheckResourceType(Inst, context)) {
1658       // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1659       ScoreBrackets->setScoreLB(VM_CNT,
1660       ScoreBrackets->getScoreUB(VM_CNT));
1661     }
1662 #endif
1663 
1664     ScoreBrackets->clearWaitcnt();
1665 
1666     DEBUG({
1667       Inst.print(dbgs());
1668       ScoreBrackets->dump();
1669     });
1670 
1671     // Check to see if this is a GWS instruction. If so, and if this is CI or
1672     // VI, then the generated code sequence will include an S_WAITCNT 0.
1673     // TODO: Are these the only GWS instructions?
1674     if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
1675         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
1676         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
1677         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
1678         Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
1679       // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
1680       ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
1681       ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
1682       ScoreBrackets->updateByWait(LGKM_CNT,
1683                                   ScoreBrackets->getScoreUB(LGKM_CNT));
1684     }
1685 
1686     // TODO: Remove this work-around after fixing the scheduler and enable the
1687     // assert above.
1688     if (VCCZBugWorkAround) {
1689       // Restore the vccz bit.  Any time a value is written to vcc, the vcc
1690       // bit is updated, so we can restore the bit by reading the value of
1691       // vcc and then writing it back to the register.
1692       BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
1693               AMDGPU::VCC)
1694           .addReg(AMDGPU::VCC);
1695       VCCZBugHandledSet.insert(&Inst);
1696     }
1697 
1698     ++Iter;
1699   }
1700 
1701   // Check if we need to force convergence at loop footer.
1702   MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
1703   if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {
1704     LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1705     WaitcntData->print();
1706     DEBUG(dbgs() << '\n';);
1707 
1708     // The iterative waitcnt insertion algorithm aims for optimal waitcnt
1709     // placement and doesn't always guarantee convergence for a loop. Each
1710     // loop should take at most 2 iterations for it to converge naturally.
1711     // When this max is reached and result doesn't converge, we force
1712     // convergence by inserting a s_waitcnt at the end of loop footer.
1713     if (WaitcntData->getIterCnt() > 2) {
1714       // To ensure convergence, need to make wait events at loop footer be no
1715       // more than those from the previous iteration.
1716       // As a simplification, instead of tracking individual scores and
1717       // generating the precise wait count, just wait on 0.
1718       bool HasPending = false;
1719       MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
1720       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1721            T = (enum InstCounterType)(T + 1)) {
1722         if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
1723           ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
1724           HasPending = true;
1725         }
1726       }
1727 
1728       if (HasPending) {
1729         if (!SWaitInst) {
1730           SWaitInst = Block.getParent()->CreateMachineInstr(
1731               TII->get(AMDGPU::S_WAITCNT), DebugLoc());
1732           TrackedWaitcntSet.insert(SWaitInst);
1733           const MachineOperand &Op = MachineOperand::CreateImm(0);
1734           SWaitInst->addOperand(MF, Op);
1735 #if 0 // TODO: Format the debug output
1736           OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
1737           OutputTransformAdd(SWaitInst, context);
1738 #endif
1739         }
1740 #if 0 // TODO: ??
1741         _DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
1742 #endif
1743       }
1744 
1745       if (SWaitInst) {
1746         DEBUG({
1747           SWaitInst->print(dbgs());
1748           dbgs() << "\nAdjusted score board:";
1749           ScoreBrackets->dump();
1750         });
1751 
1752         // Add this waitcnt to the block. It is either newly created or
1753         // created in previous iterations and added back since block traversal
1754         // always removes waitcnts.
1755         insertWaitcntBeforeCF(Block, SWaitInst);
1756         WaitcntData->setWaitcnt(SWaitInst);
1757       }
1758     }
1759   }
1760 }
1761 
1762 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
1763   ST = &MF.getSubtarget<SISubtarget>();
1764   TII = ST->getInstrInfo();
1765   TRI = &TII->getRegisterInfo();
1766   MRI = &MF.getRegInfo();
1767   MLI = &getAnalysis<MachineLoopInfo>();
1768   IV = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
1769   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1770   AMDGPUASI = ST->getAMDGPUAS();
1771 
1772   HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1773   HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1774   HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1775 
1776   HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
1777   HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
1778   assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1779   assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1780 
1781   RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1782   RegisterEncoding.VGPRL =
1783       RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
1784   RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1785   RegisterEncoding.SGPRL =
1786       RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
1787 
1788   TrackedWaitcntSet.clear();
1789   BlockVisitedSet.clear();
1790   VCCZBugHandledSet.clear();
1791   LoopWaitcntDataMap.clear();
1792 
1793   // Walk over the blocks in reverse post-dominator order, inserting
1794   // s_waitcnt where needed.
1795   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
1796   bool Modified = false;
1797   for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
1798            I = RPOT.begin(),
1799            E = RPOT.end(), J = RPOT.begin();
1800        I != E;) {
1801     MachineBasicBlock &MBB = **I;
1802 
1803     BlockVisitedSet.insert(&MBB);
1804 
1805     BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1806     if (!ScoreBrackets) {
1807       BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>();
1808       ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1809     }
1810     ScoreBrackets->setPostOrder(MBB.getNumber());
1811     MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
1812     if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
1813       LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();
1814 
1815     // If we are walking into the block from before the loop, then guarantee
1816     // at least 1 re-walk over the loop to propagate the information, even if
1817     // no S_WAITCNT instructions were generated.
1818     if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {
1819       unsigned Count = countNumBottomBlocks(ContainingLoop);
1820 
1821       // If the loop has multiple back-edges, and so more than one "bottom"
1822       // basic block, we have to guarantee a re-walk over every blocks.
1823       if ((std::count(BlockWaitcntProcessedSet.begin(),
1824                       BlockWaitcntProcessedSet.end(), &MBB) < Count)) {
1825         BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
1826         DEBUG(dbgs() << "set-revisit: Block"
1827                      << ContainingLoop->getHeader()->getNumber() << '\n';);
1828       }
1829     }
1830 
1831     // Walk over the instructions.
1832     insertWaitcntInBlock(MF, MBB);
1833 
1834     // Flag that waitcnts have been processed at least once.
1835     BlockWaitcntProcessedSet.push_back(&MBB);
1836 
1837     // See if we want to revisit the loop. If a loop has multiple back-edges,
1838     // we shouldn't revisit the same "bottom" basic block.
1839     if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&
1840         std::count(BlockWaitcntProcessedSet.begin(),
1841                    BlockWaitcntProcessedSet.end(), &MBB) == 1) {
1842       MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
1843       BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
1844       if (EntrySB && EntrySB->getRevisitLoop()) {
1845         EntrySB->setRevisitLoop(false);
1846         J = I;
1847         int32_t PostOrder = EntrySB->getPostOrder();
1848         // TODO: Avoid this loop. Find another way to set I.
1849         for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
1850                  X = RPOT.begin(),
1851                  Y = RPOT.end();
1852              X != Y; ++X) {
1853           MachineBasicBlock &MBBX = **X;
1854           if (MBBX.getNumber() == PostOrder) {
1855             I = X;
1856             break;
1857           }
1858         }
1859         LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1860         WaitcntData->incIterCnt();
1861         DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);
1862         continue;
1863       } else {
1864         LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1865         // Loop converged, reset iteration count. If this loop gets revisited,
1866         // it must be from an outer loop, the counter will restart, this will
1867         // ensure we don't force convergence on such revisits.
1868         WaitcntData->resetIterCnt();
1869       }
1870     }
1871 
1872     J = I;
1873     ++I;
1874   }
1875 
1876   SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
1877 
1878   bool HaveScalarStores = false;
1879 
1880   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
1881        ++BI) {
1882     MachineBasicBlock &MBB = *BI;
1883 
1884     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
1885          ++I) {
1886       if (!HaveScalarStores && TII->isScalarStore(*I))
1887         HaveScalarStores = true;
1888 
1889       if (I->getOpcode() == AMDGPU::S_ENDPGM ||
1890           I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1891         EndPgmBlocks.push_back(&MBB);
1892     }
1893   }
1894 
1895   if (HaveScalarStores) {
1896     // If scalar writes are used, the cache must be flushed or else the next
1897     // wave to reuse the same scratch memory can be clobbered.
1898     //
1899     // Insert s_dcache_wb at wave termination points if there were any scalar
1900     // stores, and only if the cache hasn't already been flushed. This could be
1901     // improved by looking across blocks for flushes in postdominating blocks
1902     // from the stores but an explicitly requested flush is probably very rare.
1903     for (MachineBasicBlock *MBB : EndPgmBlocks) {
1904       bool SeenDCacheWB = false;
1905 
1906       for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
1907            ++I) {
1908         if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1909           SeenDCacheWB = true;
1910         else if (TII->isScalarStore(*I))
1911           SeenDCacheWB = false;
1912 
1913         // FIXME: It would be better to insert this before a waitcnt if any.
1914         if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1915              I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1916             !SeenDCacheWB) {
1917           Modified = true;
1918           BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1919         }
1920       }
1921     }
1922   }
1923 
1924   if (!MFI->isEntryFunction()) {
1925     // Wait for any outstanding memory operations that the input registers may
1926     // depend on. We can't track them and it's better to the wait after the
1927     // costly call sequence.
1928 
1929     // TODO: Could insert earlier and schedule more liberally with operations
1930     // that only use caller preserved registers.
1931     MachineBasicBlock &EntryBB = MF.front();
1932     BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1933       .addImm(0);
1934 
1935     Modified = true;
1936   }
1937 
1938   return Modified;
1939 }
1940