1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Insert wait instructions for memory reads and writes.
11 ///
12 /// Memory reads and writes are issued asynchronously, so we need to insert
13 /// S_WAITCNT instructions when we want to access any of their results or
14 /// overwrite any register that's used asynchronously.
15 ///
16 /// TODO: This pass currently keeps one timeline per hardware counter. A more
17 /// finely-grained approach that keeps one timeline per event type could
18 /// sometimes get away with generating weaker s_waitcnt instructions. For
19 /// example, when both SMEM and LDS are in flight and we need to wait for
20 /// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21 /// but the pass will currently generate a conservative lgkmcnt(0) because
22 /// multiple event types are in flight.
23 //
24 //===----------------------------------------------------------------------===//
25 
26 #include "AMDGPU.h"
27 #include "GCNSubtarget.h"
28 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
29 #include "SIMachineFunctionInfo.h"
30 #include "llvm/ADT/MapVector.h"
31 #include "llvm/ADT/PostOrderIterator.h"
32 #include "llvm/CodeGen/MachinePostDominators.h"
33 #include "llvm/InitializePasses.h"
34 #include "llvm/Support/DebugCounter.h"
35 #include "llvm/Support/TargetParser.h"
36 using namespace llvm;
37 
38 #define DEBUG_TYPE "si-insert-waitcnts"
39 
40 DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
41               "Force emit s_waitcnt expcnt(0) instrs");
42 DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
43               "Force emit s_waitcnt lgkmcnt(0) instrs");
44 DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
45               "Force emit s_waitcnt vmcnt(0) instrs");
46 
47 static cl::opt<bool> ForceEmitZeroFlag(
48   "amdgpu-waitcnt-forcezero",
49   cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
50   cl::init(false), cl::Hidden);
51 
52 namespace {
53 
54 template <typename EnumT>
55 class enum_iterator
56     : public iterator_facade_base<enum_iterator<EnumT>,
57                                   std::forward_iterator_tag, const EnumT> {
58   EnumT Value;
59 public:
60   enum_iterator() = default;
61   enum_iterator(EnumT Value) : Value(Value) {}
62 
63   enum_iterator &operator++() {
64     Value = static_cast<EnumT>(Value + 1);
65     return *this;
66   }
67 
68   bool operator==(const enum_iterator &RHS) const { return Value == RHS.Value; }
69 
70   EnumT operator*() const { return Value; }
71 };
72 
73 // Class of object that encapsulates latest instruction counter score
74 // associated with the operand.  Used for determining whether
75 // s_waitcnt instruction needs to be emited.
76 
77 #define CNT_MASK(t) (1u << (t))
78 
79 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS };
80 
81 iterator_range<enum_iterator<InstCounterType>> inst_counter_types() {
82   return make_range(enum_iterator<InstCounterType>(VM_CNT),
83                     enum_iterator<InstCounterType>(NUM_INST_CNTS));
84 }
85 
86 using RegInterval = std::pair<int, int>;
87 
88 struct {
89   unsigned VmcntMax;
90   unsigned ExpcntMax;
91   unsigned LgkmcntMax;
92   unsigned VscntMax;
93 } HardwareLimits;
94 
95 struct {
96   unsigned VGPR0;
97   unsigned VGPRL;
98   unsigned SGPR0;
99   unsigned SGPRL;
100 } RegisterEncoding;
101 
102 enum WaitEventType {
103   VMEM_ACCESS,      // vector-memory read & write
104   VMEM_READ_ACCESS, // vector-memory read
105   VMEM_WRITE_ACCESS,// vector-memory write
106   LDS_ACCESS,       // lds read & write
107   GDS_ACCESS,       // gds read & write
108   SQ_MESSAGE,       // send message
109   SMEM_ACCESS,      // scalar-memory read & write
110   EXP_GPR_LOCK,     // export holding on its data src
111   GDS_GPR_LOCK,     // GDS holding on its data and addr src
112   EXP_POS_ACCESS,   // write to export position
113   EXP_PARAM_ACCESS, // write to export parameter
114   VMW_GPR_LOCK,     // vector-memory write holding on its data src
115   NUM_WAIT_EVENTS,
116 };
117 
118 static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = {
119   (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
120   (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
121       (1 << SQ_MESSAGE),
122   (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
123       (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS),
124   (1 << VMEM_WRITE_ACCESS)
125 };
126 
127 // The mapping is:
128 //  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
129 //  SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1                  extra VGPR-like slots
130 //  NUM_ALL_VGPRS    .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
131 // We reserve a fixed number of VGPR slots in the scoring tables for
132 // special tokens like SCMEM_LDS (needed for buffer load to LDS).
133 enum RegisterMapping {
134   SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
135   AGPR_OFFSET = 226, // Maximum programmable ArchVGPRs across all targets.
136   SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
137   NUM_EXTRA_VGPRS = 1,    // A reserved slot for DS.
138   EXTRA_VGPR_LDS = 0,     // This is a placeholder the Shader algorithm uses.
139   NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
140 };
141 
142 // Enumerate different types of result-returning VMEM operations. Although
143 // s_waitcnt orders them all with a single vmcnt counter, in the absence of
144 // s_waitcnt only instructions of the same VmemType are guaranteed to write
145 // their results in order -- so there is no need to insert an s_waitcnt between
146 // two instructions of the same type that write the same vgpr.
147 enum VmemType {
148   // BUF instructions and MIMG instructions without a sampler.
149   VMEM_NOSAMPLER,
150   // MIMG instructions with a sampler.
151   VMEM_SAMPLER,
152 };
153 
154 VmemType getVmemType(const MachineInstr &Inst) {
155   assert(SIInstrInfo::isVMEM(Inst));
156   if (!SIInstrInfo::isMIMG(Inst))
157     return VMEM_NOSAMPLER;
158   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
159   return AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler
160              ? VMEM_SAMPLER
161              : VMEM_NOSAMPLER;
162 }
163 
164 void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
165   switch (T) {
166   case VM_CNT:
167     Wait.VmCnt = std::min(Wait.VmCnt, Count);
168     break;
169   case EXP_CNT:
170     Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
171     break;
172   case LGKM_CNT:
173     Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
174     break;
175   case VS_CNT:
176     Wait.VsCnt = std::min(Wait.VsCnt, Count);
177     break;
178   default:
179     llvm_unreachable("bad InstCounterType");
180   }
181 }
182 
183 // This objects maintains the current score brackets of each wait counter, and
184 // a per-register scoreboard for each wait counter.
185 //
186 // We also maintain the latest score for every event type that can change the
187 // waitcnt in order to know if there are multiple types of events within
188 // the brackets. When multiple types of event happen in the bracket,
189 // wait count may get decreased out of order, therefore we need to put in
190 // "s_waitcnt 0" before use.
191 class WaitcntBrackets {
192 public:
193   WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {}
194 
195   static unsigned getWaitCountMax(InstCounterType T) {
196     switch (T) {
197     case VM_CNT:
198       return HardwareLimits.VmcntMax;
199     case LGKM_CNT:
200       return HardwareLimits.LgkmcntMax;
201     case EXP_CNT:
202       return HardwareLimits.ExpcntMax;
203     case VS_CNT:
204       return HardwareLimits.VscntMax;
205     default:
206       break;
207     }
208     return 0;
209   }
210 
211   unsigned getScoreLB(InstCounterType T) const {
212     assert(T < NUM_INST_CNTS);
213     return ScoreLBs[T];
214   }
215 
216   unsigned getScoreUB(InstCounterType T) const {
217     assert(T < NUM_INST_CNTS);
218     return ScoreUBs[T];
219   }
220 
221   // Mapping from event to counter.
222   InstCounterType eventCounter(WaitEventType E) {
223     if (WaitEventMaskForInst[VM_CNT] & (1 << E))
224       return VM_CNT;
225     if (WaitEventMaskForInst[LGKM_CNT] & (1 << E))
226       return LGKM_CNT;
227     if (WaitEventMaskForInst[VS_CNT] & (1 << E))
228       return VS_CNT;
229     assert(WaitEventMaskForInst[EXP_CNT] & (1 << E));
230     return EXP_CNT;
231   }
232 
233   unsigned getRegScore(int GprNo, InstCounterType T) {
234     if (GprNo < NUM_ALL_VGPRS) {
235       return VgprScores[T][GprNo];
236     }
237     assert(T == LGKM_CNT);
238     return SgprScores[GprNo - NUM_ALL_VGPRS];
239   }
240 
241   bool merge(const WaitcntBrackets &Other);
242 
243   RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
244                              const MachineRegisterInfo *MRI,
245                              const SIRegisterInfo *TRI, unsigned OpNo) const;
246 
247   bool counterOutOfOrder(InstCounterType T) const;
248   bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
249   bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
250   void determineWait(InstCounterType T, unsigned ScoreToWait,
251                      AMDGPU::Waitcnt &Wait) const;
252   void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
253   void applyWaitcnt(InstCounterType T, unsigned Count);
254   void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
255                      const MachineRegisterInfo *MRI, WaitEventType E,
256                      MachineInstr &MI);
257 
258   bool hasPending() const { return PendingEvents != 0; }
259   bool hasPendingEvent(WaitEventType E) const {
260     return PendingEvents & (1 << E);
261   }
262 
263   bool hasMixedPendingEvents(InstCounterType T) const {
264     unsigned Events = PendingEvents & WaitEventMaskForInst[T];
265     // Return true if more than one bit is set in Events.
266     return Events & (Events - 1);
267   }
268 
269   bool hasPendingFlat() const {
270     return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
271              LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
272             (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
273              LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
274   }
275 
276   void setPendingFlat() {
277     LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
278     LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
279   }
280 
281   // Return true if there might be pending writes to the specified vgpr by VMEM
282   // instructions with types different from V.
283   bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const {
284     assert(GprNo < NUM_ALL_VGPRS);
285     return VgprVmemTypes[GprNo] & ~(1 << V);
286   }
287 
288   void clearVgprVmemTypes(int GprNo) {
289     assert(GprNo < NUM_ALL_VGPRS);
290     VgprVmemTypes[GprNo] = 0;
291   }
292 
293   void print(raw_ostream &);
294   void dump() { print(dbgs()); }
295 
296 private:
297   struct MergeInfo {
298     unsigned OldLB;
299     unsigned OtherLB;
300     unsigned MyShift;
301     unsigned OtherShift;
302   };
303   static bool mergeScore(const MergeInfo &M, unsigned &Score,
304                          unsigned OtherScore);
305 
306   void setScoreLB(InstCounterType T, unsigned Val) {
307     assert(T < NUM_INST_CNTS);
308     ScoreLBs[T] = Val;
309   }
310 
311   void setScoreUB(InstCounterType T, unsigned Val) {
312     assert(T < NUM_INST_CNTS);
313     ScoreUBs[T] = Val;
314     if (T == EXP_CNT) {
315       unsigned UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT);
316       if (ScoreLBs[T] < UB && UB < ScoreUBs[T])
317         ScoreLBs[T] = UB;
318     }
319   }
320 
321   void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
322     if (GprNo < NUM_ALL_VGPRS) {
323       VgprUB = std::max(VgprUB, GprNo);
324       VgprScores[T][GprNo] = Val;
325     } else {
326       assert(T == LGKM_CNT);
327       SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
328       SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
329     }
330   }
331 
332   void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
333                    const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
334                    unsigned OpNo, unsigned Val);
335 
336   const GCNSubtarget *ST = nullptr;
337   unsigned ScoreLBs[NUM_INST_CNTS] = {0};
338   unsigned ScoreUBs[NUM_INST_CNTS] = {0};
339   unsigned PendingEvents = 0;
340   // Remember the last flat memory operation.
341   unsigned LastFlat[NUM_INST_CNTS] = {0};
342   // wait_cnt scores for every vgpr.
343   // Keep track of the VgprUB and SgprUB to make merge at join efficient.
344   int VgprUB = -1;
345   int SgprUB = -1;
346   unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
347   // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
348   unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
349   // Bitmask of the VmemTypes of VMEM instructions that might have a pending
350   // write to each vgpr.
351   unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
352 };
353 
354 class SIInsertWaitcnts : public MachineFunctionPass {
355 private:
356   const GCNSubtarget *ST = nullptr;
357   const SIInstrInfo *TII = nullptr;
358   const SIRegisterInfo *TRI = nullptr;
359   const MachineRegisterInfo *MRI = nullptr;
360   AMDGPU::IsaVersion IV;
361 
362   DenseSet<MachineInstr *> TrackedWaitcntSet;
363   DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
364   MachinePostDominatorTree *PDT;
365 
366   struct BlockInfo {
367     MachineBasicBlock *MBB;
368     std::unique_ptr<WaitcntBrackets> Incoming;
369     bool Dirty = true;
370 
371     explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {}
372   };
373 
374   MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
375 
376   // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
377   // because of amdgpu-waitcnt-forcezero flag
378   bool ForceEmitZeroWaitcnts;
379   bool ForceEmitWaitcnt[NUM_INST_CNTS];
380 
381 public:
382   static char ID;
383 
384   SIInsertWaitcnts() : MachineFunctionPass(ID) {
385     (void)ForceExpCounter;
386     (void)ForceLgkmCounter;
387     (void)ForceVMCounter;
388   }
389 
390   bool runOnMachineFunction(MachineFunction &MF) override;
391 
392   StringRef getPassName() const override {
393     return "SI insert wait instructions";
394   }
395 
396   void getAnalysisUsage(AnalysisUsage &AU) const override {
397     AU.setPreservesCFG();
398     AU.addRequired<MachinePostDominatorTree>();
399     MachineFunctionPass::getAnalysisUsage(AU);
400   }
401 
402   bool isForceEmitWaitcnt() const {
403     for (auto T : inst_counter_types())
404       if (ForceEmitWaitcnt[T])
405         return true;
406     return false;
407   }
408 
409   void setForceEmitWaitcnt() {
410 // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
411 // For debug builds, get the debug counter info and adjust if need be
412 #ifndef NDEBUG
413     if (DebugCounter::isCounterSet(ForceExpCounter) &&
414         DebugCounter::shouldExecute(ForceExpCounter)) {
415       ForceEmitWaitcnt[EXP_CNT] = true;
416     } else {
417       ForceEmitWaitcnt[EXP_CNT] = false;
418     }
419 
420     if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
421          DebugCounter::shouldExecute(ForceLgkmCounter)) {
422       ForceEmitWaitcnt[LGKM_CNT] = true;
423     } else {
424       ForceEmitWaitcnt[LGKM_CNT] = false;
425     }
426 
427     if (DebugCounter::isCounterSet(ForceVMCounter) &&
428         DebugCounter::shouldExecute(ForceVMCounter)) {
429       ForceEmitWaitcnt[VM_CNT] = true;
430     } else {
431       ForceEmitWaitcnt[VM_CNT] = false;
432     }
433 #endif // NDEBUG
434   }
435 
436   bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
437   bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
438   bool generateWaitcntInstBefore(MachineInstr &MI,
439                                  WaitcntBrackets &ScoreBrackets,
440                                  MachineInstr *OldWaitcntInstr);
441   void updateEventWaitcntAfter(MachineInstr &Inst,
442                                WaitcntBrackets *ScoreBrackets);
443   bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
444                             WaitcntBrackets &ScoreBrackets);
445 };
446 
447 } // end anonymous namespace
448 
449 RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
450                                             const SIInstrInfo *TII,
451                                             const MachineRegisterInfo *MRI,
452                                             const SIRegisterInfo *TRI,
453                                             unsigned OpNo) const {
454   const MachineOperand &Op = MI->getOperand(OpNo);
455   if (!TRI->isInAllocatableClass(Op.getReg()))
456     return {-1, -1};
457 
458   // A use via a PW operand does not need a waitcnt.
459   // A partial write is not a WAW.
460   assert(!Op.getSubReg() || !Op.isUndef());
461 
462   RegInterval Result;
463 
464   unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST));
465 
466   if (TRI->isVectorRegister(*MRI, Op.getReg())) {
467     assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
468     Result.first = Reg - RegisterEncoding.VGPR0;
469     if (TRI->isAGPR(*MRI, Op.getReg()))
470       Result.first += AGPR_OFFSET;
471     assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
472   } else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
473     assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
474     Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
475     assert(Result.first >= NUM_ALL_VGPRS &&
476            Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
477   }
478   // TODO: Handle TTMP
479   // else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
480   else
481     return {-1, -1};
482 
483   const TargetRegisterClass *RC = TII->getOpRegClass(*MI, OpNo);
484   unsigned Size = TRI->getRegSizeInBits(*RC);
485   Result.second = Result.first + ((Size + 16) / 32);
486 
487   return Result;
488 }
489 
490 void WaitcntBrackets::setExpScore(const MachineInstr *MI,
491                                   const SIInstrInfo *TII,
492                                   const SIRegisterInfo *TRI,
493                                   const MachineRegisterInfo *MRI, unsigned OpNo,
494                                   unsigned Val) {
495   RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo);
496   assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));
497   for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
498     setRegScore(RegNo, EXP_CNT, Val);
499   }
500 }
501 
502 void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
503                                     const SIRegisterInfo *TRI,
504                                     const MachineRegisterInfo *MRI,
505                                     WaitEventType E, MachineInstr &Inst) {
506   InstCounterType T = eventCounter(E);
507   unsigned CurrScore = getScoreUB(T) + 1;
508   if (CurrScore == 0)
509     report_fatal_error("InsertWaitcnt score wraparound");
510   // PendingEvents and ScoreUB need to be update regardless if this event
511   // changes the score of a register or not.
512   // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
513   PendingEvents |= 1 << E;
514   setScoreUB(T, CurrScore);
515 
516   if (T == EXP_CNT) {
517     // Put score on the source vgprs. If this is a store, just use those
518     // specific register(s).
519     if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
520       int AddrOpIdx =
521           AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
522       // All GDS operations must protect their address register (same as
523       // export.)
524       if (AddrOpIdx != -1) {
525         setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
526       }
527 
528       if (Inst.mayStore()) {
529         if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
530                                        AMDGPU::OpName::data0) != -1) {
531           setExpScore(
532               &Inst, TII, TRI, MRI,
533               AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
534               CurrScore);
535         }
536         if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
537                                        AMDGPU::OpName::data1) != -1) {
538           setExpScore(&Inst, TII, TRI, MRI,
539                       AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
540                                                  AMDGPU::OpName::data1),
541                       CurrScore);
542         }
543       } else if (SIInstrInfo::isAtomicRet(Inst) &&
544                  Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
545                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
546                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
547                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
548                  Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
549                  Inst.getOpcode() != AMDGPU::DS_APPEND &&
550                  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
551                  Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
552         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
553           const MachineOperand &Op = Inst.getOperand(I);
554           if (Op.isReg() && !Op.isDef() &&
555               TRI->isVectorRegister(*MRI, Op.getReg())) {
556             setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
557           }
558         }
559       }
560     } else if (TII->isFLAT(Inst)) {
561       if (Inst.mayStore()) {
562         setExpScore(
563             &Inst, TII, TRI, MRI,
564             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
565             CurrScore);
566       } else if (SIInstrInfo::isAtomicRet(Inst)) {
567         setExpScore(
568             &Inst, TII, TRI, MRI,
569             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
570             CurrScore);
571       }
572     } else if (TII->isMIMG(Inst)) {
573       if (Inst.mayStore()) {
574         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
575       } else if (SIInstrInfo::isAtomicRet(Inst)) {
576         setExpScore(
577             &Inst, TII, TRI, MRI,
578             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
579             CurrScore);
580       }
581     } else if (TII->isMTBUF(Inst)) {
582       if (Inst.mayStore()) {
583         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
584       }
585     } else if (TII->isMUBUF(Inst)) {
586       if (Inst.mayStore()) {
587         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
588       } else if (SIInstrInfo::isAtomicRet(Inst)) {
589         setExpScore(
590             &Inst, TII, TRI, MRI,
591             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
592             CurrScore);
593       }
594     } else {
595       if (TII->isEXP(Inst)) {
596         // For export the destination registers are really temps that
597         // can be used as the actual source after export patching, so
598         // we need to treat them like sources and set the EXP_CNT
599         // score.
600         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
601           MachineOperand &DefMO = Inst.getOperand(I);
602           if (DefMO.isReg() && DefMO.isDef() &&
603               TRI->isVGPR(*MRI, DefMO.getReg())) {
604             setRegScore(
605                 TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)),
606                 EXP_CNT, CurrScore);
607           }
608         }
609       }
610       for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
611         MachineOperand &MO = Inst.getOperand(I);
612         if (MO.isReg() && !MO.isDef() &&
613             TRI->isVectorRegister(*MRI, MO.getReg())) {
614           setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
615         }
616       }
617     }
618 #if 0 // TODO: check if this is handled by MUBUF code above.
619   } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
620        Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
621        Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
622     MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
623     unsigned OpNo;//TODO: find the OpNo for this operand;
624     RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo);
625     for (int RegNo = Interval.first; RegNo < Interval.second;
626     ++RegNo) {
627       setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
628     }
629 #endif
630   } else {
631     // Match the score to the destination registers.
632     for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
633       auto &Op = Inst.getOperand(I);
634       if (!Op.isReg() || !Op.isDef())
635         continue;
636       RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I);
637       if (T == VM_CNT) {
638         if (Interval.first >= NUM_ALL_VGPRS)
639           continue;
640         if (SIInstrInfo::isVMEM(Inst)) {
641           VmemType V = getVmemType(Inst);
642           for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
643             VgprVmemTypes[RegNo] |= 1 << V;
644         }
645       }
646       for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
647         setRegScore(RegNo, T, CurrScore);
648       }
649     }
650     if (TII->isDS(Inst) && Inst.mayStore()) {
651       setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
652     }
653   }
654 }
655 
656 void WaitcntBrackets::print(raw_ostream &OS) {
657   OS << '\n';
658   for (auto T : inst_counter_types()) {
659     unsigned LB = getScoreLB(T);
660     unsigned UB = getScoreUB(T);
661 
662     switch (T) {
663     case VM_CNT:
664       OS << "    VM_CNT(" << UB - LB << "): ";
665       break;
666     case LGKM_CNT:
667       OS << "    LGKM_CNT(" << UB - LB << "): ";
668       break;
669     case EXP_CNT:
670       OS << "    EXP_CNT(" << UB - LB << "): ";
671       break;
672     case VS_CNT:
673       OS << "    VS_CNT(" << UB - LB << "): ";
674       break;
675     default:
676       OS << "    UNKNOWN(" << UB - LB << "): ";
677       break;
678     }
679 
680     if (LB < UB) {
681       // Print vgpr scores.
682       for (int J = 0; J <= VgprUB; J++) {
683         unsigned RegScore = getRegScore(J, T);
684         if (RegScore <= LB)
685           continue;
686         unsigned RelScore = RegScore - LB - 1;
687         if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
688           OS << RelScore << ":v" << J << " ";
689         } else {
690           OS << RelScore << ":ds ";
691         }
692       }
693       // Also need to print sgpr scores for lgkm_cnt.
694       if (T == LGKM_CNT) {
695         for (int J = 0; J <= SgprUB; J++) {
696           unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
697           if (RegScore <= LB)
698             continue;
699           unsigned RelScore = RegScore - LB - 1;
700           OS << RelScore << ":s" << J << " ";
701         }
702       }
703     }
704     OS << '\n';
705   }
706   OS << '\n';
707 }
708 
709 /// Simplify the waitcnt, in the sense of removing redundant counts, and return
710 /// whether a waitcnt instruction is needed at all.
711 bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
712   return simplifyWaitcnt(VM_CNT, Wait.VmCnt) |
713          simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) |
714          simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt) |
715          simplifyWaitcnt(VS_CNT, Wait.VsCnt);
716 }
717 
718 bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
719                                       unsigned &Count) const {
720   const unsigned LB = getScoreLB(T);
721   const unsigned UB = getScoreUB(T);
722   if (Count < UB && UB - Count > LB)
723     return true;
724 
725   Count = ~0u;
726   return false;
727 }
728 
729 void WaitcntBrackets::determineWait(InstCounterType T, unsigned ScoreToWait,
730                                     AMDGPU::Waitcnt &Wait) const {
731   // If the score of src_operand falls within the bracket, we need an
732   // s_waitcnt instruction.
733   const unsigned LB = getScoreLB(T);
734   const unsigned UB = getScoreUB(T);
735   if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
736     if ((T == VM_CNT || T == LGKM_CNT) &&
737         hasPendingFlat() &&
738         !ST->hasFlatLgkmVMemCountInOrder()) {
739       // If there is a pending FLAT operation, and this is a VMem or LGKM
740       // waitcnt and the target can report early completion, then we need
741       // to force a waitcnt 0.
742       addWait(Wait, T, 0);
743     } else if (counterOutOfOrder(T)) {
744       // Counter can get decremented out-of-order when there
745       // are multiple types event in the bracket. Also emit an s_wait counter
746       // with a conservative value of 0 for the counter.
747       addWait(Wait, T, 0);
748     } else {
749       // If a counter has been maxed out avoid overflow by waiting for
750       // MAX(CounterType) - 1 instead.
751       unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
752       addWait(Wait, T, NeededWait);
753     }
754   }
755 }
756 
757 void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
758   applyWaitcnt(VM_CNT, Wait.VmCnt);
759   applyWaitcnt(EXP_CNT, Wait.ExpCnt);
760   applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
761   applyWaitcnt(VS_CNT, Wait.VsCnt);
762 }
763 
764 void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
765   const unsigned UB = getScoreUB(T);
766   if (Count >= UB)
767     return;
768   if (Count != 0) {
769     if (counterOutOfOrder(T))
770       return;
771     setScoreLB(T, std::max(getScoreLB(T), UB - Count));
772   } else {
773     setScoreLB(T, UB);
774     PendingEvents &= ~WaitEventMaskForInst[T];
775   }
776 }
777 
778 // Where there are multiple types of event in the bracket of a counter,
779 // the decrement may go out of order.
780 bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
781   // Scalar memory read always can go out of order.
782   if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
783     return true;
784   return hasMixedPendingEvents(T);
785 }
786 
787 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
788                       false)
789 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
790 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
791                     false)
792 
793 char SIInsertWaitcnts::ID = 0;
794 
795 char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
796 
797 FunctionPass *llvm::createSIInsertWaitcntsPass() {
798   return new SIInsertWaitcnts();
799 }
800 
801 static bool readsVCCZ(const MachineInstr &MI) {
802   unsigned Opc = MI.getOpcode();
803   return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
804          !MI.getOperand(1).isUndef();
805 }
806 
807 /// \returns true if the callee inserts an s_waitcnt 0 on function entry.
808 static bool callWaitsOnFunctionEntry(const MachineInstr &MI) {
809   // Currently all conventions wait, but this may not always be the case.
810   //
811   // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
812   // senses to omit the wait and do it in the caller.
813   return true;
814 }
815 
816 /// \returns true if the callee is expected to wait for any outstanding waits
817 /// before returning.
818 static bool callWaitsOnFunctionReturn(const MachineInstr &MI) {
819   return true;
820 }
821 
822 ///  Generate s_waitcnt instruction to be placed before cur_Inst.
823 ///  Instructions of a given type are returned in order,
824 ///  but instructions of different types can complete out of order.
825 ///  We rely on this in-order completion
826 ///  and simply assign a score to the memory access instructions.
827 ///  We keep track of the active "score bracket" to determine
828 ///  if an access of a memory read requires an s_waitcnt
829 ///  and if so what the value of each counter is.
830 ///  The "score bracket" is bound by the lower bound and upper bound
831 ///  scores (*_score_LB and *_score_ub respectively).
832 bool SIInsertWaitcnts::generateWaitcntInstBefore(
833     MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
834     MachineInstr *OldWaitcntInstr) {
835   setForceEmitWaitcnt();
836   bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
837 
838   if (MI.isMetaInstruction())
839     return false;
840 
841   AMDGPU::Waitcnt Wait;
842 
843   // See if this instruction has a forced S_WAITCNT VM.
844   // TODO: Handle other cases of NeedsWaitcntVmBefore()
845   if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
846       MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
847       MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
848       MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
849       MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
850     Wait.VmCnt = 0;
851   }
852 
853   // All waits must be resolved at call return.
854   // NOTE: this could be improved with knowledge of all call sites or
855   //   with knowledge of the called routines.
856   if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
857       MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
858       (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
859     Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
860   }
861   // Resolve vm waits before gs-done.
862   else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
863             MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
864            ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
865             AMDGPU::SendMsg::ID_GS_DONE)) {
866     Wait.VmCnt = 0;
867   }
868 #if 0 // TODO: the following blocks of logic when we have fence.
869   else if (MI.getOpcode() == SC_FENCE) {
870     const unsigned int group_size =
871       context->shader_info->GetMaxThreadGroupSize();
872     // group_size == 0 means thread group size is unknown at compile time
873     const bool group_is_multi_wave =
874       (group_size == 0 || group_size > target_info->GetWaveFrontSize());
875     const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
876 
877     for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
878       SCRegType src_type = Inst->GetSrcType(i);
879       switch (src_type) {
880         case SCMEM_LDS:
881           if (group_is_multi_wave ||
882             context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
883             EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
884                                ScoreBrackets->getScoreUB(LGKM_CNT));
885             // LDS may have to wait for VM_CNT after buffer load to LDS
886             if (target_info->HasBufferLoadToLDS()) {
887               EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
888                                  ScoreBrackets->getScoreUB(VM_CNT));
889             }
890           }
891           break;
892 
893         case SCMEM_GDS:
894           if (group_is_multi_wave || fence_is_global) {
895             EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
896               ScoreBrackets->getScoreUB(EXP_CNT));
897             EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
898               ScoreBrackets->getScoreUB(LGKM_CNT));
899           }
900           break;
901 
902         case SCMEM_UAV:
903         case SCMEM_TFBUF:
904         case SCMEM_RING:
905         case SCMEM_SCATTER:
906           if (group_is_multi_wave || fence_is_global) {
907             EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
908               ScoreBrackets->getScoreUB(EXP_CNT));
909             EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
910               ScoreBrackets->getScoreUB(VM_CNT));
911           }
912           break;
913 
914         case SCMEM_SCRATCH:
915         default:
916           break;
917       }
918     }
919   }
920 #endif
921 
922   // Export & GDS instructions do not read the EXEC mask until after the export
923   // is granted (which can occur well after the instruction is issued).
924   // The shader program must flush all EXP operations on the export-count
925   // before overwriting the EXEC mask.
926   else {
927     if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
928       // Export and GDS are tracked individually, either may trigger a waitcnt
929       // for EXEC.
930       if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
931           ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
932           ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
933           ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
934         Wait.ExpCnt = 0;
935       }
936     }
937 
938     if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
939       // The function is going to insert a wait on everything in its prolog.
940       // This still needs to be careful if the call target is a load (e.g. a GOT
941       // load). We also need to check WAW depenancy with saved PC.
942       Wait = AMDGPU::Waitcnt();
943 
944       int CallAddrOpIdx =
945           AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
946 
947       if (MI.getOperand(CallAddrOpIdx).isReg()) {
948         RegInterval CallAddrOpInterval =
949           ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, CallAddrOpIdx);
950 
951         for (int RegNo = CallAddrOpInterval.first;
952              RegNo < CallAddrOpInterval.second; ++RegNo)
953           ScoreBrackets.determineWait(
954             LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
955 
956         int RtnAddrOpIdx =
957           AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
958         if (RtnAddrOpIdx != -1) {
959           RegInterval RtnAddrOpInterval =
960             ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, RtnAddrOpIdx);
961 
962           for (int RegNo = RtnAddrOpInterval.first;
963                RegNo < RtnAddrOpInterval.second; ++RegNo)
964             ScoreBrackets.determineWait(
965               LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
966         }
967       }
968     } else {
969       // FIXME: Should not be relying on memoperands.
970       // Look at the source operands of every instruction to see if
971       // any of them results from a previous memory operation that affects
972       // its current usage. If so, an s_waitcnt instruction needs to be
973       // emitted.
974       // If the source operand was defined by a load, add the s_waitcnt
975       // instruction.
976       //
977       // Two cases are handled for destination operands:
978       // 1) If the destination operand was defined by a load, add the s_waitcnt
979       // instruction to guarantee the right WAW order.
980       // 2) If a destination operand that was used by a recent export/store ins,
981       // add s_waitcnt on exp_cnt to guarantee the WAR order.
982       for (const MachineMemOperand *Memop : MI.memoperands()) {
983         const Value *Ptr = Memop->getValue();
984         if (Memop->isStore() && SLoadAddresses.count(Ptr)) {
985           addWait(Wait, LGKM_CNT, 0);
986           if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second))
987             SLoadAddresses.erase(Ptr);
988         }
989         unsigned AS = Memop->getAddrSpace();
990         if (AS != AMDGPUAS::LOCAL_ADDRESS)
991           continue;
992         unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
993         // VM_CNT is only relevant to vgpr or LDS.
994         ScoreBrackets.determineWait(
995             VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
996         if (Memop->isStore()) {
997           ScoreBrackets.determineWait(
998               EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
999         }
1000       }
1001 
1002       // Loop over use and def operands.
1003       for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1004         MachineOperand &Op = MI.getOperand(I);
1005         if (!Op.isReg())
1006           continue;
1007         RegInterval Interval =
1008             ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I);
1009 
1010         const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
1011         for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1012           if (IsVGPR) {
1013             // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
1014             // previous write and this write are the same type of VMEM
1015             // instruction, in which case they're guaranteed to write their
1016             // results in order anyway.
1017             if (Op.isUse() || !SIInstrInfo::isVMEM(MI) ||
1018                 ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
1019                                                        getVmemType(MI))) {
1020               ScoreBrackets.determineWait(
1021                   VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
1022               ScoreBrackets.clearVgprVmemTypes(RegNo);
1023             }
1024             if (Op.isDef()) {
1025               ScoreBrackets.determineWait(
1026                   EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
1027             }
1028           }
1029           ScoreBrackets.determineWait(
1030               LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
1031         }
1032       }
1033     }
1034   }
1035 
1036   // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
1037   // occurs before the instruction. Doing it here prevents any additional
1038   // S_WAITCNTs from being emitted if the instruction was marked as
1039   // requiring a WAITCNT beforehand.
1040   if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1041       !ST->hasAutoWaitcntBeforeBarrier()) {
1042     Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
1043   }
1044 
1045   // TODO: Remove this work-around, enable the assert for Bug 457939
1046   //       after fixing the scheduler. Also, the Shader Compiler code is
1047   //       independent of target.
1048   if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
1049     if (ScoreBrackets.getScoreLB(LGKM_CNT) <
1050             ScoreBrackets.getScoreUB(LGKM_CNT) &&
1051         ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1052       Wait.LgkmCnt = 0;
1053     }
1054   }
1055 
1056   // Early-out if no wait is indicated.
1057   if (!ScoreBrackets.simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) {
1058     bool Modified = false;
1059     if (OldWaitcntInstr) {
1060       for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
1061            &*II != &MI; II = NextI, ++NextI) {
1062         if (II->isDebugInstr())
1063           continue;
1064 
1065         if (TrackedWaitcntSet.count(&*II)) {
1066           TrackedWaitcntSet.erase(&*II);
1067           II->eraseFromParent();
1068           Modified = true;
1069         } else if (II->getOpcode() == AMDGPU::S_WAITCNT) {
1070           int64_t Imm = II->getOperand(0).getImm();
1071           ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
1072         } else {
1073           assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
1074           assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1075           auto W = TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->getImm();
1076           ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt(~0u, ~0u, ~0u, W));
1077         }
1078       }
1079     }
1080     return Modified;
1081   }
1082 
1083   if (ForceEmitZeroWaitcnts)
1084     Wait = AMDGPU::Waitcnt::allZero(ST->hasVscnt());
1085 
1086   if (ForceEmitWaitcnt[VM_CNT])
1087     Wait.VmCnt = 0;
1088   if (ForceEmitWaitcnt[EXP_CNT])
1089     Wait.ExpCnt = 0;
1090   if (ForceEmitWaitcnt[LGKM_CNT])
1091     Wait.LgkmCnt = 0;
1092   if (ForceEmitWaitcnt[VS_CNT])
1093     Wait.VsCnt = 0;
1094 
1095   ScoreBrackets.applyWaitcnt(Wait);
1096 
1097   AMDGPU::Waitcnt OldWait;
1098   bool Modified = false;
1099 
1100   if (OldWaitcntInstr) {
1101     for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
1102          &*II != &MI; II = NextI, NextI++) {
1103       if (II->isDebugInstr())
1104         continue;
1105 
1106       if (II->getOpcode() == AMDGPU::S_WAITCNT) {
1107         unsigned IEnc = II->getOperand(0).getImm();
1108         AMDGPU::Waitcnt IWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1109         OldWait = OldWait.combined(IWait);
1110         if (!TrackedWaitcntSet.count(&*II))
1111           Wait = Wait.combined(IWait);
1112         unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait);
1113         if (IEnc != NewEnc) {
1114           II->getOperand(0).setImm(NewEnc);
1115           Modified = true;
1116         }
1117         Wait.VmCnt = ~0u;
1118         Wait.LgkmCnt = ~0u;
1119         Wait.ExpCnt = ~0u;
1120       } else {
1121         assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
1122         assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1123 
1124         unsigned ICnt = TII->getNamedOperand(*II, AMDGPU::OpName::simm16)
1125                         ->getImm();
1126         OldWait.VsCnt = std::min(OldWait.VsCnt, ICnt);
1127         if (!TrackedWaitcntSet.count(&*II))
1128           Wait.VsCnt = std::min(Wait.VsCnt, ICnt);
1129         if (Wait.VsCnt != ICnt) {
1130           TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->setImm(Wait.VsCnt);
1131           Modified = true;
1132         }
1133         Wait.VsCnt = ~0u;
1134       }
1135 
1136       LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
1137                         << "Old Instr: " << MI
1138                         << "New Instr: " << *II << '\n');
1139 
1140       if (!Wait.hasWait())
1141         return Modified;
1142     }
1143   }
1144 
1145   if (Wait.VmCnt != ~0u || Wait.LgkmCnt != ~0u || Wait.ExpCnt != ~0u) {
1146     unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1147     auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
1148                              MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1149                          .addImm(Enc);
1150     TrackedWaitcntSet.insert(SWaitInst);
1151     Modified = true;
1152 
1153     LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
1154                       << "Old Instr: " << MI
1155                       << "New Instr: " << *SWaitInst << '\n');
1156   }
1157 
1158   if (Wait.VsCnt != ~0u) {
1159     assert(ST->hasVscnt());
1160 
1161     auto SWaitInst =
1162         BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1163                 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1164             .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1165             .addImm(Wait.VsCnt);
1166     TrackedWaitcntSet.insert(SWaitInst);
1167     Modified = true;
1168 
1169     LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
1170                       << "Old Instr: " << MI
1171                       << "New Instr: " << *SWaitInst << '\n');
1172   }
1173 
1174   return Modified;
1175 }
1176 
1177 // This is a flat memory operation. Check to see if it has memory tokens other
1178 // than LDS. Other address spaces supported by flat memory operations involve
1179 // global memory.
1180 bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
1181   assert(TII->isFLAT(MI));
1182 
1183   // All flat instructions use the VMEM counter.
1184   assert(TII->usesVM_CNT(MI));
1185 
1186   // If there are no memory operands then conservatively assume the flat
1187   // operation may access VMEM.
1188   if (MI.memoperands_empty())
1189     return true;
1190 
1191   // See if any memory operand specifies an address space that involves VMEM.
1192   // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
1193   // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
1194   // (GDS) address space is not supported by flat operations. Therefore, simply
1195   // return true unless only the LDS address space is found.
1196   for (const MachineMemOperand *Memop : MI.memoperands()) {
1197     unsigned AS = Memop->getAddrSpace();
1198     assert(AS != AMDGPUAS::REGION_ADDRESS);
1199     if (AS != AMDGPUAS::LOCAL_ADDRESS)
1200       return true;
1201   }
1202 
1203   return false;
1204 }
1205 
1206 // This is a flat memory operation. Check to see if it has memory tokens for
1207 // either LDS or FLAT.
1208 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1209   assert(TII->isFLAT(MI));
1210 
1211   // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
1212   if (!TII->usesLGKM_CNT(MI))
1213     return false;
1214 
1215   // If in tgsplit mode then there can be no use of LDS.
1216   if (ST->isTgSplitEnabled())
1217     return false;
1218 
1219   // If there are no memory operands then conservatively assume the flat
1220   // operation may access LDS.
1221   if (MI.memoperands_empty())
1222     return true;
1223 
1224   // See if any memory operand specifies an address space that involves LDS.
1225   for (const MachineMemOperand *Memop : MI.memoperands()) {
1226     unsigned AS = Memop->getAddrSpace();
1227     if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
1228       return true;
1229   }
1230 
1231   return false;
1232 }
1233 
1234 void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
1235                                                WaitcntBrackets *ScoreBrackets) {
1236   // Now look at the instruction opcode. If it is a memory access
1237   // instruction, update the upper-bound of the appropriate counter's
1238   // bracket and the destination operand scores.
1239   // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
1240   if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
1241     if (TII->isAlwaysGDS(Inst.getOpcode()) ||
1242         TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
1243       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1244       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1245     } else {
1246       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1247     }
1248   } else if (TII->isFLAT(Inst)) {
1249     assert(Inst.mayLoadOrStore());
1250 
1251     int FlatASCount = 0;
1252 
1253     if (mayAccessVMEMThroughFlat(Inst)) {
1254       ++FlatASCount;
1255       if (!ST->hasVscnt())
1256         ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1257       else if (Inst.mayLoad() && !SIInstrInfo::isAtomicNoRet(Inst))
1258         ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
1259       else
1260         ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
1261     }
1262 
1263     if (mayAccessLDSThroughFlat(Inst)) {
1264       ++FlatASCount;
1265       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1266     }
1267 
1268     // A Flat memory operation must access at least one address space.
1269     assert(FlatASCount);
1270 
1271     // This is a flat memory operation that access both VMEM and LDS, so note it
1272     // - it will require that both the VM and LGKM be flushed to zero if it is
1273     // pending when a VM or LGKM dependency occurs.
1274     if (FlatASCount > 1)
1275       ScoreBrackets->setPendingFlat();
1276   } else if (SIInstrInfo::isVMEM(Inst) &&
1277              // TODO: get a better carve out.
1278              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
1279              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
1280              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL &&
1281              Inst.getOpcode() != AMDGPU::BUFFER_GL0_INV &&
1282              Inst.getOpcode() != AMDGPU::BUFFER_GL1_INV) {
1283     if (!ST->hasVscnt())
1284       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1285     else if ((Inst.mayLoad() && !SIInstrInfo::isAtomicNoRet(Inst)) ||
1286              /* IMAGE_GET_RESINFO / IMAGE_GET_LOD */
1287              (TII->isMIMG(Inst) && !Inst.mayLoad() && !Inst.mayStore()))
1288       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
1289     else if (Inst.mayStore())
1290       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
1291 
1292     if (ST->vmemWriteNeedsExpWaitcnt() &&
1293         (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
1294       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1295     }
1296   } else if (TII->isSMRD(Inst)) {
1297     ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1298   } else if (Inst.isCall()) {
1299     if (callWaitsOnFunctionReturn(Inst)) {
1300       // Act as a wait on everything
1301       ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
1302     } else {
1303       // May need to way wait for anything.
1304       ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
1305     }
1306   } else if (SIInstrInfo::isEXP(Inst)) {
1307     unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1308     if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
1309       ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1310     else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
1311       ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1312     else
1313       ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1314   } else {
1315     switch (Inst.getOpcode()) {
1316     case AMDGPU::S_SENDMSG:
1317     case AMDGPU::S_SENDMSGHALT:
1318       ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1319       break;
1320     case AMDGPU::S_MEMTIME:
1321     case AMDGPU::S_MEMREALTIME:
1322       ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1323       break;
1324     }
1325   }
1326 }
1327 
1328 bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
1329                                  unsigned OtherScore) {
1330   unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
1331   unsigned OtherShifted =
1332       OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
1333   Score = std::max(MyShifted, OtherShifted);
1334   return OtherShifted > MyShifted;
1335 }
1336 
1337 /// Merge the pending events and associater score brackets of \p Other into
1338 /// this brackets status.
1339 ///
1340 /// Returns whether the merge resulted in a change that requires tighter waits
1341 /// (i.e. the merged brackets strictly dominate the original brackets).
1342 bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
1343   bool StrictDom = false;
1344 
1345   VgprUB = std::max(VgprUB, Other.VgprUB);
1346   SgprUB = std::max(SgprUB, Other.SgprUB);
1347 
1348   for (auto T : inst_counter_types()) {
1349     // Merge event flags for this counter
1350     const bool OldOutOfOrder = counterOutOfOrder(T);
1351     const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
1352     const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
1353     if (OtherEvents & ~OldEvents)
1354       StrictDom = true;
1355     PendingEvents |= OtherEvents;
1356 
1357     // Merge scores for this counter
1358     const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
1359     const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
1360     const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
1361     if (NewUB < ScoreLBs[T])
1362       report_fatal_error("waitcnt score overflow");
1363 
1364     MergeInfo M;
1365     M.OldLB = ScoreLBs[T];
1366     M.OtherLB = Other.ScoreLBs[T];
1367     M.MyShift = NewUB - ScoreUBs[T];
1368     M.OtherShift = NewUB - Other.ScoreUBs[T];
1369 
1370     ScoreUBs[T] = NewUB;
1371 
1372     StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
1373 
1374     bool RegStrictDom = false;
1375     for (int J = 0; J <= VgprUB; J++) {
1376       RegStrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
1377     }
1378 
1379     if (T == VM_CNT) {
1380       for (int J = 0; J <= VgprUB; J++) {
1381         unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
1382         RegStrictDom |= NewVmemTypes != VgprVmemTypes[J];
1383         VgprVmemTypes[J] = NewVmemTypes;
1384       }
1385     }
1386 
1387     if (T == LGKM_CNT) {
1388       for (int J = 0; J <= SgprUB; J++) {
1389         RegStrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
1390       }
1391     }
1392 
1393     if (RegStrictDom && !OldOutOfOrder)
1394       StrictDom = true;
1395   }
1396 
1397   return StrictDom;
1398 }
1399 
1400 // Generate s_waitcnt instructions where needed.
1401 bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1402                                             MachineBasicBlock &Block,
1403                                             WaitcntBrackets &ScoreBrackets) {
1404   bool Modified = false;
1405 
1406   LLVM_DEBUG({
1407     dbgs() << "*** Block" << Block.getNumber() << " ***";
1408     ScoreBrackets.dump();
1409   });
1410 
1411   // Track the correctness of vccz through this basic block. There are two
1412   // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
1413   // ST->partialVCCWritesUpdateVCCZ().
1414   bool VCCZCorrect = true;
1415   if (ST->hasReadVCCZBug()) {
1416     // vccz could be incorrect at a basic block boundary if a predecessor wrote
1417     // to vcc and then issued an smem load.
1418     VCCZCorrect = false;
1419   } else if (!ST->partialVCCWritesUpdateVCCZ()) {
1420     // vccz could be incorrect at a basic block boundary if a predecessor wrote
1421     // to vcc_lo or vcc_hi.
1422     VCCZCorrect = false;
1423   }
1424 
1425   // Walk over the instructions.
1426   MachineInstr *OldWaitcntInstr = nullptr;
1427 
1428   for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
1429                                          E = Block.instr_end();
1430        Iter != E;) {
1431     MachineInstr &Inst = *Iter;
1432 
1433     // Track pre-existing waitcnts from earlier iterations.
1434     if (Inst.getOpcode() == AMDGPU::S_WAITCNT ||
1435         (Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1436          Inst.getOperand(0).isReg() &&
1437          Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL)) {
1438       if (!OldWaitcntInstr)
1439         OldWaitcntInstr = &Inst;
1440       ++Iter;
1441       continue;
1442     }
1443 
1444     // Generate an s_waitcnt instruction to be placed before Inst, if needed.
1445     Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
1446     OldWaitcntInstr = nullptr;
1447 
1448     // Restore vccz if it's not known to be correct already.
1449     bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
1450 
1451     // Don't examine operands unless we need to track vccz correctness.
1452     if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
1453       if (Inst.definesRegister(AMDGPU::VCC_LO) ||
1454           Inst.definesRegister(AMDGPU::VCC_HI)) {
1455         // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
1456         if (!ST->partialVCCWritesUpdateVCCZ())
1457           VCCZCorrect = false;
1458       } else if (Inst.definesRegister(AMDGPU::VCC)) {
1459         // There is a hardware bug on CI/SI where SMRD instruction may corrupt
1460         // vccz bit, so when we detect that an instruction may read from a
1461         // corrupt vccz bit, we need to:
1462         // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
1463         //    operations to complete.
1464         // 2. Restore the correct value of vccz by writing the current value
1465         //    of vcc back to vcc.
1466         if (ST->hasReadVCCZBug() &&
1467             ScoreBrackets.getScoreLB(LGKM_CNT) <
1468                 ScoreBrackets.getScoreUB(LGKM_CNT) &&
1469             ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1470           // Writes to vcc while there's an outstanding smem read may get
1471           // clobbered as soon as any read completes.
1472           VCCZCorrect = false;
1473         } else {
1474           // Writes to vcc will fix any incorrect value in vccz.
1475           VCCZCorrect = true;
1476         }
1477       }
1478     }
1479 
1480     if (TII->isSMRD(Inst)) {
1481       for (const MachineMemOperand *Memop : Inst.memoperands()) {
1482         const Value *Ptr = Memop->getValue();
1483         SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent()));
1484       }
1485       if (ST->hasReadVCCZBug()) {
1486         // This smem read could complete and clobber vccz at any time.
1487         VCCZCorrect = false;
1488       }
1489     }
1490 
1491     updateEventWaitcntAfter(Inst, &ScoreBrackets);
1492 
1493 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
1494     // If this instruction generates a S_SETVSKIP because it is an
1495     // indexed resource, and we are on Tahiti, then it will also force
1496     // an S_WAITCNT vmcnt(0)
1497     if (RequireCheckResourceType(Inst, context)) {
1498       // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1499       ScoreBrackets->setScoreLB(VM_CNT,
1500       ScoreBrackets->getScoreUB(VM_CNT));
1501     }
1502 #endif
1503 
1504     LLVM_DEBUG({
1505       Inst.print(dbgs());
1506       ScoreBrackets.dump();
1507     });
1508 
1509     // TODO: Remove this work-around after fixing the scheduler and enable the
1510     // assert above.
1511     if (RestoreVCCZ) {
1512       // Restore the vccz bit.  Any time a value is written to vcc, the vcc
1513       // bit is updated, so we can restore the bit by reading the value of
1514       // vcc and then writing it back to the register.
1515       BuildMI(Block, Inst, Inst.getDebugLoc(),
1516               TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
1517               TRI->getVCC())
1518           .addReg(TRI->getVCC());
1519       VCCZCorrect = true;
1520       Modified = true;
1521     }
1522 
1523     ++Iter;
1524   }
1525 
1526   return Modified;
1527 }
1528 
1529 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
1530   ST = &MF.getSubtarget<GCNSubtarget>();
1531   TII = ST->getInstrInfo();
1532   TRI = &TII->getRegisterInfo();
1533   MRI = &MF.getRegInfo();
1534   IV = AMDGPU::getIsaVersion(ST->getCPU());
1535   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1536   PDT = &getAnalysis<MachinePostDominatorTree>();
1537 
1538   ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
1539   for (auto T : inst_counter_types())
1540     ForceEmitWaitcnt[T] = false;
1541 
1542   HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1543   HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1544   HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1545   HardwareLimits.VscntMax = ST->hasVscnt() ? 63 : 0;
1546 
1547   unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
1548   unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
1549   assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1550   assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1551 
1552   RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1553   RegisterEncoding.VGPRL = RegisterEncoding.VGPR0 + NumVGPRsMax - 1;
1554   RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1555   RegisterEncoding.SGPRL = RegisterEncoding.SGPR0 + NumSGPRsMax - 1;
1556 
1557   TrackedWaitcntSet.clear();
1558   BlockInfos.clear();
1559 
1560   // Keep iterating over the blocks in reverse post order, inserting and
1561   // updating s_waitcnt where needed, until a fix point is reached.
1562   for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
1563     BlockInfos.insert({MBB, BlockInfo(MBB)});
1564 
1565   std::unique_ptr<WaitcntBrackets> Brackets;
1566   bool Modified = false;
1567   bool Repeat;
1568   do {
1569     Repeat = false;
1570 
1571     for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
1572          ++BII) {
1573       BlockInfo &BI = BII->second;
1574       if (!BI.Dirty)
1575         continue;
1576 
1577       if (BI.Incoming) {
1578         if (!Brackets)
1579           Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
1580         else
1581           *Brackets = *BI.Incoming;
1582       } else {
1583         if (!Brackets)
1584           Brackets = std::make_unique<WaitcntBrackets>(ST);
1585         else
1586           *Brackets = WaitcntBrackets(ST);
1587       }
1588 
1589       Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
1590       BI.Dirty = false;
1591 
1592       if (Brackets->hasPending()) {
1593         BlockInfo *MoveBracketsToSucc = nullptr;
1594         for (MachineBasicBlock *Succ : BI.MBB->successors()) {
1595           auto SuccBII = BlockInfos.find(Succ);
1596           BlockInfo &SuccBI = SuccBII->second;
1597           if (!SuccBI.Incoming) {
1598             SuccBI.Dirty = true;
1599             if (SuccBII <= BII)
1600               Repeat = true;
1601             if (!MoveBracketsToSucc) {
1602               MoveBracketsToSucc = &SuccBI;
1603             } else {
1604               SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
1605             }
1606           } else if (SuccBI.Incoming->merge(*Brackets)) {
1607             SuccBI.Dirty = true;
1608             if (SuccBII <= BII)
1609               Repeat = true;
1610           }
1611         }
1612         if (MoveBracketsToSucc)
1613           MoveBracketsToSucc->Incoming = std::move(Brackets);
1614       }
1615     }
1616   } while (Repeat);
1617 
1618   SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
1619 
1620   bool HaveScalarStores = false;
1621 
1622   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
1623        ++BI) {
1624     MachineBasicBlock &MBB = *BI;
1625 
1626     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
1627          ++I) {
1628       if (!HaveScalarStores && TII->isScalarStore(*I))
1629         HaveScalarStores = true;
1630 
1631       if (I->getOpcode() == AMDGPU::S_ENDPGM ||
1632           I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1633         EndPgmBlocks.push_back(&MBB);
1634     }
1635   }
1636 
1637   if (HaveScalarStores) {
1638     // If scalar writes are used, the cache must be flushed or else the next
1639     // wave to reuse the same scratch memory can be clobbered.
1640     //
1641     // Insert s_dcache_wb at wave termination points if there were any scalar
1642     // stores, and only if the cache hasn't already been flushed. This could be
1643     // improved by looking across blocks for flushes in postdominating blocks
1644     // from the stores but an explicitly requested flush is probably very rare.
1645     for (MachineBasicBlock *MBB : EndPgmBlocks) {
1646       bool SeenDCacheWB = false;
1647 
1648       for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
1649            ++I) {
1650         if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1651           SeenDCacheWB = true;
1652         else if (TII->isScalarStore(*I))
1653           SeenDCacheWB = false;
1654 
1655         // FIXME: It would be better to insert this before a waitcnt if any.
1656         if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1657              I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1658             !SeenDCacheWB) {
1659           Modified = true;
1660           BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1661         }
1662       }
1663     }
1664   }
1665 
1666   if (!MFI->isEntryFunction()) {
1667     // Wait for any outstanding memory operations that the input registers may
1668     // depend on. We can't track them and it's better to the wait after the
1669     // costly call sequence.
1670 
1671     // TODO: Could insert earlier and schedule more liberally with operations
1672     // that only use caller preserved registers.
1673     MachineBasicBlock &EntryBB = MF.front();
1674     MachineBasicBlock::iterator I = EntryBB.begin();
1675     for (MachineBasicBlock::iterator E = EntryBB.end();
1676          I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
1677       ;
1678     BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
1679     if (ST->hasVscnt())
1680       BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
1681           .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1682           .addImm(0);
1683 
1684     Modified = true;
1685   }
1686 
1687   return Modified;
1688 }
1689