1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Insert wait instructions for memory reads and writes.
11 ///
12 /// Memory reads and writes are issued asynchronously, so we need to insert
13 /// S_WAITCNT instructions when we want to access any of their results or
14 /// overwrite any register that's used asynchronously.
15 ///
16 /// TODO: This pass currently keeps one timeline per hardware counter. A more
17 /// finely-grained approach that keeps one timeline per event type could
18 /// sometimes get away with generating weaker s_waitcnt instructions. For
19 /// example, when both SMEM and LDS are in flight and we need to wait for
20 /// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21 /// but the pass will currently generate a conservative lgkmcnt(0) because
22 /// multiple event types are in flight.
23 //
24 //===----------------------------------------------------------------------===//
25 
26 #include "AMDGPU.h"
27 #include "GCNSubtarget.h"
28 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
29 #include "SIMachineFunctionInfo.h"
30 #include "llvm/ADT/MapVector.h"
31 #include "llvm/ADT/PostOrderIterator.h"
32 #include "llvm/CodeGen/MachinePostDominators.h"
33 #include "llvm/InitializePasses.h"
34 #include "llvm/Support/DebugCounter.h"
35 #include "llvm/Support/TargetParser.h"
36 using namespace llvm;
37 
38 #define DEBUG_TYPE "si-insert-waitcnts"
39 
40 DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
41               "Force emit s_waitcnt expcnt(0) instrs");
42 DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
43               "Force emit s_waitcnt lgkmcnt(0) instrs");
44 DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
45               "Force emit s_waitcnt vmcnt(0) instrs");
46 
47 static cl::opt<bool> ForceEmitZeroFlag(
48   "amdgpu-waitcnt-forcezero",
49   cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
50   cl::init(false), cl::Hidden);
51 
52 namespace {
53 
54 template <typename EnumT>
55 class enum_iterator
56     : public iterator_facade_base<enum_iterator<EnumT>,
57                                   std::forward_iterator_tag, const EnumT> {
58   EnumT Value;
59 public:
60   enum_iterator() = default;
61   enum_iterator(EnumT Value) : Value(Value) {}
62 
63   enum_iterator &operator++() {
64     Value = static_cast<EnumT>(Value + 1);
65     return *this;
66   }
67 
68   bool operator==(const enum_iterator &RHS) const { return Value == RHS.Value; }
69 
70   EnumT operator*() const { return Value; }
71 };
72 
73 // Class of object that encapsulates latest instruction counter score
74 // associated with the operand.  Used for determining whether
75 // s_waitcnt instruction needs to be emited.
76 
77 #define CNT_MASK(t) (1u << (t))
78 
79 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS };
80 
81 iterator_range<enum_iterator<InstCounterType>> inst_counter_types() {
82   return make_range(enum_iterator<InstCounterType>(VM_CNT),
83                     enum_iterator<InstCounterType>(NUM_INST_CNTS));
84 }
85 
86 using RegInterval = std::pair<int, int>;
87 
88 struct {
89   unsigned VmcntMax;
90   unsigned ExpcntMax;
91   unsigned LgkmcntMax;
92   unsigned VscntMax;
93 } HardwareLimits;
94 
95 struct {
96   unsigned VGPR0;
97   unsigned VGPRL;
98   unsigned SGPR0;
99   unsigned SGPRL;
100 } RegisterEncoding;
101 
102 enum WaitEventType {
103   VMEM_ACCESS,      // vector-memory read & write
104   VMEM_READ_ACCESS, // vector-memory read
105   VMEM_WRITE_ACCESS,// vector-memory write
106   LDS_ACCESS,       // lds read & write
107   GDS_ACCESS,       // gds read & write
108   SQ_MESSAGE,       // send message
109   SMEM_ACCESS,      // scalar-memory read & write
110   EXP_GPR_LOCK,     // export holding on its data src
111   GDS_GPR_LOCK,     // GDS holding on its data and addr src
112   EXP_POS_ACCESS,   // write to export position
113   EXP_PARAM_ACCESS, // write to export parameter
114   VMW_GPR_LOCK,     // vector-memory write holding on its data src
115   NUM_WAIT_EVENTS,
116 };
117 
118 static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = {
119   (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
120   (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
121       (1 << SQ_MESSAGE),
122   (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
123       (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS),
124   (1 << VMEM_WRITE_ACCESS)
125 };
126 
127 // The mapping is:
128 //  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
129 //  SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1                  extra VGPR-like slots
130 //  NUM_ALL_VGPRS    .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
131 // We reserve a fixed number of VGPR slots in the scoring tables for
132 // special tokens like SCMEM_LDS (needed for buffer load to LDS).
133 enum RegisterMapping {
134   SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
135   AGPR_OFFSET = 226, // Maximum programmable ArchVGPRs across all targets.
136   SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
137   NUM_EXTRA_VGPRS = 1,    // A reserved slot for DS.
138   EXTRA_VGPR_LDS = 0,     // This is a placeholder the Shader algorithm uses.
139   NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
140 };
141 
142 // Enumerate different types of result-returning VMEM operations. Although
143 // s_waitcnt orders them all with a single vmcnt counter, in the absence of
144 // s_waitcnt only instructions of the same VmemType are guaranteed to write
145 // their results in order -- so there is no need to insert an s_waitcnt between
146 // two instructions of the same type that write the same vgpr.
147 enum VmemType {
148   // BUF instructions and MIMG instructions without a sampler.
149   VMEM_NOSAMPLER,
150   // MIMG instructions with a sampler.
151   VMEM_SAMPLER,
152 };
153 
154 VmemType getVmemType(const MachineInstr &Inst) {
155   assert(SIInstrInfo::isVMEM(Inst));
156   if (!SIInstrInfo::isMIMG(Inst))
157     return VMEM_NOSAMPLER;
158   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
159   return AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler
160              ? VMEM_SAMPLER
161              : VMEM_NOSAMPLER;
162 }
163 
164 void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
165   switch (T) {
166   case VM_CNT:
167     Wait.VmCnt = std::min(Wait.VmCnt, Count);
168     break;
169   case EXP_CNT:
170     Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
171     break;
172   case LGKM_CNT:
173     Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
174     break;
175   case VS_CNT:
176     Wait.VsCnt = std::min(Wait.VsCnt, Count);
177     break;
178   default:
179     llvm_unreachable("bad InstCounterType");
180   }
181 }
182 
183 // This objects maintains the current score brackets of each wait counter, and
184 // a per-register scoreboard for each wait counter.
185 //
186 // We also maintain the latest score for every event type that can change the
187 // waitcnt in order to know if there are multiple types of events within
188 // the brackets. When multiple types of event happen in the bracket,
189 // wait count may get decreased out of order, therefore we need to put in
190 // "s_waitcnt 0" before use.
191 class WaitcntBrackets {
192 public:
193   WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {}
194 
195   static unsigned getWaitCountMax(InstCounterType T) {
196     switch (T) {
197     case VM_CNT:
198       return HardwareLimits.VmcntMax;
199     case LGKM_CNT:
200       return HardwareLimits.LgkmcntMax;
201     case EXP_CNT:
202       return HardwareLimits.ExpcntMax;
203     case VS_CNT:
204       return HardwareLimits.VscntMax;
205     default:
206       break;
207     }
208     return 0;
209   }
210 
211   unsigned getScoreLB(InstCounterType T) const {
212     assert(T < NUM_INST_CNTS);
213     return ScoreLBs[T];
214   }
215 
216   unsigned getScoreUB(InstCounterType T) const {
217     assert(T < NUM_INST_CNTS);
218     return ScoreUBs[T];
219   }
220 
221   // Mapping from event to counter.
222   InstCounterType eventCounter(WaitEventType E) {
223     if (WaitEventMaskForInst[VM_CNT] & (1 << E))
224       return VM_CNT;
225     if (WaitEventMaskForInst[LGKM_CNT] & (1 << E))
226       return LGKM_CNT;
227     if (WaitEventMaskForInst[VS_CNT] & (1 << E))
228       return VS_CNT;
229     assert(WaitEventMaskForInst[EXP_CNT] & (1 << E));
230     return EXP_CNT;
231   }
232 
233   unsigned getRegScore(int GprNo, InstCounterType T) {
234     if (GprNo < NUM_ALL_VGPRS) {
235       return VgprScores[T][GprNo];
236     }
237     assert(T == LGKM_CNT);
238     return SgprScores[GprNo - NUM_ALL_VGPRS];
239   }
240 
241   bool merge(const WaitcntBrackets &Other);
242 
243   RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
244                              const MachineRegisterInfo *MRI,
245                              const SIRegisterInfo *TRI, unsigned OpNo) const;
246 
247   bool counterOutOfOrder(InstCounterType T) const;
248   void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
249   void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
250   void determineWait(InstCounterType T, unsigned ScoreToWait,
251                      AMDGPU::Waitcnt &Wait) const;
252   void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
253   void applyWaitcnt(InstCounterType T, unsigned Count);
254   void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
255                      const MachineRegisterInfo *MRI, WaitEventType E,
256                      MachineInstr &MI);
257 
258   bool hasPending() const { return PendingEvents != 0; }
259   bool hasPendingEvent(WaitEventType E) const {
260     return PendingEvents & (1 << E);
261   }
262 
263   bool hasMixedPendingEvents(InstCounterType T) const {
264     unsigned Events = PendingEvents & WaitEventMaskForInst[T];
265     // Return true if more than one bit is set in Events.
266     return Events & (Events - 1);
267   }
268 
269   bool hasPendingFlat() const {
270     return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
271              LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
272             (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
273              LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
274   }
275 
276   void setPendingFlat() {
277     LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
278     LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
279   }
280 
281   // Return true if there might be pending writes to the specified vgpr by VMEM
282   // instructions with types different from V.
283   bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const {
284     assert(GprNo < NUM_ALL_VGPRS);
285     return VgprVmemTypes[GprNo] & ~(1 << V);
286   }
287 
288   void clearVgprVmemTypes(int GprNo) {
289     assert(GprNo < NUM_ALL_VGPRS);
290     VgprVmemTypes[GprNo] = 0;
291   }
292 
293   void print(raw_ostream &);
294   void dump() { print(dbgs()); }
295 
296 private:
297   struct MergeInfo {
298     unsigned OldLB;
299     unsigned OtherLB;
300     unsigned MyShift;
301     unsigned OtherShift;
302   };
303   static bool mergeScore(const MergeInfo &M, unsigned &Score,
304                          unsigned OtherScore);
305 
306   void setScoreLB(InstCounterType T, unsigned Val) {
307     assert(T < NUM_INST_CNTS);
308     ScoreLBs[T] = Val;
309   }
310 
311   void setScoreUB(InstCounterType T, unsigned Val) {
312     assert(T < NUM_INST_CNTS);
313     ScoreUBs[T] = Val;
314     if (T == EXP_CNT) {
315       unsigned UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT);
316       if (ScoreLBs[T] < UB && UB < ScoreUBs[T])
317         ScoreLBs[T] = UB;
318     }
319   }
320 
321   void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
322     if (GprNo < NUM_ALL_VGPRS) {
323       VgprUB = std::max(VgprUB, GprNo);
324       VgprScores[T][GprNo] = Val;
325     } else {
326       assert(T == LGKM_CNT);
327       SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
328       SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
329     }
330   }
331 
332   void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
333                    const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
334                    unsigned OpNo, unsigned Val);
335 
336   const GCNSubtarget *ST = nullptr;
337   unsigned ScoreLBs[NUM_INST_CNTS] = {0};
338   unsigned ScoreUBs[NUM_INST_CNTS] = {0};
339   unsigned PendingEvents = 0;
340   // Remember the last flat memory operation.
341   unsigned LastFlat[NUM_INST_CNTS] = {0};
342   // wait_cnt scores for every vgpr.
343   // Keep track of the VgprUB and SgprUB to make merge at join efficient.
344   int VgprUB = -1;
345   int SgprUB = -1;
346   unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
347   // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
348   unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
349   // Bitmask of the VmemTypes of VMEM instructions that might have a pending
350   // write to each vgpr.
351   unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
352 };
353 
354 class SIInsertWaitcnts : public MachineFunctionPass {
355 private:
356   const GCNSubtarget *ST = nullptr;
357   const SIInstrInfo *TII = nullptr;
358   const SIRegisterInfo *TRI = nullptr;
359   const MachineRegisterInfo *MRI = nullptr;
360   AMDGPU::IsaVersion IV;
361 
362   DenseSet<MachineInstr *> TrackedWaitcntSet;
363   DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
364   MachinePostDominatorTree *PDT;
365 
366   struct BlockInfo {
367     MachineBasicBlock *MBB;
368     std::unique_ptr<WaitcntBrackets> Incoming;
369     bool Dirty = true;
370 
371     explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {}
372   };
373 
374   MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
375 
376   // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
377   // because of amdgpu-waitcnt-forcezero flag
378   bool ForceEmitZeroWaitcnts;
379   bool ForceEmitWaitcnt[NUM_INST_CNTS];
380 
381 public:
382   static char ID;
383 
384   SIInsertWaitcnts() : MachineFunctionPass(ID) {
385     (void)ForceExpCounter;
386     (void)ForceLgkmCounter;
387     (void)ForceVMCounter;
388   }
389 
390   bool runOnMachineFunction(MachineFunction &MF) override;
391 
392   StringRef getPassName() const override {
393     return "SI insert wait instructions";
394   }
395 
396   void getAnalysisUsage(AnalysisUsage &AU) const override {
397     AU.setPreservesCFG();
398     AU.addRequired<MachinePostDominatorTree>();
399     MachineFunctionPass::getAnalysisUsage(AU);
400   }
401 
402   bool isForceEmitWaitcnt() const {
403     for (auto T : inst_counter_types())
404       if (ForceEmitWaitcnt[T])
405         return true;
406     return false;
407   }
408 
409   void setForceEmitWaitcnt() {
410 // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
411 // For debug builds, get the debug counter info and adjust if need be
412 #ifndef NDEBUG
413     if (DebugCounter::isCounterSet(ForceExpCounter) &&
414         DebugCounter::shouldExecute(ForceExpCounter)) {
415       ForceEmitWaitcnt[EXP_CNT] = true;
416     } else {
417       ForceEmitWaitcnt[EXP_CNT] = false;
418     }
419 
420     if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
421         DebugCounter::shouldExecute(ForceLgkmCounter)) {
422       ForceEmitWaitcnt[LGKM_CNT] = true;
423     } else {
424       ForceEmitWaitcnt[LGKM_CNT] = false;
425     }
426 
427     if (DebugCounter::isCounterSet(ForceVMCounter) &&
428         DebugCounter::shouldExecute(ForceVMCounter)) {
429       ForceEmitWaitcnt[VM_CNT] = true;
430     } else {
431       ForceEmitWaitcnt[VM_CNT] = false;
432     }
433 #endif // NDEBUG
434   }
435 
436   bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
437   bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
438   bool generateWaitcntInstBefore(MachineInstr &MI,
439                                  WaitcntBrackets &ScoreBrackets,
440                                  MachineInstr *OldWaitcntInstr);
441   void updateEventWaitcntAfter(MachineInstr &Inst,
442                                WaitcntBrackets *ScoreBrackets);
443   bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
444                             WaitcntBrackets &ScoreBrackets);
445   bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
446                                MachineInstr &OldWaitcntInstr,
447                                AMDGPU::Waitcnt &Wait, const MachineInstr *MI);
448 };
449 
450 } // end anonymous namespace
451 
452 RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
453                                             const SIInstrInfo *TII,
454                                             const MachineRegisterInfo *MRI,
455                                             const SIRegisterInfo *TRI,
456                                             unsigned OpNo) const {
457   const MachineOperand &Op = MI->getOperand(OpNo);
458   if (!TRI->isInAllocatableClass(Op.getReg()))
459     return {-1, -1};
460 
461   // A use via a PW operand does not need a waitcnt.
462   // A partial write is not a WAW.
463   assert(!Op.getSubReg() || !Op.isUndef());
464 
465   RegInterval Result;
466 
467   unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST));
468 
469   if (TRI->isVectorRegister(*MRI, Op.getReg())) {
470     assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
471     Result.first = Reg - RegisterEncoding.VGPR0;
472     if (TRI->isAGPR(*MRI, Op.getReg()))
473       Result.first += AGPR_OFFSET;
474     assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
475   } else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
476     assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
477     Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
478     assert(Result.first >= NUM_ALL_VGPRS &&
479            Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
480   }
481   // TODO: Handle TTMP
482   // else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
483   else
484     return {-1, -1};
485 
486   const TargetRegisterClass *RC = TII->getOpRegClass(*MI, OpNo);
487   unsigned Size = TRI->getRegSizeInBits(*RC);
488   Result.second = Result.first + ((Size + 16) / 32);
489 
490   return Result;
491 }
492 
493 void WaitcntBrackets::setExpScore(const MachineInstr *MI,
494                                   const SIInstrInfo *TII,
495                                   const SIRegisterInfo *TRI,
496                                   const MachineRegisterInfo *MRI, unsigned OpNo,
497                                   unsigned Val) {
498   RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo);
499   assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));
500   for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
501     setRegScore(RegNo, EXP_CNT, Val);
502   }
503 }
504 
505 void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
506                                     const SIRegisterInfo *TRI,
507                                     const MachineRegisterInfo *MRI,
508                                     WaitEventType E, MachineInstr &Inst) {
509   InstCounterType T = eventCounter(E);
510   unsigned CurrScore = getScoreUB(T) + 1;
511   if (CurrScore == 0)
512     report_fatal_error("InsertWaitcnt score wraparound");
513   // PendingEvents and ScoreUB need to be update regardless if this event
514   // changes the score of a register or not.
515   // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
516   PendingEvents |= 1 << E;
517   setScoreUB(T, CurrScore);
518 
519   if (T == EXP_CNT) {
520     // Put score on the source vgprs. If this is a store, just use those
521     // specific register(s).
522     if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
523       int AddrOpIdx =
524           AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
525       // All GDS operations must protect their address register (same as
526       // export.)
527       if (AddrOpIdx != -1) {
528         setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
529       }
530 
531       if (Inst.mayStore()) {
532         if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
533                                        AMDGPU::OpName::data0) != -1) {
534           setExpScore(
535               &Inst, TII, TRI, MRI,
536               AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
537               CurrScore);
538         }
539         if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
540                                        AMDGPU::OpName::data1) != -1) {
541           setExpScore(&Inst, TII, TRI, MRI,
542                       AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
543                                                  AMDGPU::OpName::data1),
544                       CurrScore);
545         }
546       } else if (SIInstrInfo::isAtomicRet(Inst) &&
547                  Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
548                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
549                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
550                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
551                  Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
552                  Inst.getOpcode() != AMDGPU::DS_APPEND &&
553                  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
554                  Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
555         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
556           const MachineOperand &Op = Inst.getOperand(I);
557           if (Op.isReg() && !Op.isDef() &&
558               TRI->isVectorRegister(*MRI, Op.getReg())) {
559             setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
560           }
561         }
562       }
563     } else if (TII->isFLAT(Inst)) {
564       if (Inst.mayStore()) {
565         setExpScore(
566             &Inst, TII, TRI, MRI,
567             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
568             CurrScore);
569       } else if (SIInstrInfo::isAtomicRet(Inst)) {
570         setExpScore(
571             &Inst, TII, TRI, MRI,
572             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
573             CurrScore);
574       }
575     } else if (TII->isMIMG(Inst)) {
576       if (Inst.mayStore()) {
577         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
578       } else if (SIInstrInfo::isAtomicRet(Inst)) {
579         setExpScore(
580             &Inst, TII, TRI, MRI,
581             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
582             CurrScore);
583       }
584     } else if (TII->isMTBUF(Inst)) {
585       if (Inst.mayStore()) {
586         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
587       }
588     } else if (TII->isMUBUF(Inst)) {
589       if (Inst.mayStore()) {
590         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
591       } else if (SIInstrInfo::isAtomicRet(Inst)) {
592         setExpScore(
593             &Inst, TII, TRI, MRI,
594             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
595             CurrScore);
596       }
597     } else {
598       if (TII->isEXP(Inst)) {
599         // For export the destination registers are really temps that
600         // can be used as the actual source after export patching, so
601         // we need to treat them like sources and set the EXP_CNT
602         // score.
603         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
604           MachineOperand &DefMO = Inst.getOperand(I);
605           if (DefMO.isReg() && DefMO.isDef() &&
606               TRI->isVGPR(*MRI, DefMO.getReg())) {
607             setRegScore(
608                 TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)),
609                 EXP_CNT, CurrScore);
610           }
611         }
612       }
613       for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
614         MachineOperand &MO = Inst.getOperand(I);
615         if (MO.isReg() && !MO.isDef() &&
616             TRI->isVectorRegister(*MRI, MO.getReg())) {
617           setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
618         }
619       }
620     }
621 #if 0 // TODO: check if this is handled by MUBUF code above.
622   } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
623        Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
624        Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
625     MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
626     unsigned OpNo;//TODO: find the OpNo for this operand;
627     RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo);
628     for (int RegNo = Interval.first; RegNo < Interval.second;
629     ++RegNo) {
630       setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
631     }
632 #endif
633   } else {
634     // Match the score to the destination registers.
635     for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
636       auto &Op = Inst.getOperand(I);
637       if (!Op.isReg() || !Op.isDef())
638         continue;
639       RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I);
640       if (T == VM_CNT) {
641         if (Interval.first >= NUM_ALL_VGPRS)
642           continue;
643         if (SIInstrInfo::isVMEM(Inst)) {
644           VmemType V = getVmemType(Inst);
645           for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
646             VgprVmemTypes[RegNo] |= 1 << V;
647         }
648       }
649       for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
650         setRegScore(RegNo, T, CurrScore);
651       }
652     }
653     if (TII->isDS(Inst) && Inst.mayStore()) {
654       setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
655     }
656   }
657 }
658 
659 void WaitcntBrackets::print(raw_ostream &OS) {
660   OS << '\n';
661   for (auto T : inst_counter_types()) {
662     unsigned LB = getScoreLB(T);
663     unsigned UB = getScoreUB(T);
664 
665     switch (T) {
666     case VM_CNT:
667       OS << "    VM_CNT(" << UB - LB << "): ";
668       break;
669     case LGKM_CNT:
670       OS << "    LGKM_CNT(" << UB - LB << "): ";
671       break;
672     case EXP_CNT:
673       OS << "    EXP_CNT(" << UB - LB << "): ";
674       break;
675     case VS_CNT:
676       OS << "    VS_CNT(" << UB - LB << "): ";
677       break;
678     default:
679       OS << "    UNKNOWN(" << UB - LB << "): ";
680       break;
681     }
682 
683     if (LB < UB) {
684       // Print vgpr scores.
685       for (int J = 0; J <= VgprUB; J++) {
686         unsigned RegScore = getRegScore(J, T);
687         if (RegScore <= LB)
688           continue;
689         unsigned RelScore = RegScore - LB - 1;
690         if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
691           OS << RelScore << ":v" << J << " ";
692         } else {
693           OS << RelScore << ":ds ";
694         }
695       }
696       // Also need to print sgpr scores for lgkm_cnt.
697       if (T == LGKM_CNT) {
698         for (int J = 0; J <= SgprUB; J++) {
699           unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
700           if (RegScore <= LB)
701             continue;
702           unsigned RelScore = RegScore - LB - 1;
703           OS << RelScore << ":s" << J << " ";
704         }
705       }
706     }
707     OS << '\n';
708   }
709   OS << '\n';
710 }
711 
712 /// Simplify the waitcnt, in the sense of removing redundant counts, and return
713 /// whether a waitcnt instruction is needed at all.
714 void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
715   simplifyWaitcnt(VM_CNT, Wait.VmCnt);
716   simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
717   simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
718   simplifyWaitcnt(VS_CNT, Wait.VsCnt);
719 }
720 
721 void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
722                                       unsigned &Count) const {
723   const unsigned LB = getScoreLB(T);
724   const unsigned UB = getScoreUB(T);
725 
726   // The number of outstanding events for this type, T, can be calculated
727   // as (UB - LB). If the current Count is greater than or equal to the number
728   // of outstanding events, then the wait for this counter is redundant.
729   if (Count >= UB - LB)
730     Count = ~0u;
731 }
732 
733 void WaitcntBrackets::determineWait(InstCounterType T, unsigned ScoreToWait,
734                                     AMDGPU::Waitcnt &Wait) const {
735   // If the score of src_operand falls within the bracket, we need an
736   // s_waitcnt instruction.
737   const unsigned LB = getScoreLB(T);
738   const unsigned UB = getScoreUB(T);
739   if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
740     if ((T == VM_CNT || T == LGKM_CNT) &&
741         hasPendingFlat() &&
742         !ST->hasFlatLgkmVMemCountInOrder()) {
743       // If there is a pending FLAT operation, and this is a VMem or LGKM
744       // waitcnt and the target can report early completion, then we need
745       // to force a waitcnt 0.
746       addWait(Wait, T, 0);
747     } else if (counterOutOfOrder(T)) {
748       // Counter can get decremented out-of-order when there
749       // are multiple types event in the bracket. Also emit an s_wait counter
750       // with a conservative value of 0 for the counter.
751       addWait(Wait, T, 0);
752     } else {
753       // If a counter has been maxed out avoid overflow by waiting for
754       // MAX(CounterType) - 1 instead.
755       unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
756       addWait(Wait, T, NeededWait);
757     }
758   }
759 }
760 
761 void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
762   applyWaitcnt(VM_CNT, Wait.VmCnt);
763   applyWaitcnt(EXP_CNT, Wait.ExpCnt);
764   applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
765   applyWaitcnt(VS_CNT, Wait.VsCnt);
766 }
767 
768 void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
769   const unsigned UB = getScoreUB(T);
770   if (Count >= UB)
771     return;
772   if (Count != 0) {
773     if (counterOutOfOrder(T))
774       return;
775     setScoreLB(T, std::max(getScoreLB(T), UB - Count));
776   } else {
777     setScoreLB(T, UB);
778     PendingEvents &= ~WaitEventMaskForInst[T];
779   }
780 }
781 
782 // Where there are multiple types of event in the bracket of a counter,
783 // the decrement may go out of order.
784 bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
785   // Scalar memory read always can go out of order.
786   if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
787     return true;
788   return hasMixedPendingEvents(T);
789 }
790 
791 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
792                       false)
793 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
794 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
795                     false)
796 
797 char SIInsertWaitcnts::ID = 0;
798 
799 char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
800 
801 FunctionPass *llvm::createSIInsertWaitcntsPass() {
802   return new SIInsertWaitcnts();
803 }
804 
805 /// Combine consecutive waitcnt instructions that precede \p MI and follow
806 /// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
807 /// by previous passes. Currently this pass conservatively assumes that these
808 /// preexisting waitcnt are required for correctness.
809 bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
810                                                MachineInstr &OldWaitcntInstr,
811                                                AMDGPU::Waitcnt &Wait,
812                                                const MachineInstr *MI) {
813   bool Modified = false;
814   MachineInstr *WaitcntInstr = nullptr;
815   MachineInstr *WaitcntVsCntInstr = nullptr;
816   for (auto II = OldWaitcntInstr.getIterator(), NextI = std::next(II);
817        &*II != MI; II = NextI, ++NextI) {
818     if (II->isMetaInstruction())
819       continue;
820 
821     if (II->getOpcode() == AMDGPU::S_WAITCNT) {
822       // Conservatively update required wait if this waitcnt was added in an
823       // earlier pass. In this case it will not exist in the tracked waitcnt
824       // set.
825       if (!TrackedWaitcntSet.count(&*II)) {
826         unsigned IEnc = II->getOperand(0).getImm();
827         AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
828         Wait = Wait.combined(OldWait);
829       }
830 
831       // Merge consecutive waitcnt of the same type by erasing multiples.
832       if (!WaitcntInstr) {
833         WaitcntInstr = &*II;
834       } else {
835         II->eraseFromParent();
836         Modified = true;
837       }
838 
839     } else {
840       assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
841       assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
842       if (!TrackedWaitcntSet.count(&*II)) {
843         unsigned OldVSCnt =
844             TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->getImm();
845         Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);
846       }
847 
848       if (!WaitcntVsCntInstr) {
849         WaitcntVsCntInstr = &*II;
850       } else {
851         II->eraseFromParent();
852         Modified = true;
853       }
854     }
855   }
856 
857   // Updated encoding of merged waitcnt with the required wait.
858   if (WaitcntInstr) {
859     if (Wait.hasWaitExceptVsCnt()) {
860       unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait);
861       unsigned OldEnc = WaitcntInstr->getOperand(0).getImm();
862       if (OldEnc != NewEnc) {
863         WaitcntInstr->getOperand(0).setImm(NewEnc);
864         Modified = true;
865       }
866       ScoreBrackets.applyWaitcnt(Wait);
867       Wait.VmCnt = ~0u;
868       Wait.LgkmCnt = ~0u;
869       Wait.ExpCnt = ~0u;
870 
871       LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
872                         << "Old Instr: " << MI << "New Instr: " << *WaitcntInstr
873                         << '\n');
874     } else {
875       WaitcntInstr->eraseFromParent();
876       Modified = true;
877     }
878   }
879 
880   if (WaitcntVsCntInstr) {
881     if (Wait.hasWaitVsCnt()) {
882       assert(ST->hasVscnt());
883       unsigned OldVSCnt =
884           TII->getNamedOperand(*WaitcntVsCntInstr, AMDGPU::OpName::simm16)
885               ->getImm();
886       if (Wait.VsCnt != OldVSCnt) {
887         TII->getNamedOperand(*WaitcntVsCntInstr, AMDGPU::OpName::simm16)
888             ->setImm(Wait.VsCnt);
889         Modified = true;
890       }
891       ScoreBrackets.applyWaitcnt(Wait);
892       Wait.VsCnt = ~0u;
893 
894       LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
895                         << "Old Instr: " << MI
896                         << "New Instr: " << *WaitcntVsCntInstr << '\n');
897     } else {
898       WaitcntVsCntInstr->eraseFromParent();
899       Modified = true;
900     }
901   }
902 
903   return Modified;
904 }
905 
906 static bool readsVCCZ(const MachineInstr &MI) {
907   unsigned Opc = MI.getOpcode();
908   return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
909          !MI.getOperand(1).isUndef();
910 }
911 
912 /// \returns true if the callee inserts an s_waitcnt 0 on function entry.
913 static bool callWaitsOnFunctionEntry(const MachineInstr &MI) {
914   // Currently all conventions wait, but this may not always be the case.
915   //
916   // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
917   // senses to omit the wait and do it in the caller.
918   return true;
919 }
920 
921 /// \returns true if the callee is expected to wait for any outstanding waits
922 /// before returning.
923 static bool callWaitsOnFunctionReturn(const MachineInstr &MI) {
924   return true;
925 }
926 
927 ///  Generate s_waitcnt instruction to be placed before cur_Inst.
928 ///  Instructions of a given type are returned in order,
929 ///  but instructions of different types can complete out of order.
930 ///  We rely on this in-order completion
931 ///  and simply assign a score to the memory access instructions.
932 ///  We keep track of the active "score bracket" to determine
933 ///  if an access of a memory read requires an s_waitcnt
934 ///  and if so what the value of each counter is.
935 ///  The "score bracket" is bound by the lower bound and upper bound
936 ///  scores (*_score_LB and *_score_ub respectively).
937 bool SIInsertWaitcnts::generateWaitcntInstBefore(
938     MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
939     MachineInstr *OldWaitcntInstr) {
940   setForceEmitWaitcnt();
941 
942   if (MI.isMetaInstruction())
943     return false;
944 
945   AMDGPU::Waitcnt Wait;
946   bool Modified = false;
947 
948   // See if this instruction has a forced S_WAITCNT VM.
949   // TODO: Handle other cases of NeedsWaitcntVmBefore()
950   if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
951       MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
952       MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
953       MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
954       MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
955     Wait.VmCnt = 0;
956   }
957 
958   // All waits must be resolved at call return.
959   // NOTE: this could be improved with knowledge of all call sites or
960   //   with knowledge of the called routines.
961   if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
962       MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
963       (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
964     Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
965   }
966   // Resolve vm waits before gs-done.
967   else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
968             MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
969            ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
970             AMDGPU::SendMsg::ID_GS_DONE)) {
971     Wait.VmCnt = 0;
972   }
973 #if 0 // TODO: the following blocks of logic when we have fence.
974   else if (MI.getOpcode() == SC_FENCE) {
975     const unsigned int group_size =
976       context->shader_info->GetMaxThreadGroupSize();
977     // group_size == 0 means thread group size is unknown at compile time
978     const bool group_is_multi_wave =
979       (group_size == 0 || group_size > target_info->GetWaveFrontSize());
980     const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
981 
982     for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
983       SCRegType src_type = Inst->GetSrcType(i);
984       switch (src_type) {
985         case SCMEM_LDS:
986           if (group_is_multi_wave ||
987             context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
988             EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
989                                ScoreBrackets->getScoreUB(LGKM_CNT));
990             // LDS may have to wait for VM_CNT after buffer load to LDS
991             if (target_info->HasBufferLoadToLDS()) {
992               EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
993                                  ScoreBrackets->getScoreUB(VM_CNT));
994             }
995           }
996           break;
997 
998         case SCMEM_GDS:
999           if (group_is_multi_wave || fence_is_global) {
1000             EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1001               ScoreBrackets->getScoreUB(EXP_CNT));
1002             EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
1003               ScoreBrackets->getScoreUB(LGKM_CNT));
1004           }
1005           break;
1006 
1007         case SCMEM_UAV:
1008         case SCMEM_TFBUF:
1009         case SCMEM_RING:
1010         case SCMEM_SCATTER:
1011           if (group_is_multi_wave || fence_is_global) {
1012             EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1013               ScoreBrackets->getScoreUB(EXP_CNT));
1014             EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
1015               ScoreBrackets->getScoreUB(VM_CNT));
1016           }
1017           break;
1018 
1019         case SCMEM_SCRATCH:
1020         default:
1021           break;
1022       }
1023     }
1024   }
1025 #endif
1026 
1027   // Export & GDS instructions do not read the EXEC mask until after the export
1028   // is granted (which can occur well after the instruction is issued).
1029   // The shader program must flush all EXP operations on the export-count
1030   // before overwriting the EXEC mask.
1031   else {
1032     if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
1033       // Export and GDS are tracked individually, either may trigger a waitcnt
1034       // for EXEC.
1035       if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
1036           ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
1037           ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
1038           ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
1039         Wait.ExpCnt = 0;
1040       }
1041     }
1042 
1043     if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
1044       // The function is going to insert a wait on everything in its prolog.
1045       // This still needs to be careful if the call target is a load (e.g. a GOT
1046       // load). We also need to check WAW depenancy with saved PC.
1047       Wait = AMDGPU::Waitcnt();
1048 
1049       int CallAddrOpIdx =
1050           AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
1051 
1052       if (MI.getOperand(CallAddrOpIdx).isReg()) {
1053         RegInterval CallAddrOpInterval =
1054           ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, CallAddrOpIdx);
1055 
1056         for (int RegNo = CallAddrOpInterval.first;
1057              RegNo < CallAddrOpInterval.second; ++RegNo)
1058           ScoreBrackets.determineWait(
1059             LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
1060 
1061         int RtnAddrOpIdx =
1062           AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
1063         if (RtnAddrOpIdx != -1) {
1064           RegInterval RtnAddrOpInterval =
1065             ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, RtnAddrOpIdx);
1066 
1067           for (int RegNo = RtnAddrOpInterval.first;
1068                RegNo < RtnAddrOpInterval.second; ++RegNo)
1069             ScoreBrackets.determineWait(
1070               LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
1071         }
1072       }
1073     } else {
1074       // FIXME: Should not be relying on memoperands.
1075       // Look at the source operands of every instruction to see if
1076       // any of them results from a previous memory operation that affects
1077       // its current usage. If so, an s_waitcnt instruction needs to be
1078       // emitted.
1079       // If the source operand was defined by a load, add the s_waitcnt
1080       // instruction.
1081       //
1082       // Two cases are handled for destination operands:
1083       // 1) If the destination operand was defined by a load, add the s_waitcnt
1084       // instruction to guarantee the right WAW order.
1085       // 2) If a destination operand that was used by a recent export/store ins,
1086       // add s_waitcnt on exp_cnt to guarantee the WAR order.
1087       for (const MachineMemOperand *Memop : MI.memoperands()) {
1088         const Value *Ptr = Memop->getValue();
1089         if (Memop->isStore() && SLoadAddresses.count(Ptr)) {
1090           addWait(Wait, LGKM_CNT, 0);
1091           if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second))
1092             SLoadAddresses.erase(Ptr);
1093         }
1094         unsigned AS = Memop->getAddrSpace();
1095         if (AS != AMDGPUAS::LOCAL_ADDRESS)
1096           continue;
1097         unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1098         // VM_CNT is only relevant to vgpr or LDS.
1099         ScoreBrackets.determineWait(
1100             VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
1101         if (Memop->isStore()) {
1102           ScoreBrackets.determineWait(
1103               EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
1104         }
1105       }
1106 
1107       // Loop over use and def operands.
1108       for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1109         MachineOperand &Op = MI.getOperand(I);
1110         if (!Op.isReg())
1111           continue;
1112         RegInterval Interval =
1113             ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I);
1114 
1115         const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
1116         for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1117           if (IsVGPR) {
1118             // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
1119             // previous write and this write are the same type of VMEM
1120             // instruction, in which case they're guaranteed to write their
1121             // results in order anyway.
1122             if (Op.isUse() || !SIInstrInfo::isVMEM(MI) ||
1123                 ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
1124                                                        getVmemType(MI))) {
1125               ScoreBrackets.determineWait(
1126                   VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
1127               ScoreBrackets.clearVgprVmemTypes(RegNo);
1128             }
1129             if (Op.isDef()) {
1130               ScoreBrackets.determineWait(
1131                   EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
1132             }
1133           }
1134           ScoreBrackets.determineWait(
1135               LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
1136         }
1137       }
1138     }
1139   }
1140 
1141   // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
1142   // occurs before the instruction. Doing it here prevents any additional
1143   // S_WAITCNTs from being emitted if the instruction was marked as
1144   // requiring a WAITCNT beforehand.
1145   if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1146       !ST->hasAutoWaitcntBeforeBarrier()) {
1147     Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
1148   }
1149 
1150   // TODO: Remove this work-around, enable the assert for Bug 457939
1151   //       after fixing the scheduler. Also, the Shader Compiler code is
1152   //       independent of target.
1153   if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
1154     if (ScoreBrackets.getScoreLB(LGKM_CNT) <
1155             ScoreBrackets.getScoreUB(LGKM_CNT) &&
1156         ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1157       Wait.LgkmCnt = 0;
1158     }
1159   }
1160 
1161   // Verify that the wait is actually needed.
1162   ScoreBrackets.simplifyWaitcnt(Wait);
1163 
1164   if (ForceEmitZeroWaitcnts)
1165     Wait = AMDGPU::Waitcnt::allZero(ST->hasVscnt());
1166 
1167   if (ForceEmitWaitcnt[VM_CNT])
1168     Wait.VmCnt = 0;
1169   if (ForceEmitWaitcnt[EXP_CNT])
1170     Wait.ExpCnt = 0;
1171   if (ForceEmitWaitcnt[LGKM_CNT])
1172     Wait.LgkmCnt = 0;
1173   if (ForceEmitWaitcnt[VS_CNT])
1174     Wait.VsCnt = 0;
1175 
1176   if (OldWaitcntInstr) {
1177     // Try to merge the required wait with preexisting waitcnt instructions.
1178     // Also erase redundant waitcnt.
1179     Modified =
1180         applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, &MI);
1181   } else {
1182     // Update waitcnt brackets after determining the required wait.
1183     ScoreBrackets.applyWaitcnt(Wait);
1184   }
1185 
1186   // Build new waitcnt instructions unless no wait is needed or the old waitcnt
1187   // instruction was modified to handle the required wait.
1188   if (Wait.hasWaitExceptVsCnt()) {
1189     unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1190     auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
1191                              MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1192                          .addImm(Enc);
1193     TrackedWaitcntSet.insert(SWaitInst);
1194     Modified = true;
1195 
1196     LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
1197                       << "Old Instr: " << MI
1198                       << "New Instr: " << *SWaitInst << '\n');
1199   }
1200 
1201   if (Wait.hasWaitVsCnt()) {
1202     assert(ST->hasVscnt());
1203 
1204     auto SWaitInst =
1205         BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1206                 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1207             .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1208             .addImm(Wait.VsCnt);
1209     TrackedWaitcntSet.insert(SWaitInst);
1210     Modified = true;
1211 
1212     LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
1213                       << "Old Instr: " << MI
1214                       << "New Instr: " << *SWaitInst << '\n');
1215   }
1216 
1217   return Modified;
1218 }
1219 
1220 // This is a flat memory operation. Check to see if it has memory tokens other
1221 // than LDS. Other address spaces supported by flat memory operations involve
1222 // global memory.
1223 bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
1224   assert(TII->isFLAT(MI));
1225 
1226   // All flat instructions use the VMEM counter.
1227   assert(TII->usesVM_CNT(MI));
1228 
1229   // If there are no memory operands then conservatively assume the flat
1230   // operation may access VMEM.
1231   if (MI.memoperands_empty())
1232     return true;
1233 
1234   // See if any memory operand specifies an address space that involves VMEM.
1235   // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
1236   // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
1237   // (GDS) address space is not supported by flat operations. Therefore, simply
1238   // return true unless only the LDS address space is found.
1239   for (const MachineMemOperand *Memop : MI.memoperands()) {
1240     unsigned AS = Memop->getAddrSpace();
1241     assert(AS != AMDGPUAS::REGION_ADDRESS);
1242     if (AS != AMDGPUAS::LOCAL_ADDRESS)
1243       return true;
1244   }
1245 
1246   return false;
1247 }
1248 
1249 // This is a flat memory operation. Check to see if it has memory tokens for
1250 // either LDS or FLAT.
1251 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1252   assert(TII->isFLAT(MI));
1253 
1254   // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
1255   if (!TII->usesLGKM_CNT(MI))
1256     return false;
1257 
1258   // If in tgsplit mode then there can be no use of LDS.
1259   if (ST->isTgSplitEnabled())
1260     return false;
1261 
1262   // If there are no memory operands then conservatively assume the flat
1263   // operation may access LDS.
1264   if (MI.memoperands_empty())
1265     return true;
1266 
1267   // See if any memory operand specifies an address space that involves LDS.
1268   for (const MachineMemOperand *Memop : MI.memoperands()) {
1269     unsigned AS = Memop->getAddrSpace();
1270     if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
1271       return true;
1272   }
1273 
1274   return false;
1275 }
1276 
1277 void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
1278                                                WaitcntBrackets *ScoreBrackets) {
1279   // Now look at the instruction opcode. If it is a memory access
1280   // instruction, update the upper-bound of the appropriate counter's
1281   // bracket and the destination operand scores.
1282   // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
1283   if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
1284     if (TII->isAlwaysGDS(Inst.getOpcode()) ||
1285         TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
1286       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1287       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1288     } else {
1289       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1290     }
1291   } else if (TII->isFLAT(Inst)) {
1292     assert(Inst.mayLoadOrStore());
1293 
1294     int FlatASCount = 0;
1295 
1296     if (mayAccessVMEMThroughFlat(Inst)) {
1297       ++FlatASCount;
1298       if (!ST->hasVscnt())
1299         ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1300       else if (Inst.mayLoad() && !SIInstrInfo::isAtomicNoRet(Inst))
1301         ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
1302       else
1303         ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
1304     }
1305 
1306     if (mayAccessLDSThroughFlat(Inst)) {
1307       ++FlatASCount;
1308       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1309     }
1310 
1311     // A Flat memory operation must access at least one address space.
1312     assert(FlatASCount);
1313 
1314     // This is a flat memory operation that access both VMEM and LDS, so note it
1315     // - it will require that both the VM and LGKM be flushed to zero if it is
1316     // pending when a VM or LGKM dependency occurs.
1317     if (FlatASCount > 1)
1318       ScoreBrackets->setPendingFlat();
1319   } else if (SIInstrInfo::isVMEM(Inst) &&
1320              // TODO: get a better carve out.
1321              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
1322              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
1323              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL &&
1324              Inst.getOpcode() != AMDGPU::BUFFER_GL0_INV &&
1325              Inst.getOpcode() != AMDGPU::BUFFER_GL1_INV) {
1326     if (!ST->hasVscnt())
1327       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1328     else if ((Inst.mayLoad() && !SIInstrInfo::isAtomicNoRet(Inst)) ||
1329              /* IMAGE_GET_RESINFO / IMAGE_GET_LOD */
1330              (TII->isMIMG(Inst) && !Inst.mayLoad() && !Inst.mayStore()))
1331       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
1332     else if (Inst.mayStore())
1333       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
1334 
1335     if (ST->vmemWriteNeedsExpWaitcnt() &&
1336         (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
1337       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1338     }
1339   } else if (TII->isSMRD(Inst)) {
1340     ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1341   } else if (Inst.isCall()) {
1342     if (callWaitsOnFunctionReturn(Inst)) {
1343       // Act as a wait on everything
1344       ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
1345     } else {
1346       // May need to way wait for anything.
1347       ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
1348     }
1349   } else if (SIInstrInfo::isEXP(Inst)) {
1350     unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1351     if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
1352       ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1353     else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
1354       ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1355     else
1356       ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1357   } else {
1358     switch (Inst.getOpcode()) {
1359     case AMDGPU::S_SENDMSG:
1360     case AMDGPU::S_SENDMSGHALT:
1361       ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1362       break;
1363     case AMDGPU::S_MEMTIME:
1364     case AMDGPU::S_MEMREALTIME:
1365       ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1366       break;
1367     }
1368   }
1369 }
1370 
1371 bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
1372                                  unsigned OtherScore) {
1373   unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
1374   unsigned OtherShifted =
1375       OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
1376   Score = std::max(MyShifted, OtherShifted);
1377   return OtherShifted > MyShifted;
1378 }
1379 
1380 /// Merge the pending events and associater score brackets of \p Other into
1381 /// this brackets status.
1382 ///
1383 /// Returns whether the merge resulted in a change that requires tighter waits
1384 /// (i.e. the merged brackets strictly dominate the original brackets).
1385 bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
1386   bool StrictDom = false;
1387 
1388   VgprUB = std::max(VgprUB, Other.VgprUB);
1389   SgprUB = std::max(SgprUB, Other.SgprUB);
1390 
1391   for (auto T : inst_counter_types()) {
1392     // Merge event flags for this counter
1393     const bool OldOutOfOrder = counterOutOfOrder(T);
1394     const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
1395     const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
1396     if (OtherEvents & ~OldEvents)
1397       StrictDom = true;
1398     PendingEvents |= OtherEvents;
1399 
1400     // Merge scores for this counter
1401     const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
1402     const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
1403     const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
1404     if (NewUB < ScoreLBs[T])
1405       report_fatal_error("waitcnt score overflow");
1406 
1407     MergeInfo M;
1408     M.OldLB = ScoreLBs[T];
1409     M.OtherLB = Other.ScoreLBs[T];
1410     M.MyShift = NewUB - ScoreUBs[T];
1411     M.OtherShift = NewUB - Other.ScoreUBs[T];
1412 
1413     ScoreUBs[T] = NewUB;
1414 
1415     StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
1416 
1417     bool RegStrictDom = false;
1418     for (int J = 0; J <= VgprUB; J++) {
1419       RegStrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
1420     }
1421 
1422     if (T == VM_CNT) {
1423       for (int J = 0; J <= VgprUB; J++) {
1424         unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
1425         RegStrictDom |= NewVmemTypes != VgprVmemTypes[J];
1426         VgprVmemTypes[J] = NewVmemTypes;
1427       }
1428     }
1429 
1430     if (T == LGKM_CNT) {
1431       for (int J = 0; J <= SgprUB; J++) {
1432         RegStrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
1433       }
1434     }
1435 
1436     if (RegStrictDom && !OldOutOfOrder)
1437       StrictDom = true;
1438   }
1439 
1440   return StrictDom;
1441 }
1442 
1443 // Generate s_waitcnt instructions where needed.
1444 bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1445                                             MachineBasicBlock &Block,
1446                                             WaitcntBrackets &ScoreBrackets) {
1447   bool Modified = false;
1448 
1449   LLVM_DEBUG({
1450     dbgs() << "*** Block" << Block.getNumber() << " ***";
1451     ScoreBrackets.dump();
1452   });
1453 
1454   // Track the correctness of vccz through this basic block. There are two
1455   // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
1456   // ST->partialVCCWritesUpdateVCCZ().
1457   bool VCCZCorrect = true;
1458   if (ST->hasReadVCCZBug()) {
1459     // vccz could be incorrect at a basic block boundary if a predecessor wrote
1460     // to vcc and then issued an smem load.
1461     VCCZCorrect = false;
1462   } else if (!ST->partialVCCWritesUpdateVCCZ()) {
1463     // vccz could be incorrect at a basic block boundary if a predecessor wrote
1464     // to vcc_lo or vcc_hi.
1465     VCCZCorrect = false;
1466   }
1467 
1468   // Walk over the instructions.
1469   MachineInstr *OldWaitcntInstr = nullptr;
1470 
1471   for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
1472                                          E = Block.instr_end();
1473        Iter != E;) {
1474     MachineInstr &Inst = *Iter;
1475 
1476     // Track pre-existing waitcnts that were added in earlier iterations or by
1477     // the memory legalizer.
1478     if (Inst.getOpcode() == AMDGPU::S_WAITCNT ||
1479         (Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1480          Inst.getOperand(0).isReg() &&
1481          Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL)) {
1482       if (!OldWaitcntInstr)
1483         OldWaitcntInstr = &Inst;
1484       ++Iter;
1485       continue;
1486     }
1487 
1488     // Generate an s_waitcnt instruction to be placed before Inst, if needed.
1489     Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
1490     OldWaitcntInstr = nullptr;
1491 
1492     // Restore vccz if it's not known to be correct already.
1493     bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
1494 
1495     // Don't examine operands unless we need to track vccz correctness.
1496     if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
1497       if (Inst.definesRegister(AMDGPU::VCC_LO) ||
1498           Inst.definesRegister(AMDGPU::VCC_HI)) {
1499         // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
1500         if (!ST->partialVCCWritesUpdateVCCZ())
1501           VCCZCorrect = false;
1502       } else if (Inst.definesRegister(AMDGPU::VCC)) {
1503         // There is a hardware bug on CI/SI where SMRD instruction may corrupt
1504         // vccz bit, so when we detect that an instruction may read from a
1505         // corrupt vccz bit, we need to:
1506         // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
1507         //    operations to complete.
1508         // 2. Restore the correct value of vccz by writing the current value
1509         //    of vcc back to vcc.
1510         if (ST->hasReadVCCZBug() &&
1511             ScoreBrackets.getScoreLB(LGKM_CNT) <
1512                 ScoreBrackets.getScoreUB(LGKM_CNT) &&
1513             ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1514           // Writes to vcc while there's an outstanding smem read may get
1515           // clobbered as soon as any read completes.
1516           VCCZCorrect = false;
1517         } else {
1518           // Writes to vcc will fix any incorrect value in vccz.
1519           VCCZCorrect = true;
1520         }
1521       }
1522     }
1523 
1524     if (TII->isSMRD(Inst)) {
1525       for (const MachineMemOperand *Memop : Inst.memoperands()) {
1526         const Value *Ptr = Memop->getValue();
1527         SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent()));
1528       }
1529       if (ST->hasReadVCCZBug()) {
1530         // This smem read could complete and clobber vccz at any time.
1531         VCCZCorrect = false;
1532       }
1533     }
1534 
1535     updateEventWaitcntAfter(Inst, &ScoreBrackets);
1536 
1537 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
1538     // If this instruction generates a S_SETVSKIP because it is an
1539     // indexed resource, and we are on Tahiti, then it will also force
1540     // an S_WAITCNT vmcnt(0)
1541     if (RequireCheckResourceType(Inst, context)) {
1542       // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1543       ScoreBrackets->setScoreLB(VM_CNT,
1544       ScoreBrackets->getScoreUB(VM_CNT));
1545     }
1546 #endif
1547 
1548     LLVM_DEBUG({
1549       Inst.print(dbgs());
1550       ScoreBrackets.dump();
1551     });
1552 
1553     // TODO: Remove this work-around after fixing the scheduler and enable the
1554     // assert above.
1555     if (RestoreVCCZ) {
1556       // Restore the vccz bit.  Any time a value is written to vcc, the vcc
1557       // bit is updated, so we can restore the bit by reading the value of
1558       // vcc and then writing it back to the register.
1559       BuildMI(Block, Inst, Inst.getDebugLoc(),
1560               TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
1561               TRI->getVCC())
1562           .addReg(TRI->getVCC());
1563       VCCZCorrect = true;
1564       Modified = true;
1565     }
1566 
1567     ++Iter;
1568   }
1569 
1570   return Modified;
1571 }
1572 
1573 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
1574   ST = &MF.getSubtarget<GCNSubtarget>();
1575   TII = ST->getInstrInfo();
1576   TRI = &TII->getRegisterInfo();
1577   MRI = &MF.getRegInfo();
1578   IV = AMDGPU::getIsaVersion(ST->getCPU());
1579   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1580   PDT = &getAnalysis<MachinePostDominatorTree>();
1581 
1582   ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
1583   for (auto T : inst_counter_types())
1584     ForceEmitWaitcnt[T] = false;
1585 
1586   HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1587   HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1588   HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1589   HardwareLimits.VscntMax = ST->hasVscnt() ? 63 : 0;
1590 
1591   unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
1592   unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
1593   assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1594   assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1595 
1596   RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1597   RegisterEncoding.VGPRL = RegisterEncoding.VGPR0 + NumVGPRsMax - 1;
1598   RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1599   RegisterEncoding.SGPRL = RegisterEncoding.SGPR0 + NumSGPRsMax - 1;
1600 
1601   TrackedWaitcntSet.clear();
1602   BlockInfos.clear();
1603   bool Modified = false;
1604 
1605   if (!MFI->isEntryFunction()) {
1606     // Wait for any outstanding memory operations that the input registers may
1607     // depend on. We can't track them and it's better to do the wait after the
1608     // costly call sequence.
1609 
1610     // TODO: Could insert earlier and schedule more liberally with operations
1611     // that only use caller preserved registers.
1612     MachineBasicBlock &EntryBB = MF.front();
1613     MachineBasicBlock::iterator I = EntryBB.begin();
1614     for (MachineBasicBlock::iterator E = EntryBB.end();
1615          I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
1616       ;
1617     BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
1618     if (ST->hasVscnt())
1619       BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
1620           .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1621           .addImm(0);
1622 
1623     Modified = true;
1624   }
1625 
1626   // Keep iterating over the blocks in reverse post order, inserting and
1627   // updating s_waitcnt where needed, until a fix point is reached.
1628   for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
1629     BlockInfos.insert({MBB, BlockInfo(MBB)});
1630 
1631   std::unique_ptr<WaitcntBrackets> Brackets;
1632   bool Repeat;
1633   do {
1634     Repeat = false;
1635 
1636     for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
1637          ++BII) {
1638       BlockInfo &BI = BII->second;
1639       if (!BI.Dirty)
1640         continue;
1641 
1642       if (BI.Incoming) {
1643         if (!Brackets)
1644           Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
1645         else
1646           *Brackets = *BI.Incoming;
1647       } else {
1648         if (!Brackets)
1649           Brackets = std::make_unique<WaitcntBrackets>(ST);
1650         else
1651           *Brackets = WaitcntBrackets(ST);
1652       }
1653 
1654       Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
1655       BI.Dirty = false;
1656 
1657       if (Brackets->hasPending()) {
1658         BlockInfo *MoveBracketsToSucc = nullptr;
1659         for (MachineBasicBlock *Succ : BI.MBB->successors()) {
1660           auto SuccBII = BlockInfos.find(Succ);
1661           BlockInfo &SuccBI = SuccBII->second;
1662           if (!SuccBI.Incoming) {
1663             SuccBI.Dirty = true;
1664             if (SuccBII <= BII)
1665               Repeat = true;
1666             if (!MoveBracketsToSucc) {
1667               MoveBracketsToSucc = &SuccBI;
1668             } else {
1669               SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
1670             }
1671           } else if (SuccBI.Incoming->merge(*Brackets)) {
1672             SuccBI.Dirty = true;
1673             if (SuccBII <= BII)
1674               Repeat = true;
1675           }
1676         }
1677         if (MoveBracketsToSucc)
1678           MoveBracketsToSucc->Incoming = std::move(Brackets);
1679       }
1680     }
1681   } while (Repeat);
1682 
1683   SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
1684 
1685   bool HaveScalarStores = false;
1686 
1687   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
1688        ++BI) {
1689     MachineBasicBlock &MBB = *BI;
1690 
1691     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
1692          ++I) {
1693       if (!HaveScalarStores && TII->isScalarStore(*I))
1694         HaveScalarStores = true;
1695 
1696       if (I->getOpcode() == AMDGPU::S_ENDPGM ||
1697           I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1698         EndPgmBlocks.push_back(&MBB);
1699     }
1700   }
1701 
1702   if (HaveScalarStores) {
1703     // If scalar writes are used, the cache must be flushed or else the next
1704     // wave to reuse the same scratch memory can be clobbered.
1705     //
1706     // Insert s_dcache_wb at wave termination points if there were any scalar
1707     // stores, and only if the cache hasn't already been flushed. This could be
1708     // improved by looking across blocks for flushes in postdominating blocks
1709     // from the stores but an explicitly requested flush is probably very rare.
1710     for (MachineBasicBlock *MBB : EndPgmBlocks) {
1711       bool SeenDCacheWB = false;
1712 
1713       for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
1714            ++I) {
1715         if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1716           SeenDCacheWB = true;
1717         else if (TII->isScalarStore(*I))
1718           SeenDCacheWB = false;
1719 
1720         // FIXME: It would be better to insert this before a waitcnt if any.
1721         if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1722              I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1723             !SeenDCacheWB) {
1724           Modified = true;
1725           BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1726         }
1727       }
1728     }
1729   }
1730 
1731   return Modified;
1732 }
1733