1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "GCNHazardRecognizer.h"
14 #include "GCNSubtarget.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "SIMachineFunctionInfo.h"
17 #include "llvm/CodeGen/MachineFunction.h"
18 #include "llvm/CodeGen/ScheduleDAG.h"
19 #include "llvm/Support/TargetParser.h"
20 
21 using namespace llvm;
22 
23 namespace {
24 
25 struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
26   MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
27 
28   bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
29     if (Arg.getAsInteger(0, Value))
30       return O.error("'" + Arg + "' value invalid for uint argument!");
31 
32     if (Value > 100)
33       return O.error("'" + Arg + "' value must be in the range [0, 100]!");
34 
35     return false;
36   }
37 };
38 
39 } // end anonymous namespace
40 
41 static cl::opt<unsigned, false, MFMAPaddingRatioParser>
42     MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
43                      cl::desc("Fill a percentage of the latency between "
44                               "neighboring MFMA with s_nops."));
45 
46 //===----------------------------------------------------------------------===//
47 // Hazard Recognizer Implementation
48 //===----------------------------------------------------------------------===//
49 
50 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
51                                                  const GCNSubtarget &ST);
52 
53 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
54   IsHazardRecognizerMode(false),
55   CurrCycleInstr(nullptr),
56   MF(MF),
57   ST(MF.getSubtarget<GCNSubtarget>()),
58   TII(*ST.getInstrInfo()),
59   TRI(TII.getRegisterInfo()),
60   ClauseUses(TRI.getNumRegUnits()),
61   ClauseDefs(TRI.getNumRegUnits()) {
62   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
63   TSchedModel.init(&ST);
64   RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
65 }
66 
67 void GCNHazardRecognizer::Reset() {
68   EmittedInstrs.clear();
69 }
70 
71 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
72   EmitInstruction(SU->getInstr());
73 }
74 
75 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
76   CurrCycleInstr = MI;
77 }
78 
79 static bool isDivFMas(unsigned Opcode) {
80   return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
81 }
82 
83 static bool isSGetReg(unsigned Opcode) {
84   return Opcode == AMDGPU::S_GETREG_B32;
85 }
86 
87 static bool isSSetReg(unsigned Opcode) {
88   switch (Opcode) {
89   case AMDGPU::S_SETREG_B32:
90   case AMDGPU::S_SETREG_B32_mode:
91   case AMDGPU::S_SETREG_IMM32_B32:
92   case AMDGPU::S_SETREG_IMM32_B32_mode:
93     return true;
94   }
95   return false;
96 }
97 
98 static bool isRWLane(unsigned Opcode) {
99   return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
100 }
101 
102 static bool isRFE(unsigned Opcode) {
103   return Opcode == AMDGPU::S_RFE_B64;
104 }
105 
106 static bool isSMovRel(unsigned Opcode) {
107   switch (Opcode) {
108   case AMDGPU::S_MOVRELS_B32:
109   case AMDGPU::S_MOVRELS_B64:
110   case AMDGPU::S_MOVRELD_B32:
111   case AMDGPU::S_MOVRELD_B64:
112     return true;
113   default:
114     return false;
115   }
116 }
117 
118 static bool isDGEMM(unsigned Opcode) {
119   return AMDGPU::getMAIIsDGEMM(Opcode);
120 }
121 
122 static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
123   unsigned Opcode = MI.getOpcode();
124 
125   if (!SIInstrInfo::isMAI(MI) ||
126       isDGEMM(Opcode) ||
127       Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
128       Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
129     return false;
130 
131   if (!ST.hasGFX940Insts())
132     return true;
133 
134   return AMDGPU::getMAIIsGFX940XDL(Opcode);
135 }
136 
137 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
138                                     const MachineInstr &MI) {
139   if (TII.isAlwaysGDS(MI.getOpcode()))
140     return true;
141 
142   switch (MI.getOpcode()) {
143   case AMDGPU::S_SENDMSG:
144   case AMDGPU::S_SENDMSGHALT:
145   case AMDGPU::S_TTRACEDATA:
146     return true;
147   // These DS opcodes don't support GDS.
148   case AMDGPU::DS_NOP:
149   case AMDGPU::DS_PERMUTE_B32:
150   case AMDGPU::DS_BPERMUTE_B32:
151     return false;
152   default:
153     if (TII.isDS(MI.getOpcode())) {
154       int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
155                                            AMDGPU::OpName::gds);
156       if (MI.getOperand(GDS).getImm())
157         return true;
158     }
159     return false;
160   }
161 }
162 
163 static bool isPermlane(const MachineInstr &MI) {
164   unsigned Opcode = MI.getOpcode();
165   return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
166          Opcode == AMDGPU::V_PERMLANEX16_B32_e64;
167 }
168 
169 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
170   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
171                                                      AMDGPU::OpName::simm16);
172   return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
173 }
174 
175 ScheduleHazardRecognizer::HazardType
176 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
177   MachineInstr *MI = SU->getInstr();
178   // If we are not in "HazardRecognizerMode" and therefore not being run from
179   // the scheduler, track possible stalls from hazards but don't insert noops.
180   auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
181 
182   if (MI->isBundle())
183    return NoHazard;
184 
185   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
186     return HazardType;
187 
188   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
189     return HazardType;
190 
191   if (checkFPAtomicToDenormModeHazard(MI) > 0)
192     return HazardType;
193 
194   if (ST.hasNoDataDepHazard())
195     return NoHazard;
196 
197   // FIXME: Should flat be considered vmem?
198   if ((SIInstrInfo::isVMEM(*MI) ||
199        SIInstrInfo::isFLAT(*MI))
200       && checkVMEMHazards(MI) > 0)
201     return HazardType;
202 
203   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
204     return HazardType;
205 
206   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
207     return HazardType;
208 
209   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
210     return HazardType;
211 
212   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
213     return HazardType;
214 
215   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
216        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
217        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
218     return HazardType;
219 
220   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
221     return HazardType;
222 
223   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
224     return HazardType;
225 
226   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
227     return HazardType;
228 
229   if (ST.hasReadM0MovRelInterpHazard() &&
230       (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
231       checkReadM0Hazards(MI) > 0)
232     return HazardType;
233 
234   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&
235       checkReadM0Hazards(MI) > 0)
236     return HazardType;
237 
238   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
239     return HazardType;
240 
241   if ((SIInstrInfo::isVMEM(*MI) ||
242        SIInstrInfo::isFLAT(*MI) ||
243        SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
244     return HazardType;
245 
246   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
247     return HazardType;
248 
249   return NoHazard;
250 }
251 
252 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
253                                 unsigned Quantity) {
254   while (Quantity > 0) {
255     unsigned Arg = std::min(Quantity, 8u);
256     Quantity -= Arg;
257     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
258         .addImm(Arg - 1);
259   }
260 }
261 
262 unsigned
263 GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
264   const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
265   assert(TSchedModel.getWriteProcResBegin(SC) !=
266          TSchedModel.getWriteProcResEnd(SC));
267   return TSchedModel.getWriteProcResBegin(SC)->Cycles;
268 }
269 
270 void GCNHazardRecognizer::processBundle() {
271   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
272   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
273   // Check bundled MachineInstr's for hazards.
274   for (; MI != E && MI->isInsideBundle(); ++MI) {
275     CurrCycleInstr = &*MI;
276     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
277 
278     if (IsHazardRecognizerMode) {
279       fixHazards(CurrCycleInstr);
280 
281       insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
282     }
283 
284     // It’s unnecessary to track more than MaxLookAhead instructions. Since we
285     // include the bundled MI directly after, only add a maximum of
286     // (MaxLookAhead - 1) noops to EmittedInstrs.
287     for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
288       EmittedInstrs.push_front(nullptr);
289 
290     EmittedInstrs.push_front(CurrCycleInstr);
291     EmittedInstrs.resize(MaxLookAhead);
292   }
293   CurrCycleInstr = nullptr;
294 }
295 
296 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
297   IsHazardRecognizerMode = true;
298   CurrCycleInstr = MI;
299   unsigned W = PreEmitNoopsCommon(MI);
300   fixHazards(MI);
301   CurrCycleInstr = nullptr;
302   return W;
303 }
304 
305 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
306   if (MI->isBundle())
307     return 0;
308 
309   int WaitStates = 0;
310 
311   if (SIInstrInfo::isSMRD(*MI))
312     return std::max(WaitStates, checkSMRDHazards(MI));
313 
314   if (ST.hasNSAtoVMEMBug())
315     WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
316 
317   WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
318 
319   if (ST.hasNoDataDepHazard())
320     return WaitStates;
321 
322   if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
323     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
324 
325   if (SIInstrInfo::isVALU(*MI))
326     WaitStates = std::max(WaitStates, checkVALUHazards(MI));
327 
328   if (SIInstrInfo::isDPP(*MI))
329     WaitStates = std::max(WaitStates, checkDPPHazards(MI));
330 
331   if (isDivFMas(MI->getOpcode()))
332     WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
333 
334   if (isRWLane(MI->getOpcode()))
335     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
336 
337   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
338        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
339        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
340     WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
341 
342   if (MI->isInlineAsm())
343     return std::max(WaitStates, checkInlineAsmHazards(MI));
344 
345   if (isSGetReg(MI->getOpcode()))
346     return std::max(WaitStates, checkGetRegHazards(MI));
347 
348   if (isSSetReg(MI->getOpcode()))
349     return std::max(WaitStates, checkSetRegHazards(MI));
350 
351   if (isRFE(MI->getOpcode()))
352     return std::max(WaitStates, checkRFEHazards(MI));
353 
354   if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) ||
355                                            isSMovRel(MI->getOpcode())))
356     return std::max(WaitStates, checkReadM0Hazards(MI));
357 
358   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI))
359     return std::max(WaitStates, checkReadM0Hazards(MI));
360 
361   if (SIInstrInfo::isMAI(*MI))
362     return std::max(WaitStates, checkMAIHazards(MI));
363 
364   if (SIInstrInfo::isVMEM(*MI) ||
365       SIInstrInfo::isFLAT(*MI) ||
366       SIInstrInfo::isDS(*MI))
367     return std::max(WaitStates, checkMAILdStHazards(MI));
368 
369   return WaitStates;
370 }
371 
372 void GCNHazardRecognizer::EmitNoop() {
373   EmittedInstrs.push_front(nullptr);
374 }
375 
376 void GCNHazardRecognizer::AdvanceCycle() {
377   // When the scheduler detects a stall, it will call AdvanceCycle() without
378   // emitting any instructions.
379   if (!CurrCycleInstr) {
380     EmittedInstrs.push_front(nullptr);
381     return;
382   }
383 
384   if (CurrCycleInstr->isBundle()) {
385     processBundle();
386     return;
387   }
388 
389   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
390   if (!NumWaitStates) {
391     CurrCycleInstr = nullptr;
392     return;
393   }
394 
395   // Keep track of emitted instructions
396   EmittedInstrs.push_front(CurrCycleInstr);
397 
398   // Add a nullptr for each additional wait state after the first.  Make sure
399   // not to add more than getMaxLookAhead() items to the list, since we
400   // truncate the list to that size right after this loop.
401   for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
402        i < e; ++i) {
403     EmittedInstrs.push_front(nullptr);
404   }
405 
406   // getMaxLookahead() is the largest number of wait states we will ever need
407   // to insert, so there is no point in keeping track of more than that many
408   // wait states.
409   EmittedInstrs.resize(getMaxLookAhead());
410 
411   CurrCycleInstr = nullptr;
412 }
413 
414 void GCNHazardRecognizer::RecedeCycle() {
415   llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
416 }
417 
418 //===----------------------------------------------------------------------===//
419 // Helper Functions
420 //===----------------------------------------------------------------------===//
421 
422 typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn;
423 
424 // Returns a minimum wait states since \p I walking all predecessors.
425 // Only scans until \p IsExpired does not return true.
426 // Can only be run in a hazard recognizer mode.
427 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
428                               const MachineBasicBlock *MBB,
429                               MachineBasicBlock::const_reverse_instr_iterator I,
430                               int WaitStates, IsExpiredFn IsExpired,
431                               DenseSet<const MachineBasicBlock *> &Visited) {
432   for (auto E = MBB->instr_rend(); I != E; ++I) {
433     // Don't add WaitStates for parent BUNDLE instructions.
434     if (I->isBundle())
435       continue;
436 
437     if (IsHazard(*I))
438       return WaitStates;
439 
440     if (I->isInlineAsm())
441       continue;
442 
443     WaitStates += SIInstrInfo::getNumWaitStates(*I);
444 
445     if (IsExpired(*I, WaitStates))
446       return std::numeric_limits<int>::max();
447   }
448 
449   int MinWaitStates = std::numeric_limits<int>::max();
450   for (MachineBasicBlock *Pred : MBB->predecessors()) {
451     if (!Visited.insert(Pred).second)
452       continue;
453 
454     int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
455                                WaitStates, IsExpired, Visited);
456 
457     MinWaitStates = std::min(MinWaitStates, W);
458   }
459 
460   return MinWaitStates;
461 }
462 
463 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
464                               const MachineInstr *MI, IsExpiredFn IsExpired) {
465   DenseSet<const MachineBasicBlock *> Visited;
466   return getWaitStatesSince(IsHazard, MI->getParent(),
467                             std::next(MI->getReverseIterator()),
468                             0, IsExpired, Visited);
469 }
470 
471 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
472   if (IsHazardRecognizerMode) {
473     auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
474       return WaitStates >= Limit;
475     };
476     return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
477   }
478 
479   int WaitStates = 0;
480   for (MachineInstr *MI : EmittedInstrs) {
481     if (MI) {
482       if (IsHazard(*MI))
483         return WaitStates;
484 
485       if (MI->isInlineAsm())
486         continue;
487     }
488     ++WaitStates;
489 
490     if (WaitStates >= Limit)
491       break;
492   }
493   return std::numeric_limits<int>::max();
494 }
495 
496 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
497                                                IsHazardFn IsHazardDef,
498                                                int Limit) {
499   const SIRegisterInfo *TRI = ST.getRegisterInfo();
500 
501   auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
502     return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
503   };
504 
505   return getWaitStatesSince(IsHazardFn, Limit);
506 }
507 
508 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
509                                                   int Limit) {
510   auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
511     return isSSetReg(MI.getOpcode()) && IsHazard(MI);
512   };
513 
514   return getWaitStatesSince(IsHazardFn, Limit);
515 }
516 
517 //===----------------------------------------------------------------------===//
518 // No-op Hazard Detection
519 //===----------------------------------------------------------------------===//
520 
521 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
522                         MCRegister Reg) {
523   for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
524     BV.set(*RUI);
525 }
526 
527 static void addRegsToSet(const SIRegisterInfo &TRI,
528                          iterator_range<MachineInstr::const_mop_iterator> Ops,
529                          BitVector &Set) {
530   for (const MachineOperand &Op : Ops) {
531     if (Op.isReg())
532       addRegUnits(TRI, Set, Op.getReg().asMCReg());
533   }
534 }
535 
536 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
537   // XXX: Do we need to worry about implicit operands
538   addRegsToSet(TRI, MI.defs(), ClauseDefs);
539   addRegsToSet(TRI, MI.uses(), ClauseUses);
540 }
541 
542 static bool breaksSMEMSoftClause(MachineInstr *MI) {
543   return !SIInstrInfo::isSMRD(*MI);
544 }
545 
546 static bool breaksVMEMSoftClause(MachineInstr *MI) {
547   return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
548 }
549 
550 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
551   // SMEM soft clause are only present on VI+, and only matter if xnack is
552   // enabled.
553   if (!ST.isXNACKEnabled())
554     return 0;
555 
556   bool IsSMRD = TII.isSMRD(*MEM);
557 
558   resetClause();
559 
560   // A soft-clause is any group of consecutive SMEM instructions.  The
561   // instructions in this group may return out of order and/or may be
562   // replayed (i.e. the same instruction issued more than once).
563   //
564   // In order to handle these situations correctly we need to make sure that
565   // when a clause has more than one instruction, no instruction in the clause
566   // writes to a register that is read by another instruction in the clause
567   // (including itself). If we encounter this situation, we need to break the
568   // clause by inserting a non SMEM instruction.
569 
570   for (MachineInstr *MI : EmittedInstrs) {
571     // When we hit a non-SMEM instruction then we have passed the start of the
572     // clause and we can stop.
573     if (!MI)
574       break;
575 
576     if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
577       break;
578 
579     addClauseInst(*MI);
580   }
581 
582   if (ClauseDefs.none())
583     return 0;
584 
585   // We need to make sure not to put loads and stores in the same clause if they
586   // use the same address. For now, just start a new clause whenever we see a
587   // store.
588   if (MEM->mayStore())
589     return 1;
590 
591   addClauseInst(*MEM);
592 
593   // If the set of defs and uses intersect then we cannot add this instruction
594   // to the clause, so we have a hazard.
595   return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
596 }
597 
598 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
599   int WaitStatesNeeded = 0;
600 
601   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
602 
603   // This SMRD hazard only affects SI.
604   if (!ST.hasSMRDReadVALUDefHazard())
605     return WaitStatesNeeded;
606 
607   // A read of an SGPR by SMRD instruction requires 4 wait states when the
608   // SGPR was written by a VALU instruction.
609   int SmrdSgprWaitStates = 4;
610   auto IsHazardDefFn = [this](const MachineInstr &MI) {
611     return TII.isVALU(MI);
612   };
613   auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
614     return TII.isSALU(MI);
615   };
616 
617   bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
618 
619   for (const MachineOperand &Use : SMRD->uses()) {
620     if (!Use.isReg())
621       continue;
622     int WaitStatesNeededForUse =
623         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
624                                                    SmrdSgprWaitStates);
625     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
626 
627     // This fixes what appears to be undocumented hardware behavior in SI where
628     // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
629     // needs some number of nops in between. We don't know how many we need, but
630     // let's use 4. This wasn't discovered before probably because the only
631     // case when this happens is when we expand a 64-bit pointer into a full
632     // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
633     // probably never encountered in the closed-source land.
634     if (IsBufferSMRD) {
635       int WaitStatesNeededForUse =
636         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
637                                                    IsBufferHazardDefFn,
638                                                    SmrdSgprWaitStates);
639       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
640     }
641   }
642 
643   return WaitStatesNeeded;
644 }
645 
646 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
647   if (!ST.hasVMEMReadSGPRVALUDefHazard())
648     return 0;
649 
650   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
651 
652   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
653   // SGPR was written by a VALU Instruction.
654   const int VmemSgprWaitStates = 5;
655   auto IsHazardDefFn = [this](const MachineInstr &MI) {
656     return TII.isVALU(MI);
657   };
658   for (const MachineOperand &Use : VMEM->uses()) {
659     if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
660       continue;
661 
662     int WaitStatesNeededForUse =
663         VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
664                                                    VmemSgprWaitStates);
665     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
666   }
667   return WaitStatesNeeded;
668 }
669 
670 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
671   const SIRegisterInfo *TRI = ST.getRegisterInfo();
672   const SIInstrInfo *TII = ST.getInstrInfo();
673 
674   // Check for DPP VGPR read after VALU VGPR write and EXEC write.
675   int DppVgprWaitStates = 2;
676   int DppExecWaitStates = 5;
677   int WaitStatesNeeded = 0;
678   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
679     return TII->isVALU(MI);
680   };
681 
682   for (const MachineOperand &Use : DPP->uses()) {
683     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
684       continue;
685     int WaitStatesNeededForUse =
686         DppVgprWaitStates - getWaitStatesSinceDef(
687                                 Use.getReg(),
688                                 [](const MachineInstr &) { return true; },
689                                 DppVgprWaitStates);
690     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
691   }
692 
693   WaitStatesNeeded = std::max(
694       WaitStatesNeeded,
695       DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
696                                                 DppExecWaitStates));
697 
698   return WaitStatesNeeded;
699 }
700 
701 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
702   const SIInstrInfo *TII = ST.getInstrInfo();
703 
704   // v_div_fmas requires 4 wait states after a write to vcc from a VALU
705   // instruction.
706   const int DivFMasWaitStates = 4;
707   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
708     return TII->isVALU(MI);
709   };
710   int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
711                                                DivFMasWaitStates);
712 
713   return DivFMasWaitStates - WaitStatesNeeded;
714 }
715 
716 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
717   const SIInstrInfo *TII = ST.getInstrInfo();
718   unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
719 
720   const int GetRegWaitStates = 2;
721   auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
722     return GetRegHWReg == getHWReg(TII, MI);
723   };
724   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
725 
726   return GetRegWaitStates - WaitStatesNeeded;
727 }
728 
729 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
730   const SIInstrInfo *TII = ST.getInstrInfo();
731   unsigned HWReg = getHWReg(TII, *SetRegInstr);
732 
733   const int SetRegWaitStates = ST.getSetRegWaitStates();
734   auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
735     return HWReg == getHWReg(TII, MI);
736   };
737   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
738   return SetRegWaitStates - WaitStatesNeeded;
739 }
740 
741 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
742   if (!MI.mayStore())
743     return -1;
744 
745   const SIInstrInfo *TII = ST.getInstrInfo();
746   unsigned Opcode = MI.getOpcode();
747   const MCInstrDesc &Desc = MI.getDesc();
748 
749   int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
750   int VDataRCID = -1;
751   if (VDataIdx != -1)
752     VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
753 
754   if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
755     // There is no hazard if the instruction does not use vector regs
756     // (like wbinvl1)
757     if (VDataIdx == -1)
758       return -1;
759     // For MUBUF/MTBUF instructions this hazard only exists if the
760     // instruction is not using a register in the soffset field.
761     const MachineOperand *SOffset =
762         TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
763     // If we have no soffset operand, then assume this field has been
764     // hardcoded to zero.
765     if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
766         (!SOffset || !SOffset->isReg()))
767       return VDataIdx;
768   }
769 
770   // MIMG instructions create a hazard if they don't use a 256-bit T# and
771   // the store size is greater than 8 bytes and they have more than two bits
772   // of their dmask set.
773   // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
774   if (TII->isMIMG(MI)) {
775     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
776     assert(SRsrcIdx != -1 &&
777            AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
778     (void)SRsrcIdx;
779   }
780 
781   if (TII->isFLAT(MI)) {
782     int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
783     if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
784       return DataIdx;
785   }
786 
787   return -1;
788 }
789 
790 int
791 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
792                                             const MachineRegisterInfo &MRI) {
793   // Helper to check for the hazard where VMEM instructions that store more than
794   // 8 bytes can have there store data over written by the next instruction.
795   const SIRegisterInfo *TRI = ST.getRegisterInfo();
796 
797   const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
798   int WaitStatesNeeded = 0;
799 
800   if (!TRI->isVectorRegister(MRI, Def.getReg()))
801     return WaitStatesNeeded;
802   Register Reg = Def.getReg();
803   auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
804     int DataIdx = createsVALUHazard(MI);
805     return DataIdx >= 0 &&
806            TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
807   };
808   int WaitStatesNeededForDef =
809     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
810   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
811 
812   return WaitStatesNeeded;
813 }
814 
815 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
816   int WaitStatesNeeded = 0;
817 
818   if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
819     const int TransDefWaitstates = 1;
820 
821     auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
822       if (!SIInstrInfo::isTRANS(MI))
823         return false;
824       const SIRegisterInfo *TRI = ST.getRegisterInfo();
825       const SIInstrInfo *TII = ST.getInstrInfo();
826       Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
827 
828       for (const MachineOperand &Use : VALU->explicit_uses()) {
829         if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
830           return true;
831       }
832 
833       return false;
834     };
835 
836     int WaitStatesNeededForDef =
837         TransDefWaitstates -
838         getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
839     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
840   }
841 
842   if (ST.hasDstSelForwardingHazard()) {
843     const int Shift16DefWaitstates = 1;
844 
845     auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
846       if (!SIInstrInfo::isVALU(MI))
847         return false;
848       const SIInstrInfo *TII = ST.getInstrInfo();
849       if (SIInstrInfo::isSDWA(MI)) {
850         if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
851           if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
852             return false;
853       } else {
854         if ((AMDGPU::getNamedOperandIdx(MI.getOpcode(),
855                                         AMDGPU::OpName::op_sel) == -1) ||
856             !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)
857                   ->getImm() &
858               SISrcMods::DST_OP_SEL))
859           return false;
860       }
861       const SIRegisterInfo *TRI = ST.getRegisterInfo();
862       if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
863         Register Def = Dst->getReg();
864 
865         for (const MachineOperand &Use : VALU->explicit_uses()) {
866           if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
867             return true;
868         }
869       }
870 
871       return false;
872     };
873 
874     int WaitStatesNeededForDef =
875         Shift16DefWaitstates -
876         getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
877     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
878   }
879 
880   if (ST.hasVDecCoExecHazard()) {
881     const int VALUWriteSGPRVALUReadWaitstates = 2;
882     const int VALUWriteEXECRWLane = 4;
883     const int VALUWriteVGPRReadlaneRead = 1;
884 
885     const SIRegisterInfo *TRI = ST.getRegisterInfo();
886     const MachineRegisterInfo &MRI = MF.getRegInfo();
887     Register UseReg;
888     auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
889       if (!SIInstrInfo::isVALU(MI))
890         return false;
891       return MI.modifiesRegister(UseReg, TRI);
892     };
893 
894     for (const MachineOperand &Use : VALU->explicit_uses()) {
895       if (!Use.isReg())
896         continue;
897 
898       UseReg = Use.getReg();
899       if (TRI->isSGPRReg(MRI, UseReg)) {
900         int WaitStatesNeededForDef =
901             VALUWriteSGPRVALUReadWaitstates -
902             getWaitStatesSince(IsVALUDefSGPRFn,
903                                VALUWriteSGPRVALUReadWaitstates);
904         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
905       }
906     }
907 
908     if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
909       UseReg = AMDGPU::VCC;
910       int WaitStatesNeededForDef =
911           VALUWriteSGPRVALUReadWaitstates -
912           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
913       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
914     }
915 
916     switch (VALU->getOpcode()) {
917     case AMDGPU::V_READLANE_B32:
918     case AMDGPU::V_READFIRSTLANE_B32: {
919       MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
920       UseReg = Src->getReg();
921       int WaitStatesNeededForDef =
922           VALUWriteVGPRReadlaneRead -
923           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
924       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
925     }
926       LLVM_FALLTHROUGH;
927     case AMDGPU::V_WRITELANE_B32: {
928       UseReg = AMDGPU::EXEC;
929       int WaitStatesNeededForDef =
930           VALUWriteEXECRWLane -
931           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
932       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
933       break;
934     }
935     default:
936       break;
937     }
938   }
939 
940   // This checks for the hazard where VMEM instructions that store more than
941   // 8 bytes can have there store data over written by the next instruction.
942   if (!ST.has12DWordStoreHazard())
943     return WaitStatesNeeded;
944 
945   const MachineRegisterInfo &MRI = MF.getRegInfo();
946 
947   for (const MachineOperand &Def : VALU->defs()) {
948     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
949   }
950 
951   return WaitStatesNeeded;
952 }
953 
954 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
955   // This checks for hazards associated with inline asm statements.
956   // Since inline asms can contain just about anything, we use this
957   // to call/leverage other check*Hazard routines. Note that
958   // this function doesn't attempt to address all possible inline asm
959   // hazards (good luck), but is a collection of what has been
960   // problematic thus far.
961 
962   // see checkVALUHazards()
963   if (!ST.has12DWordStoreHazard())
964     return 0;
965 
966   const MachineRegisterInfo &MRI = MF.getRegInfo();
967   int WaitStatesNeeded = 0;
968 
969   for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands();
970        I != E; ++I) {
971     const MachineOperand &Op = IA->getOperand(I);
972     if (Op.isReg() && Op.isDef()) {
973       WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
974     }
975   }
976 
977   return WaitStatesNeeded;
978 }
979 
980 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
981   const SIInstrInfo *TII = ST.getInstrInfo();
982   const SIRegisterInfo *TRI = ST.getRegisterInfo();
983   const MachineRegisterInfo &MRI = MF.getRegInfo();
984 
985   const MachineOperand *LaneSelectOp =
986       TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
987 
988   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
989     return 0;
990 
991   Register LaneSelectReg = LaneSelectOp->getReg();
992   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
993 
994   const int RWLaneWaitStates = 4;
995   int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
996                                               RWLaneWaitStates);
997   return RWLaneWaitStates - WaitStatesSince;
998 }
999 
1000 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1001   if (!ST.hasRFEHazards())
1002     return 0;
1003 
1004   const SIInstrInfo *TII = ST.getInstrInfo();
1005 
1006   const int RFEWaitStates = 1;
1007 
1008   auto IsHazardFn = [TII](const MachineInstr &MI) {
1009     return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1010   };
1011   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1012   return RFEWaitStates - WaitStatesNeeded;
1013 }
1014 
1015 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1016   const SIInstrInfo *TII = ST.getInstrInfo();
1017   const int SMovRelWaitStates = 1;
1018   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1019   return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
1020                                                    SMovRelWaitStates);
1021 }
1022 
1023 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1024   fixVMEMtoScalarWriteHazards(MI);
1025   fixVcmpxPermlaneHazards(MI);
1026   fixSMEMtoVectorWriteHazards(MI);
1027   fixVcmpxExecWARHazard(MI);
1028   fixLdsBranchVmemWARHazard(MI);
1029 }
1030 
1031 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1032   if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1033     return false;
1034 
1035   const SIInstrInfo *TII = ST.getInstrInfo();
1036   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVOPC(MI); };
1037 
1038   auto IsExpiredFn = [](const MachineInstr &MI, int) {
1039     unsigned Opc = MI.getOpcode();
1040     return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1041            Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1042   };
1043 
1044   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1045       std::numeric_limits<int>::max())
1046     return false;
1047 
1048   // V_NOP will be discarded by SQ.
1049   // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1050   // which is always a VGPR and available.
1051   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1052   Register Reg = Src0->getReg();
1053   bool IsUndef = Src0->isUndef();
1054   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1055           TII->get(AMDGPU::V_MOV_B32_e32))
1056     .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
1057     .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
1058 
1059   return true;
1060 }
1061 
1062 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1063   if (!ST.hasVMEMtoScalarWriteHazard())
1064     return false;
1065 
1066   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
1067     return false;
1068 
1069   if (MI->getNumDefs() == 0)
1070     return false;
1071 
1072   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1073 
1074   auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1075     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) &&
1076         !SIInstrInfo::isFLAT(I))
1077       return false;
1078 
1079     for (const MachineOperand &Def : MI->defs()) {
1080       const MachineOperand *Op =
1081           I.findRegisterUseOperand(Def.getReg(), false, TRI);
1082       if (!Op)
1083         continue;
1084       return true;
1085     }
1086     return false;
1087   };
1088 
1089   auto IsExpiredFn = [](const MachineInstr &MI, int) {
1090     return SIInstrInfo::isVALU(MI) ||
1091            (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1092             !MI.getOperand(0).getImm()) ||
1093            (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1094             MI.getOperand(0).getImm() == 0xffe3);
1095   };
1096 
1097   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1098       std::numeric_limits<int>::max())
1099     return false;
1100 
1101   const SIInstrInfo *TII = ST.getInstrInfo();
1102   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1103           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1104       .addImm(0xffe3);
1105   return true;
1106 }
1107 
1108 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1109   if (!ST.hasSMEMtoVectorWriteHazard())
1110     return false;
1111 
1112   if (!SIInstrInfo::isVALU(*MI))
1113     return false;
1114 
1115   unsigned SDSTName;
1116   switch (MI->getOpcode()) {
1117   case AMDGPU::V_READLANE_B32:
1118   case AMDGPU::V_READFIRSTLANE_B32:
1119     SDSTName = AMDGPU::OpName::vdst;
1120     break;
1121   default:
1122     SDSTName = AMDGPU::OpName::sdst;
1123     break;
1124   }
1125 
1126   const SIInstrInfo *TII = ST.getInstrInfo();
1127   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1128   const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1129   const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1130   if (!SDST) {
1131     for (const auto &MO : MI->implicit_operands()) {
1132       if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) {
1133         SDST = &MO;
1134         break;
1135       }
1136     }
1137   }
1138 
1139   if (!SDST)
1140     return false;
1141 
1142   const Register SDSTReg = SDST->getReg();
1143   auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1144     return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1145   };
1146 
1147   auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1148     if (TII->isSALU(MI)) {
1149       switch (MI.getOpcode()) {
1150       case AMDGPU::S_SETVSKIP:
1151       case AMDGPU::S_VERSION:
1152       case AMDGPU::S_WAITCNT_VSCNT:
1153       case AMDGPU::S_WAITCNT_VMCNT:
1154       case AMDGPU::S_WAITCNT_EXPCNT:
1155         // These instructions cannot not mitigate the hazard.
1156         return false;
1157       case AMDGPU::S_WAITCNT_LGKMCNT:
1158         // Reducing lgkmcnt count to 0 always mitigates the hazard.
1159         return (MI.getOperand(1).getImm() == 0) &&
1160                (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1161       case AMDGPU::S_WAITCNT: {
1162         const int64_t Imm = MI.getOperand(0).getImm();
1163         AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1164         return (Decoded.LgkmCnt == 0);
1165       }
1166       default:
1167         // SOPP instructions cannot mitigate the hazard.
1168         if (TII->isSOPP(MI))
1169           return false;
1170         // At this point the SALU can be assumed to mitigate the hazard
1171         // because either:
1172         // (a) it is independent of the at risk SMEM (breaking chain),
1173         // or
1174         // (b) it is dependent on the SMEM, in which case an appropriate
1175         //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
1176         //     SMEM instruction.
1177         return true;
1178       }
1179     }
1180     return false;
1181   };
1182 
1183   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1184       std::numeric_limits<int>::max())
1185     return false;
1186 
1187   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1188           TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1189       .addImm(0);
1190   return true;
1191 }
1192 
1193 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1194   if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
1195     return false;
1196 
1197   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1198   if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1199     return false;
1200 
1201   auto IsHazardFn = [TRI](const MachineInstr &I) {
1202     if (SIInstrInfo::isVALU(I))
1203       return false;
1204     return I.readsRegister(AMDGPU::EXEC, TRI);
1205   };
1206 
1207   const SIInstrInfo *TII = ST.getInstrInfo();
1208   auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1209     if (SIInstrInfo::isVALU(MI)) {
1210       if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1211         return true;
1212       for (auto MO : MI.implicit_operands())
1213         if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))
1214           return true;
1215     }
1216     if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1217         (MI.getOperand(0).getImm() & 0xfffe) == 0xfffe)
1218       return true;
1219     return false;
1220   };
1221 
1222   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1223       std::numeric_limits<int>::max())
1224     return false;
1225 
1226   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1227           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1228     .addImm(0xfffe);
1229   return true;
1230 }
1231 
1232 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1233                                                  const GCNSubtarget &ST) {
1234   if (!ST.hasLdsBranchVmemWARHazard())
1235     return false;
1236 
1237   // Check if the necessary condition for the hazard is met: both LDS and VMEM
1238   // instructions need to appear in the same function.
1239   bool HasLds = false;
1240   bool HasVmem = false;
1241   for (auto &MBB : MF) {
1242     for (auto &MI : MBB) {
1243       HasLds |= SIInstrInfo::isDS(MI);
1244       HasVmem |=
1245           SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI);
1246       if (HasLds && HasVmem)
1247         return true;
1248     }
1249   }
1250   return false;
1251 }
1252 
1253 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1254   if (!RunLdsBranchVmemWARHazardFixup)
1255     return false;
1256 
1257   assert(ST.hasLdsBranchVmemWARHazard());
1258 
1259   auto IsHazardInst = [](const MachineInstr &MI) {
1260     if (SIInstrInfo::isDS(MI))
1261       return 1;
1262     if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
1263       return 2;
1264     return 0;
1265   };
1266 
1267   auto InstType = IsHazardInst(*MI);
1268   if (!InstType)
1269     return false;
1270 
1271   auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1272     return IsHazardInst(I) || (I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1273                                I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1274                                !I.getOperand(1).getImm());
1275   };
1276 
1277   auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1278     if (!I.isBranch())
1279       return false;
1280 
1281     auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1282       auto InstType2 = IsHazardInst(I);
1283       return InstType2 && InstType != InstType2;
1284     };
1285 
1286     auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1287       auto InstType2 = IsHazardInst(I);
1288       if (InstType == InstType2)
1289         return true;
1290 
1291       return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1292              I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1293              !I.getOperand(1).getImm();
1294     };
1295 
1296     return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1297            std::numeric_limits<int>::max();
1298   };
1299 
1300   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1301       std::numeric_limits<int>::max())
1302     return false;
1303 
1304   const SIInstrInfo *TII = ST.getInstrInfo();
1305   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1306           TII->get(AMDGPU::S_WAITCNT_VSCNT))
1307     .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1308     .addImm(0);
1309 
1310   return true;
1311 }
1312 
1313 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1314   int NSAtoVMEMWaitStates = 1;
1315 
1316   if (!ST.hasNSAtoVMEMBug())
1317     return 0;
1318 
1319   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1320     return 0;
1321 
1322   const SIInstrInfo *TII = ST.getInstrInfo();
1323   const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1324   if (!Offset || (Offset->getImm() & 6) == 0)
1325     return 0;
1326 
1327   auto IsHazardFn = [TII](const MachineInstr &I) {
1328     if (!SIInstrInfo::isMIMG(I))
1329       return false;
1330     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
1331     return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1332            TII->getInstSizeInBytes(I) >= 16;
1333   };
1334 
1335   return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1336 }
1337 
1338 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1339   int FPAtomicToDenormModeWaitStates = 3;
1340 
1341   if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1342     return 0;
1343 
1344   auto IsHazardFn = [](const MachineInstr &I) {
1345     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I))
1346       return false;
1347     return SIInstrInfo::isFPAtomic(I);
1348   };
1349 
1350   auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
1351     if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
1352       return true;
1353 
1354     switch (MI.getOpcode()) {
1355     case AMDGPU::S_WAITCNT:
1356     case AMDGPU::S_WAITCNT_VSCNT:
1357     case AMDGPU::S_WAITCNT_VMCNT:
1358     case AMDGPU::S_WAITCNT_EXPCNT:
1359     case AMDGPU::S_WAITCNT_LGKMCNT:
1360     case AMDGPU::S_WAIT_IDLE:
1361       return true;
1362     default:
1363       break;
1364     }
1365 
1366     return false;
1367   };
1368 
1369   return FPAtomicToDenormModeWaitStates -
1370          ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
1371 }
1372 
1373 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1374   assert(SIInstrInfo::isMAI(*MI));
1375 
1376   return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
1377 }
1378 
1379 int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
1380   // Early exit if no padding is requested.
1381   if (MFMAPaddingRatio == 0)
1382     return 0;
1383 
1384   auto IsMFMAFn = [](const MachineInstr &MI) {
1385     return SIInstrInfo::isMAI(MI) &&
1386            MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1387            MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1388   };
1389 
1390   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1391   if (!IsMFMAFn(*MI) || MFI->getOccupancy() < 2)
1392     return 0;
1393 
1394   int NeighborMFMALatency = 0;
1395   auto IsNeighboringMFMA = [&IsMFMAFn, &NeighborMFMALatency,
1396                             this](const MachineInstr &MI) {
1397     if (!IsMFMAFn(MI))
1398       return false;
1399 
1400     NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
1401     return true;
1402   };
1403 
1404   const int MaxMFMAPipelineWaitStates = 16;
1405   int WaitStatesSinceNeighborMFMA =
1406       getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
1407 
1408   int NeighborMFMAPaddingNeeded =
1409       (NeighborMFMALatency * MFMAPaddingRatio / 100) -
1410       WaitStatesSinceNeighborMFMA;
1411 
1412   return std::max(0, NeighborMFMAPaddingNeeded);
1413 }
1414 
1415 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
1416   int WaitStatesNeeded = 0;
1417   unsigned Opc = MI->getOpcode();
1418 
1419   auto IsVALUFn = [](const MachineInstr &MI) {
1420     return SIInstrInfo::isVALU(MI);
1421   };
1422 
1423   if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
1424     const int LegacyVALUWritesVGPRWaitStates = 2;
1425     const int VALUWritesExecWaitStates = 4;
1426     const int MaxWaitStates = 4;
1427 
1428     int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1429       getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1430     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1431 
1432     if (WaitStatesNeeded < MaxWaitStates) {
1433       for (const MachineOperand &Use : MI->explicit_uses()) {
1434         const int MaxWaitStates = 2;
1435 
1436         if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1437           continue;
1438 
1439         int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
1440           getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
1441         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1442 
1443         if (WaitStatesNeeded == MaxWaitStates)
1444           break;
1445       }
1446     }
1447   }
1448 
1449   auto IsMFMAFn = [](const MachineInstr &MI) {
1450     return SIInstrInfo::isMAI(MI) &&
1451            MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1452            MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1453   };
1454 
1455   for (const MachineOperand &Op : MI->explicit_operands()) {
1456     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
1457       continue;
1458 
1459     if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1460       continue;
1461 
1462     const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
1463     const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
1464     const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
1465     const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
1466     const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
1467     const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
1468     const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
1469     const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
1470     const int MaxWaitStates = 18;
1471     Register Reg = Op.getReg();
1472     unsigned HazardDefLatency = 0;
1473 
1474     auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency,
1475                                this](const MachineInstr &MI) {
1476       if (!IsMFMAFn(MI))
1477         return false;
1478       Register DstReg = MI.getOperand(0).getReg();
1479       if (DstReg == Reg)
1480         return false;
1481       HazardDefLatency =
1482           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
1483       return TRI.regsOverlap(DstReg, Reg);
1484     };
1485 
1486     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
1487                                                    MaxWaitStates);
1488     int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
1489     int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1490     int OpNo = MI->getOperandNo(&Op);
1491     if (OpNo == SrcCIdx) {
1492       NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
1493     } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
1494       switch (HazardDefLatency) {
1495       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
1496                break;
1497       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
1498                break;
1499       case 16: LLVM_FALLTHROUGH;
1500       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
1501                break;
1502       }
1503     } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
1504       switch (HazardDefLatency) {
1505       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
1506                break;
1507       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
1508                break;
1509       case 16: LLVM_FALLTHROUGH;
1510       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
1511                break;
1512       }
1513     }
1514 
1515     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1516     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1517 
1518     if (WaitStatesNeeded == MaxWaitStates)
1519       return WaitStatesNeeded; // Early exit.
1520 
1521     auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
1522       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1523         return false;
1524       Register DstReg = MI.getOperand(0).getReg();
1525       return TRI.regsOverlap(Reg, DstReg);
1526     };
1527 
1528     const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
1529     const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
1530     const int AccVGPRWriteAccVgprReadWaitStates = 3;
1531     NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
1532     if (OpNo == SrcCIdx)
1533       NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
1534     else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
1535       NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
1536 
1537     WaitStatesNeededForUse = NeedWaitStates -
1538       getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
1539     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1540 
1541     if (WaitStatesNeeded == MaxWaitStates)
1542       return WaitStatesNeeded; // Early exit.
1543   }
1544 
1545   if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
1546     const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
1547     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
1548     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
1549     const int MaxWaitStates = 13;
1550     Register DstReg = MI->getOperand(0).getReg();
1551     unsigned HazardDefLatency = 0;
1552 
1553     auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency,
1554                          this](const MachineInstr &MI) {
1555       if (!IsMFMAFn(MI))
1556         return false;
1557       Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
1558       HazardDefLatency =
1559           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
1560       return TRI.regsOverlap(Reg, DstReg);
1561     };
1562 
1563     int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
1564     int NeedWaitStates;
1565     switch (HazardDefLatency) {
1566     case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
1567              break;
1568     case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
1569              break;
1570     case 16: LLVM_FALLTHROUGH;
1571     default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
1572              break;
1573     }
1574 
1575     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
1576     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1577   }
1578 
1579   // Pad neighboring MFMA with noops for better inter-wave performance.
1580   WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
1581 
1582   return WaitStatesNeeded;
1583 }
1584 
1585 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
1586   int WaitStatesNeeded = 0;
1587   unsigned Opc = MI->getOpcode();
1588 
1589   auto IsMFMAFn = [](const MachineInstr &MI) {
1590     return SIInstrInfo::isMAI(MI) &&
1591            MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1592            MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1593   };
1594 
1595   auto IsLegacyVALUFn = [&IsMFMAFn](const MachineInstr &MI) {
1596     return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI);
1597   };
1598 
1599   auto IsLegacyVALUNotDotFn = [&IsMFMAFn](const MachineInstr &MI) {
1600     return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI) && !SIInstrInfo::isDOT(MI);
1601   };
1602 
1603   if (!IsMFMAFn(*MI))
1604     return WaitStatesNeeded;
1605 
1606   const int VALUWritesExecWaitStates = 4;
1607   int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1608     getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
1609                           VALUWritesExecWaitStates);
1610   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1611 
1612   int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1613 
1614   // Loop for both DGEMM and S/HGEMM 2nd instruction.
1615   for (const MachineOperand &Use : MI->explicit_uses()) {
1616     const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
1617     const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
1618     const int GFX940_XDL2PassWritesVGPROverlappedSMFMASrcCWaitStates = 3;
1619     const int GFX940_XDL4PassWritesVGPROverlappedSMFMASrcCWaitStates = 5;
1620     const int GFX940_SMFMA4PassWritesVGPROverlappedSMFMASrcCWaitStates = 4;
1621     const int GFX940_XDL8PassWritesVGPROverlappedSMFMASrcCWaitStates = 9;
1622     const int GFX940_SMFMA8PassWritesVGPROverlappedSMFMASrcCWaitStates = 8;
1623     const int GFX940_XDL16PassWritesVGPROverlappedSMFMASrcCWaitStates = 17;
1624     const int GFX940_SMFMA16PassWritesVGPROverlappedSMFMASrcCWaitStates = 16;
1625     const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
1626     const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
1627     const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
1628     const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
1629     const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
1630     const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
1631     const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
1632     const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
1633     const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
1634     const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
1635     const int GFX940_SMFMA2PassWritesVGPROverlappedSrcABWaitStates = 4;
1636     const int GFX940_SMFMA4PassWritesVGPROverlappedSrcABWaitStates = 6;
1637     const int GFX940_SMFMA8PassWritesVGPROverlappedSrcABWaitStates = 10;
1638     const int GFX940_SMFMA16PassWritesVGPROverlappedSrcABWaitStates = 18;
1639     const int GFX940_XDL2PassWritesVGPROverlappedSrcABWaitStates = 5;
1640     const int GFX940_XDL4PassWritesVGPROverlappedSrcABWaitStates = 7;
1641     const int GFX940_XDL8PassWritesVGPROverlappedSrcABWaitStates = 11;
1642     const int GFX940_XDL16PassWritesVGPROverlappedSrcABWaitStates = 19;
1643     const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
1644     const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
1645     const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
1646     const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
1647     const int MaxWaitStates = 19;
1648 
1649     if (!Use.isReg())
1650       continue;
1651     Register Reg = Use.getReg();
1652     bool FullReg;
1653     const MachineInstr *MI1;
1654 
1655     auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &FullReg, &MI1,
1656                                this](const MachineInstr &MI) {
1657       if (!IsMFMAFn(MI))
1658         return false;
1659       Register DstReg = MI.getOperand(0).getReg();
1660       FullReg = (DstReg == Reg);
1661       MI1 = &MI;
1662       return TRI.regsOverlap(DstReg, Reg);
1663     };
1664 
1665     WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
1666       getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
1667     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1668 
1669     int NumWaitStates =
1670         getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
1671     if (NumWaitStates == std::numeric_limits<int>::max())
1672       continue;
1673 
1674     int OpNo = MI->getOperandNo(&Use);
1675     unsigned Opc1 = MI1->getOpcode();
1676     int NeedWaitStates = 0;
1677     if (OpNo == SrcCIdx) {
1678       if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) {
1679         NeedWaitStates = 0;
1680       } else if (FullReg) {
1681         if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
1682              Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
1683             (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
1684              Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
1685           NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
1686         else if (ST.hasGFX940Insts() &&
1687                  TSchedModel.computeInstrLatency(MI1) == 2)
1688           NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
1689       } else {
1690         switch (Opc1) {
1691         case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
1692         case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
1693         case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
1694         case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
1695           if (!isXDL(ST, *MI))
1696             NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
1697           break;
1698         case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
1699         case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
1700           if (!isXDL(ST, *MI))
1701             NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
1702           break;
1703         default:
1704           if (ST.hasGFX940Insts() && isXDL(ST, *MI) && !isXDL(ST, *MI1))
1705             break;
1706           switch (TSchedModel.computeInstrLatency(MI1)) {
1707           case 2:
1708             NeedWaitStates = ST.hasGFX940Insts()
1709               ? isXDL(ST, *MI1)
1710                 ? GFX940_XDL2PassWritesVGPROverlappedSMFMASrcCWaitStates
1711                 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates
1712               : isDGEMM(Opc)
1713                 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
1714                 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
1715             break;
1716           case 4:
1717             assert(ST.hasGFX940Insts());
1718             NeedWaitStates = isXDL(ST, *MI1)
1719               ? GFX940_XDL4PassWritesVGPROverlappedSMFMASrcCWaitStates
1720               : GFX940_SMFMA4PassWritesVGPROverlappedSMFMASrcCWaitStates;
1721             break;
1722           case 8:
1723             NeedWaitStates = ST.hasGFX940Insts()
1724               ? isXDL(ST, *MI1)
1725                 ? GFX940_XDL8PassWritesVGPROverlappedSMFMASrcCWaitStates
1726                 : GFX940_SMFMA8PassWritesVGPROverlappedSMFMASrcCWaitStates
1727               : isDGEMM(Opc)
1728                 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
1729                 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
1730             break;
1731           case 16: LLVM_FALLTHROUGH;
1732           default:
1733             NeedWaitStates = ST.hasGFX940Insts()
1734               ? isXDL(ST, *MI1)
1735                 ? GFX940_XDL16PassWritesVGPROverlappedSMFMASrcCWaitStates
1736                 : GFX940_SMFMA16PassWritesVGPROverlappedSMFMASrcCWaitStates
1737               : isDGEMM(Opc)
1738                 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
1739                 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
1740           }
1741         }
1742       }
1743     } else {
1744       switch (Opc1) {
1745       case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
1746       case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
1747       case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
1748       case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
1749         NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
1750         break;
1751       case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
1752       case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
1753         NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
1754         break;
1755       default:
1756         switch (TSchedModel.computeInstrLatency(MI1)) {
1757         case 2:
1758           NeedWaitStates = ST.hasGFX940Insts()
1759             ? isXDL(ST, *MI1)
1760               ? GFX940_XDL2PassWritesVGPROverlappedSrcABWaitStates
1761               : GFX940_SMFMA2PassWritesVGPROverlappedSrcABWaitStates
1762             : SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
1763           break;
1764         case 4:
1765           assert(ST.hasGFX940Insts());
1766           NeedWaitStates = isXDL(ST, *MI1)
1767             ? GFX940_XDL4PassWritesVGPROverlappedSrcABWaitStates
1768             : GFX940_SMFMA4PassWritesVGPROverlappedSrcABWaitStates;
1769           break;
1770         case 8:
1771           NeedWaitStates = ST.hasGFX940Insts()
1772             ? isXDL(ST, *MI1)
1773               ? GFX940_XDL8PassWritesVGPROverlappedSrcABWaitStates
1774               : GFX940_SMFMA8PassWritesVGPROverlappedSrcABWaitStates
1775             : SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
1776           break;
1777         case 16: LLVM_FALLTHROUGH;
1778         default:
1779           NeedWaitStates = ST.hasGFX940Insts()
1780             ? isXDL(ST, *MI1)
1781               ? GFX940_XDL16PassWritesVGPROverlappedSrcABWaitStates
1782               : GFX940_SMFMA16PassWritesVGPROverlappedSrcABWaitStates
1783             : SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
1784         }
1785       }
1786     }
1787     if (WaitStatesNeeded >= NeedWaitStates)
1788       continue;
1789 
1790     WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
1791     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1792 
1793     if (WaitStatesNeeded == MaxWaitStates)
1794       break;
1795   }
1796 
1797   return WaitStatesNeeded;
1798 }
1799 
1800 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
1801   // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
1802   if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
1803     return 0;
1804 
1805   int WaitStatesNeeded = 0;
1806 
1807   auto IsAccVgprReadFn = [](const MachineInstr &MI) {
1808     return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
1809   };
1810 
1811   for (const MachineOperand &Op : MI->explicit_uses()) {
1812     if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
1813       continue;
1814 
1815     Register Reg = Op.getReg();
1816 
1817     const int AccVgprReadLdStWaitStates = 2;
1818     const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
1819     const int MaxWaitStates = 2;
1820 
1821     int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
1822       getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
1823     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1824 
1825     if (WaitStatesNeeded == MaxWaitStates)
1826       return WaitStatesNeeded; // Early exit.
1827 
1828     auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
1829       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
1830           MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1831         return false;
1832       auto IsVALUFn = [](const MachineInstr &MI) {
1833         return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
1834       };
1835       return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
1836              std::numeric_limits<int>::max();
1837     };
1838 
1839     WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
1840       getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
1841     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1842   }
1843 
1844   return WaitStatesNeeded;
1845 }
1846 
1847 int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
1848   if (!ST.hasGFX90AInsts())
1849     return 0;
1850 
1851   auto IsMFMAFn = [](const MachineInstr &MI) -> bool {
1852     return SIInstrInfo::isMAI(MI) &&
1853            MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1854            MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1855   };
1856 
1857   auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
1858     return isDGEMM(MI.getOpcode());
1859   };
1860 
1861   // This is checked in checkMAIHazards90A()
1862   if (IsMFMAFn(*MI))
1863     return 0;
1864 
1865   int WaitStatesNeeded = 0;
1866 
1867   bool IsMemOrExport = SIInstrInfo::isVMEM(*MI) ||
1868                        SIInstrInfo::isFLAT(*MI) ||
1869                        SIInstrInfo::isDS(*MI) ||
1870                        SIInstrInfo::isEXP(*MI);
1871   bool IsVALU = SIInstrInfo::isVALU(*MI);
1872 
1873   const MachineInstr *MFMA = nullptr;
1874   unsigned Reg;
1875   auto IsMFMAWriteFn = [&Reg, &IsMFMAFn, &MFMA, this](const MachineInstr &MI) {
1876     if (!IsMFMAFn(MI) || !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
1877       return false;
1878     MFMA = &MI;
1879     return true;
1880   };
1881 
1882   const MachineInstr *DOT = nullptr;
1883   auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
1884     if (!SIInstrInfo::isDOT(MI) ||
1885         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
1886       return false;
1887     DOT = &MI;
1888     return true;
1889   };
1890 
1891   int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
1892                                            AMDGPU::OpName::src2);
1893 
1894   if (IsMemOrExport || IsVALU) {
1895     const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
1896     const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
1897     const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
1898     const int GFX940_SMFMA2PassWriteVgprVALUMemExpReadWaitStates = 4;
1899     const int GFX940_SMFMA4PassWriteVgprVALUMemExpReadWaitStates = 6;
1900     const int GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates = 10;
1901     const int GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates = 18;
1902     const int GFX940_XDL2PassWriteVgprVALUMemExpReadWaitStates = 5;
1903     const int GFX940_XDL4PassWriteVgprVALUMemExpReadWaitStates = 7;
1904     const int GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates = 11;
1905     const int GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates = 19;
1906     const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
1907     const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
1908     const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
1909     const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
1910     const int DotWriteSameDotReadSrcAB = 3;
1911     const int DotWriteDifferentVALURead = 3;
1912     const int MaxWaitStates = 19;
1913 
1914     for (const MachineOperand &Use : MI->explicit_uses()) {
1915       if (!Use.isReg())
1916         continue;
1917       Reg = Use.getReg();
1918 
1919       DOT = nullptr;
1920       int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
1921                                                      MaxWaitStates);
1922       if (DOT) {
1923         int NeedWaitStates = 0;
1924         if (DOT->getOpcode() == MI->getOpcode()) {
1925           if (&Use - &MI->getOperand(0) != SrcCIdx)
1926             NeedWaitStates = DotWriteSameDotReadSrcAB;
1927         } else {
1928           NeedWaitStates = DotWriteDifferentVALURead;
1929         }
1930 
1931         int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1932         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1933       }
1934 
1935       MFMA = nullptr;
1936       WaitStatesSinceDef =
1937           getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
1938       if (!MFMA)
1939         continue;
1940 
1941       unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
1942       int NeedWaitStates = MaxWaitStates;
1943       switch (HazardDefLatency) {
1944       case 2:
1945         NeedWaitStates =
1946           ST.hasGFX940Insts()
1947             ? isXDL(ST, *MFMA)
1948               ? GFX940_XDL2PassWriteVgprVALUMemExpReadWaitStates
1949               : GFX940_SMFMA2PassWriteVgprVALUMemExpReadWaitStates
1950             : SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
1951         break;
1952       case 4:
1953         assert(isDGEMM(MFMA->getOpcode()) || ST.hasGFX940Insts());
1954         NeedWaitStates =
1955           isDGEMM(MFMA->getOpcode())
1956             ? IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
1957                             : DMFMA4x4WriteVgprVALUReadWaitStates
1958             : isXDL(ST, *MFMA)
1959               ? GFX940_XDL4PassWriteVgprVALUMemExpReadWaitStates
1960               : GFX940_SMFMA4PassWriteVgprVALUMemExpReadWaitStates;
1961         break;
1962       case 8:
1963         NeedWaitStates =
1964           ST.hasGFX940Insts()
1965             ? isXDL(ST, *MFMA)
1966               ? GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates
1967               : GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates
1968             : SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
1969         break;
1970       case 16: LLVM_FALLTHROUGH;
1971       default:
1972         NeedWaitStates =
1973           isDGEMM(MFMA->getOpcode())
1974             ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates
1975                             : DMFMA16x16WriteVgprVALUReadWaitStates
1976             : ST.hasGFX940Insts()
1977               ? isXDL(ST, *MFMA)
1978                 ? GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates
1979                 : GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates
1980               : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
1981         break;
1982       }
1983 
1984       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1985       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1986 
1987       if (WaitStatesNeeded == MaxWaitStates)
1988         break;
1989     }
1990   }
1991 
1992   unsigned Opc = MI->getOpcode();
1993   const int DMFMAToFMA64WaitStates = 2;
1994   if ((Opc == AMDGPU::V_FMA_F64_e64 ||
1995        Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
1996        Opc == AMDGPU::V_FMAC_F64_dpp) &&
1997       WaitStatesNeeded < DMFMAToFMA64WaitStates) {
1998     int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
1999       getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
2000     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2001   }
2002 
2003   if (!IsVALU && !IsMemOrExport)
2004     return WaitStatesNeeded;
2005 
2006   for (const MachineOperand &Def : MI->defs()) {
2007     const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2008     const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2009     const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2010     const int GFX940_SMFMA2PassWriteVgprVALUWawWaitStates = 4;
2011     const int GFX940_SMFMA4PassWriteVgprVALUWawWaitStates = 6;
2012     const int GFX940_SMFMA8PassWriteVgprVALUWawWaitStates = 10;
2013     const int GFX940_SMFMA16PassWriteVgprVALUWawWaitStates = 18;
2014     const int GFX940_XDL2PassWriteVgprVALUWawWaitStates = 5;
2015     const int GFX940_XDL4PassWriteVgprVALUWawWaitStates = 7;
2016     const int GFX940_XDL8PassWriteVgprVALUWawWaitStates = 11;
2017     const int GFX940_XDL16PassWriteVgprVALUWawWaitStates = 19;
2018     const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
2019     const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2020     const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2021     const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2022     const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2023     const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2024     const int DotWriteDifferentVALUWrite = 3;
2025     const int MaxWaitStates = 19;
2026     const int MaxWarWaitStates = 15;
2027 
2028     Reg = Def.getReg();
2029 
2030     DOT = nullptr;
2031     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2032                                                    MaxWaitStates);
2033     if (DOT && DOT->getOpcode() != MI->getOpcode())
2034       WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
2035                                                     WaitStatesSinceDef);
2036 
2037     MFMA = nullptr;
2038     WaitStatesSinceDef =
2039         getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2040     if (MFMA) {
2041       int NeedWaitStates = MaxWaitStates;
2042       switch (TSchedModel.computeInstrLatency(MFMA)) {
2043       case 2:
2044         NeedWaitStates = ST.hasGFX940Insts()
2045           ? isXDL(ST, *MFMA)
2046             ? GFX940_XDL2PassWriteVgprVALUWawWaitStates
2047             : GFX940_SMFMA2PassWriteVgprVALUWawWaitStates
2048           : SMFMA4x4WriteVgprVALUWawWaitStates;
2049         break;
2050       case 4:
2051         assert(isDGEMM(MFMA->getOpcode()) || ST.hasGFX940Insts());
2052         NeedWaitStates = isDGEMM(MFMA->getOpcode())
2053             ? DMFMA4x4WriteVgprVALUWriteWaitStates
2054             : isXDL(ST, *MFMA)
2055               ? GFX940_XDL4PassWriteVgprVALUWawWaitStates
2056               : GFX940_SMFMA4PassWriteVgprVALUWawWaitStates;
2057         break;
2058       case 8:
2059         NeedWaitStates = ST.hasGFX940Insts()
2060           ? isXDL(ST, *MFMA)
2061             ? GFX940_XDL8PassWriteVgprVALUWawWaitStates
2062             : GFX940_SMFMA8PassWriteVgprVALUWawWaitStates
2063           : SMFMA16x16WriteVgprVALUWawWaitStates;
2064         break;
2065       case 16: LLVM_FALLTHROUGH;
2066       default:
2067         NeedWaitStates = isDGEMM(MFMA->getOpcode())
2068                    ? DMFMA16x16WriteVgprVALUWriteWaitStates
2069                    : ST.hasGFX940Insts()
2070                      ? isXDL(ST, *MFMA)
2071                        ? GFX940_XDL16PassWriteVgprVALUWawWaitStates
2072                        : GFX940_SMFMA16PassWriteVgprVALUWawWaitStates
2073                    : SMFMA32x32WriteVgprVALUWawWaitStates;
2074         break;
2075       }
2076 
2077       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2078       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2079 
2080       if (WaitStatesNeeded == MaxWaitStates)
2081         break;
2082     }
2083 
2084     auto IsSMFMAReadAsCFn = [&Reg, &IsMFMAFn, &MFMA,
2085                              this](const MachineInstr &MI) {
2086       if (!IsMFMAFn(MI) || isDGEMM(MI.getOpcode()) ||
2087           !MI.readsRegister(Reg, &TRI))
2088         return false;
2089 
2090       if (ST.hasGFX940Insts() && !isXDL(ST, MI))
2091         return false;
2092 
2093       const MachineOperand *SrcC =
2094           TII.getNamedOperand(MI, AMDGPU::OpName::src2);
2095       assert(SrcC);
2096       if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
2097         return false;
2098 
2099       MFMA = &MI;
2100       return true;
2101     };
2102 
2103     MFMA = nullptr;
2104     int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
2105                                                 MaxWarWaitStates);
2106     if (!MFMA)
2107       continue;
2108 
2109     unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2110     int NeedWaitStates = MaxWaitStates;
2111     switch (HazardDefLatency) {
2112     case 2:  NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2113              break;
2114     case 4:  assert(ST.hasGFX940Insts());
2115              NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2116              break;
2117     case 8:  NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2118              break;
2119     case 16: LLVM_FALLTHROUGH;
2120     default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2121              break;
2122     }
2123 
2124     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2125     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2126   }
2127 
2128   return WaitStatesNeeded;
2129 }
2130 
2131 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
2132   if (!SU->isInstr())
2133     return false;
2134 
2135   const MachineInstr *MAI = nullptr;
2136   auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
2137     MAI = nullptr;
2138     if (SIInstrInfo::isMAI(MI) &&
2139         MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
2140         MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64)
2141       MAI = &MI;
2142     return MAI != nullptr;
2143   };
2144 
2145   MachineInstr *MI = SU->getInstr();
2146   if (IsMFMAFn(*MI)) {
2147     int W = getWaitStatesSince(IsMFMAFn, 16);
2148     if (MAI)
2149       return W < (int)TSchedModel.computeInstrLatency(MAI);
2150   }
2151 
2152   return false;
2153 }
2154