1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "GCNHazardRecognizer.h"
14 #include "GCNSubtarget.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "llvm/CodeGen/MachineFunction.h"
17 #include "llvm/CodeGen/ScheduleDAG.h"
18 #include "llvm/Support/TargetParser.h"
19 
20 using namespace llvm;
21 
22 //===----------------------------------------------------------------------===//
23 // Hazard Recoginizer Implementation
24 //===----------------------------------------------------------------------===//
25 
26 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
27                                                  const GCNSubtarget &ST);
28 
29 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
30   IsHazardRecognizerMode(false),
31   CurrCycleInstr(nullptr),
32   MF(MF),
33   ST(MF.getSubtarget<GCNSubtarget>()),
34   TII(*ST.getInstrInfo()),
35   TRI(TII.getRegisterInfo()),
36   ClauseUses(TRI.getNumRegUnits()),
37   ClauseDefs(TRI.getNumRegUnits()) {
38   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
39   TSchedModel.init(&ST);
40   RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
41 }
42 
43 void GCNHazardRecognizer::Reset() {
44   EmittedInstrs.clear();
45 }
46 
47 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
48   EmitInstruction(SU->getInstr());
49 }
50 
51 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
52   CurrCycleInstr = MI;
53 }
54 
55 static bool isDivFMas(unsigned Opcode) {
56   return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
57 }
58 
59 static bool isSGetReg(unsigned Opcode) {
60   return Opcode == AMDGPU::S_GETREG_B32;
61 }
62 
63 static bool isSSetReg(unsigned Opcode) {
64   switch (Opcode) {
65   case AMDGPU::S_SETREG_B32:
66   case AMDGPU::S_SETREG_B32_mode:
67   case AMDGPU::S_SETREG_IMM32_B32:
68   case AMDGPU::S_SETREG_IMM32_B32_mode:
69     return true;
70   }
71   return false;
72 }
73 
74 static bool isRWLane(unsigned Opcode) {
75   return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
76 }
77 
78 static bool isRFE(unsigned Opcode) {
79   return Opcode == AMDGPU::S_RFE_B64;
80 }
81 
82 static bool isSMovRel(unsigned Opcode) {
83   switch (Opcode) {
84   case AMDGPU::S_MOVRELS_B32:
85   case AMDGPU::S_MOVRELS_B64:
86   case AMDGPU::S_MOVRELD_B32:
87   case AMDGPU::S_MOVRELD_B64:
88     return true;
89   default:
90     return false;
91   }
92 }
93 
94 static bool isDGEMM(unsigned Opcode) {
95   return Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
96          Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64 ||
97          Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_e64 ||
98          Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64;
99 }
100 
101 static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
102   unsigned Opcode = MI.getOpcode();
103 
104   if (!SIInstrInfo::isMAI(MI) ||
105       isDGEMM(Opcode) ||
106       Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
107       Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
108     return false;
109 
110   return true;
111 }
112 
113 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
114                                     const MachineInstr &MI) {
115   if (TII.isAlwaysGDS(MI.getOpcode()))
116     return true;
117 
118   switch (MI.getOpcode()) {
119   case AMDGPU::S_SENDMSG:
120   case AMDGPU::S_SENDMSGHALT:
121   case AMDGPU::S_TTRACEDATA:
122     return true;
123   // These DS opcodes don't support GDS.
124   case AMDGPU::DS_NOP:
125   case AMDGPU::DS_PERMUTE_B32:
126   case AMDGPU::DS_BPERMUTE_B32:
127     return false;
128   default:
129     if (TII.isDS(MI.getOpcode())) {
130       int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
131                                            AMDGPU::OpName::gds);
132       if (MI.getOperand(GDS).getImm())
133         return true;
134     }
135     return false;
136   }
137 }
138 
139 static bool isPermlane(const MachineInstr &MI) {
140   unsigned Opcode = MI.getOpcode();
141   return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
142          Opcode == AMDGPU::V_PERMLANEX16_B32_e64;
143 }
144 
145 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
146   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
147                                                      AMDGPU::OpName::simm16);
148   return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
149 }
150 
151 ScheduleHazardRecognizer::HazardType
152 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
153   MachineInstr *MI = SU->getInstr();
154   // If we are not in "HazardRecognizerMode" and therefore not being run from
155   // the scheduler, track possible stalls from hazards but don't insert noops.
156   auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
157 
158   if (MI->isBundle())
159    return NoHazard;
160 
161   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
162     return HazardType;
163 
164   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
165     return HazardType;
166 
167   if (checkFPAtomicToDenormModeHazard(MI) > 0)
168     return HazardType;
169 
170   if (ST.hasNoDataDepHazard())
171     return NoHazard;
172 
173   // FIXME: Should flat be considered vmem?
174   if ((SIInstrInfo::isVMEM(*MI) ||
175        SIInstrInfo::isFLAT(*MI))
176       && checkVMEMHazards(MI) > 0)
177     return HazardType;
178 
179   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
180     return HazardType;
181 
182   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
183     return HazardType;
184 
185   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
186     return HazardType;
187 
188   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
189     return HazardType;
190 
191   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
192        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
193        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
194     return HazardType;
195 
196   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
197     return HazardType;
198 
199   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
200     return HazardType;
201 
202   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
203     return HazardType;
204 
205   if (ST.hasReadM0MovRelInterpHazard() &&
206       (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
207       checkReadM0Hazards(MI) > 0)
208     return HazardType;
209 
210   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&
211       checkReadM0Hazards(MI) > 0)
212     return HazardType;
213 
214   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
215     return HazardType;
216 
217   if ((SIInstrInfo::isVMEM(*MI) ||
218        SIInstrInfo::isFLAT(*MI) ||
219        SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
220     return HazardType;
221 
222   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
223     return HazardType;
224 
225   return NoHazard;
226 }
227 
228 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
229                                 unsigned Quantity) {
230   while (Quantity > 0) {
231     unsigned Arg = std::min(Quantity, 8u);
232     Quantity -= Arg;
233     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
234         .addImm(Arg - 1);
235   }
236 }
237 
238 void GCNHazardRecognizer::processBundle() {
239   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
240   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
241   // Check bundled MachineInstr's for hazards.
242   for (; MI != E && MI->isInsideBundle(); ++MI) {
243     CurrCycleInstr = &*MI;
244     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
245 
246     if (IsHazardRecognizerMode) {
247       fixHazards(CurrCycleInstr);
248 
249       insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
250     }
251 
252     // It’s unnecessary to track more than MaxLookAhead instructions. Since we
253     // include the bundled MI directly after, only add a maximum of
254     // (MaxLookAhead - 1) noops to EmittedInstrs.
255     for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
256       EmittedInstrs.push_front(nullptr);
257 
258     EmittedInstrs.push_front(CurrCycleInstr);
259     EmittedInstrs.resize(MaxLookAhead);
260   }
261   CurrCycleInstr = nullptr;
262 }
263 
264 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
265   IsHazardRecognizerMode = true;
266   CurrCycleInstr = MI;
267   unsigned W = PreEmitNoopsCommon(MI);
268   fixHazards(MI);
269   CurrCycleInstr = nullptr;
270   return W;
271 }
272 
273 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
274   if (MI->isBundle())
275     return 0;
276 
277   int WaitStates = 0;
278 
279   if (SIInstrInfo::isSMRD(*MI))
280     return std::max(WaitStates, checkSMRDHazards(MI));
281 
282   if (ST.hasNSAtoVMEMBug())
283     WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
284 
285   WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
286 
287   if (ST.hasNoDataDepHazard())
288     return WaitStates;
289 
290   if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
291     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
292 
293   if (SIInstrInfo::isVALU(*MI))
294     WaitStates = std::max(WaitStates, checkVALUHazards(MI));
295 
296   if (SIInstrInfo::isDPP(*MI))
297     WaitStates = std::max(WaitStates, checkDPPHazards(MI));
298 
299   if (isDivFMas(MI->getOpcode()))
300     WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
301 
302   if (isRWLane(MI->getOpcode()))
303     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
304 
305   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
306        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
307        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
308     WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
309 
310   if (MI->isInlineAsm())
311     return std::max(WaitStates, checkInlineAsmHazards(MI));
312 
313   if (isSGetReg(MI->getOpcode()))
314     return std::max(WaitStates, checkGetRegHazards(MI));
315 
316   if (isSSetReg(MI->getOpcode()))
317     return std::max(WaitStates, checkSetRegHazards(MI));
318 
319   if (isRFE(MI->getOpcode()))
320     return std::max(WaitStates, checkRFEHazards(MI));
321 
322   if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) ||
323                                            isSMovRel(MI->getOpcode())))
324     return std::max(WaitStates, checkReadM0Hazards(MI));
325 
326   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI))
327     return std::max(WaitStates, checkReadM0Hazards(MI));
328 
329   if (SIInstrInfo::isMAI(*MI))
330     return std::max(WaitStates, checkMAIHazards(MI));
331 
332   if (SIInstrInfo::isVMEM(*MI) ||
333       SIInstrInfo::isFLAT(*MI) ||
334       SIInstrInfo::isDS(*MI))
335     return std::max(WaitStates, checkMAILdStHazards(MI));
336 
337   return WaitStates;
338 }
339 
340 void GCNHazardRecognizer::EmitNoop() {
341   EmittedInstrs.push_front(nullptr);
342 }
343 
344 void GCNHazardRecognizer::AdvanceCycle() {
345   // When the scheduler detects a stall, it will call AdvanceCycle() without
346   // emitting any instructions.
347   if (!CurrCycleInstr) {
348     EmittedInstrs.push_front(nullptr);
349     return;
350   }
351 
352   // Do not track non-instructions which do not affect the wait states.
353   // If included, these instructions can lead to buffer overflow such that
354   // detectable hazards are missed.
355   if (CurrCycleInstr->isMetaInstruction()) {
356     CurrCycleInstr = nullptr;
357     return;
358   }
359 
360   if (CurrCycleInstr->isBundle()) {
361     processBundle();
362     return;
363   }
364 
365   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
366   if (!NumWaitStates) {
367     CurrCycleInstr = nullptr;
368     return;
369   }
370 
371   // Keep track of emitted instructions
372   EmittedInstrs.push_front(CurrCycleInstr);
373 
374   // Add a nullptr for each additional wait state after the first.  Make sure
375   // not to add more than getMaxLookAhead() items to the list, since we
376   // truncate the list to that size right after this loop.
377   for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
378        i < e; ++i) {
379     EmittedInstrs.push_front(nullptr);
380   }
381 
382   // getMaxLookahead() is the largest number of wait states we will ever need
383   // to insert, so there is no point in keeping track of more than that many
384   // wait states.
385   EmittedInstrs.resize(getMaxLookAhead());
386 
387   CurrCycleInstr = nullptr;
388 }
389 
390 void GCNHazardRecognizer::RecedeCycle() {
391   llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
392 }
393 
394 //===----------------------------------------------------------------------===//
395 // Helper Functions
396 //===----------------------------------------------------------------------===//
397 
398 typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn;
399 
400 // Returns a minimum wait states since \p I walking all predecessors.
401 // Only scans until \p IsExpired does not return true.
402 // Can only be run in a hazard recognizer mode.
403 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
404                               const MachineBasicBlock *MBB,
405                               MachineBasicBlock::const_reverse_instr_iterator I,
406                               int WaitStates, IsExpiredFn IsExpired,
407                               DenseSet<const MachineBasicBlock *> &Visited) {
408   for (auto E = MBB->instr_rend(); I != E; ++I) {
409     // Don't add WaitStates for parent BUNDLE instructions.
410     if (I->isBundle())
411       continue;
412 
413     if (IsHazard(*I))
414       return WaitStates;
415 
416     if (I->isInlineAsm() || I->isMetaInstruction())
417       continue;
418 
419     WaitStates += SIInstrInfo::getNumWaitStates(*I);
420 
421     if (IsExpired(*I, WaitStates))
422       return std::numeric_limits<int>::max();
423   }
424 
425   int MinWaitStates = std::numeric_limits<int>::max();
426   for (MachineBasicBlock *Pred : MBB->predecessors()) {
427     if (!Visited.insert(Pred).second)
428       continue;
429 
430     int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
431                                WaitStates, IsExpired, Visited);
432 
433     MinWaitStates = std::min(MinWaitStates, W);
434   }
435 
436   return MinWaitStates;
437 }
438 
439 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
440                               const MachineInstr *MI, IsExpiredFn IsExpired) {
441   DenseSet<const MachineBasicBlock *> Visited;
442   return getWaitStatesSince(IsHazard, MI->getParent(),
443                             std::next(MI->getReverseIterator()),
444                             0, IsExpired, Visited);
445 }
446 
447 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
448   if (IsHazardRecognizerMode) {
449     auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
450       return WaitStates >= Limit;
451     };
452     return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
453   }
454 
455   int WaitStates = 0;
456   for (MachineInstr *MI : EmittedInstrs) {
457     if (MI) {
458       if (IsHazard(*MI))
459         return WaitStates;
460 
461       if (MI->isInlineAsm())
462         continue;
463     }
464     ++WaitStates;
465 
466     if (WaitStates >= Limit)
467       break;
468   }
469   return std::numeric_limits<int>::max();
470 }
471 
472 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
473                                                IsHazardFn IsHazardDef,
474                                                int Limit) {
475   const SIRegisterInfo *TRI = ST.getRegisterInfo();
476 
477   auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
478     return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
479   };
480 
481   return getWaitStatesSince(IsHazardFn, Limit);
482 }
483 
484 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
485                                                   int Limit) {
486   auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
487     return isSSetReg(MI.getOpcode()) && IsHazard(MI);
488   };
489 
490   return getWaitStatesSince(IsHazardFn, Limit);
491 }
492 
493 //===----------------------------------------------------------------------===//
494 // No-op Hazard Detection
495 //===----------------------------------------------------------------------===//
496 
497 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
498                         MCRegister Reg) {
499   for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
500     BV.set(*RUI);
501 }
502 
503 static void addRegsToSet(const SIRegisterInfo &TRI,
504                          iterator_range<MachineInstr::const_mop_iterator> Ops,
505                          BitVector &Set) {
506   for (const MachineOperand &Op : Ops) {
507     if (Op.isReg())
508       addRegUnits(TRI, Set, Op.getReg().asMCReg());
509   }
510 }
511 
512 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
513   // XXX: Do we need to worry about implicit operands
514   addRegsToSet(TRI, MI.defs(), ClauseDefs);
515   addRegsToSet(TRI, MI.uses(), ClauseUses);
516 }
517 
518 static bool breaksSMEMSoftClause(MachineInstr *MI) {
519   return !SIInstrInfo::isSMRD(*MI);
520 }
521 
522 static bool breaksVMEMSoftClause(MachineInstr *MI) {
523   return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
524 }
525 
526 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
527   // SMEM soft clause are only present on VI+, and only matter if xnack is
528   // enabled.
529   if (!ST.isXNACKEnabled())
530     return 0;
531 
532   bool IsSMRD = TII.isSMRD(*MEM);
533 
534   resetClause();
535 
536   // A soft-clause is any group of consecutive SMEM instructions.  The
537   // instructions in this group may return out of order and/or may be
538   // replayed (i.e. the same instruction issued more than once).
539   //
540   // In order to handle these situations correctly we need to make sure that
541   // when a clause has more than one instruction, no instruction in the clause
542   // writes to a register that is read by another instruction in the clause
543   // (including itself). If we encounter this situaion, we need to break the
544   // clause by inserting a non SMEM instruction.
545 
546   for (MachineInstr *MI : EmittedInstrs) {
547     // When we hit a non-SMEM instruction then we have passed the start of the
548     // clause and we can stop.
549     if (!MI)
550       break;
551 
552     if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
553       break;
554 
555     addClauseInst(*MI);
556   }
557 
558   if (ClauseDefs.none())
559     return 0;
560 
561   // We need to make sure not to put loads and stores in the same clause if they
562   // use the same address. For now, just start a new clause whenever we see a
563   // store.
564   if (MEM->mayStore())
565     return 1;
566 
567   addClauseInst(*MEM);
568 
569   // If the set of defs and uses intersect then we cannot add this instruction
570   // to the clause, so we have a hazard.
571   return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
572 }
573 
574 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
575   int WaitStatesNeeded = 0;
576 
577   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
578 
579   // This SMRD hazard only affects SI.
580   if (!ST.hasSMRDReadVALUDefHazard())
581     return WaitStatesNeeded;
582 
583   // A read of an SGPR by SMRD instruction requires 4 wait states when the
584   // SGPR was written by a VALU instruction.
585   int SmrdSgprWaitStates = 4;
586   auto IsHazardDefFn = [this](const MachineInstr &MI) {
587     return TII.isVALU(MI);
588   };
589   auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
590     return TII.isSALU(MI);
591   };
592 
593   bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
594 
595   for (const MachineOperand &Use : SMRD->uses()) {
596     if (!Use.isReg())
597       continue;
598     int WaitStatesNeededForUse =
599         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
600                                                    SmrdSgprWaitStates);
601     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
602 
603     // This fixes what appears to be undocumented hardware behavior in SI where
604     // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
605     // needs some number of nops in between. We don't know how many we need, but
606     // let's use 4. This wasn't discovered before probably because the only
607     // case when this happens is when we expand a 64-bit pointer into a full
608     // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
609     // probably never encountered in the closed-source land.
610     if (IsBufferSMRD) {
611       int WaitStatesNeededForUse =
612         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
613                                                    IsBufferHazardDefFn,
614                                                    SmrdSgprWaitStates);
615       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
616     }
617   }
618 
619   return WaitStatesNeeded;
620 }
621 
622 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
623   if (!ST.hasVMEMReadSGPRVALUDefHazard())
624     return 0;
625 
626   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
627 
628   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
629   // SGPR was written by a VALU Instruction.
630   const int VmemSgprWaitStates = 5;
631   auto IsHazardDefFn = [this](const MachineInstr &MI) {
632     return TII.isVALU(MI);
633   };
634   for (const MachineOperand &Use : VMEM->uses()) {
635     if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
636       continue;
637 
638     int WaitStatesNeededForUse =
639         VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
640                                                    VmemSgprWaitStates);
641     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
642   }
643   return WaitStatesNeeded;
644 }
645 
646 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
647   const SIRegisterInfo *TRI = ST.getRegisterInfo();
648   const SIInstrInfo *TII = ST.getInstrInfo();
649 
650   // Check for DPP VGPR read after VALU VGPR write and EXEC write.
651   int DppVgprWaitStates = 2;
652   int DppExecWaitStates = 5;
653   int WaitStatesNeeded = 0;
654   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
655     return TII->isVALU(MI);
656   };
657 
658   for (const MachineOperand &Use : DPP->uses()) {
659     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
660       continue;
661     int WaitStatesNeededForUse =
662         DppVgprWaitStates - getWaitStatesSinceDef(
663                                 Use.getReg(),
664                                 [](const MachineInstr &) { return true; },
665                                 DppVgprWaitStates);
666     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
667   }
668 
669   WaitStatesNeeded = std::max(
670       WaitStatesNeeded,
671       DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
672                                                 DppExecWaitStates));
673 
674   return WaitStatesNeeded;
675 }
676 
677 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
678   const SIInstrInfo *TII = ST.getInstrInfo();
679 
680   // v_div_fmas requires 4 wait states after a write to vcc from a VALU
681   // instruction.
682   const int DivFMasWaitStates = 4;
683   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
684     return TII->isVALU(MI);
685   };
686   int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
687                                                DivFMasWaitStates);
688 
689   return DivFMasWaitStates - WaitStatesNeeded;
690 }
691 
692 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
693   const SIInstrInfo *TII = ST.getInstrInfo();
694   unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
695 
696   const int GetRegWaitStates = 2;
697   auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
698     return GetRegHWReg == getHWReg(TII, MI);
699   };
700   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
701 
702   return GetRegWaitStates - WaitStatesNeeded;
703 }
704 
705 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
706   const SIInstrInfo *TII = ST.getInstrInfo();
707   unsigned HWReg = getHWReg(TII, *SetRegInstr);
708 
709   const int SetRegWaitStates = ST.getSetRegWaitStates();
710   auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
711     return HWReg == getHWReg(TII, MI);
712   };
713   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
714   return SetRegWaitStates - WaitStatesNeeded;
715 }
716 
717 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
718   if (!MI.mayStore())
719     return -1;
720 
721   const SIInstrInfo *TII = ST.getInstrInfo();
722   unsigned Opcode = MI.getOpcode();
723   const MCInstrDesc &Desc = MI.getDesc();
724 
725   int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
726   int VDataRCID = -1;
727   if (VDataIdx != -1)
728     VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
729 
730   if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
731     // There is no hazard if the instruction does not use vector regs
732     // (like wbinvl1)
733     if (VDataIdx == -1)
734       return -1;
735     // For MUBUF/MTBUF instructions this hazard only exists if the
736     // instruction is not using a register in the soffset field.
737     const MachineOperand *SOffset =
738         TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
739     // If we have no soffset operand, then assume this field has been
740     // hardcoded to zero.
741     if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
742         (!SOffset || !SOffset->isReg()))
743       return VDataIdx;
744   }
745 
746   // MIMG instructions create a hazard if they don't use a 256-bit T# and
747   // the store size is greater than 8 bytes and they have more than two bits
748   // of their dmask set.
749   // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
750   if (TII->isMIMG(MI)) {
751     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
752     assert(SRsrcIdx != -1 &&
753            AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
754     (void)SRsrcIdx;
755   }
756 
757   if (TII->isFLAT(MI)) {
758     int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
759     if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
760       return DataIdx;
761   }
762 
763   return -1;
764 }
765 
766 int
767 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
768                                             const MachineRegisterInfo &MRI) {
769   // Helper to check for the hazard where VMEM instructions that store more than
770   // 8 bytes can have there store data over written by the next instruction.
771   const SIRegisterInfo *TRI = ST.getRegisterInfo();
772 
773   const int VALUWaitStates = 1;
774   int WaitStatesNeeded = 0;
775 
776   if (!TRI->isVectorRegister(MRI, Def.getReg()))
777     return WaitStatesNeeded;
778   Register Reg = Def.getReg();
779   auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
780     int DataIdx = createsVALUHazard(MI);
781     return DataIdx >= 0 &&
782            TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
783   };
784   int WaitStatesNeededForDef =
785     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
786   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
787 
788   return WaitStatesNeeded;
789 }
790 
791 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
792   // This checks for the hazard where VMEM instructions that store more than
793   // 8 bytes can have there store data over written by the next instruction.
794   if (!ST.has12DWordStoreHazard())
795     return 0;
796 
797   const MachineRegisterInfo &MRI = MF.getRegInfo();
798   int WaitStatesNeeded = 0;
799 
800   for (const MachineOperand &Def : VALU->defs()) {
801     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
802   }
803 
804   return WaitStatesNeeded;
805 }
806 
807 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
808   // This checks for hazards associated with inline asm statements.
809   // Since inline asms can contain just about anything, we use this
810   // to call/leverage other check*Hazard routines. Note that
811   // this function doesn't attempt to address all possible inline asm
812   // hazards (good luck), but is a collection of what has been
813   // problematic thus far.
814 
815   // see checkVALUHazards()
816   if (!ST.has12DWordStoreHazard())
817     return 0;
818 
819   const MachineRegisterInfo &MRI = MF.getRegInfo();
820   int WaitStatesNeeded = 0;
821 
822   for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands();
823        I != E; ++I) {
824     const MachineOperand &Op = IA->getOperand(I);
825     if (Op.isReg() && Op.isDef()) {
826       WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
827     }
828   }
829 
830   return WaitStatesNeeded;
831 }
832 
833 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
834   const SIInstrInfo *TII = ST.getInstrInfo();
835   const SIRegisterInfo *TRI = ST.getRegisterInfo();
836   const MachineRegisterInfo &MRI = MF.getRegInfo();
837 
838   const MachineOperand *LaneSelectOp =
839       TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
840 
841   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
842     return 0;
843 
844   Register LaneSelectReg = LaneSelectOp->getReg();
845   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
846 
847   const int RWLaneWaitStates = 4;
848   int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
849                                               RWLaneWaitStates);
850   return RWLaneWaitStates - WaitStatesSince;
851 }
852 
853 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
854   if (!ST.hasRFEHazards())
855     return 0;
856 
857   const SIInstrInfo *TII = ST.getInstrInfo();
858 
859   const int RFEWaitStates = 1;
860 
861   auto IsHazardFn = [TII](const MachineInstr &MI) {
862     return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
863   };
864   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
865   return RFEWaitStates - WaitStatesNeeded;
866 }
867 
868 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
869   const SIInstrInfo *TII = ST.getInstrInfo();
870   const int SMovRelWaitStates = 1;
871   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
872   return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
873                                                    SMovRelWaitStates);
874 }
875 
876 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
877   fixVMEMtoScalarWriteHazards(MI);
878   fixVcmpxPermlaneHazards(MI);
879   fixSMEMtoVectorWriteHazards(MI);
880   fixVcmpxExecWARHazard(MI);
881   fixLdsBranchVmemWARHazard(MI);
882 }
883 
884 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
885   if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
886     return false;
887 
888   const SIInstrInfo *TII = ST.getInstrInfo();
889   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVOPC(MI); };
890 
891   auto IsExpiredFn = [](const MachineInstr &MI, int) {
892     unsigned Opc = MI.getOpcode();
893     return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
894            Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
895   };
896 
897   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
898       std::numeric_limits<int>::max())
899     return false;
900 
901   // V_NOP will be discarded by SQ.
902   // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
903   // which is always a VGPR and available.
904   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
905   Register Reg = Src0->getReg();
906   bool IsUndef = Src0->isUndef();
907   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
908           TII->get(AMDGPU::V_MOV_B32_e32))
909     .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
910     .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
911 
912   return true;
913 }
914 
915 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
916   if (!ST.hasVMEMtoScalarWriteHazard())
917     return false;
918 
919   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
920     return false;
921 
922   if (MI->getNumDefs() == 0)
923     return false;
924 
925   const SIRegisterInfo *TRI = ST.getRegisterInfo();
926 
927   auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
928     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) &&
929         !SIInstrInfo::isFLAT(I))
930       return false;
931 
932     for (const MachineOperand &Def : MI->defs()) {
933       const MachineOperand *Op =
934           I.findRegisterUseOperand(Def.getReg(), false, TRI);
935       if (!Op)
936         continue;
937       return true;
938     }
939     return false;
940   };
941 
942   auto IsExpiredFn = [](const MachineInstr &MI, int) {
943     return SIInstrInfo::isVALU(MI) ||
944            (MI.getOpcode() == AMDGPU::S_WAITCNT &&
945             !MI.getOperand(0).getImm()) ||
946            (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
947             MI.getOperand(0).getImm() == 0xffe3);
948   };
949 
950   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
951       std::numeric_limits<int>::max())
952     return false;
953 
954   const SIInstrInfo *TII = ST.getInstrInfo();
955   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
956           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
957       .addImm(0xffe3);
958   return true;
959 }
960 
961 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
962   if (!ST.hasSMEMtoVectorWriteHazard())
963     return false;
964 
965   if (!SIInstrInfo::isVALU(*MI))
966     return false;
967 
968   unsigned SDSTName;
969   switch (MI->getOpcode()) {
970   case AMDGPU::V_READLANE_B32:
971   case AMDGPU::V_READFIRSTLANE_B32:
972     SDSTName = AMDGPU::OpName::vdst;
973     break;
974   default:
975     SDSTName = AMDGPU::OpName::sdst;
976     break;
977   }
978 
979   const SIInstrInfo *TII = ST.getInstrInfo();
980   const SIRegisterInfo *TRI = ST.getRegisterInfo();
981   const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
982   const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
983   if (!SDST) {
984     for (const auto &MO : MI->implicit_operands()) {
985       if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) {
986         SDST = &MO;
987         break;
988       }
989     }
990   }
991 
992   if (!SDST)
993     return false;
994 
995   const Register SDSTReg = SDST->getReg();
996   auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
997     return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
998   };
999 
1000   auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1001     if (TII->isSALU(MI)) {
1002       switch (MI.getOpcode()) {
1003       case AMDGPU::S_SETVSKIP:
1004       case AMDGPU::S_VERSION:
1005       case AMDGPU::S_WAITCNT_VSCNT:
1006       case AMDGPU::S_WAITCNT_VMCNT:
1007       case AMDGPU::S_WAITCNT_EXPCNT:
1008         // These instructions cannot not mitigate the hazard.
1009         return false;
1010       case AMDGPU::S_WAITCNT_LGKMCNT:
1011         // Reducing lgkmcnt count to 0 always mitigates the hazard.
1012         return (MI.getOperand(1).getImm() == 0) &&
1013                (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1014       case AMDGPU::S_WAITCNT: {
1015         const int64_t Imm = MI.getOperand(0).getImm();
1016         AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1017         return (Decoded.LgkmCnt == 0);
1018       }
1019       default:
1020         // SOPP instructions cannot mitigate the hazard.
1021         if (TII->isSOPP(MI))
1022           return false;
1023         // At this point the SALU can be assumed to mitigate the hazard
1024         // because either:
1025         // (a) it is independent of the at risk SMEM (breaking chain),
1026         // or
1027         // (b) it is dependent on the SMEM, in which case an appropriate
1028         //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
1029         //     SMEM instruction.
1030         return true;
1031       }
1032     }
1033     return false;
1034   };
1035 
1036   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1037       std::numeric_limits<int>::max())
1038     return false;
1039 
1040   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1041           TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1042       .addImm(0);
1043   return true;
1044 }
1045 
1046 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1047   if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
1048     return false;
1049 
1050   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1051   if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1052     return false;
1053 
1054   auto IsHazardFn = [TRI](const MachineInstr &I) {
1055     if (SIInstrInfo::isVALU(I))
1056       return false;
1057     return I.readsRegister(AMDGPU::EXEC, TRI);
1058   };
1059 
1060   const SIInstrInfo *TII = ST.getInstrInfo();
1061   auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1062     if (SIInstrInfo::isVALU(MI)) {
1063       if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1064         return true;
1065       for (auto MO : MI.implicit_operands())
1066         if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))
1067           return true;
1068     }
1069     if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1070         (MI.getOperand(0).getImm() & 0xfffe) == 0xfffe)
1071       return true;
1072     return false;
1073   };
1074 
1075   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1076       std::numeric_limits<int>::max())
1077     return false;
1078 
1079   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1080           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1081     .addImm(0xfffe);
1082   return true;
1083 }
1084 
1085 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1086                                                  const GCNSubtarget &ST) {
1087   if (!ST.hasLdsBranchVmemWARHazard())
1088     return false;
1089 
1090   // Check if the necessary condition for the hazard is met: both LDS and VMEM
1091   // instructions need to appear in the same function.
1092   bool HasLds = false;
1093   bool HasVmem = false;
1094   for (auto &MBB : MF) {
1095     for (auto &MI : MBB) {
1096       HasLds |= SIInstrInfo::isDS(MI);
1097       HasVmem |=
1098           SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI);
1099       if (HasLds && HasVmem)
1100         return true;
1101     }
1102   }
1103   return false;
1104 }
1105 
1106 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1107   if (!RunLdsBranchVmemWARHazardFixup)
1108     return false;
1109 
1110   assert(ST.hasLdsBranchVmemWARHazard());
1111 
1112   auto IsHazardInst = [](const MachineInstr &MI) {
1113     if (SIInstrInfo::isDS(MI))
1114       return 1;
1115     if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
1116       return 2;
1117     return 0;
1118   };
1119 
1120   auto InstType = IsHazardInst(*MI);
1121   if (!InstType)
1122     return false;
1123 
1124   auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1125     return IsHazardInst(I) || (I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1126                                I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1127                                !I.getOperand(1).getImm());
1128   };
1129 
1130   auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1131     if (!I.isBranch())
1132       return false;
1133 
1134     auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1135       auto InstType2 = IsHazardInst(I);
1136       return InstType2 && InstType != InstType2;
1137     };
1138 
1139     auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1140       auto InstType2 = IsHazardInst(I);
1141       if (InstType == InstType2)
1142         return true;
1143 
1144       return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1145              I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1146              !I.getOperand(1).getImm();
1147     };
1148 
1149     return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1150            std::numeric_limits<int>::max();
1151   };
1152 
1153   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1154       std::numeric_limits<int>::max())
1155     return false;
1156 
1157   const SIInstrInfo *TII = ST.getInstrInfo();
1158   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1159           TII->get(AMDGPU::S_WAITCNT_VSCNT))
1160     .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1161     .addImm(0);
1162 
1163   return true;
1164 }
1165 
1166 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1167   int NSAtoVMEMWaitStates = 1;
1168 
1169   if (!ST.hasNSAtoVMEMBug())
1170     return 0;
1171 
1172   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1173     return 0;
1174 
1175   const SIInstrInfo *TII = ST.getInstrInfo();
1176   const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1177   if (!Offset || (Offset->getImm() & 6) == 0)
1178     return 0;
1179 
1180   auto IsHazardFn = [TII](const MachineInstr &I) {
1181     if (!SIInstrInfo::isMIMG(I))
1182       return false;
1183     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
1184     return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1185            TII->getInstSizeInBytes(I) >= 16;
1186   };
1187 
1188   return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1189 }
1190 
1191 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1192   int FPAtomicToDenormModeWaitStates = 3;
1193 
1194   if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1195     return 0;
1196 
1197   auto IsHazardFn = [](const MachineInstr &I) {
1198     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I))
1199       return false;
1200     return SIInstrInfo::isFPAtomic(I);
1201   };
1202 
1203   auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
1204     if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
1205       return true;
1206 
1207     switch (MI.getOpcode()) {
1208     case AMDGPU::S_WAITCNT:
1209     case AMDGPU::S_WAITCNT_VSCNT:
1210     case AMDGPU::S_WAITCNT_VMCNT:
1211     case AMDGPU::S_WAITCNT_EXPCNT:
1212     case AMDGPU::S_WAITCNT_LGKMCNT:
1213     case AMDGPU::S_WAIT_IDLE:
1214       return true;
1215     default:
1216       break;
1217     }
1218 
1219     return false;
1220   };
1221 
1222   return FPAtomicToDenormModeWaitStates -
1223          ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
1224 }
1225 
1226 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1227   assert(SIInstrInfo::isMAI(*MI));
1228 
1229   return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
1230 }
1231 
1232 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
1233   int WaitStatesNeeded = 0;
1234   unsigned Opc = MI->getOpcode();
1235 
1236   auto IsVALUFn = [](const MachineInstr &MI) {
1237     return SIInstrInfo::isVALU(MI);
1238   };
1239 
1240   if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
1241     const int LegacyVALUWritesVGPRWaitStates = 2;
1242     const int VALUWritesExecWaitStates = 4;
1243     const int MaxWaitStates = 4;
1244 
1245     int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1246       getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1247     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1248 
1249     if (WaitStatesNeeded < MaxWaitStates) {
1250       for (const MachineOperand &Use : MI->explicit_uses()) {
1251         const int MaxWaitStates = 2;
1252 
1253         if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1254           continue;
1255 
1256         int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
1257           getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
1258         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1259 
1260         if (WaitStatesNeeded == MaxWaitStates)
1261           break;
1262       }
1263     }
1264   }
1265 
1266   auto IsMFMAFn = [](const MachineInstr &MI) {
1267     return SIInstrInfo::isMAI(MI) &&
1268            MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1269            MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1270   };
1271 
1272   for (const MachineOperand &Op : MI->explicit_operands()) {
1273     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
1274       continue;
1275 
1276     if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1277       continue;
1278 
1279     const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
1280     const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
1281     const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
1282     const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
1283     const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
1284     const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
1285     const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
1286     const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
1287     const int MaxWaitStates = 18;
1288     Register Reg = Op.getReg();
1289     unsigned HazardDefLatency = 0;
1290 
1291     auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency,
1292                                this](const MachineInstr &MI) {
1293       if (!IsMFMAFn(MI))
1294         return false;
1295       Register DstReg = MI.getOperand(0).getReg();
1296       if (DstReg == Reg)
1297         return false;
1298       HazardDefLatency =
1299           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
1300       return TRI.regsOverlap(DstReg, Reg);
1301     };
1302 
1303     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
1304                                                    MaxWaitStates);
1305     int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
1306     int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1307     int OpNo = MI->getOperandNo(&Op);
1308     if (OpNo == SrcCIdx) {
1309       NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
1310     } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
1311       switch (HazardDefLatency) {
1312       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
1313                break;
1314       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
1315                break;
1316       case 16: LLVM_FALLTHROUGH;
1317       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
1318                break;
1319       }
1320     } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
1321       switch (HazardDefLatency) {
1322       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
1323                break;
1324       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
1325                break;
1326       case 16: LLVM_FALLTHROUGH;
1327       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
1328                break;
1329       }
1330     }
1331 
1332     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1333     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1334 
1335     if (WaitStatesNeeded == MaxWaitStates)
1336       return WaitStatesNeeded; // Early exit.
1337 
1338     auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
1339       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1340         return false;
1341       Register DstReg = MI.getOperand(0).getReg();
1342       return TRI.regsOverlap(Reg, DstReg);
1343     };
1344 
1345     const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
1346     const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
1347     const int AccVGPRWriteAccVgprReadWaitStates = 3;
1348     NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
1349     if (OpNo == SrcCIdx)
1350       NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
1351     else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
1352       NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
1353 
1354     WaitStatesNeededForUse = NeedWaitStates -
1355       getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
1356     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1357 
1358     if (WaitStatesNeeded == MaxWaitStates)
1359       return WaitStatesNeeded; // Early exit.
1360   }
1361 
1362   if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
1363     const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
1364     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
1365     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
1366     const int MaxWaitStates = 13;
1367     Register DstReg = MI->getOperand(0).getReg();
1368     unsigned HazardDefLatency = 0;
1369 
1370     auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency,
1371                          this](const MachineInstr &MI) {
1372       if (!IsMFMAFn(MI))
1373         return false;
1374       Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
1375       HazardDefLatency =
1376           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
1377       return TRI.regsOverlap(Reg, DstReg);
1378     };
1379 
1380     int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
1381     int NeedWaitStates;
1382     switch (HazardDefLatency) {
1383     case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
1384              break;
1385     case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
1386              break;
1387     case 16: LLVM_FALLTHROUGH;
1388     default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
1389              break;
1390     }
1391 
1392     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
1393     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1394   }
1395 
1396   return WaitStatesNeeded;
1397 }
1398 
1399 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
1400   int WaitStatesNeeded = 0;
1401   unsigned Opc = MI->getOpcode();
1402 
1403   auto IsMFMAFn = [](const MachineInstr &MI) {
1404     return SIInstrInfo::isMAI(MI) &&
1405            MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1406            MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1407   };
1408 
1409   auto IsLegacyVALUFn = [&IsMFMAFn](const MachineInstr &MI) {
1410     return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI);
1411   };
1412 
1413   auto IsLegacyVALUNotDotFn = [&IsMFMAFn](const MachineInstr &MI) {
1414     return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI) && !SIInstrInfo::isDOT(MI);
1415   };
1416 
1417   if (!IsMFMAFn(*MI))
1418     return WaitStatesNeeded;
1419 
1420   const int VALUWritesExecWaitStates = 4;
1421   int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1422     getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
1423                           VALUWritesExecWaitStates);
1424   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1425 
1426   int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1427 
1428   // Loop for both DGEMM and S/HGEMM 2nd instruction.
1429   for (const MachineOperand &Use : MI->explicit_uses()) {
1430     const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
1431     const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
1432     const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
1433     const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
1434     const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
1435     const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
1436     const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
1437     const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
1438     const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
1439     const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
1440     const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
1441     const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
1442     const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
1443     const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
1444     const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
1445     const int MaxWaitStates = 19;
1446 
1447     if (!Use.isReg())
1448       continue;
1449     unsigned Reg = Use.getReg();
1450     bool FullReg;
1451     const MachineInstr *MI1;
1452 
1453     auto IsOverlappedDGEMMorXDLFn = [Reg, &IsMFMAFn, &FullReg, &MI1,
1454                                      this](const MachineInstr &MI) {
1455       if (!IsMFMAFn(MI))
1456         return false;
1457       if (!isDGEMM(MI.getOpcode()) && !isXDL(ST, MI))
1458         return false;
1459       Register DstReg = MI.getOperand(0).getReg();
1460       FullReg = (DstReg == Reg);
1461       MI1 = &MI;
1462       return TRI.regsOverlap(DstReg, Reg);
1463     };
1464 
1465     WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
1466       getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
1467     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1468 
1469     int NumWaitStates = getWaitStatesSinceDef(Reg, IsOverlappedDGEMMorXDLFn,
1470                                               MaxWaitStates);
1471     if (NumWaitStates == std::numeric_limits<int>::max())
1472       continue;
1473 
1474     int OpNo = MI->getOperandNo(&Use);
1475     unsigned Opc1 = MI1->getOpcode();
1476     int NeedWaitStates = 0;
1477     if (OpNo == SrcCIdx) {
1478       if (!isDGEMM(Opc) && isDGEMM(Opc1)) {
1479         NeedWaitStates = 0;
1480       } else if (FullReg) {
1481         if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
1482              Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
1483             (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
1484              Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
1485           NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
1486       } else {
1487         switch (Opc1) {
1488         case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
1489         case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
1490           if (!isXDL(ST, *MI))
1491             NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
1492           break;
1493         case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
1494         case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
1495           if (!isXDL(ST, *MI))
1496             NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
1497           break;
1498         default:
1499           switch (TSchedModel.computeInstrLatency(MI1)) {
1500           case 2:
1501             NeedWaitStates = isDGEMM(Opc)
1502               ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
1503               : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
1504             break;
1505           case 8:
1506             NeedWaitStates = isDGEMM(Opc)
1507               ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
1508               : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
1509             break;
1510           case 16: LLVM_FALLTHROUGH;
1511           default:
1512             NeedWaitStates = isDGEMM(Opc)
1513               ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
1514               : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
1515           }
1516         }
1517       }
1518     } else {
1519       switch (Opc1) {
1520       case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
1521       case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
1522         NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
1523         break;
1524       case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
1525       case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
1526         NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
1527         break;
1528       default:
1529         switch (TSchedModel.computeInstrLatency(MI1)) {
1530         case 2:
1531           NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
1532           break;
1533         case 8:
1534           NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
1535           break;
1536         case 16: LLVM_FALLTHROUGH;
1537         default:
1538           NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
1539         }
1540       }
1541     }
1542     if (WaitStatesNeeded >= NeedWaitStates)
1543       continue;
1544 
1545     WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
1546     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1547 
1548     if (WaitStatesNeeded == MaxWaitStates)
1549       break;
1550   }
1551 
1552   return WaitStatesNeeded;
1553 }
1554 
1555 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
1556   // On gfx90a+ releveant hazards are checked in checkMAIVALUHazards()
1557   if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
1558     return 0;
1559 
1560   int WaitStatesNeeded = 0;
1561 
1562   auto IsAccVgprReadFn = [](const MachineInstr &MI) {
1563     return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
1564   };
1565 
1566   for (const MachineOperand &Op : MI->explicit_uses()) {
1567     if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
1568       continue;
1569 
1570     Register Reg = Op.getReg();
1571 
1572     const int AccVgprReadLdStWaitStates = 2;
1573     const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
1574     const int MaxWaitStates = 2;
1575 
1576     int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
1577       getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
1578     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1579 
1580     if (WaitStatesNeeded == MaxWaitStates)
1581       return WaitStatesNeeded; // Early exit.
1582 
1583     auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
1584       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
1585           MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1586         return false;
1587       auto IsVALUFn = [](const MachineInstr &MI) {
1588         return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
1589       };
1590       return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
1591              std::numeric_limits<int>::max();
1592     };
1593 
1594     WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
1595       getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
1596     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1597   }
1598 
1599   return WaitStatesNeeded;
1600 }
1601 
1602 int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
1603   if (!ST.hasGFX90AInsts())
1604     return 0;
1605 
1606   auto IsMFMAFn = [](const MachineInstr &MI) -> bool {
1607     return SIInstrInfo::isMAI(MI) &&
1608            MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1609            MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1610   };
1611 
1612   auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
1613     return isDGEMM(MI.getOpcode());
1614   };
1615 
1616   // This is checked in checkMAIHazards90A()
1617   if (IsMFMAFn(*MI))
1618     return 0;
1619 
1620   int WaitStatesNeeded = 0;
1621 
1622   bool IsMemOrExport = SIInstrInfo::isVMEM(*MI) ||
1623                        SIInstrInfo::isFLAT(*MI) ||
1624                        SIInstrInfo::isDS(*MI) ||
1625                        SIInstrInfo::isEXP(*MI);
1626   bool IsVALU = SIInstrInfo::isVALU(*MI);
1627 
1628   const MachineInstr *MFMA = nullptr;
1629   unsigned Reg;
1630   auto IsDGEMMorXDLWriteFn = [&Reg, &IsMFMAFn, &MFMA,
1631                               this](const MachineInstr &MI) {
1632     if (!IsMFMAFn(MI) || !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
1633       return false;
1634     if (!isDGEMM(MI.getOpcode()) && !isXDL(ST, MI))
1635       return false;
1636     MFMA = &MI;
1637     return true;
1638   };
1639 
1640   const MachineInstr *DOT = nullptr;
1641   auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
1642     if (!SIInstrInfo::isDOT(MI) ||
1643         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
1644       return false;
1645     DOT = &MI;
1646     return true;
1647   };
1648 
1649   int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
1650                                            AMDGPU::OpName::src2);
1651 
1652   if (IsMemOrExport || IsVALU) {
1653     const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
1654     const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
1655     const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
1656     const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
1657     const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
1658     const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
1659     const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
1660     const int DotWriteSameDotReadSrcAB = 3;
1661     const int DotWriteDifferentVALURead = 3;
1662     const int MaxWaitStates = 19;
1663 
1664     for (const MachineOperand &Use : MI->explicit_uses()) {
1665       if (!Use.isReg())
1666         continue;
1667       Reg = Use.getReg();
1668 
1669       DOT = nullptr;
1670       int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
1671                                                      MaxWaitStates);
1672       if (DOT) {
1673         int NeedWaitStates = 0;
1674         if (DOT->getOpcode() == MI->getOpcode()) {
1675           if (&Use - &MI->getOperand(0) != SrcCIdx)
1676             NeedWaitStates = DotWriteSameDotReadSrcAB;
1677         } else {
1678           NeedWaitStates = DotWriteDifferentVALURead;
1679         }
1680 
1681         int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1682         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1683       }
1684 
1685       MFMA = nullptr;
1686       WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDGEMMorXDLWriteFn,
1687                                                  MaxWaitStates);
1688       if (!MFMA)
1689         continue;
1690 
1691       unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
1692       int NeedWaitStates = MaxWaitStates;
1693       switch (HazardDefLatency) {
1694       case 2:
1695         NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
1696         break;
1697       case 4:
1698         assert(isDGEMM(MFMA->getOpcode()));
1699         NeedWaitStates =
1700             IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
1701                           : DMFMA4x4WriteVgprVALUReadWaitStates;
1702         break;
1703       case 8:
1704         NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
1705         break;
1706       case 16: LLVM_FALLTHROUGH;
1707       default:
1708         NeedWaitStates =
1709           isDGEMM(MFMA->getOpcode())
1710             ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates
1711                             : DMFMA16x16WriteVgprVALUReadWaitStates
1712             : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
1713         break;
1714       }
1715 
1716       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1717       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1718 
1719       if (WaitStatesNeeded == MaxWaitStates)
1720         break;
1721     }
1722   }
1723 
1724   unsigned Opc = MI->getOpcode();
1725   const int DMFMAToFMA64WaitStates = 2;
1726   if ((Opc == AMDGPU::V_FMA_F64_e64 ||
1727        Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
1728        Opc == AMDGPU::V_FMAC_F64_dpp) &&
1729       WaitStatesNeeded < DMFMAToFMA64WaitStates) {
1730     int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
1731       getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
1732     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1733   }
1734 
1735   if (!IsVALU && !IsMemOrExport)
1736     return WaitStatesNeeded;
1737 
1738   for (const MachineOperand &Def : MI->defs()) {
1739     const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
1740     const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
1741     const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
1742     const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
1743     const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
1744     const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
1745     const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
1746     const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
1747     const int DotWriteDifferentVALUWrite = 3;
1748     const int MaxWaitStates = 19;
1749     const int MaxWarWaitStates = 15;
1750 
1751     Reg = Def.getReg();
1752 
1753     DOT = nullptr;
1754     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
1755                                                    MaxWaitStates);
1756     if (DOT && DOT->getOpcode() != MI->getOpcode())
1757       WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
1758                                                     WaitStatesSinceDef);
1759 
1760     MFMA = nullptr;
1761     WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDGEMMorXDLWriteFn,
1762                                                MaxWaitStates);
1763     if (MFMA) {
1764       int NeedWaitStates = MaxWaitStates;
1765       switch (TSchedModel.computeInstrLatency(MFMA)) {
1766       case 2:
1767         NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
1768         break;
1769       case 4:
1770         assert(isDGEMM(MFMA->getOpcode()));
1771         NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
1772         break;
1773       case 8:
1774         NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
1775         break;
1776       case 16: LLVM_FALLTHROUGH;
1777       default:
1778         NeedWaitStates = isDGEMM(MFMA->getOpcode())
1779                    ? DMFMA16x16WriteVgprVALUWriteWaitStates
1780                    : SMFMA32x32WriteVgprVALUWawWaitStates;
1781         break;
1782       }
1783 
1784       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1785       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1786 
1787       if (WaitStatesNeeded == MaxWaitStates)
1788         break;
1789     }
1790 
1791     auto IsSMFMAReadAsCFn = [&Reg, &IsMFMAFn, &MFMA,
1792                              this](const MachineInstr &MI) {
1793       if (!IsMFMAFn(MI) || isDGEMM(MI.getOpcode()) ||
1794           !MI.readsRegister(Reg, &TRI))
1795         return false;
1796 
1797       const MachineOperand *SrcC =
1798           TII.getNamedOperand(MI, AMDGPU::OpName::src2);
1799       assert(SrcC);
1800       if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
1801         return false;
1802 
1803       MFMA = &MI;
1804       return true;
1805     };
1806 
1807     MFMA = nullptr;
1808     int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
1809                                                 MaxWarWaitStates);
1810     if (!MFMA)
1811       continue;
1812 
1813     unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
1814     int NeedWaitStates = MaxWaitStates;
1815     switch (HazardDefLatency) {
1816     case 2:  NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
1817              break;
1818     case 8:  NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
1819              break;
1820     case 16: LLVM_FALLTHROUGH;
1821     default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
1822              break;
1823     }
1824 
1825     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
1826     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1827   }
1828 
1829   return WaitStatesNeeded;
1830 }
1831 
1832 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
1833   if (!SU->isInstr())
1834     return false;
1835 
1836   const MachineInstr *MAI = nullptr;
1837   auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
1838     MAI = nullptr;
1839     if (SIInstrInfo::isMAI(MI) &&
1840         MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1841         MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64)
1842       MAI = &MI;
1843     return MAI != nullptr;
1844   };
1845 
1846   MachineInstr *MI = SU->getInstr();
1847   if (IsMFMAFn(*MI)) {
1848     int W = getWaitStatesSince(IsMFMAFn, 16);
1849     if (MAI)
1850       return W < (int)TSchedModel.computeInstrLatency(MAI);
1851   }
1852 
1853   return false;
1854 }
1855