1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "GCNHazardRecognizer.h"
14 #include "AMDGPUSubtarget.h"
15 #include "SIDefines.h"
16 #include "SIInstrInfo.h"
17 #include "SIRegisterInfo.h"
18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19 #include "Utils/AMDGPUBaseInfo.h"
20 #include "llvm/ADT/iterator_range.h"
21 #include "llvm/CodeGen/MachineFunction.h"
22 #include "llvm/CodeGen/MachineInstr.h"
23 #include "llvm/CodeGen/MachineInstrBuilder.h"
24 #include "llvm/CodeGen/MachineOperand.h"
25 #include "llvm/CodeGen/ScheduleDAG.h"
26 #include "llvm/MC/MCInstrDesc.h"
27 #include "llvm/Support/ErrorHandling.h"
28 #include <algorithm>
29 #include <cassert>
30 #include <limits>
31 #include <set>
32 #include <vector>
33 
34 using namespace llvm;
35 
36 //===----------------------------------------------------------------------===//
37 // Hazard Recoginizer Implementation
38 //===----------------------------------------------------------------------===//
39 
40 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
41   IsHazardRecognizerMode(false),
42   CurrCycleInstr(nullptr),
43   MF(MF),
44   ST(MF.getSubtarget<GCNSubtarget>()),
45   TII(*ST.getInstrInfo()),
46   TRI(TII.getRegisterInfo()),
47   ClauseUses(TRI.getNumRegUnits()),
48   ClauseDefs(TRI.getNumRegUnits()) {
49   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5;
50   TSchedModel.init(&ST);
51 }
52 
53 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
54   EmitInstruction(SU->getInstr());
55 }
56 
57 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
58   CurrCycleInstr = MI;
59 }
60 
61 static bool isDivFMas(unsigned Opcode) {
62   return Opcode == AMDGPU::V_DIV_FMAS_F32 || Opcode == AMDGPU::V_DIV_FMAS_F64;
63 }
64 
65 static bool isSGetReg(unsigned Opcode) {
66   return Opcode == AMDGPU::S_GETREG_B32;
67 }
68 
69 static bool isSSetReg(unsigned Opcode) {
70   switch (Opcode) {
71   case AMDGPU::S_SETREG_B32:
72   case AMDGPU::S_SETREG_B32_mode:
73   case AMDGPU::S_SETREG_IMM32_B32:
74   case AMDGPU::S_SETREG_IMM32_B32_mode:
75     return true;
76   }
77   return false;
78 }
79 
80 static bool isRWLane(unsigned Opcode) {
81   return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
82 }
83 
84 static bool isRFE(unsigned Opcode) {
85   return Opcode == AMDGPU::S_RFE_B64;
86 }
87 
88 static bool isSMovRel(unsigned Opcode) {
89   switch (Opcode) {
90   case AMDGPU::S_MOVRELS_B32:
91   case AMDGPU::S_MOVRELS_B64:
92   case AMDGPU::S_MOVRELD_B32:
93   case AMDGPU::S_MOVRELD_B64:
94     return true;
95   default:
96     return false;
97   }
98 }
99 
100 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
101                                     const MachineInstr &MI) {
102   if (TII.isAlwaysGDS(MI.getOpcode()))
103     return true;
104 
105   switch (MI.getOpcode()) {
106   case AMDGPU::S_SENDMSG:
107   case AMDGPU::S_SENDMSGHALT:
108   case AMDGPU::S_TTRACEDATA:
109     return true;
110   // These DS opcodes don't support GDS.
111   case AMDGPU::DS_NOP:
112   case AMDGPU::DS_PERMUTE_B32:
113   case AMDGPU::DS_BPERMUTE_B32:
114     return false;
115   default:
116     if (TII.isDS(MI.getOpcode())) {
117       int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
118                                            AMDGPU::OpName::gds);
119       if (MI.getOperand(GDS).getImm())
120         return true;
121     }
122     return false;
123   }
124 }
125 
126 static bool isPermlane(const MachineInstr &MI) {
127   unsigned Opcode = MI.getOpcode();
128   return Opcode == AMDGPU::V_PERMLANE16_B32 ||
129          Opcode == AMDGPU::V_PERMLANEX16_B32;
130 }
131 
132 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
133   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
134                                                      AMDGPU::OpName::simm16);
135   return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
136 }
137 
138 ScheduleHazardRecognizer::HazardType
139 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
140   MachineInstr *MI = SU->getInstr();
141   // If we are not in "HazardRecognizerMode" and therefore not being run from
142   // the scheduler, track possible stalls from hazards but don't insert noops.
143   auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
144 
145   if (MI->isBundle())
146    return NoHazard;
147 
148   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
149     return HazardType;
150 
151   // FIXME: Should flat be considered vmem?
152   if ((SIInstrInfo::isVMEM(*MI) ||
153        SIInstrInfo::isFLAT(*MI))
154       && checkVMEMHazards(MI) > 0)
155     return HazardType;
156 
157   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
158     return HazardType;
159 
160   if (checkFPAtomicToDenormModeHazard(MI) > 0)
161     return HazardType;
162 
163   if (ST.hasNoDataDepHazard())
164     return NoHazard;
165 
166   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
167     return HazardType;
168 
169   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
170     return HazardType;
171 
172   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
173     return HazardType;
174 
175   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
176     return HazardType;
177 
178   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
179     return HazardType;
180 
181   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
182     return HazardType;
183 
184   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
185     return HazardType;
186 
187   if (ST.hasReadM0MovRelInterpHazard() &&
188       (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
189       checkReadM0Hazards(MI) > 0)
190     return HazardType;
191 
192   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&
193       checkReadM0Hazards(MI) > 0)
194     return HazardType;
195 
196   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
197     return HazardType;
198 
199   if ((SIInstrInfo::isVMEM(*MI) ||
200        SIInstrInfo::isFLAT(*MI) ||
201        SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
202     return HazardType;
203 
204   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
205     return HazardType;
206 
207   return NoHazard;
208 }
209 
210 static void insertNoopInBundle(MachineInstr *MI, const SIInstrInfo &TII) {
211   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
212       .addImm(0);
213 }
214 
215 void GCNHazardRecognizer::processBundle() {
216   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
217   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
218   // Check bundled MachineInstr's for hazards.
219   for (; MI != E && MI->isInsideBundle(); ++MI) {
220     CurrCycleInstr = &*MI;
221     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
222 
223     if (IsHazardRecognizerMode)
224       fixHazards(CurrCycleInstr);
225 
226     for (unsigned i = 0; i < WaitStates; ++i)
227       insertNoopInBundle(CurrCycleInstr, TII);
228 
229     // It’s unnecessary to track more than MaxLookAhead instructions. Since we
230     // include the bundled MI directly after, only add a maximum of
231     // (MaxLookAhead - 1) noops to EmittedInstrs.
232     for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
233       EmittedInstrs.push_front(nullptr);
234 
235     EmittedInstrs.push_front(CurrCycleInstr);
236     EmittedInstrs.resize(MaxLookAhead);
237   }
238   CurrCycleInstr = nullptr;
239 }
240 
241 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
242   IsHazardRecognizerMode = true;
243   CurrCycleInstr = MI;
244   unsigned W = PreEmitNoopsCommon(MI);
245   fixHazards(MI);
246   CurrCycleInstr = nullptr;
247   return W;
248 }
249 
250 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
251   if (MI->isBundle())
252     return 0;
253 
254   int WaitStates = 0;
255 
256   if (SIInstrInfo::isSMRD(*MI))
257     return std::max(WaitStates, checkSMRDHazards(MI));
258 
259   if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
260     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
261 
262   if (ST.hasNSAtoVMEMBug())
263     WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
264 
265   WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
266 
267   if (ST.hasNoDataDepHazard())
268     return WaitStates;
269 
270   if (SIInstrInfo::isVALU(*MI))
271     WaitStates = std::max(WaitStates, checkVALUHazards(MI));
272 
273   if (SIInstrInfo::isDPP(*MI))
274     WaitStates = std::max(WaitStates, checkDPPHazards(MI));
275 
276   if (isDivFMas(MI->getOpcode()))
277     WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
278 
279   if (isRWLane(MI->getOpcode()))
280     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
281 
282   if (MI->isInlineAsm())
283     return std::max(WaitStates, checkInlineAsmHazards(MI));
284 
285   if (isSGetReg(MI->getOpcode()))
286     return std::max(WaitStates, checkGetRegHazards(MI));
287 
288   if (isSSetReg(MI->getOpcode()))
289     return std::max(WaitStates, checkSetRegHazards(MI));
290 
291   if (isRFE(MI->getOpcode()))
292     return std::max(WaitStates, checkRFEHazards(MI));
293 
294   if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) ||
295                                            isSMovRel(MI->getOpcode())))
296     return std::max(WaitStates, checkReadM0Hazards(MI));
297 
298   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI))
299     return std::max(WaitStates, checkReadM0Hazards(MI));
300 
301   if (SIInstrInfo::isMAI(*MI))
302     return std::max(WaitStates, checkMAIHazards(MI));
303 
304   if (SIInstrInfo::isVMEM(*MI) ||
305       SIInstrInfo::isFLAT(*MI) ||
306       SIInstrInfo::isDS(*MI))
307     return std::max(WaitStates, checkMAILdStHazards(MI));
308 
309   return WaitStates;
310 }
311 
312 void GCNHazardRecognizer::EmitNoop() {
313   EmittedInstrs.push_front(nullptr);
314 }
315 
316 void GCNHazardRecognizer::AdvanceCycle() {
317   // When the scheduler detects a stall, it will call AdvanceCycle() without
318   // emitting any instructions.
319   if (!CurrCycleInstr) {
320     EmittedInstrs.push_front(nullptr);
321     return;
322   }
323 
324   // Do not track non-instructions which do not affect the wait states.
325   // If included, these instructions can lead to buffer overflow such that
326   // detectable hazards are missed.
327   if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() ||
328       CurrCycleInstr->isKill()) {
329     CurrCycleInstr = nullptr;
330     return;
331   }
332 
333   if (CurrCycleInstr->isBundle()) {
334     processBundle();
335     return;
336   }
337 
338   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
339 
340   // Keep track of emitted instructions
341   EmittedInstrs.push_front(CurrCycleInstr);
342 
343   // Add a nullptr for each additional wait state after the first.  Make sure
344   // not to add more than getMaxLookAhead() items to the list, since we
345   // truncate the list to that size right after this loop.
346   for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
347        i < e; ++i) {
348     EmittedInstrs.push_front(nullptr);
349   }
350 
351   // getMaxLookahead() is the largest number of wait states we will ever need
352   // to insert, so there is no point in keeping track of more than that many
353   // wait states.
354   EmittedInstrs.resize(getMaxLookAhead());
355 
356   CurrCycleInstr = nullptr;
357 }
358 
359 void GCNHazardRecognizer::RecedeCycle() {
360   llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
361 }
362 
363 //===----------------------------------------------------------------------===//
364 // Helper Functions
365 //===----------------------------------------------------------------------===//
366 
367 typedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn;
368 
369 // Returns a minimum wait states since \p I walking all predecessors.
370 // Only scans until \p IsExpired does not return true.
371 // Can only be run in a hazard recognizer mode.
372 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
373                               MachineBasicBlock *MBB,
374                               MachineBasicBlock::reverse_instr_iterator I,
375                               int WaitStates,
376                               IsExpiredFn IsExpired,
377                               DenseSet<const MachineBasicBlock *> &Visited) {
378   for (auto E = MBB->instr_rend(); I != E; ++I) {
379     // Don't add WaitStates for parent BUNDLE instructions.
380     if (I->isBundle())
381       continue;
382 
383     if (IsHazard(&*I))
384       return WaitStates;
385 
386     if (I->isInlineAsm() || I->isMetaInstruction())
387       continue;
388 
389     WaitStates += SIInstrInfo::getNumWaitStates(*I);
390 
391     if (IsExpired(&*I, WaitStates))
392       return std::numeric_limits<int>::max();
393   }
394 
395   int MinWaitStates = WaitStates;
396   bool Found = false;
397   for (MachineBasicBlock *Pred : MBB->predecessors()) {
398     if (!Visited.insert(Pred).second)
399       continue;
400 
401     int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
402                                WaitStates, IsExpired, Visited);
403 
404     if (W == std::numeric_limits<int>::max())
405       continue;
406 
407     MinWaitStates = Found ? std::min(MinWaitStates, W) : W;
408     if (IsExpired(nullptr, MinWaitStates))
409       return MinWaitStates;
410 
411     Found = true;
412   }
413 
414   if (Found)
415     return MinWaitStates;
416 
417   return std::numeric_limits<int>::max();
418 }
419 
420 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
421                               MachineInstr *MI,
422                               IsExpiredFn IsExpired) {
423   DenseSet<const MachineBasicBlock *> Visited;
424   return getWaitStatesSince(IsHazard, MI->getParent(),
425                             std::next(MI->getReverseIterator()),
426                             0, IsExpired, Visited);
427 }
428 
429 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
430   if (IsHazardRecognizerMode) {
431     auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) {
432       return WaitStates >= Limit;
433     };
434     return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
435   }
436 
437   int WaitStates = 0;
438   for (MachineInstr *MI : EmittedInstrs) {
439     if (MI) {
440       if (IsHazard(MI))
441         return WaitStates;
442 
443       if (MI->isInlineAsm())
444         continue;
445     }
446     ++WaitStates;
447 
448     if (WaitStates >= Limit)
449       break;
450   }
451   return std::numeric_limits<int>::max();
452 }
453 
454 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
455                                                IsHazardFn IsHazardDef,
456                                                int Limit) {
457   const SIRegisterInfo *TRI = ST.getRegisterInfo();
458 
459   auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) {
460     return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI);
461   };
462 
463   return getWaitStatesSince(IsHazardFn, Limit);
464 }
465 
466 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
467                                                   int Limit) {
468   auto IsHazardFn = [IsHazard] (MachineInstr *MI) {
469     return isSSetReg(MI->getOpcode()) && IsHazard(MI);
470   };
471 
472   return getWaitStatesSince(IsHazardFn, Limit);
473 }
474 
475 //===----------------------------------------------------------------------===//
476 // No-op Hazard Detection
477 //===----------------------------------------------------------------------===//
478 
479 static void addRegUnits(const SIRegisterInfo &TRI,
480                         BitVector &BV, unsigned Reg) {
481   for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
482     BV.set(*RUI);
483 }
484 
485 static void addRegsToSet(const SIRegisterInfo &TRI,
486                          iterator_range<MachineInstr::const_mop_iterator> Ops,
487                          BitVector &Set) {
488   for (const MachineOperand &Op : Ops) {
489     if (Op.isReg())
490       addRegUnits(TRI, Set, Op.getReg());
491   }
492 }
493 
494 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
495   // XXX: Do we need to worry about implicit operands
496   addRegsToSet(TRI, MI.defs(), ClauseDefs);
497   addRegsToSet(TRI, MI.uses(), ClauseUses);
498 }
499 
500 static bool breaksSMEMSoftClause(MachineInstr *MI) {
501   return !SIInstrInfo::isSMRD(*MI);
502 }
503 
504 static bool breaksVMEMSoftClause(MachineInstr *MI) {
505   return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
506 }
507 
508 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
509   // SMEM soft clause are only present on VI+, and only matter if xnack is
510   // enabled.
511   if (!ST.isXNACKEnabled())
512     return 0;
513 
514   bool IsSMRD = TII.isSMRD(*MEM);
515 
516   resetClause();
517 
518   // A soft-clause is any group of consecutive SMEM instructions.  The
519   // instructions in this group may return out of order and/or may be
520   // replayed (i.e. the same instruction issued more than once).
521   //
522   // In order to handle these situations correctly we need to make sure that
523   // when a clause has more than one instruction, no instruction in the clause
524   // writes to a register that is read by another instruction in the clause
525   // (including itself). If we encounter this situaion, we need to break the
526   // clause by inserting a non SMEM instruction.
527 
528   for (MachineInstr *MI : EmittedInstrs) {
529     // When we hit a non-SMEM instruction then we have passed the start of the
530     // clause and we can stop.
531     if (!MI)
532       break;
533 
534     if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
535       break;
536 
537     addClauseInst(*MI);
538   }
539 
540   if (ClauseDefs.none())
541     return 0;
542 
543   // We need to make sure not to put loads and stores in the same clause if they
544   // use the same address. For now, just start a new clause whenever we see a
545   // store.
546   if (MEM->mayStore())
547     return 1;
548 
549   addClauseInst(*MEM);
550 
551   // If the set of defs and uses intersect then we cannot add this instruction
552   // to the clause, so we have a hazard.
553   return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
554 }
555 
556 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
557   int WaitStatesNeeded = 0;
558 
559   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
560 
561   // This SMRD hazard only affects SI.
562   if (!ST.hasSMRDReadVALUDefHazard())
563     return WaitStatesNeeded;
564 
565   // A read of an SGPR by SMRD instruction requires 4 wait states when the
566   // SGPR was written by a VALU instruction.
567   int SmrdSgprWaitStates = 4;
568   auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
569   auto IsBufferHazardDefFn = [this] (MachineInstr *MI) { return TII.isSALU(*MI); };
570 
571   bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
572 
573   for (const MachineOperand &Use : SMRD->uses()) {
574     if (!Use.isReg())
575       continue;
576     int WaitStatesNeededForUse =
577         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
578                                                    SmrdSgprWaitStates);
579     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
580 
581     // This fixes what appears to be undocumented hardware behavior in SI where
582     // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
583     // needs some number of nops in between. We don't know how many we need, but
584     // let's use 4. This wasn't discovered before probably because the only
585     // case when this happens is when we expand a 64-bit pointer into a full
586     // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
587     // probably never encountered in the closed-source land.
588     if (IsBufferSMRD) {
589       int WaitStatesNeededForUse =
590         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
591                                                    IsBufferHazardDefFn,
592                                                    SmrdSgprWaitStates);
593       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
594     }
595   }
596 
597   return WaitStatesNeeded;
598 }
599 
600 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
601   if (!ST.hasVMEMReadSGPRVALUDefHazard())
602     return 0;
603 
604   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
605 
606   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
607   // SGPR was written by a VALU Instruction.
608   const int VmemSgprWaitStates = 5;
609   auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
610   for (const MachineOperand &Use : VMEM->uses()) {
611     if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
612       continue;
613 
614     int WaitStatesNeededForUse =
615         VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
616                                                    VmemSgprWaitStates);
617     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
618   }
619   return WaitStatesNeeded;
620 }
621 
622 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
623   const SIRegisterInfo *TRI = ST.getRegisterInfo();
624   const SIInstrInfo *TII = ST.getInstrInfo();
625 
626   // Check for DPP VGPR read after VALU VGPR write and EXEC write.
627   int DppVgprWaitStates = 2;
628   int DppExecWaitStates = 5;
629   int WaitStatesNeeded = 0;
630   auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
631 
632   for (const MachineOperand &Use : DPP->uses()) {
633     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
634       continue;
635     int WaitStatesNeededForUse =
636         DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
637                               [](MachineInstr *) { return true; },
638                               DppVgprWaitStates);
639     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
640   }
641 
642   WaitStatesNeeded = std::max(
643       WaitStatesNeeded,
644       DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
645                                                 DppExecWaitStates));
646 
647   return WaitStatesNeeded;
648 }
649 
650 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
651   const SIInstrInfo *TII = ST.getInstrInfo();
652 
653   // v_div_fmas requires 4 wait states after a write to vcc from a VALU
654   // instruction.
655   const int DivFMasWaitStates = 4;
656   auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
657   int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
658                                                DivFMasWaitStates);
659 
660   return DivFMasWaitStates - WaitStatesNeeded;
661 }
662 
663 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
664   const SIInstrInfo *TII = ST.getInstrInfo();
665   unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
666 
667   const int GetRegWaitStates = 2;
668   auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) {
669     return GetRegHWReg == getHWReg(TII, *MI);
670   };
671   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
672 
673   return GetRegWaitStates - WaitStatesNeeded;
674 }
675 
676 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
677   const SIInstrInfo *TII = ST.getInstrInfo();
678   unsigned HWReg = getHWReg(TII, *SetRegInstr);
679 
680   const int SetRegWaitStates = ST.getSetRegWaitStates();
681   auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) {
682     return HWReg == getHWReg(TII, *MI);
683   };
684   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
685   return SetRegWaitStates - WaitStatesNeeded;
686 }
687 
688 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
689   if (!MI.mayStore())
690     return -1;
691 
692   const SIInstrInfo *TII = ST.getInstrInfo();
693   unsigned Opcode = MI.getOpcode();
694   const MCInstrDesc &Desc = MI.getDesc();
695 
696   int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
697   int VDataRCID = -1;
698   if (VDataIdx != -1)
699     VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
700 
701   if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
702     // There is no hazard if the instruction does not use vector regs
703     // (like wbinvl1)
704     if (VDataIdx == -1)
705       return -1;
706     // For MUBUF/MTBUF instructions this hazard only exists if the
707     // instruction is not using a register in the soffset field.
708     const MachineOperand *SOffset =
709         TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
710     // If we have no soffset operand, then assume this field has been
711     // hardcoded to zero.
712     if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
713         (!SOffset || !SOffset->isReg()))
714       return VDataIdx;
715   }
716 
717   // MIMG instructions create a hazard if they don't use a 256-bit T# and
718   // the store size is greater than 8 bytes and they have more than two bits
719   // of their dmask set.
720   // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
721   if (TII->isMIMG(MI)) {
722     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
723     assert(SRsrcIdx != -1 &&
724            AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
725     (void)SRsrcIdx;
726   }
727 
728   if (TII->isFLAT(MI)) {
729     int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
730     if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
731       return DataIdx;
732   }
733 
734   return -1;
735 }
736 
737 int
738 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
739                                             const MachineRegisterInfo &MRI) {
740   // Helper to check for the hazard where VMEM instructions that store more than
741   // 8 bytes can have there store data over written by the next instruction.
742   const SIRegisterInfo *TRI = ST.getRegisterInfo();
743 
744   const int VALUWaitStates = 1;
745   int WaitStatesNeeded = 0;
746 
747   if (!TRI->isVGPR(MRI, Def.getReg()))
748     return WaitStatesNeeded;
749   Register Reg = Def.getReg();
750   auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) {
751     int DataIdx = createsVALUHazard(*MI);
752     return DataIdx >= 0 &&
753     TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg);
754   };
755   int WaitStatesNeededForDef =
756     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
757   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
758 
759   return WaitStatesNeeded;
760 }
761 
762 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
763   // This checks for the hazard where VMEM instructions that store more than
764   // 8 bytes can have there store data over written by the next instruction.
765   if (!ST.has12DWordStoreHazard())
766     return 0;
767 
768   const MachineRegisterInfo &MRI = MF.getRegInfo();
769   int WaitStatesNeeded = 0;
770 
771   for (const MachineOperand &Def : VALU->defs()) {
772     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
773   }
774 
775   return WaitStatesNeeded;
776 }
777 
778 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
779   // This checks for hazards associated with inline asm statements.
780   // Since inline asms can contain just about anything, we use this
781   // to call/leverage other check*Hazard routines. Note that
782   // this function doesn't attempt to address all possible inline asm
783   // hazards (good luck), but is a collection of what has been
784   // problematic thus far.
785 
786   // see checkVALUHazards()
787   if (!ST.has12DWordStoreHazard())
788     return 0;
789 
790   const MachineRegisterInfo &MRI = MF.getRegInfo();
791   int WaitStatesNeeded = 0;
792 
793   for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands();
794        I != E; ++I) {
795     const MachineOperand &Op = IA->getOperand(I);
796     if (Op.isReg() && Op.isDef()) {
797       WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
798     }
799   }
800 
801   return WaitStatesNeeded;
802 }
803 
804 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
805   const SIInstrInfo *TII = ST.getInstrInfo();
806   const SIRegisterInfo *TRI = ST.getRegisterInfo();
807   const MachineRegisterInfo &MRI = MF.getRegInfo();
808 
809   const MachineOperand *LaneSelectOp =
810       TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
811 
812   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
813     return 0;
814 
815   Register LaneSelectReg = LaneSelectOp->getReg();
816   auto IsHazardFn = [TII] (MachineInstr *MI) {
817     return TII->isVALU(*MI);
818   };
819 
820   const int RWLaneWaitStates = 4;
821   int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
822                                               RWLaneWaitStates);
823   return RWLaneWaitStates - WaitStatesSince;
824 }
825 
826 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
827   if (!ST.hasRFEHazards())
828     return 0;
829 
830   const SIInstrInfo *TII = ST.getInstrInfo();
831 
832   const int RFEWaitStates = 1;
833 
834   auto IsHazardFn = [TII] (MachineInstr *MI) {
835     return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS;
836   };
837   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
838   return RFEWaitStates - WaitStatesNeeded;
839 }
840 
841 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
842   const SIInstrInfo *TII = ST.getInstrInfo();
843   const int SMovRelWaitStates = 1;
844   auto IsHazardFn = [TII] (MachineInstr *MI) {
845     return TII->isSALU(*MI);
846   };
847   return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
848                                                    SMovRelWaitStates);
849 }
850 
851 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
852   fixVMEMtoScalarWriteHazards(MI);
853   fixVcmpxPermlaneHazards(MI);
854   fixSMEMtoVectorWriteHazards(MI);
855   fixVcmpxExecWARHazard(MI);
856   fixLdsBranchVmemWARHazard(MI);
857 }
858 
859 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
860   if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
861     return false;
862 
863   const SIInstrInfo *TII = ST.getInstrInfo();
864   auto IsHazardFn = [TII] (MachineInstr *MI) {
865     return TII->isVOPC(*MI);
866   };
867 
868   auto IsExpiredFn = [] (MachineInstr *MI, int) {
869     if (!MI)
870       return false;
871     unsigned Opc = MI->getOpcode();
872     return SIInstrInfo::isVALU(*MI) &&
873            Opc != AMDGPU::V_NOP_e32 &&
874            Opc != AMDGPU::V_NOP_e64 &&
875            Opc != AMDGPU::V_NOP_sdwa;
876   };
877 
878   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
879       std::numeric_limits<int>::max())
880     return false;
881 
882   // V_NOP will be discarded by SQ.
883   // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
884   // which is always a VGPR and available.
885   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
886   Register Reg = Src0->getReg();
887   bool IsUndef = Src0->isUndef();
888   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
889           TII->get(AMDGPU::V_MOV_B32_e32))
890     .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
891     .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
892 
893   return true;
894 }
895 
896 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
897   if (!ST.hasVMEMtoScalarWriteHazard())
898     return false;
899 
900   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
901     return false;
902 
903   if (MI->getNumDefs() == 0)
904     return false;
905 
906   const SIRegisterInfo *TRI = ST.getRegisterInfo();
907 
908   auto IsHazardFn = [TRI, MI] (MachineInstr *I) {
909     if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I) &&
910         !SIInstrInfo::isFLAT(*I))
911       return false;
912 
913     for (const MachineOperand &Def : MI->defs()) {
914       MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI);
915       if (!Op)
916         continue;
917       return true;
918     }
919     return false;
920   };
921 
922   auto IsExpiredFn = [](MachineInstr *MI, int) {
923     return MI && (SIInstrInfo::isVALU(*MI) ||
924                   (MI->getOpcode() == AMDGPU::S_WAITCNT &&
925                    !MI->getOperand(0).getImm()) ||
926                   (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
927                    MI->getOperand(0).getImm() == 0xffe3));
928   };
929 
930   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
931       std::numeric_limits<int>::max())
932     return false;
933 
934   const SIInstrInfo *TII = ST.getInstrInfo();
935   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
936           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
937       .addImm(0xffe3);
938   return true;
939 }
940 
941 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
942   if (!ST.hasSMEMtoVectorWriteHazard())
943     return false;
944 
945   if (!SIInstrInfo::isVALU(*MI))
946     return false;
947 
948   unsigned SDSTName;
949   switch (MI->getOpcode()) {
950   case AMDGPU::V_READLANE_B32:
951   case AMDGPU::V_READLANE_B32_gfx10:
952   case AMDGPU::V_READFIRSTLANE_B32:
953     SDSTName = AMDGPU::OpName::vdst;
954     break;
955   default:
956     SDSTName = AMDGPU::OpName::sdst;
957     break;
958   }
959 
960   const SIInstrInfo *TII = ST.getInstrInfo();
961   const SIRegisterInfo *TRI = ST.getRegisterInfo();
962   const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
963   const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
964   if (!SDST) {
965     for (const auto &MO : MI->implicit_operands()) {
966       if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) {
967         SDST = &MO;
968         break;
969       }
970     }
971   }
972 
973   if (!SDST)
974     return false;
975 
976   const Register SDSTReg = SDST->getReg();
977   auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) {
978     return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI);
979   };
980 
981   auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) {
982     if (MI) {
983       if (TII->isSALU(*MI)) {
984         switch (MI->getOpcode()) {
985         case AMDGPU::S_SETVSKIP:
986         case AMDGPU::S_VERSION:
987         case AMDGPU::S_WAITCNT_VSCNT:
988         case AMDGPU::S_WAITCNT_VMCNT:
989         case AMDGPU::S_WAITCNT_EXPCNT:
990           // These instructions cannot not mitigate the hazard.
991           return false;
992         case AMDGPU::S_WAITCNT_LGKMCNT:
993           // Reducing lgkmcnt count to 0 always mitigates the hazard.
994           return (MI->getOperand(1).getImm() == 0) &&
995                  (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
996         case AMDGPU::S_WAITCNT: {
997           const int64_t Imm = MI->getOperand(0).getImm();
998           AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
999           return (Decoded.LgkmCnt == 0);
1000         }
1001         default:
1002           // SOPP instructions cannot mitigate the hazard.
1003           if (TII->isSOPP(*MI))
1004             return false;
1005           // At this point the SALU can be assumed to mitigate the hazard
1006           // because either:
1007           // (a) it is independent of the at risk SMEM (breaking chain),
1008           // or
1009           // (b) it is dependent on the SMEM, in which case an appropriate
1010           //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
1011           //     SMEM instruction.
1012           return true;
1013         }
1014       }
1015     }
1016     return false;
1017   };
1018 
1019   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1020       std::numeric_limits<int>::max())
1021     return false;
1022 
1023   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1024           TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1025       .addImm(0);
1026   return true;
1027 }
1028 
1029 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1030   if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
1031     return false;
1032 
1033   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1034   if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1035     return false;
1036 
1037   auto IsHazardFn = [TRI] (MachineInstr *I) {
1038     if (SIInstrInfo::isVALU(*I))
1039       return false;
1040     return I->readsRegister(AMDGPU::EXEC, TRI);
1041   };
1042 
1043   const SIInstrInfo *TII = ST.getInstrInfo();
1044   auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) {
1045     if (!MI)
1046       return false;
1047     if (SIInstrInfo::isVALU(*MI)) {
1048       if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst))
1049         return true;
1050       for (auto MO : MI->implicit_operands())
1051         if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))
1052           return true;
1053     }
1054     if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1055         (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe)
1056       return true;
1057     return false;
1058   };
1059 
1060   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1061       std::numeric_limits<int>::max())
1062     return false;
1063 
1064   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1065           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1066     .addImm(0xfffe);
1067   return true;
1068 }
1069 
1070 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1071   if (!ST.hasLdsBranchVmemWARHazard())
1072     return false;
1073 
1074   auto IsHazardInst = [] (const MachineInstr *MI) {
1075     if (SIInstrInfo::isDS(*MI))
1076       return 1;
1077     if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI))
1078       return 2;
1079     return 0;
1080   };
1081 
1082   auto InstType = IsHazardInst(MI);
1083   if (!InstType)
1084     return false;
1085 
1086   auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) {
1087     return I && (IsHazardInst(I) ||
1088                  (I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1089                   I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1090                   !I->getOperand(1).getImm()));
1091   };
1092 
1093   auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) {
1094     if (!I->isBranch())
1095       return false;
1096 
1097     auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) {
1098       auto InstType2 = IsHazardInst(I);
1099       return InstType2 && InstType != InstType2;
1100     };
1101 
1102     auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) {
1103       if (!I)
1104         return false;
1105 
1106       auto InstType2 = IsHazardInst(I);
1107       if (InstType == InstType2)
1108         return true;
1109 
1110       return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1111              I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1112              !I->getOperand(1).getImm();
1113     };
1114 
1115     return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) !=
1116            std::numeric_limits<int>::max();
1117   };
1118 
1119   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1120       std::numeric_limits<int>::max())
1121     return false;
1122 
1123   const SIInstrInfo *TII = ST.getInstrInfo();
1124   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1125           TII->get(AMDGPU::S_WAITCNT_VSCNT))
1126     .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1127     .addImm(0);
1128 
1129   return true;
1130 }
1131 
1132 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1133   int NSAtoVMEMWaitStates = 1;
1134 
1135   if (!ST.hasNSAtoVMEMBug())
1136     return 0;
1137 
1138   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1139     return 0;
1140 
1141   const SIInstrInfo *TII = ST.getInstrInfo();
1142   const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1143   if (!Offset || (Offset->getImm() & 6) == 0)
1144     return 0;
1145 
1146   auto IsHazardFn = [TII] (MachineInstr *I) {
1147     if (!SIInstrInfo::isMIMG(*I))
1148       return false;
1149     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode());
1150     return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1151            TII->getInstSizeInBytes(*I) >= 16;
1152   };
1153 
1154   return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1155 }
1156 
1157 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1158   int FPAtomicToDenormModeWaitStates = 3;
1159 
1160   if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1161     return 0;
1162 
1163   auto IsHazardFn = [] (MachineInstr *I) {
1164     if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I))
1165       return false;
1166     return SIInstrInfo::isFPAtomic(*I);
1167   };
1168 
1169   auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) {
1170     if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI))
1171       return true;
1172 
1173     switch (MI->getOpcode()) {
1174     case AMDGPU::S_WAITCNT:
1175     case AMDGPU::S_WAITCNT_VSCNT:
1176     case AMDGPU::S_WAITCNT_VMCNT:
1177     case AMDGPU::S_WAITCNT_EXPCNT:
1178     case AMDGPU::S_WAITCNT_LGKMCNT:
1179     case AMDGPU::S_WAITCNT_IDLE:
1180       return true;
1181     default:
1182       break;
1183     }
1184 
1185     return false;
1186   };
1187 
1188 
1189   return FPAtomicToDenormModeWaitStates -
1190          ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
1191 }
1192 
1193 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1194   assert(SIInstrInfo::isMAI(*MI));
1195 
1196   int WaitStatesNeeded = 0;
1197   unsigned Opc = MI->getOpcode();
1198 
1199   auto IsVALUFn = [] (MachineInstr *MI) {
1200     return SIInstrInfo::isVALU(*MI);
1201   };
1202 
1203   if (Opc != AMDGPU::V_ACCVGPR_READ_B32) { // MFMA or v_accvgpr_write
1204     const int LegacyVALUWritesVGPRWaitStates = 2;
1205     const int VALUWritesExecWaitStates = 4;
1206     const int MaxWaitStates = 4;
1207 
1208     int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1209       getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1210     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1211 
1212     if (WaitStatesNeeded < MaxWaitStates) {
1213       for (const MachineOperand &Use : MI->explicit_uses()) {
1214         const int MaxWaitStates = 2;
1215 
1216         if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1217           continue;
1218 
1219         int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
1220           getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
1221         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1222 
1223         if (WaitStatesNeeded == MaxWaitStates)
1224           break;
1225       }
1226     }
1227   }
1228 
1229   auto IsMFMAFn = [] (MachineInstr *MI) {
1230     return SIInstrInfo::isMAI(*MI) &&
1231            MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 &&
1232            MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32;
1233   };
1234 
1235   for (const MachineOperand &Op : MI->explicit_operands()) {
1236     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
1237       continue;
1238 
1239     if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32)
1240       continue;
1241 
1242     const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
1243     const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
1244     const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
1245     const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
1246     const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
1247     const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
1248     const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
1249     const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
1250     const int MaxWaitStates = 18;
1251     Register Reg = Op.getReg();
1252     unsigned HazardDefLatency = 0;
1253 
1254     auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this]
1255                               (MachineInstr *MI) {
1256       if (!IsMFMAFn(MI))
1257         return false;
1258       Register DstReg = MI->getOperand(0).getReg();
1259       if (DstReg == Reg)
1260         return false;
1261       HazardDefLatency = std::max(HazardDefLatency,
1262                                   TSchedModel.computeInstrLatency(MI));
1263       return TRI.regsOverlap(DstReg, Reg);
1264     };
1265 
1266     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
1267                                                    MaxWaitStates);
1268     int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
1269     int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1270     int OpNo = MI->getOperandNo(&Op);
1271     if (OpNo == SrcCIdx) {
1272       NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
1273     } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) {
1274       switch (HazardDefLatency) {
1275       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
1276                break;
1277       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
1278                break;
1279       case 16: LLVM_FALLTHROUGH;
1280       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
1281                break;
1282       }
1283     } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
1284       switch (HazardDefLatency) {
1285       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
1286                break;
1287       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
1288                break;
1289       case 16: LLVM_FALLTHROUGH;
1290       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
1291                break;
1292       }
1293     }
1294 
1295     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1296     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1297 
1298     if (WaitStatesNeeded == MaxWaitStates)
1299       return WaitStatesNeeded; // Early exit.
1300 
1301     auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) {
1302       if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32)
1303         return false;
1304       Register DstReg = MI->getOperand(0).getReg();
1305       return TRI.regsOverlap(Reg, DstReg);
1306     };
1307 
1308     const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
1309     const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
1310     const int AccVGPRWriteAccVgprReadWaitStates = 3;
1311     NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
1312     if (OpNo == SrcCIdx)
1313       NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
1314     else if (Opc == AMDGPU::V_ACCVGPR_READ_B32)
1315       NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
1316 
1317     WaitStatesNeededForUse = NeedWaitStates -
1318       getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
1319     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1320 
1321     if (WaitStatesNeeded == MaxWaitStates)
1322       return WaitStatesNeeded; // Early exit.
1323   }
1324 
1325   if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
1326     const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
1327     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
1328     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
1329     const int MaxWaitStates = 13;
1330     Register DstReg = MI->getOperand(0).getReg();
1331     unsigned HazardDefLatency = 0;
1332 
1333     auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this]
1334                          (MachineInstr *MI) {
1335       if (!IsMFMAFn(MI))
1336         return false;
1337       Register Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1338       HazardDefLatency = std::max(HazardDefLatency,
1339                                   TSchedModel.computeInstrLatency(MI));
1340       return TRI.regsOverlap(Reg, DstReg);
1341     };
1342 
1343     int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
1344     int NeedWaitStates;
1345     switch (HazardDefLatency) {
1346     case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
1347              break;
1348     case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
1349              break;
1350     case 16: LLVM_FALLTHROUGH;
1351     default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
1352              break;
1353     }
1354 
1355     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
1356     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1357   }
1358 
1359   return WaitStatesNeeded;
1360 }
1361 
1362 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
1363   if (!ST.hasMAIInsts())
1364     return 0;
1365 
1366   int WaitStatesNeeded = 0;
1367 
1368   auto IsAccVgprReadFn = [] (MachineInstr *MI) {
1369     return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32;
1370   };
1371 
1372   for (const MachineOperand &Op : MI->explicit_uses()) {
1373     if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
1374       continue;
1375 
1376     Register Reg = Op.getReg();
1377 
1378     const int AccVgprReadLdStWaitStates = 2;
1379     const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
1380     const int MaxWaitStates = 2;
1381 
1382     int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
1383       getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
1384     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1385 
1386     if (WaitStatesNeeded == MaxWaitStates)
1387       return WaitStatesNeeded; // Early exit.
1388 
1389     auto IsVALUAccVgprRdWrCheckFn = [Reg, this](MachineInstr *MI) {
1390       if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32 &&
1391           MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32)
1392         return false;
1393       auto IsVALUFn = [] (MachineInstr *MI) {
1394         return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI);
1395       };
1396       return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
1397              std::numeric_limits<int>::max();
1398     };
1399 
1400     WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
1401       getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
1402     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1403   }
1404 
1405   return WaitStatesNeeded;
1406 }
1407 
1408 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
1409   if (!SU->isInstr())
1410     return false;
1411 
1412   MachineInstr *MAI = nullptr;
1413   auto IsMFMAFn = [&MAI] (MachineInstr *MI) {
1414     MAI = nullptr;
1415     if (SIInstrInfo::isMAI(*MI) &&
1416         MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 &&
1417         MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32)
1418       MAI = MI;
1419     return MAI != nullptr;
1420   };
1421 
1422   MachineInstr *MI = SU->getInstr();
1423   if (IsMFMAFn(MI)) {
1424     int W = getWaitStatesSince(IsMFMAFn, 16);
1425     if (MAI)
1426       return W < (int)TSchedModel.computeInstrLatency(MAI);
1427   }
1428 
1429   return false;
1430 }
1431