1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPUCallLowering.h"
16 #include "AMDGPUInstructionSelector.h"
17 #include "AMDGPULegalizerInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "R600Subtarget.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/IntrinsicsR600.h"
29 #include "llvm/IR/MDBuilder.h"
30 #include "llvm/MC/MCSubtargetInfo.h"
31 #include <algorithm>
32 
33 using namespace llvm;
34 
35 #define DEBUG_TYPE "amdgpu-subtarget"
36 
37 #define GET_SUBTARGETINFO_TARGET_DESC
38 #define GET_SUBTARGETINFO_CTOR
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #undef AMDGPUSubtarget
42 
43 static cl::opt<bool> DisablePowerSched(
44   "amdgpu-disable-power-sched",
45   cl::desc("Disable scheduling to minimize mAI power bursts"),
46   cl::init(false));
47 
48 static cl::opt<bool> EnableVGPRIndexMode(
49   "amdgpu-vgpr-index-mode",
50   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51   cl::init(false));
52 
53 static cl::opt<bool> EnableFlatScratch(
54   "amdgpu-enable-flat-scratch",
55   cl::desc("Use flat scratch instructions"),
56   cl::init(false));
57 
58 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
59                            cl::desc("Enable the use of AA during codegen."),
60                            cl::init(true));
61 
62 GCNSubtarget::~GCNSubtarget() = default;
63 
64 GCNSubtarget &
65 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
66                                               StringRef GPU, StringRef FS) {
67   // Determine default and user-specified characteristics
68   //
69   // We want to be able to turn these off, but making this a subtarget feature
70   // for SI has the unhelpful behavior that it unsets everything else if you
71   // disable it.
72   //
73   // Similarly we want enable-prt-strict-null to be on by default and not to
74   // unset everything else if it is disabled
75 
76   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
77 
78   // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
79   if (isAmdHsaOS())
80     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
81 
82   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
83 
84   // Disable mutually exclusive bits.
85   if (FS.contains_insensitive("+wavefrontsize")) {
86     if (!FS.contains_insensitive("wavefrontsize16"))
87       FullFS += "-wavefrontsize16,";
88     if (!FS.contains_insensitive("wavefrontsize32"))
89       FullFS += "-wavefrontsize32,";
90     if (!FS.contains_insensitive("wavefrontsize64"))
91       FullFS += "-wavefrontsize64,";
92   }
93 
94   FullFS += FS;
95 
96   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
97 
98   // Implement the "generic" processors, which acts as the default when no
99   // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
100   // the first amdgcn target that supports flat addressing. Other OSes defaults
101   // to the first amdgcn target.
102   if (Gen == AMDGPUSubtarget::INVALID) {
103      Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
104                                         : AMDGPUSubtarget::SOUTHERN_ISLANDS;
105   }
106 
107   // We don't support FP64 for EG/NI atm.
108   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
109 
110   // Targets must either support 64-bit offsets for MUBUF instructions, and/or
111   // support flat operations, otherwise they cannot access a 64-bit global
112   // address space
113   assert(hasAddr64() || hasFlat());
114   // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
115   // that do not support ADDR64 variants of MUBUF instructions. Such targets
116   // cannot use a 64 bit offset with a MUBUF instruction to access the global
117   // address space
118   if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
119     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
120     FlatForGlobal = true;
121   }
122   // Unless +-flat-for-global is specified, use MUBUF instructions for global
123   // address space access if flat operations are not available.
124   if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
125     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
126     FlatForGlobal = false;
127   }
128 
129   // Set defaults if needed.
130   if (MaxPrivateElementSize == 0)
131     MaxPrivateElementSize = 4;
132 
133   if (LDSBankCount == 0)
134     LDSBankCount = 32;
135 
136   if (TT.getArch() == Triple::amdgcn) {
137     if (LocalMemorySize == 0)
138       LocalMemorySize = 32768;
139 
140     // Do something sensible for unspecified target.
141     if (!HasMovrel && !HasVGPRIndexMode)
142       HasMovrel = true;
143   }
144 
145   // Don't crash on invalid devices.
146   if (WavefrontSizeLog2 == 0)
147     WavefrontSizeLog2 = 5;
148 
149   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
150   HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
151 
152   TargetID.setTargetIDFromFeaturesString(FS);
153 
154   LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
155                     << TargetID.getXnackSetting() << '\n');
156   LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
157                     << TargetID.getSramEccSetting() << '\n');
158 
159   return *this;
160 }
161 
162 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
163   TargetTriple(TT),
164   GCN3Encoding(false),
165   Has16BitInsts(false),
166   HasMadMixInsts(false),
167   HasMadMacF32Insts(false),
168   HasDsSrc2Insts(false),
169   HasSDWA(false),
170   HasVOP3PInsts(false),
171   HasMulI24(true),
172   HasMulU24(true),
173   HasSMulHi(false),
174   HasInv2PiInlineImm(false),
175   HasFminFmaxLegacy(true),
176   EnablePromoteAlloca(false),
177   HasTrigReducedRange(false),
178   MaxWavesPerEU(10),
179   LocalMemorySize(0),
180   WavefrontSizeLog2(0)
181   { }
182 
183 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
184                            const GCNTargetMachine &TM)
185     : // clang-format off
186     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
187     AMDGPUSubtarget(TT),
188     TargetTriple(TT),
189     TargetID(*this),
190     Gen(INVALID),
191     InstrItins(getInstrItineraryForCPU(GPU)),
192     LDSBankCount(0),
193     MaxPrivateElementSize(0),
194 
195     FastFMAF32(false),
196     FastDenormalF32(false),
197     HalfRate64Ops(false),
198     FullRate64Ops(false),
199 
200     FlatForGlobal(false),
201     AutoWaitcntBeforeBarrier(false),
202     UnalignedScratchAccess(false),
203     UnalignedAccessMode(false),
204 
205     HasApertureRegs(false),
206     SupportsXNACK(false),
207     EnableXNACK(false),
208     EnableTgSplit(false),
209     EnableCuMode(false),
210     TrapHandler(false),
211 
212     EnableLoadStoreOpt(false),
213     EnableUnsafeDSOffsetFolding(false),
214     EnableSIScheduler(false),
215     EnableDS128(false),
216     EnablePRTStrictNull(false),
217     DumpCode(false),
218 
219     FP64(false),
220     CIInsts(false),
221     GFX8Insts(false),
222     GFX9Insts(false),
223     GFX90AInsts(false),
224     GFX10Insts(false),
225     GFX10_3Insts(false),
226     GFX7GFX8GFX9Insts(false),
227     SGPRInitBug(false),
228     NegativeScratchOffsetBug(false),
229     NegativeUnalignedScratchOffsetBug(false),
230     HasSMemRealTime(false),
231     HasIntClamp(false),
232     HasFmaMixInsts(false),
233     HasMovrel(false),
234     HasVGPRIndexMode(false),
235     HasScalarStores(false),
236     HasScalarAtomics(false),
237     HasSDWAOmod(false),
238     HasSDWAScalar(false),
239     HasSDWASdst(false),
240     HasSDWAMac(false),
241     HasSDWAOutModsVOPC(false),
242     HasDPP(false),
243     HasDPP8(false),
244     Has64BitDPP(false),
245     HasPackedFP32Ops(false),
246     HasExtendedImageInsts(false),
247     HasR128A16(false),
248     HasGFX10A16(false),
249     HasG16(false),
250     HasNSAEncoding(false),
251     NSAMaxSize(0),
252     GFX10_AEncoding(false),
253     GFX10_BEncoding(false),
254     HasDLInsts(false),
255     HasDot1Insts(false),
256     HasDot2Insts(false),
257     HasDot3Insts(false),
258     HasDot4Insts(false),
259     HasDot5Insts(false),
260     HasDot6Insts(false),
261     HasDot7Insts(false),
262     HasMAIInsts(false),
263     HasPkFmacF16Inst(false),
264     HasAtomicFaddInsts(false),
265     SupportsSRAMECC(false),
266     EnableSRAMECC(false),
267     HasNoSdstCMPX(false),
268     HasVscnt(false),
269     HasGetWaveIdInst(false),
270     HasSMemTimeInst(false),
271     HasShaderCyclesRegister(false),
272     HasRegisterBanking(false),
273     HasVOP3Literal(false),
274     HasNoDataDepHazard(false),
275     FlatAddressSpace(false),
276     FlatInstOffsets(false),
277     FlatGlobalInsts(false),
278     FlatScratchInsts(false),
279     ScalarFlatScratchInsts(false),
280     HasArchitectedFlatScratch(false),
281     AddNoCarryInsts(false),
282     HasUnpackedD16VMem(false),
283     LDSMisalignedBug(false),
284     HasMFMAInlineLiteralBug(false),
285     UnalignedBufferAccess(false),
286     UnalignedDSAccess(false),
287     HasPackedTID(false),
288 
289     ScalarizeGlobal(false),
290 
291     HasVcmpxPermlaneHazard(false),
292     HasVMEMtoScalarWriteHazard(false),
293     HasSMEMtoVectorWriteHazard(false),
294     HasInstFwdPrefetchBug(false),
295     HasVcmpxExecWARHazard(false),
296     HasLdsBranchVmemWARHazard(false),
297     HasNSAtoVMEMBug(false),
298     HasNSAClauseBug(false),
299     HasOffset3fBug(false),
300     HasFlatSegmentOffsetBug(false),
301     HasImageStoreD16Bug(false),
302     HasImageGather4D16Bug(false),
303 
304     FeatureDisable(false),
305     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
306     TLInfo(TM, *this),
307     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
308   // clang-format on
309   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
310   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
311   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
312   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
313   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
314   InstSelector.reset(new AMDGPUInstructionSelector(
315   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
316 }
317 
318 bool GCNSubtarget::enableFlatScratch() const {
319   return flatScratchIsArchitected() ||
320          (EnableFlatScratch && hasFlatScratchInsts());
321 }
322 
323 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
324   if (getGeneration() < GFX10)
325     return 1;
326 
327   switch (Opcode) {
328   case AMDGPU::V_LSHLREV_B64_e64:
329   case AMDGPU::V_LSHLREV_B64_gfx10:
330   case AMDGPU::V_LSHL_B64_e64:
331   case AMDGPU::V_LSHRREV_B64_e64:
332   case AMDGPU::V_LSHRREV_B64_gfx10:
333   case AMDGPU::V_LSHR_B64_e64:
334   case AMDGPU::V_ASHRREV_I64_e64:
335   case AMDGPU::V_ASHRREV_I64_gfx10:
336   case AMDGPU::V_ASHR_I64_e64:
337     return 1;
338   }
339 
340   return 2;
341 }
342 
343 /// This list was mostly derived from experimentation.
344 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
345   switch (Opcode) {
346   case AMDGPU::V_CVT_F16_F32_e32:
347   case AMDGPU::V_CVT_F16_F32_e64:
348   case AMDGPU::V_CVT_F16_U16_e32:
349   case AMDGPU::V_CVT_F16_U16_e64:
350   case AMDGPU::V_CVT_F16_I16_e32:
351   case AMDGPU::V_CVT_F16_I16_e64:
352   case AMDGPU::V_RCP_F16_e64:
353   case AMDGPU::V_RCP_F16_e32:
354   case AMDGPU::V_RSQ_F16_e64:
355   case AMDGPU::V_RSQ_F16_e32:
356   case AMDGPU::V_SQRT_F16_e64:
357   case AMDGPU::V_SQRT_F16_e32:
358   case AMDGPU::V_LOG_F16_e64:
359   case AMDGPU::V_LOG_F16_e32:
360   case AMDGPU::V_EXP_F16_e64:
361   case AMDGPU::V_EXP_F16_e32:
362   case AMDGPU::V_SIN_F16_e64:
363   case AMDGPU::V_SIN_F16_e32:
364   case AMDGPU::V_COS_F16_e64:
365   case AMDGPU::V_COS_F16_e32:
366   case AMDGPU::V_FLOOR_F16_e64:
367   case AMDGPU::V_FLOOR_F16_e32:
368   case AMDGPU::V_CEIL_F16_e64:
369   case AMDGPU::V_CEIL_F16_e32:
370   case AMDGPU::V_TRUNC_F16_e64:
371   case AMDGPU::V_TRUNC_F16_e32:
372   case AMDGPU::V_RNDNE_F16_e64:
373   case AMDGPU::V_RNDNE_F16_e32:
374   case AMDGPU::V_FRACT_F16_e64:
375   case AMDGPU::V_FRACT_F16_e32:
376   case AMDGPU::V_FREXP_MANT_F16_e64:
377   case AMDGPU::V_FREXP_MANT_F16_e32:
378   case AMDGPU::V_FREXP_EXP_I16_F16_e64:
379   case AMDGPU::V_FREXP_EXP_I16_F16_e32:
380   case AMDGPU::V_LDEXP_F16_e64:
381   case AMDGPU::V_LDEXP_F16_e32:
382   case AMDGPU::V_LSHLREV_B16_e64:
383   case AMDGPU::V_LSHLREV_B16_e32:
384   case AMDGPU::V_LSHRREV_B16_e64:
385   case AMDGPU::V_LSHRREV_B16_e32:
386   case AMDGPU::V_ASHRREV_I16_e64:
387   case AMDGPU::V_ASHRREV_I16_e32:
388   case AMDGPU::V_ADD_U16_e64:
389   case AMDGPU::V_ADD_U16_e32:
390   case AMDGPU::V_SUB_U16_e64:
391   case AMDGPU::V_SUB_U16_e32:
392   case AMDGPU::V_SUBREV_U16_e64:
393   case AMDGPU::V_SUBREV_U16_e32:
394   case AMDGPU::V_MUL_LO_U16_e64:
395   case AMDGPU::V_MUL_LO_U16_e32:
396   case AMDGPU::V_ADD_F16_e64:
397   case AMDGPU::V_ADD_F16_e32:
398   case AMDGPU::V_SUB_F16_e64:
399   case AMDGPU::V_SUB_F16_e32:
400   case AMDGPU::V_SUBREV_F16_e64:
401   case AMDGPU::V_SUBREV_F16_e32:
402   case AMDGPU::V_MUL_F16_e64:
403   case AMDGPU::V_MUL_F16_e32:
404   case AMDGPU::V_MAX_F16_e64:
405   case AMDGPU::V_MAX_F16_e32:
406   case AMDGPU::V_MIN_F16_e64:
407   case AMDGPU::V_MIN_F16_e32:
408   case AMDGPU::V_MAX_U16_e64:
409   case AMDGPU::V_MAX_U16_e32:
410   case AMDGPU::V_MIN_U16_e64:
411   case AMDGPU::V_MIN_U16_e32:
412   case AMDGPU::V_MAX_I16_e64:
413   case AMDGPU::V_MAX_I16_e32:
414   case AMDGPU::V_MIN_I16_e64:
415   case AMDGPU::V_MIN_I16_e32:
416     // On gfx10, all 16-bit instructions preserve the high bits.
417     return getGeneration() <= AMDGPUSubtarget::GFX9;
418   case AMDGPU::V_MAD_F16_e64:
419   case AMDGPU::V_MADAK_F16:
420   case AMDGPU::V_MADMK_F16:
421   case AMDGPU::V_MAC_F16_e64:
422   case AMDGPU::V_MAC_F16_e32:
423   case AMDGPU::V_FMAMK_F16:
424   case AMDGPU::V_FMAAK_F16:
425   case AMDGPU::V_MAD_U16_e64:
426   case AMDGPU::V_MAD_I16_e64:
427   case AMDGPU::V_FMA_F16_e64:
428   case AMDGPU::V_FMAC_F16_e64:
429   case AMDGPU::V_FMAC_F16_e32:
430   case AMDGPU::V_DIV_FIXUP_F16_e64:
431     // In gfx9, the preferred handling of the unused high 16-bits changed. Most
432     // instructions maintain the legacy behavior of 0ing. Some instructions
433     // changed to preserving the high bits.
434     return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
435   case AMDGPU::V_MAD_MIXLO_F16:
436   case AMDGPU::V_MAD_MIXHI_F16:
437   default:
438     return false;
439   }
440 }
441 
442 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
443   const Function &F) const {
444   if (NWaves == 1)
445     return getLocalMemorySize();
446   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
447   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
448   if (!WorkGroupsPerCu)
449     return 0;
450   unsigned MaxWaves = getMaxWavesPerEU();
451   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
452 }
453 
454 // FIXME: Should return min,max range.
455 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
456   const Function &F) const {
457   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
458   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
459   if (!MaxWorkGroupsPerCu)
460     return 0;
461 
462   const unsigned WaveSize = getWavefrontSize();
463 
464   // FIXME: Do we need to account for alignment requirement of LDS rounding the
465   // size up?
466   // Compute restriction based on LDS usage
467   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
468 
469   // This can be queried with more LDS than is possible, so just assume the
470   // worst.
471   if (NumGroups == 0)
472     return 1;
473 
474   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
475 
476   // Round to the number of waves.
477   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
478   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
479 
480   // Clamp to the maximum possible number of waves.
481   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
482 
483   // FIXME: Needs to be a multiple of the group size?
484   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
485 
486   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
487          "computed invalid occupancy");
488   return MaxWaves;
489 }
490 
491 unsigned
492 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
493   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
494   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
495 }
496 
497 std::pair<unsigned, unsigned>
498 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
499   switch (CC) {
500   case CallingConv::AMDGPU_VS:
501   case CallingConv::AMDGPU_LS:
502   case CallingConv::AMDGPU_HS:
503   case CallingConv::AMDGPU_ES:
504   case CallingConv::AMDGPU_GS:
505   case CallingConv::AMDGPU_PS:
506     return std::make_pair(1, getWavefrontSize());
507   default:
508     return std::make_pair(1u, getMaxFlatWorkGroupSize());
509   }
510 }
511 
512 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
513   const Function &F) const {
514   // Default minimum/maximum flat work group sizes.
515   std::pair<unsigned, unsigned> Default =
516     getDefaultFlatWorkGroupSize(F.getCallingConv());
517 
518   // Requested minimum/maximum flat work group sizes.
519   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
520     F, "amdgpu-flat-work-group-size", Default);
521 
522   // Make sure requested minimum is less than requested maximum.
523   if (Requested.first > Requested.second)
524     return Default;
525 
526   // Make sure requested values do not violate subtarget's specifications.
527   if (Requested.first < getMinFlatWorkGroupSize())
528     return Default;
529   if (Requested.second > getMaxFlatWorkGroupSize())
530     return Default;
531 
532   return Requested;
533 }
534 
535 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
536     const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
537   // Default minimum/maximum number of waves per execution unit.
538   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
539 
540   // If minimum/maximum flat work group sizes were explicitly requested using
541   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
542   // number of waves per execution unit to values implied by requested
543   // minimum/maximum flat work group sizes.
544   unsigned MinImpliedByFlatWorkGroupSize =
545     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
546   Default.first = MinImpliedByFlatWorkGroupSize;
547 
548   // Requested minimum/maximum number of waves per execution unit.
549   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
550     F, "amdgpu-waves-per-eu", Default, true);
551 
552   // Make sure requested minimum is less than requested maximum.
553   if (Requested.second && Requested.first > Requested.second)
554     return Default;
555 
556   // Make sure requested values do not violate subtarget's specifications.
557   if (Requested.first < getMinWavesPerEU() ||
558       Requested.second > getMaxWavesPerEU())
559     return Default;
560 
561   // Make sure requested values are compatible with values implied by requested
562   // minimum/maximum flat work group sizes.
563   if (Requested.first < MinImpliedByFlatWorkGroupSize)
564     return Default;
565 
566   return Requested;
567 }
568 
569 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
570   auto Node = Kernel.getMetadata("reqd_work_group_size");
571   if (Node && Node->getNumOperands() == 3)
572     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
573   return std::numeric_limits<unsigned>::max();
574 }
575 
576 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
577   return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
578 }
579 
580 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
581                                            unsigned Dimension) const {
582   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
583   if (ReqdSize != std::numeric_limits<unsigned>::max())
584     return ReqdSize - 1;
585   return getFlatWorkGroupSizes(Kernel).second - 1;
586 }
587 
588 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
589   Function *Kernel = I->getParent()->getParent();
590   unsigned MinSize = 0;
591   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
592   bool IdQuery = false;
593 
594   // If reqd_work_group_size is present it narrows value down.
595   if (auto *CI = dyn_cast<CallInst>(I)) {
596     const Function *F = CI->getCalledFunction();
597     if (F) {
598       unsigned Dim = UINT_MAX;
599       switch (F->getIntrinsicID()) {
600       case Intrinsic::amdgcn_workitem_id_x:
601       case Intrinsic::r600_read_tidig_x:
602         IdQuery = true;
603         LLVM_FALLTHROUGH;
604       case Intrinsic::r600_read_local_size_x:
605         Dim = 0;
606         break;
607       case Intrinsic::amdgcn_workitem_id_y:
608       case Intrinsic::r600_read_tidig_y:
609         IdQuery = true;
610         LLVM_FALLTHROUGH;
611       case Intrinsic::r600_read_local_size_y:
612         Dim = 1;
613         break;
614       case Intrinsic::amdgcn_workitem_id_z:
615       case Intrinsic::r600_read_tidig_z:
616         IdQuery = true;
617         LLVM_FALLTHROUGH;
618       case Intrinsic::r600_read_local_size_z:
619         Dim = 2;
620         break;
621       default:
622         break;
623       }
624 
625       if (Dim <= 3) {
626         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
627         if (ReqdSize != std::numeric_limits<unsigned>::max())
628           MinSize = MaxSize = ReqdSize;
629       }
630     }
631   }
632 
633   if (!MaxSize)
634     return false;
635 
636   // Range metadata is [Lo, Hi). For ID query we need to pass max size
637   // as Hi. For size query we need to pass Hi + 1.
638   if (IdQuery)
639     MinSize = 0;
640   else
641     ++MaxSize;
642 
643   MDBuilder MDB(I->getContext());
644   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
645                                                   APInt(32, MaxSize));
646   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
647   return true;
648 }
649 
650 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
651   assert(AMDGPU::isKernel(F.getCallingConv()));
652 
653   // We don't allocate the segment if we know the implicit arguments weren't
654   // used, even if the ABI implies we need them.
655   if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
656     return 0;
657 
658   if (isMesaKernel(F))
659     return 16;
660 
661   // Assume all implicit inputs are used by default
662   return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 56);
663 }
664 
665 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
666                                                  Align &MaxAlign) const {
667   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
668          F.getCallingConv() == CallingConv::SPIR_KERNEL);
669 
670   const DataLayout &DL = F.getParent()->getDataLayout();
671   uint64_t ExplicitArgBytes = 0;
672   MaxAlign = Align(1);
673 
674   for (const Argument &Arg : F.args()) {
675     const bool IsByRef = Arg.hasByRefAttr();
676     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
677     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
678     if (!Alignment)
679       Alignment = DL.getABITypeAlign(ArgTy);
680 
681     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
682     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
683     MaxAlign = max(MaxAlign, Alignment);
684   }
685 
686   return ExplicitArgBytes;
687 }
688 
689 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
690                                                 Align &MaxAlign) const {
691   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
692 
693   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
694 
695   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
696   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
697   if (ImplicitBytes != 0) {
698     const Align Alignment = getAlignmentForImplicitArgPtr();
699     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
700     MaxAlign = std::max(MaxAlign, Alignment);
701   }
702 
703   // Being able to dereference past the end is useful for emitting scalar loads.
704   return alignTo(TotalSize, 4);
705 }
706 
707 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
708   return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
709                                   : AMDGPUDwarfFlavour::Wave64;
710 }
711 
712 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
713                                       unsigned NumRegionInstrs) const {
714   // Track register pressure so the scheduler can try to decrease
715   // pressure once register usage is above the threshold defined by
716   // SIRegisterInfo::getRegPressureSetLimit()
717   Policy.ShouldTrackPressure = true;
718 
719   // Enabling both top down and bottom up scheduling seems to give us less
720   // register spills than just using one of these approaches on its own.
721   Policy.OnlyTopDown = false;
722   Policy.OnlyBottomUp = false;
723 
724   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
725   if (!enableSIScheduler())
726     Policy.ShouldTrackLaneMasks = true;
727 }
728 
729 bool GCNSubtarget::hasMadF16() const {
730   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
731 }
732 
733 bool GCNSubtarget::useVGPRIndexMode() const {
734   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
735 }
736 
737 bool GCNSubtarget::useAA() const { return UseAA; }
738 
739 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
740   if (getGeneration() >= AMDGPUSubtarget::GFX10)
741     return getMaxWavesPerEU();
742 
743   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
744     if (SGPRs <= 80)
745       return 10;
746     if (SGPRs <= 88)
747       return 9;
748     if (SGPRs <= 100)
749       return 8;
750     return 7;
751   }
752   if (SGPRs <= 48)
753     return 10;
754   if (SGPRs <= 56)
755     return 9;
756   if (SGPRs <= 64)
757     return 8;
758   if (SGPRs <= 72)
759     return 7;
760   if (SGPRs <= 80)
761     return 6;
762   return 5;
763 }
764 
765 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
766   unsigned MaxWaves = getMaxWavesPerEU();
767   unsigned Granule = getVGPRAllocGranule();
768   if (VGPRs < Granule)
769     return MaxWaves;
770   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
771   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
772 }
773 
774 unsigned
775 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const {
776   if (getGeneration() >= AMDGPUSubtarget::GFX10)
777     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
778 
779   if (HasFlatScratchInit || HasArchitectedFlatScratch) {
780     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
781       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
782     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
783       return 4; // FLAT_SCRATCH, VCC (in that order).
784   }
785 
786   if (isXNACKEnabled())
787     return 4; // XNACK, VCC (in that order).
788   return 2; // VCC.
789 }
790 
791 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
792   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
793   return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit());
794 }
795 
796 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
797   // The logic to detect if the function has
798   // flat scratch init is slightly different than how
799   // SIMachineFunctionInfo constructor derives.
800   // We don't use amdgpu-calls, amdgpu-stack-objects
801   // attributes and isAmdHsaOrMesa here as it doesn't really matter.
802   // TODO: Outline this derivation logic and have just
803   // one common function in the backend to avoid duplication.
804   bool isEntry = AMDGPU::isEntryFunctionCC(F.getCallingConv());
805   bool FunctionHasFlatScratchInit = false;
806   if (hasFlatAddressSpace() && isEntry && !flatScratchIsArchitected() &&
807       enableFlatScratch()) {
808     FunctionHasFlatScratchInit = true;
809   }
810   return getBaseReservedNumSGPRs(FunctionHasFlatScratchInit);
811 }
812 
813 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
814                                         unsigned NumSGPRs,
815                                         unsigned NumVGPRs) const {
816   unsigned Occupancy =
817     std::min(getMaxWavesPerEU(),
818              getOccupancyWithLocalMemSize(LDSSize, F));
819   if (NumSGPRs)
820     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
821   if (NumVGPRs)
822     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
823   return Occupancy;
824 }
825 
826 unsigned GCNSubtarget::getBaseMaxNumSGPRs(
827     const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
828     unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
829   // Compute maximum number of SGPRs function can use using default/requested
830   // minimum number of waves per execution unit.
831   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
832   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
833 
834   // Check if maximum number of SGPRs was explicitly requested using
835   // "amdgpu-num-sgpr" attribute.
836   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
837     unsigned Requested = AMDGPU::getIntegerAttribute(
838       F, "amdgpu-num-sgpr", MaxNumSGPRs);
839 
840     // Make sure requested value does not violate subtarget's specifications.
841     if (Requested && (Requested <= ReservedNumSGPRs))
842       Requested = 0;
843 
844     // If more SGPRs are required to support the input user/system SGPRs,
845     // increase to accommodate them.
846     //
847     // FIXME: This really ends up using the requested number of SGPRs + number
848     // of reserved special registers in total. Theoretically you could re-use
849     // the last input registers for these special registers, but this would
850     // require a lot of complexity to deal with the weird aliasing.
851     unsigned InputNumSGPRs = PreloadedSGPRs;
852     if (Requested && Requested < InputNumSGPRs)
853       Requested = InputNumSGPRs;
854 
855     // Make sure requested value is compatible with values implied by
856     // default/requested minimum/maximum number of waves per execution unit.
857     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
858       Requested = 0;
859     if (WavesPerEU.second &&
860         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
861       Requested = 0;
862 
863     if (Requested)
864       MaxNumSGPRs = Requested;
865   }
866 
867   if (hasSGPRInitBug())
868     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
869 
870   return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
871 }
872 
873 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
874   const Function &F = MF.getFunction();
875   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
876   return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),
877                             getReservedNumSGPRs(MF));
878 }
879 
880 static unsigned getMaxNumPreloadedSGPRs() {
881   // Max number of user SGPRs
882   unsigned MaxUserSGPRs = 4 + // private segment buffer
883                           2 + // Dispatch ptr
884                           2 + // queue ptr
885                           2 + // kernel segment ptr
886                           2 + // dispatch ID
887                           2 + // flat scratch init
888                           2;  // Implicit buffer ptr
889   // Max number of system SGPRs
890   unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
891                             1 + // WorkGroupIDY
892                             1 + // WorkGroupIDZ
893                             1 + // WorkGroupInfo
894                             1;  // private segment wave byte offset
895   return MaxUserSGPRs + MaxSystemSGPRs;
896 }
897 
898 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
899   return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),
900                             getReservedNumSGPRs(F));
901 }
902 
903 unsigned GCNSubtarget::getBaseMaxNumVGPRs(
904     const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
905   // Compute maximum number of VGPRs function can use using default/requested
906   // minimum number of waves per execution unit.
907   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
908 
909   // Check if maximum number of VGPRs was explicitly requested using
910   // "amdgpu-num-vgpr" attribute.
911   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
912     unsigned Requested = AMDGPU::getIntegerAttribute(
913       F, "amdgpu-num-vgpr", MaxNumVGPRs);
914 
915     if (hasGFX90AInsts())
916       Requested *= 2;
917 
918     // Make sure requested value is compatible with values implied by
919     // default/requested minimum/maximum number of waves per execution unit.
920     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
921       Requested = 0;
922     if (WavesPerEU.second &&
923         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
924       Requested = 0;
925 
926     if (Requested)
927       MaxNumVGPRs = Requested;
928   }
929 
930   return MaxNumVGPRs;
931 }
932 
933 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
934   return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
935 }
936 
937 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
938   const Function &F = MF.getFunction();
939   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
940   return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
941 }
942 
943 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
944                                          int UseOpIdx, SDep &Dep) const {
945   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
946       !Def->isInstr() || !Use->isInstr())
947     return;
948 
949   MachineInstr *DefI = Def->getInstr();
950   MachineInstr *UseI = Use->getInstr();
951 
952   if (DefI->isBundle()) {
953     const SIRegisterInfo *TRI = getRegisterInfo();
954     auto Reg = Dep.getReg();
955     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
956     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
957     unsigned Lat = 0;
958     for (++I; I != E && I->isBundledWithPred(); ++I) {
959       if (I->modifiesRegister(Reg, TRI))
960         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
961       else if (Lat)
962         --Lat;
963     }
964     Dep.setLatency(Lat);
965   } else if (UseI->isBundle()) {
966     const SIRegisterInfo *TRI = getRegisterInfo();
967     auto Reg = Dep.getReg();
968     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
969     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
970     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
971     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
972       if (I->readsRegister(Reg, TRI))
973         break;
974       --Lat;
975     }
976     Dep.setLatency(Lat);
977   } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
978     // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
979     // implicit operands which come from the MCInstrDesc, which can fool
980     // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
981     // pseudo operands.
982     Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
983         DefI, DefOpIdx, UseI, UseOpIdx));
984   }
985 }
986 
987 namespace {
988 struct FillMFMAShadowMutation : ScheduleDAGMutation {
989   const SIInstrInfo *TII;
990 
991   ScheduleDAGMI *DAG;
992 
993   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
994 
995   bool isSALU(const SUnit *SU) const {
996     const MachineInstr *MI = SU->getInstr();
997     return MI && TII->isSALU(*MI) && !MI->isTerminator();
998   }
999 
1000   bool isVALU(const SUnit *SU) const {
1001     const MachineInstr *MI = SU->getInstr();
1002     return MI && TII->isVALU(*MI);
1003   }
1004 
1005   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
1006     if (Pred->NodeNum < Succ->NodeNum)
1007       return true;
1008 
1009     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
1010 
1011     for (unsigned I = 0; I < Succs.size(); ++I) {
1012       for (const SDep &SI : Succs[I]->Succs) {
1013         const SUnit *SU = SI.getSUnit();
1014         if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
1015           Succs.push_back(SU);
1016       }
1017     }
1018 
1019     SmallPtrSet<const SUnit*, 32> Visited;
1020     while (!Preds.empty()) {
1021       const SUnit *SU = Preds.pop_back_val();
1022       if (llvm::is_contained(Succs, SU))
1023         return false;
1024       Visited.insert(SU);
1025       for (const SDep &SI : SU->Preds)
1026         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
1027           Preds.push_back(SI.getSUnit());
1028     }
1029 
1030     return true;
1031   }
1032 
1033   // Link as many SALU instructions in chain as possible. Return the size
1034   // of the chain. Links up to MaxChain instructions.
1035   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
1036                          SmallPtrSetImpl<SUnit *> &Visited) const {
1037     SmallVector<SUnit *, 8> Worklist({To});
1038     unsigned Linked = 0;
1039 
1040     while (!Worklist.empty() && MaxChain-- > 0) {
1041       SUnit *SU = Worklist.pop_back_val();
1042       if (!Visited.insert(SU).second)
1043         continue;
1044 
1045       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
1046                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
1047 
1048       if (SU->addPred(SDep(From, SDep::Artificial), false))
1049         ++Linked;
1050 
1051       for (SDep &SI : From->Succs) {
1052         SUnit *SUv = SI.getSUnit();
1053         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
1054           SUv->addPred(SDep(SU, SDep::Artificial), false);
1055       }
1056 
1057       for (SDep &SI : SU->Succs) {
1058         SUnit *Succ = SI.getSUnit();
1059         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
1060           Worklist.push_back(Succ);
1061       }
1062     }
1063 
1064     return Linked;
1065   }
1066 
1067   void apply(ScheduleDAGInstrs *DAGInstrs) override {
1068     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
1069     if (!ST.hasMAIInsts() || DisablePowerSched)
1070       return;
1071     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
1072     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
1073     if (!TSchedModel || DAG->SUnits.empty())
1074       return;
1075 
1076     // Scan for MFMA long latency instructions and try to add a dependency
1077     // of available SALU instructions to give them a chance to fill MFMA
1078     // shadow. That is desirable to fill MFMA shadow with SALU instructions
1079     // rather than VALU to prevent power consumption bursts and throttle.
1080     auto LastSALU = DAG->SUnits.begin();
1081     auto E = DAG->SUnits.end();
1082     SmallPtrSet<SUnit*, 32> Visited;
1083     for (SUnit &SU : DAG->SUnits) {
1084       MachineInstr &MAI = *SU.getInstr();
1085       if (!TII->isMAI(MAI) ||
1086            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
1087            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
1088         continue;
1089 
1090       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
1091 
1092       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
1093                  dbgs() << "Need " << Lat
1094                         << " instructions to cover latency.\n");
1095 
1096       // Find up to Lat independent scalar instructions as early as
1097       // possible such that they can be scheduled after this MFMA.
1098       for ( ; Lat && LastSALU != E; ++LastSALU) {
1099         if (Visited.count(&*LastSALU))
1100           continue;
1101 
1102         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
1103           continue;
1104 
1105         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
1106       }
1107     }
1108   }
1109 };
1110 } // namespace
1111 
1112 void GCNSubtarget::getPostRAMutations(
1113     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
1114   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
1115 }
1116 
1117 std::unique_ptr<ScheduleDAGMutation>
1118 GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const {
1119   return std::make_unique<FillMFMAShadowMutation>(&InstrInfo);
1120 }
1121 
1122 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
1123   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
1124     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
1125   else
1126     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
1127 }
1128 
1129 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
1130   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
1131     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
1132   else
1133     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
1134 }
1135