1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPUCallLowering.h"
16 #include "AMDGPUInstructionSelector.h"
17 #include "AMDGPULegalizerInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "R600Subtarget.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/IntrinsicsR600.h"
29 #include "llvm/IR/MDBuilder.h"
30 #include "llvm/MC/MCSubtargetInfo.h"
31 #include <algorithm>
32 
33 using namespace llvm;
34 
35 #define DEBUG_TYPE "amdgpu-subtarget"
36 
37 #define GET_SUBTARGETINFO_TARGET_DESC
38 #define GET_SUBTARGETINFO_CTOR
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #undef AMDGPUSubtarget
42 
43 static cl::opt<bool> DisablePowerSched(
44   "amdgpu-disable-power-sched",
45   cl::desc("Disable scheduling to minimize mAI power bursts"),
46   cl::init(false));
47 
48 static cl::opt<bool> EnableVGPRIndexMode(
49   "amdgpu-vgpr-index-mode",
50   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51   cl::init(false));
52 
53 static cl::opt<bool> EnableFlatScratch(
54   "amdgpu-enable-flat-scratch",
55   cl::desc("Use flat scratch instructions"),
56   cl::init(false));
57 
58 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
59                            cl::desc("Enable the use of AA during codegen."),
60                            cl::init(true));
61 
62 GCNSubtarget::~GCNSubtarget() = default;
63 
64 GCNSubtarget &
65 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
66                                               StringRef GPU, StringRef FS) {
67   // Determine default and user-specified characteristics
68   //
69   // We want to be able to turn these off, but making this a subtarget feature
70   // for SI has the unhelpful behavior that it unsets everything else if you
71   // disable it.
72   //
73   // Similarly we want enable-prt-strict-null to be on by default and not to
74   // unset everything else if it is disabled
75 
76   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
77 
78   // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
79   if (isAmdHsaOS())
80     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
81 
82   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
83 
84   // Disable mutually exclusive bits.
85   if (FS.contains_insensitive("+wavefrontsize")) {
86     if (!FS.contains_insensitive("wavefrontsize16"))
87       FullFS += "-wavefrontsize16,";
88     if (!FS.contains_insensitive("wavefrontsize32"))
89       FullFS += "-wavefrontsize32,";
90     if (!FS.contains_insensitive("wavefrontsize64"))
91       FullFS += "-wavefrontsize64,";
92   }
93 
94   FullFS += FS;
95 
96   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
97 
98   // Implement the "generic" processors, which acts as the default when no
99   // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
100   // the first amdgcn target that supports flat addressing. Other OSes defaults
101   // to the first amdgcn target.
102   if (Gen == AMDGPUSubtarget::INVALID) {
103      Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
104                                         : AMDGPUSubtarget::SOUTHERN_ISLANDS;
105   }
106 
107   // We don't support FP64 for EG/NI atm.
108   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
109 
110   // Targets must either support 64-bit offsets for MUBUF instructions, and/or
111   // support flat operations, otherwise they cannot access a 64-bit global
112   // address space
113   assert(hasAddr64() || hasFlat());
114   // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
115   // that do not support ADDR64 variants of MUBUF instructions. Such targets
116   // cannot use a 64 bit offset with a MUBUF instruction to access the global
117   // address space
118   if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
119     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
120     FlatForGlobal = true;
121   }
122   // Unless +-flat-for-global is specified, use MUBUF instructions for global
123   // address space access if flat operations are not available.
124   if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
125     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
126     FlatForGlobal = false;
127   }
128 
129   // Set defaults if needed.
130   if (MaxPrivateElementSize == 0)
131     MaxPrivateElementSize = 4;
132 
133   if (LDSBankCount == 0)
134     LDSBankCount = 32;
135 
136   if (TT.getArch() == Triple::amdgcn) {
137     if (LocalMemorySize == 0)
138       LocalMemorySize = 32768;
139 
140     // Do something sensible for unspecified target.
141     if (!HasMovrel && !HasVGPRIndexMode)
142       HasMovrel = true;
143   }
144 
145   // Don't crash on invalid devices.
146   if (WavefrontSizeLog2 == 0)
147     WavefrontSizeLog2 = 5;
148 
149   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
150   HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
151 
152   TargetID.setTargetIDFromFeaturesString(FS);
153 
154   LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
155                     << TargetID.getXnackSetting() << '\n');
156   LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
157                     << TargetID.getSramEccSetting() << '\n');
158 
159   return *this;
160 }
161 
162 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
163   TargetTriple(TT),
164   GCN3Encoding(false),
165   Has16BitInsts(false),
166   HasMadMixInsts(false),
167   HasMadMacF32Insts(false),
168   HasDsSrc2Insts(false),
169   HasSDWA(false),
170   HasVOP3PInsts(false),
171   HasMulI24(true),
172   HasMulU24(true),
173   HasSMulHi(false),
174   HasInv2PiInlineImm(false),
175   HasFminFmaxLegacy(true),
176   EnablePromoteAlloca(false),
177   HasTrigReducedRange(false),
178   MaxWavesPerEU(10),
179   LocalMemorySize(0),
180   WavefrontSizeLog2(0)
181   { }
182 
183 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
184                            const GCNTargetMachine &TM)
185     : // clang-format off
186     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
187     AMDGPUSubtarget(TT),
188     TargetTriple(TT),
189     TargetID(*this),
190     Gen(INVALID),
191     InstrItins(getInstrItineraryForCPU(GPU)),
192     LDSBankCount(0),
193     MaxPrivateElementSize(0),
194 
195     FastFMAF32(false),
196     FastDenormalF32(false),
197     HalfRate64Ops(false),
198     FullRate64Ops(false),
199 
200     FlatForGlobal(false),
201     AutoWaitcntBeforeBarrier(false),
202     UnalignedScratchAccess(false),
203     UnalignedAccessMode(false),
204 
205     HasApertureRegs(false),
206     SupportsXNACK(false),
207     EnableXNACK(false),
208     EnableTgSplit(false),
209     EnableCuMode(false),
210     TrapHandler(false),
211 
212     EnableLoadStoreOpt(false),
213     EnableUnsafeDSOffsetFolding(false),
214     EnableSIScheduler(false),
215     EnableDS128(false),
216     EnablePRTStrictNull(false),
217     DumpCode(false),
218 
219     FP64(false),
220     CIInsts(false),
221     GFX8Insts(false),
222     GFX9Insts(false),
223     GFX90AInsts(false),
224     GFX10Insts(false),
225     GFX10_3Insts(false),
226     GFX7GFX8GFX9Insts(false),
227     SGPRInitBug(false),
228     NegativeScratchOffsetBug(false),
229     NegativeUnalignedScratchOffsetBug(false),
230     HasSMemRealTime(false),
231     HasIntClamp(false),
232     HasFmaMixInsts(false),
233     HasMovrel(false),
234     HasVGPRIndexMode(false),
235     HasScalarStores(false),
236     HasScalarAtomics(false),
237     HasSDWAOmod(false),
238     HasSDWAScalar(false),
239     HasSDWASdst(false),
240     HasSDWAMac(false),
241     HasSDWAOutModsVOPC(false),
242     HasDPP(false),
243     HasDPP8(false),
244     Has64BitDPP(false),
245     HasPackedFP32Ops(false),
246     HasExtendedImageInsts(false),
247     HasR128A16(false),
248     HasGFX10A16(false),
249     HasG16(false),
250     HasNSAEncoding(false),
251     NSAMaxSize(0),
252     GFX10_AEncoding(false),
253     GFX10_BEncoding(false),
254     HasDLInsts(false),
255     HasDot1Insts(false),
256     HasDot2Insts(false),
257     HasDot3Insts(false),
258     HasDot4Insts(false),
259     HasDot5Insts(false),
260     HasDot6Insts(false),
261     HasDot7Insts(false),
262     HasMAIInsts(false),
263     HasPkFmacF16Inst(false),
264     HasAtomicFaddInsts(false),
265     SupportsSRAMECC(false),
266     EnableSRAMECC(false),
267     HasNoSdstCMPX(false),
268     HasVscnt(false),
269     HasGetWaveIdInst(false),
270     HasSMemTimeInst(false),
271     HasShaderCyclesRegister(false),
272     HasRegisterBanking(false),
273     HasVOP3Literal(false),
274     HasNoDataDepHazard(false),
275     FlatAddressSpace(false),
276     FlatInstOffsets(false),
277     FlatGlobalInsts(false),
278     FlatScratchInsts(false),
279     ScalarFlatScratchInsts(false),
280     HasArchitectedFlatScratch(false),
281     AddNoCarryInsts(false),
282     HasUnpackedD16VMem(false),
283     LDSMisalignedBug(false),
284     HasMFMAInlineLiteralBug(false),
285     UnalignedBufferAccess(false),
286     UnalignedDSAccess(false),
287     HasPackedTID(false),
288 
289     ScalarizeGlobal(false),
290 
291     HasVcmpxPermlaneHazard(false),
292     HasVMEMtoScalarWriteHazard(false),
293     HasSMEMtoVectorWriteHazard(false),
294     HasInstFwdPrefetchBug(false),
295     HasVcmpxExecWARHazard(false),
296     HasLdsBranchVmemWARHazard(false),
297     HasNSAtoVMEMBug(false),
298     HasNSAClauseBug(false),
299     HasOffset3fBug(false),
300     HasFlatSegmentOffsetBug(false),
301     HasImageStoreD16Bug(false),
302     HasImageGather4D16Bug(false),
303 
304     FeatureDisable(false),
305     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
306     TLInfo(TM, *this),
307     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
308   // clang-format on
309   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
310   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
311   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
312   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
313   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
314   InstSelector.reset(new AMDGPUInstructionSelector(
315   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
316 }
317 
318 bool GCNSubtarget::enableFlatScratch() const {
319   return flatScratchIsArchitected() ||
320          (EnableFlatScratch && hasFlatScratchInsts());
321 }
322 
323 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
324   if (getGeneration() < GFX10)
325     return 1;
326 
327   switch (Opcode) {
328   case AMDGPU::V_LSHLREV_B64_e64:
329   case AMDGPU::V_LSHLREV_B64_gfx10:
330   case AMDGPU::V_LSHL_B64_e64:
331   case AMDGPU::V_LSHRREV_B64_e64:
332   case AMDGPU::V_LSHRREV_B64_gfx10:
333   case AMDGPU::V_LSHR_B64_e64:
334   case AMDGPU::V_ASHRREV_I64_e64:
335   case AMDGPU::V_ASHRREV_I64_gfx10:
336   case AMDGPU::V_ASHR_I64_e64:
337     return 1;
338   }
339 
340   return 2;
341 }
342 
343 /// This list was mostly derived from experimentation.
344 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
345   switch (Opcode) {
346   case AMDGPU::V_CVT_F16_F32_e32:
347   case AMDGPU::V_CVT_F16_F32_e64:
348   case AMDGPU::V_CVT_F16_U16_e32:
349   case AMDGPU::V_CVT_F16_U16_e64:
350   case AMDGPU::V_CVT_F16_I16_e32:
351   case AMDGPU::V_CVT_F16_I16_e64:
352   case AMDGPU::V_RCP_F16_e64:
353   case AMDGPU::V_RCP_F16_e32:
354   case AMDGPU::V_RSQ_F16_e64:
355   case AMDGPU::V_RSQ_F16_e32:
356   case AMDGPU::V_SQRT_F16_e64:
357   case AMDGPU::V_SQRT_F16_e32:
358   case AMDGPU::V_LOG_F16_e64:
359   case AMDGPU::V_LOG_F16_e32:
360   case AMDGPU::V_EXP_F16_e64:
361   case AMDGPU::V_EXP_F16_e32:
362   case AMDGPU::V_SIN_F16_e64:
363   case AMDGPU::V_SIN_F16_e32:
364   case AMDGPU::V_COS_F16_e64:
365   case AMDGPU::V_COS_F16_e32:
366   case AMDGPU::V_FLOOR_F16_e64:
367   case AMDGPU::V_FLOOR_F16_e32:
368   case AMDGPU::V_CEIL_F16_e64:
369   case AMDGPU::V_CEIL_F16_e32:
370   case AMDGPU::V_TRUNC_F16_e64:
371   case AMDGPU::V_TRUNC_F16_e32:
372   case AMDGPU::V_RNDNE_F16_e64:
373   case AMDGPU::V_RNDNE_F16_e32:
374   case AMDGPU::V_FRACT_F16_e64:
375   case AMDGPU::V_FRACT_F16_e32:
376   case AMDGPU::V_FREXP_MANT_F16_e64:
377   case AMDGPU::V_FREXP_MANT_F16_e32:
378   case AMDGPU::V_FREXP_EXP_I16_F16_e64:
379   case AMDGPU::V_FREXP_EXP_I16_F16_e32:
380   case AMDGPU::V_LDEXP_F16_e64:
381   case AMDGPU::V_LDEXP_F16_e32:
382   case AMDGPU::V_LSHLREV_B16_e64:
383   case AMDGPU::V_LSHLREV_B16_e32:
384   case AMDGPU::V_LSHRREV_B16_e64:
385   case AMDGPU::V_LSHRREV_B16_e32:
386   case AMDGPU::V_ASHRREV_I16_e64:
387   case AMDGPU::V_ASHRREV_I16_e32:
388   case AMDGPU::V_ADD_U16_e64:
389   case AMDGPU::V_ADD_U16_e32:
390   case AMDGPU::V_SUB_U16_e64:
391   case AMDGPU::V_SUB_U16_e32:
392   case AMDGPU::V_SUBREV_U16_e64:
393   case AMDGPU::V_SUBREV_U16_e32:
394   case AMDGPU::V_MUL_LO_U16_e64:
395   case AMDGPU::V_MUL_LO_U16_e32:
396   case AMDGPU::V_ADD_F16_e64:
397   case AMDGPU::V_ADD_F16_e32:
398   case AMDGPU::V_SUB_F16_e64:
399   case AMDGPU::V_SUB_F16_e32:
400   case AMDGPU::V_SUBREV_F16_e64:
401   case AMDGPU::V_SUBREV_F16_e32:
402   case AMDGPU::V_MUL_F16_e64:
403   case AMDGPU::V_MUL_F16_e32:
404   case AMDGPU::V_MAX_F16_e64:
405   case AMDGPU::V_MAX_F16_e32:
406   case AMDGPU::V_MIN_F16_e64:
407   case AMDGPU::V_MIN_F16_e32:
408   case AMDGPU::V_MAX_U16_e64:
409   case AMDGPU::V_MAX_U16_e32:
410   case AMDGPU::V_MIN_U16_e64:
411   case AMDGPU::V_MIN_U16_e32:
412   case AMDGPU::V_MAX_I16_e64:
413   case AMDGPU::V_MAX_I16_e32:
414   case AMDGPU::V_MIN_I16_e64:
415   case AMDGPU::V_MIN_I16_e32:
416     // On gfx10, all 16-bit instructions preserve the high bits.
417     return getGeneration() <= AMDGPUSubtarget::GFX9;
418   case AMDGPU::V_MAD_F16_e64:
419   case AMDGPU::V_MADAK_F16:
420   case AMDGPU::V_MADMK_F16:
421   case AMDGPU::V_MAC_F16_e64:
422   case AMDGPU::V_MAC_F16_e32:
423   case AMDGPU::V_FMAMK_F16:
424   case AMDGPU::V_FMAAK_F16:
425   case AMDGPU::V_MAD_U16_e64:
426   case AMDGPU::V_MAD_I16_e64:
427   case AMDGPU::V_FMA_F16_e64:
428   case AMDGPU::V_FMAC_F16_e64:
429   case AMDGPU::V_FMAC_F16_e32:
430   case AMDGPU::V_DIV_FIXUP_F16_e64:
431     // In gfx9, the preferred handling of the unused high 16-bits changed. Most
432     // instructions maintain the legacy behavior of 0ing. Some instructions
433     // changed to preserving the high bits.
434     return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
435   case AMDGPU::V_MAD_MIXLO_F16:
436   case AMDGPU::V_MAD_MIXHI_F16:
437   default:
438     return false;
439   }
440 }
441 
442 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
443   const Function &F) const {
444   if (NWaves == 1)
445     return getLocalMemorySize();
446   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
447   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
448   if (!WorkGroupsPerCu)
449     return 0;
450   unsigned MaxWaves = getMaxWavesPerEU();
451   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
452 }
453 
454 // FIXME: Should return min,max range.
455 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
456   const Function &F) const {
457   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
458   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
459   if (!MaxWorkGroupsPerCu)
460     return 0;
461 
462   const unsigned WaveSize = getWavefrontSize();
463 
464   // FIXME: Do we need to account for alignment requirement of LDS rounding the
465   // size up?
466   // Compute restriction based on LDS usage
467   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
468 
469   // This can be queried with more LDS than is possible, so just assume the
470   // worst.
471   if (NumGroups == 0)
472     return 1;
473 
474   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
475 
476   // Round to the number of waves.
477   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
478   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
479 
480   // Clamp to the maximum possible number of waves.
481   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
482 
483   // FIXME: Needs to be a multiple of the group size?
484   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
485 
486   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
487          "computed invalid occupancy");
488   return MaxWaves;
489 }
490 
491 unsigned
492 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
493   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
494   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
495 }
496 
497 std::pair<unsigned, unsigned>
498 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
499   switch (CC) {
500   case CallingConv::AMDGPU_VS:
501   case CallingConv::AMDGPU_LS:
502   case CallingConv::AMDGPU_HS:
503   case CallingConv::AMDGPU_ES:
504   case CallingConv::AMDGPU_GS:
505   case CallingConv::AMDGPU_PS:
506     return std::make_pair(1, getWavefrontSize());
507   default:
508     return std::make_pair(1u, getMaxFlatWorkGroupSize());
509   }
510 }
511 
512 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
513   const Function &F) const {
514   // Default minimum/maximum flat work group sizes.
515   std::pair<unsigned, unsigned> Default =
516     getDefaultFlatWorkGroupSize(F.getCallingConv());
517 
518   // Requested minimum/maximum flat work group sizes.
519   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
520     F, "amdgpu-flat-work-group-size", Default);
521 
522   // Make sure requested minimum is less than requested maximum.
523   if (Requested.first > Requested.second)
524     return Default;
525 
526   // Make sure requested values do not violate subtarget's specifications.
527   if (Requested.first < getMinFlatWorkGroupSize())
528     return Default;
529   if (Requested.second > getMaxFlatWorkGroupSize())
530     return Default;
531 
532   return Requested;
533 }
534 
535 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
536     const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
537   // Default minimum/maximum number of waves per execution unit.
538   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
539 
540   // If minimum/maximum flat work group sizes were explicitly requested using
541   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
542   // number of waves per execution unit to values implied by requested
543   // minimum/maximum flat work group sizes.
544   unsigned MinImpliedByFlatWorkGroupSize =
545     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
546   Default.first = MinImpliedByFlatWorkGroupSize;
547 
548   // Requested minimum/maximum number of waves per execution unit.
549   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
550     F, "amdgpu-waves-per-eu", Default, true);
551 
552   // Make sure requested minimum is less than requested maximum.
553   if (Requested.second && Requested.first > Requested.second)
554     return Default;
555 
556   // Make sure requested values do not violate subtarget's specifications.
557   if (Requested.first < getMinWavesPerEU() ||
558       Requested.second > getMaxWavesPerEU())
559     return Default;
560 
561   // Make sure requested values are compatible with values implied by requested
562   // minimum/maximum flat work group sizes.
563   if (Requested.first < MinImpliedByFlatWorkGroupSize)
564     return Default;
565 
566   return Requested;
567 }
568 
569 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
570   auto Node = Kernel.getMetadata("reqd_work_group_size");
571   if (Node && Node->getNumOperands() == 3)
572     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
573   return std::numeric_limits<unsigned>::max();
574 }
575 
576 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
577   return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
578 }
579 
580 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
581                                            unsigned Dimension) const {
582   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
583   if (ReqdSize != std::numeric_limits<unsigned>::max())
584     return ReqdSize - 1;
585   return getFlatWorkGroupSizes(Kernel).second - 1;
586 }
587 
588 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
589   Function *Kernel = I->getParent()->getParent();
590   unsigned MinSize = 0;
591   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
592   bool IdQuery = false;
593 
594   // If reqd_work_group_size is present it narrows value down.
595   if (auto *CI = dyn_cast<CallInst>(I)) {
596     const Function *F = CI->getCalledFunction();
597     if (F) {
598       unsigned Dim = UINT_MAX;
599       switch (F->getIntrinsicID()) {
600       case Intrinsic::amdgcn_workitem_id_x:
601       case Intrinsic::r600_read_tidig_x:
602         IdQuery = true;
603         LLVM_FALLTHROUGH;
604       case Intrinsic::r600_read_local_size_x:
605         Dim = 0;
606         break;
607       case Intrinsic::amdgcn_workitem_id_y:
608       case Intrinsic::r600_read_tidig_y:
609         IdQuery = true;
610         LLVM_FALLTHROUGH;
611       case Intrinsic::r600_read_local_size_y:
612         Dim = 1;
613         break;
614       case Intrinsic::amdgcn_workitem_id_z:
615       case Intrinsic::r600_read_tidig_z:
616         IdQuery = true;
617         LLVM_FALLTHROUGH;
618       case Intrinsic::r600_read_local_size_z:
619         Dim = 2;
620         break;
621       default:
622         break;
623       }
624 
625       if (Dim <= 3) {
626         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
627         if (ReqdSize != std::numeric_limits<unsigned>::max())
628           MinSize = MaxSize = ReqdSize;
629       }
630     }
631   }
632 
633   if (!MaxSize)
634     return false;
635 
636   // Range metadata is [Lo, Hi). For ID query we need to pass max size
637   // as Hi. For size query we need to pass Hi + 1.
638   if (IdQuery)
639     MinSize = 0;
640   else
641     ++MaxSize;
642 
643   MDBuilder MDB(I->getContext());
644   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
645                                                   APInt(32, MaxSize));
646   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
647   return true;
648 }
649 
650 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
651   if (isMesaKernel(F))
652     return 16;
653   return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
654 }
655 
656 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
657                                                  Align &MaxAlign) const {
658   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
659          F.getCallingConv() == CallingConv::SPIR_KERNEL);
660 
661   const DataLayout &DL = F.getParent()->getDataLayout();
662   uint64_t ExplicitArgBytes = 0;
663   MaxAlign = Align(1);
664 
665   for (const Argument &Arg : F.args()) {
666     const bool IsByRef = Arg.hasByRefAttr();
667     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
668     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
669     if (!Alignment)
670       Alignment = DL.getABITypeAlign(ArgTy);
671 
672     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
673     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
674     MaxAlign = max(MaxAlign, Alignment);
675   }
676 
677   return ExplicitArgBytes;
678 }
679 
680 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
681                                                 Align &MaxAlign) const {
682   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
683 
684   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
685 
686   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
687   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
688   if (ImplicitBytes != 0) {
689     const Align Alignment = getAlignmentForImplicitArgPtr();
690     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
691     MaxAlign = std::max(MaxAlign, Alignment);
692   }
693 
694   // Being able to dereference past the end is useful for emitting scalar loads.
695   return alignTo(TotalSize, 4);
696 }
697 
698 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
699   return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
700                                   : AMDGPUDwarfFlavour::Wave64;
701 }
702 
703 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
704                                       unsigned NumRegionInstrs) const {
705   // Track register pressure so the scheduler can try to decrease
706   // pressure once register usage is above the threshold defined by
707   // SIRegisterInfo::getRegPressureSetLimit()
708   Policy.ShouldTrackPressure = true;
709 
710   // Enabling both top down and bottom up scheduling seems to give us less
711   // register spills than just using one of these approaches on its own.
712   Policy.OnlyTopDown = false;
713   Policy.OnlyBottomUp = false;
714 
715   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
716   if (!enableSIScheduler())
717     Policy.ShouldTrackLaneMasks = true;
718 }
719 
720 bool GCNSubtarget::hasMadF16() const {
721   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
722 }
723 
724 bool GCNSubtarget::useVGPRIndexMode() const {
725   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
726 }
727 
728 bool GCNSubtarget::useAA() const { return UseAA; }
729 
730 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
731   if (getGeneration() >= AMDGPUSubtarget::GFX10)
732     return getMaxWavesPerEU();
733 
734   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
735     if (SGPRs <= 80)
736       return 10;
737     if (SGPRs <= 88)
738       return 9;
739     if (SGPRs <= 100)
740       return 8;
741     return 7;
742   }
743   if (SGPRs <= 48)
744     return 10;
745   if (SGPRs <= 56)
746     return 9;
747   if (SGPRs <= 64)
748     return 8;
749   if (SGPRs <= 72)
750     return 7;
751   if (SGPRs <= 80)
752     return 6;
753   return 5;
754 }
755 
756 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
757   unsigned MaxWaves = getMaxWavesPerEU();
758   unsigned Granule = getVGPRAllocGranule();
759   if (VGPRs < Granule)
760     return MaxWaves;
761   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
762   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
763 }
764 
765 unsigned
766 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const {
767   if (getGeneration() >= AMDGPUSubtarget::GFX10)
768     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
769 
770   if (HasFlatScratchInit || HasArchitectedFlatScratch) {
771     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
772       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
773     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
774       return 4; // FLAT_SCRATCH, VCC (in that order).
775   }
776 
777   if (isXNACKEnabled())
778     return 4; // XNACK, VCC (in that order).
779   return 2; // VCC.
780 }
781 
782 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
783   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
784   return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit());
785 }
786 
787 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
788   // The logic to detect if the function has
789   // flat scratch init is slightly different than how
790   // SIMachineFunctionInfo constructor derives.
791   // We don't use amdgpu-calls, amdgpu-stack-objects
792   // attributes and isAmdHsaOrMesa here as it doesn't really matter.
793   // TODO: Outline this derivation logic and have just
794   // one common function in the backend to avoid duplication.
795   bool isEntry = AMDGPU::isEntryFunctionCC(F.getCallingConv());
796   bool FunctionHasFlatScratchInit = false;
797   if (hasFlatAddressSpace() && isEntry && !flatScratchIsArchitected() &&
798       enableFlatScratch()) {
799     FunctionHasFlatScratchInit = true;
800   }
801   return getBaseReservedNumSGPRs(FunctionHasFlatScratchInit);
802 }
803 
804 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
805                                         unsigned NumSGPRs,
806                                         unsigned NumVGPRs) const {
807   unsigned Occupancy =
808     std::min(getMaxWavesPerEU(),
809              getOccupancyWithLocalMemSize(LDSSize, F));
810   if (NumSGPRs)
811     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
812   if (NumVGPRs)
813     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
814   return Occupancy;
815 }
816 
817 unsigned GCNSubtarget::getBaseMaxNumSGPRs(
818     const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
819     unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
820   // Compute maximum number of SGPRs function can use using default/requested
821   // minimum number of waves per execution unit.
822   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
823   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
824 
825   // Check if maximum number of SGPRs was explicitly requested using
826   // "amdgpu-num-sgpr" attribute.
827   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
828     unsigned Requested = AMDGPU::getIntegerAttribute(
829       F, "amdgpu-num-sgpr", MaxNumSGPRs);
830 
831     // Make sure requested value does not violate subtarget's specifications.
832     if (Requested && (Requested <= ReservedNumSGPRs))
833       Requested = 0;
834 
835     // If more SGPRs are required to support the input user/system SGPRs,
836     // increase to accommodate them.
837     //
838     // FIXME: This really ends up using the requested number of SGPRs + number
839     // of reserved special registers in total. Theoretically you could re-use
840     // the last input registers for these special registers, but this would
841     // require a lot of complexity to deal with the weird aliasing.
842     unsigned InputNumSGPRs = PreloadedSGPRs;
843     if (Requested && Requested < InputNumSGPRs)
844       Requested = InputNumSGPRs;
845 
846     // Make sure requested value is compatible with values implied by
847     // default/requested minimum/maximum number of waves per execution unit.
848     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
849       Requested = 0;
850     if (WavesPerEU.second &&
851         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
852       Requested = 0;
853 
854     if (Requested)
855       MaxNumSGPRs = Requested;
856   }
857 
858   if (hasSGPRInitBug())
859     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
860 
861   return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
862 }
863 
864 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
865   const Function &F = MF.getFunction();
866   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
867   return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),
868                             getReservedNumSGPRs(MF));
869 }
870 
871 static unsigned getMaxNumPreloadedSGPRs() {
872   // Max number of user SGPRs
873   unsigned MaxUserSGPRs = 4 + // private segment buffer
874                           2 + // Dispatch ptr
875                           2 + // queue ptr
876                           2 + // kernel segment ptr
877                           2 + // dispatch ID
878                           2 + // flat scratch init
879                           2;  // Implicit buffer ptr
880   // Max number of system SGPRs
881   unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
882                             1 + // WorkGroupIDY
883                             1 + // WorkGroupIDZ
884                             1 + // WorkGroupInfo
885                             1;  // private segment wave byte offset
886   return MaxUserSGPRs + MaxSystemSGPRs;
887 }
888 
889 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
890   return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),
891                             getReservedNumSGPRs(F));
892 }
893 
894 unsigned GCNSubtarget::getBaseMaxNumVGPRs(
895     const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
896   // Compute maximum number of VGPRs function can use using default/requested
897   // minimum number of waves per execution unit.
898   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
899 
900   // Check if maximum number of VGPRs was explicitly requested using
901   // "amdgpu-num-vgpr" attribute.
902   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
903     unsigned Requested = AMDGPU::getIntegerAttribute(
904       F, "amdgpu-num-vgpr", MaxNumVGPRs);
905 
906     if (hasGFX90AInsts())
907       Requested *= 2;
908 
909     // Make sure requested value is compatible with values implied by
910     // default/requested minimum/maximum number of waves per execution unit.
911     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
912       Requested = 0;
913     if (WavesPerEU.second &&
914         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
915       Requested = 0;
916 
917     if (Requested)
918       MaxNumVGPRs = Requested;
919   }
920 
921   return MaxNumVGPRs;
922 }
923 
924 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
925   return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
926 }
927 
928 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
929   const Function &F = MF.getFunction();
930   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
931   return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
932 }
933 
934 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
935                                          int UseOpIdx, SDep &Dep) const {
936   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
937       !Def->isInstr() || !Use->isInstr())
938     return;
939 
940   MachineInstr *DefI = Def->getInstr();
941   MachineInstr *UseI = Use->getInstr();
942 
943   if (DefI->isBundle()) {
944     const SIRegisterInfo *TRI = getRegisterInfo();
945     auto Reg = Dep.getReg();
946     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
947     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
948     unsigned Lat = 0;
949     for (++I; I != E && I->isBundledWithPred(); ++I) {
950       if (I->modifiesRegister(Reg, TRI))
951         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
952       else if (Lat)
953         --Lat;
954     }
955     Dep.setLatency(Lat);
956   } else if (UseI->isBundle()) {
957     const SIRegisterInfo *TRI = getRegisterInfo();
958     auto Reg = Dep.getReg();
959     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
960     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
961     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
962     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
963       if (I->readsRegister(Reg, TRI))
964         break;
965       --Lat;
966     }
967     Dep.setLatency(Lat);
968   } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
969     // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
970     // implicit operands which come from the MCInstrDesc, which can fool
971     // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
972     // pseudo operands.
973     Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
974         DefI, DefOpIdx, UseI, UseOpIdx));
975   }
976 }
977 
978 namespace {
979 struct FillMFMAShadowMutation : ScheduleDAGMutation {
980   const SIInstrInfo *TII;
981 
982   ScheduleDAGMI *DAG;
983 
984   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
985 
986   bool isSALU(const SUnit *SU) const {
987     const MachineInstr *MI = SU->getInstr();
988     return MI && TII->isSALU(*MI) && !MI->isTerminator();
989   }
990 
991   bool isVALU(const SUnit *SU) const {
992     const MachineInstr *MI = SU->getInstr();
993     return MI && TII->isVALU(*MI);
994   }
995 
996   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
997     if (Pred->NodeNum < Succ->NodeNum)
998       return true;
999 
1000     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
1001 
1002     for (unsigned I = 0; I < Succs.size(); ++I) {
1003       for (const SDep &SI : Succs[I]->Succs) {
1004         const SUnit *SU = SI.getSUnit();
1005         if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
1006           Succs.push_back(SU);
1007       }
1008     }
1009 
1010     SmallPtrSet<const SUnit*, 32> Visited;
1011     while (!Preds.empty()) {
1012       const SUnit *SU = Preds.pop_back_val();
1013       if (llvm::is_contained(Succs, SU))
1014         return false;
1015       Visited.insert(SU);
1016       for (const SDep &SI : SU->Preds)
1017         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
1018           Preds.push_back(SI.getSUnit());
1019     }
1020 
1021     return true;
1022   }
1023 
1024   // Link as many SALU instructions in chain as possible. Return the size
1025   // of the chain. Links up to MaxChain instructions.
1026   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
1027                          SmallPtrSetImpl<SUnit *> &Visited) const {
1028     SmallVector<SUnit *, 8> Worklist({To});
1029     unsigned Linked = 0;
1030 
1031     while (!Worklist.empty() && MaxChain-- > 0) {
1032       SUnit *SU = Worklist.pop_back_val();
1033       if (!Visited.insert(SU).second)
1034         continue;
1035 
1036       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
1037                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
1038 
1039       if (SU->addPred(SDep(From, SDep::Artificial), false))
1040         ++Linked;
1041 
1042       for (SDep &SI : From->Succs) {
1043         SUnit *SUv = SI.getSUnit();
1044         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
1045           SUv->addPred(SDep(SU, SDep::Artificial), false);
1046       }
1047 
1048       for (SDep &SI : SU->Succs) {
1049         SUnit *Succ = SI.getSUnit();
1050         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
1051           Worklist.push_back(Succ);
1052       }
1053     }
1054 
1055     return Linked;
1056   }
1057 
1058   void apply(ScheduleDAGInstrs *DAGInstrs) override {
1059     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
1060     if (!ST.hasMAIInsts() || DisablePowerSched)
1061       return;
1062     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
1063     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
1064     if (!TSchedModel || DAG->SUnits.empty())
1065       return;
1066 
1067     // Scan for MFMA long latency instructions and try to add a dependency
1068     // of available SALU instructions to give them a chance to fill MFMA
1069     // shadow. That is desirable to fill MFMA shadow with SALU instructions
1070     // rather than VALU to prevent power consumption bursts and throttle.
1071     auto LastSALU = DAG->SUnits.begin();
1072     auto E = DAG->SUnits.end();
1073     SmallPtrSet<SUnit*, 32> Visited;
1074     for (SUnit &SU : DAG->SUnits) {
1075       MachineInstr &MAI = *SU.getInstr();
1076       if (!TII->isMAI(MAI) ||
1077            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
1078            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
1079         continue;
1080 
1081       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
1082 
1083       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
1084                  dbgs() << "Need " << Lat
1085                         << " instructions to cover latency.\n");
1086 
1087       // Find up to Lat independent scalar instructions as early as
1088       // possible such that they can be scheduled after this MFMA.
1089       for ( ; Lat && LastSALU != E; ++LastSALU) {
1090         if (Visited.count(&*LastSALU))
1091           continue;
1092 
1093         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
1094           continue;
1095 
1096         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
1097       }
1098     }
1099   }
1100 };
1101 } // namespace
1102 
1103 void GCNSubtarget::getPostRAMutations(
1104     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
1105   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
1106 }
1107 
1108 std::unique_ptr<ScheduleDAGMutation>
1109 GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const {
1110   return std::make_unique<FillMFMAShadowMutation>(&InstrInfo);
1111 }
1112 
1113 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
1114   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
1115     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
1116   else
1117     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
1118 }
1119 
1120 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
1121   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
1122     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
1123   else
1124     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
1125 }
1126