1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPUCallLowering.h"
16 #include "AMDGPUInstructionSelector.h"
17 #include "AMDGPULegalizerInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "R600Subtarget.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/IntrinsicsR600.h"
29 #include "llvm/IR/MDBuilder.h"
30 #include "llvm/MC/MCSubtargetInfo.h"
31 #include <algorithm>
32 
33 using namespace llvm;
34 
35 #define DEBUG_TYPE "amdgpu-subtarget"
36 
37 #define GET_SUBTARGETINFO_TARGET_DESC
38 #define GET_SUBTARGETINFO_CTOR
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #undef AMDGPUSubtarget
42 
43 static cl::opt<bool> DisablePowerSched(
44   "amdgpu-disable-power-sched",
45   cl::desc("Disable scheduling to minimize mAI power bursts"),
46   cl::init(false));
47 
48 static cl::opt<bool> EnableVGPRIndexMode(
49   "amdgpu-vgpr-index-mode",
50   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51   cl::init(false));
52 
53 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
54                            cl::desc("Enable the use of AA during codegen."),
55                            cl::init(true));
56 
57 GCNSubtarget::~GCNSubtarget() = default;
58 
59 GCNSubtarget &
60 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
61                                               StringRef GPU, StringRef FS) {
62   // Determine default and user-specified characteristics
63   //
64   // We want to be able to turn these off, but making this a subtarget feature
65   // for SI has the unhelpful behavior that it unsets everything else if you
66   // disable it.
67   //
68   // Similarly we want enable-prt-strict-null to be on by default and not to
69   // unset everything else if it is disabled
70 
71   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
72 
73   // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
74   if (isAmdHsaOS())
75     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
76 
77   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
78 
79   // Disable mutually exclusive bits.
80   if (FS.contains_insensitive("+wavefrontsize")) {
81     if (!FS.contains_insensitive("wavefrontsize16"))
82       FullFS += "-wavefrontsize16,";
83     if (!FS.contains_insensitive("wavefrontsize32"))
84       FullFS += "-wavefrontsize32,";
85     if (!FS.contains_insensitive("wavefrontsize64"))
86       FullFS += "-wavefrontsize64,";
87   }
88 
89   FullFS += FS;
90 
91   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
92 
93   // Implement the "generic" processors, which acts as the default when no
94   // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
95   // the first amdgcn target that supports flat addressing. Other OSes defaults
96   // to the first amdgcn target.
97   if (Gen == AMDGPUSubtarget::INVALID) {
98      Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
99                                         : AMDGPUSubtarget::SOUTHERN_ISLANDS;
100   }
101 
102   // We don't support FP64 for EG/NI atm.
103   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
104 
105   // Targets must either support 64-bit offsets for MUBUF instructions, and/or
106   // support flat operations, otherwise they cannot access a 64-bit global
107   // address space
108   assert(hasAddr64() || hasFlat());
109   // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
110   // that do not support ADDR64 variants of MUBUF instructions. Such targets
111   // cannot use a 64 bit offset with a MUBUF instruction to access the global
112   // address space
113   if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
114     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
115     FlatForGlobal = true;
116   }
117   // Unless +-flat-for-global is specified, use MUBUF instructions for global
118   // address space access if flat operations are not available.
119   if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
120     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
121     FlatForGlobal = false;
122   }
123 
124   // Set defaults if needed.
125   if (MaxPrivateElementSize == 0)
126     MaxPrivateElementSize = 4;
127 
128   if (LDSBankCount == 0)
129     LDSBankCount = 32;
130 
131   if (TT.getArch() == Triple::amdgcn) {
132     if (LocalMemorySize == 0)
133       LocalMemorySize = 32768;
134 
135     // Do something sensible for unspecified target.
136     if (!HasMovrel && !HasVGPRIndexMode)
137       HasMovrel = true;
138   }
139 
140   // Don't crash on invalid devices.
141   if (WavefrontSizeLog2 == 0)
142     WavefrontSizeLog2 = 5;
143 
144   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
145   HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
146 
147   TargetID.setTargetIDFromFeaturesString(FS);
148 
149   LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
150                     << TargetID.getXnackSetting() << '\n');
151   LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
152                     << TargetID.getSramEccSetting() << '\n');
153 
154   return *this;
155 }
156 
157 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
158   TargetTriple(TT),
159   GCN3Encoding(false),
160   Has16BitInsts(false),
161   HasMadMixInsts(false),
162   HasMadMacF32Insts(false),
163   HasDsSrc2Insts(false),
164   HasSDWA(false),
165   HasVOP3PInsts(false),
166   HasMulI24(true),
167   HasMulU24(true),
168   HasSMulHi(false),
169   HasInv2PiInlineImm(false),
170   HasFminFmaxLegacy(true),
171   EnablePromoteAlloca(false),
172   HasTrigReducedRange(false),
173   MaxWavesPerEU(10),
174   LocalMemorySize(0),
175   WavefrontSizeLog2(0)
176   { }
177 
178 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
179                            const GCNTargetMachine &TM)
180     : // clang-format off
181     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
182     AMDGPUSubtarget(TT),
183     TargetTriple(TT),
184     TargetID(*this),
185     Gen(INVALID),
186     InstrItins(getInstrItineraryForCPU(GPU)),
187     LDSBankCount(0),
188     MaxPrivateElementSize(0),
189 
190     FastFMAF32(false),
191     FastDenormalF32(false),
192     HalfRate64Ops(false),
193     FullRate64Ops(false),
194 
195     FlatForGlobal(false),
196     AutoWaitcntBeforeBarrier(false),
197     UnalignedScratchAccess(false),
198     UnalignedAccessMode(false),
199 
200     HasApertureRegs(false),
201     SupportsXNACK(false),
202     EnableXNACK(false),
203     EnableTgSplit(false),
204     EnableCuMode(false),
205     TrapHandler(false),
206 
207     EnableLoadStoreOpt(false),
208     EnableUnsafeDSOffsetFolding(false),
209     EnableSIScheduler(false),
210     EnableDS128(false),
211     EnablePRTStrictNull(false),
212     DumpCode(false),
213 
214     FP64(false),
215     CIInsts(false),
216     GFX8Insts(false),
217     GFX9Insts(false),
218     GFX90AInsts(false),
219     GFX940Insts(false),
220     GFX10Insts(false),
221     GFX10_3Insts(false),
222     GFX7GFX8GFX9Insts(false),
223     SGPRInitBug(false),
224     NegativeScratchOffsetBug(false),
225     NegativeUnalignedScratchOffsetBug(false),
226     HasSMemRealTime(false),
227     HasIntClamp(false),
228     HasFmaMixInsts(false),
229     HasMovrel(false),
230     HasVGPRIndexMode(false),
231     HasScalarStores(false),
232     HasScalarAtomics(false),
233     HasSDWAOmod(false),
234     HasSDWAScalar(false),
235     HasSDWASdst(false),
236     HasSDWAMac(false),
237     HasSDWAOutModsVOPC(false),
238     HasDPP(false),
239     HasDPP8(false),
240     Has64BitDPP(false),
241     HasPackedFP32Ops(false),
242     HasExtendedImageInsts(false),
243     HasR128A16(false),
244     HasGFX10A16(false),
245     HasG16(false),
246     HasNSAEncoding(false),
247     NSAMaxSize(0),
248     GFX10_AEncoding(false),
249     GFX10_BEncoding(false),
250     HasDLInsts(false),
251     HasDot1Insts(false),
252     HasDot2Insts(false),
253     HasDot3Insts(false),
254     HasDot4Insts(false),
255     HasDot5Insts(false),
256     HasDot6Insts(false),
257     HasDot7Insts(false),
258     HasMAIInsts(false),
259     HasPkFmacF16Inst(false),
260     HasAtomicFaddInsts(false),
261     SupportsSRAMECC(false),
262     EnableSRAMECC(false),
263     HasNoSdstCMPX(false),
264     HasVscnt(false),
265     HasGetWaveIdInst(false),
266     HasSMemTimeInst(false),
267     HasShaderCyclesRegister(false),
268     HasVOP3Literal(false),
269     HasNoDataDepHazard(false),
270     FlatAddressSpace(false),
271     FlatInstOffsets(false),
272     FlatGlobalInsts(false),
273     FlatScratchInsts(false),
274     ScalarFlatScratchInsts(false),
275     HasArchitectedFlatScratch(false),
276     EnableFlatScratch(false),
277     AddNoCarryInsts(false),
278     HasUnpackedD16VMem(false),
279     LDSMisalignedBug(false),
280     HasMFMAInlineLiteralBug(false),
281     UnalignedBufferAccess(false),
282     UnalignedDSAccess(false),
283     HasPackedTID(false),
284 
285     ScalarizeGlobal(false),
286 
287     HasVcmpxPermlaneHazard(false),
288     HasVMEMtoScalarWriteHazard(false),
289     HasSMEMtoVectorWriteHazard(false),
290     HasInstFwdPrefetchBug(false),
291     HasVcmpxExecWARHazard(false),
292     HasLdsBranchVmemWARHazard(false),
293     HasNSAtoVMEMBug(false),
294     HasNSAClauseBug(false),
295     HasOffset3fBug(false),
296     HasFlatSegmentOffsetBug(false),
297     HasImageStoreD16Bug(false),
298     HasImageGather4D16Bug(false),
299 
300     FeatureDisable(false),
301     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
302     TLInfo(TM, *this),
303     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
304   // clang-format on
305   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
306   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
307   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
308   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
309   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
310   InstSelector.reset(new AMDGPUInstructionSelector(
311   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
312 }
313 
314 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
315   if (getGeneration() < GFX10)
316     return 1;
317 
318   switch (Opcode) {
319   case AMDGPU::V_LSHLREV_B64_e64:
320   case AMDGPU::V_LSHLREV_B64_gfx10:
321   case AMDGPU::V_LSHL_B64_e64:
322   case AMDGPU::V_LSHRREV_B64_e64:
323   case AMDGPU::V_LSHRREV_B64_gfx10:
324   case AMDGPU::V_LSHR_B64_e64:
325   case AMDGPU::V_ASHRREV_I64_e64:
326   case AMDGPU::V_ASHRREV_I64_gfx10:
327   case AMDGPU::V_ASHR_I64_e64:
328     return 1;
329   }
330 
331   return 2;
332 }
333 
334 /// This list was mostly derived from experimentation.
335 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
336   switch (Opcode) {
337   case AMDGPU::V_CVT_F16_F32_e32:
338   case AMDGPU::V_CVT_F16_F32_e64:
339   case AMDGPU::V_CVT_F16_U16_e32:
340   case AMDGPU::V_CVT_F16_U16_e64:
341   case AMDGPU::V_CVT_F16_I16_e32:
342   case AMDGPU::V_CVT_F16_I16_e64:
343   case AMDGPU::V_RCP_F16_e64:
344   case AMDGPU::V_RCP_F16_e32:
345   case AMDGPU::V_RSQ_F16_e64:
346   case AMDGPU::V_RSQ_F16_e32:
347   case AMDGPU::V_SQRT_F16_e64:
348   case AMDGPU::V_SQRT_F16_e32:
349   case AMDGPU::V_LOG_F16_e64:
350   case AMDGPU::V_LOG_F16_e32:
351   case AMDGPU::V_EXP_F16_e64:
352   case AMDGPU::V_EXP_F16_e32:
353   case AMDGPU::V_SIN_F16_e64:
354   case AMDGPU::V_SIN_F16_e32:
355   case AMDGPU::V_COS_F16_e64:
356   case AMDGPU::V_COS_F16_e32:
357   case AMDGPU::V_FLOOR_F16_e64:
358   case AMDGPU::V_FLOOR_F16_e32:
359   case AMDGPU::V_CEIL_F16_e64:
360   case AMDGPU::V_CEIL_F16_e32:
361   case AMDGPU::V_TRUNC_F16_e64:
362   case AMDGPU::V_TRUNC_F16_e32:
363   case AMDGPU::V_RNDNE_F16_e64:
364   case AMDGPU::V_RNDNE_F16_e32:
365   case AMDGPU::V_FRACT_F16_e64:
366   case AMDGPU::V_FRACT_F16_e32:
367   case AMDGPU::V_FREXP_MANT_F16_e64:
368   case AMDGPU::V_FREXP_MANT_F16_e32:
369   case AMDGPU::V_FREXP_EXP_I16_F16_e64:
370   case AMDGPU::V_FREXP_EXP_I16_F16_e32:
371   case AMDGPU::V_LDEXP_F16_e64:
372   case AMDGPU::V_LDEXP_F16_e32:
373   case AMDGPU::V_LSHLREV_B16_e64:
374   case AMDGPU::V_LSHLREV_B16_e32:
375   case AMDGPU::V_LSHRREV_B16_e64:
376   case AMDGPU::V_LSHRREV_B16_e32:
377   case AMDGPU::V_ASHRREV_I16_e64:
378   case AMDGPU::V_ASHRREV_I16_e32:
379   case AMDGPU::V_ADD_U16_e64:
380   case AMDGPU::V_ADD_U16_e32:
381   case AMDGPU::V_SUB_U16_e64:
382   case AMDGPU::V_SUB_U16_e32:
383   case AMDGPU::V_SUBREV_U16_e64:
384   case AMDGPU::V_SUBREV_U16_e32:
385   case AMDGPU::V_MUL_LO_U16_e64:
386   case AMDGPU::V_MUL_LO_U16_e32:
387   case AMDGPU::V_ADD_F16_e64:
388   case AMDGPU::V_ADD_F16_e32:
389   case AMDGPU::V_SUB_F16_e64:
390   case AMDGPU::V_SUB_F16_e32:
391   case AMDGPU::V_SUBREV_F16_e64:
392   case AMDGPU::V_SUBREV_F16_e32:
393   case AMDGPU::V_MUL_F16_e64:
394   case AMDGPU::V_MUL_F16_e32:
395   case AMDGPU::V_MAX_F16_e64:
396   case AMDGPU::V_MAX_F16_e32:
397   case AMDGPU::V_MIN_F16_e64:
398   case AMDGPU::V_MIN_F16_e32:
399   case AMDGPU::V_MAX_U16_e64:
400   case AMDGPU::V_MAX_U16_e32:
401   case AMDGPU::V_MIN_U16_e64:
402   case AMDGPU::V_MIN_U16_e32:
403   case AMDGPU::V_MAX_I16_e64:
404   case AMDGPU::V_MAX_I16_e32:
405   case AMDGPU::V_MIN_I16_e64:
406   case AMDGPU::V_MIN_I16_e32:
407   case AMDGPU::V_MAD_F16_e64:
408   case AMDGPU::V_MAD_U16_e64:
409   case AMDGPU::V_MAD_I16_e64:
410   case AMDGPU::V_FMA_F16_e64:
411   case AMDGPU::V_DIV_FIXUP_F16_e64:
412     // On gfx10, all 16-bit instructions preserve the high bits.
413     return getGeneration() <= AMDGPUSubtarget::GFX9;
414   case AMDGPU::V_MADAK_F16:
415   case AMDGPU::V_MADMK_F16:
416   case AMDGPU::V_MAC_F16_e64:
417   case AMDGPU::V_MAC_F16_e32:
418   case AMDGPU::V_FMAMK_F16:
419   case AMDGPU::V_FMAAK_F16:
420   case AMDGPU::V_FMAC_F16_e64:
421   case AMDGPU::V_FMAC_F16_e32:
422     // In gfx9, the preferred handling of the unused high 16-bits changed. Most
423     // instructions maintain the legacy behavior of 0ing. Some instructions
424     // changed to preserving the high bits.
425     return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
426   case AMDGPU::V_MAD_MIXLO_F16:
427   case AMDGPU::V_MAD_MIXHI_F16:
428   default:
429     return false;
430   }
431 }
432 
433 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
434   const Function &F) const {
435   if (NWaves == 1)
436     return getLocalMemorySize();
437   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
438   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
439   if (!WorkGroupsPerCu)
440     return 0;
441   unsigned MaxWaves = getMaxWavesPerEU();
442   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
443 }
444 
445 // FIXME: Should return min,max range.
446 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
447   const Function &F) const {
448   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
449   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
450   if (!MaxWorkGroupsPerCu)
451     return 0;
452 
453   const unsigned WaveSize = getWavefrontSize();
454 
455   // FIXME: Do we need to account for alignment requirement of LDS rounding the
456   // size up?
457   // Compute restriction based on LDS usage
458   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
459 
460   // This can be queried with more LDS than is possible, so just assume the
461   // worst.
462   if (NumGroups == 0)
463     return 1;
464 
465   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
466 
467   // Round to the number of waves.
468   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
469   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
470 
471   // Clamp to the maximum possible number of waves.
472   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
473 
474   // FIXME: Needs to be a multiple of the group size?
475   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
476 
477   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
478          "computed invalid occupancy");
479   return MaxWaves;
480 }
481 
482 unsigned
483 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
484   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
485   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
486 }
487 
488 std::pair<unsigned, unsigned>
489 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
490   switch (CC) {
491   case CallingConv::AMDGPU_VS:
492   case CallingConv::AMDGPU_LS:
493   case CallingConv::AMDGPU_HS:
494   case CallingConv::AMDGPU_ES:
495   case CallingConv::AMDGPU_GS:
496   case CallingConv::AMDGPU_PS:
497     return std::make_pair(1, getWavefrontSize());
498   default:
499     return std::make_pair(1u, getMaxFlatWorkGroupSize());
500   }
501 }
502 
503 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
504   const Function &F) const {
505   // Default minimum/maximum flat work group sizes.
506   std::pair<unsigned, unsigned> Default =
507     getDefaultFlatWorkGroupSize(F.getCallingConv());
508 
509   // Requested minimum/maximum flat work group sizes.
510   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
511     F, "amdgpu-flat-work-group-size", Default);
512 
513   // Make sure requested minimum is less than requested maximum.
514   if (Requested.first > Requested.second)
515     return Default;
516 
517   // Make sure requested values do not violate subtarget's specifications.
518   if (Requested.first < getMinFlatWorkGroupSize())
519     return Default;
520   if (Requested.second > getMaxFlatWorkGroupSize())
521     return Default;
522 
523   return Requested;
524 }
525 
526 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
527     const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
528   // Default minimum/maximum number of waves per execution unit.
529   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
530 
531   // If minimum/maximum flat work group sizes were explicitly requested using
532   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
533   // number of waves per execution unit to values implied by requested
534   // minimum/maximum flat work group sizes.
535   unsigned MinImpliedByFlatWorkGroupSize =
536     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
537   Default.first = MinImpliedByFlatWorkGroupSize;
538 
539   // Requested minimum/maximum number of waves per execution unit.
540   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
541     F, "amdgpu-waves-per-eu", Default, true);
542 
543   // Make sure requested minimum is less than requested maximum.
544   if (Requested.second && Requested.first > Requested.second)
545     return Default;
546 
547   // Make sure requested values do not violate subtarget's specifications.
548   if (Requested.first < getMinWavesPerEU() ||
549       Requested.second > getMaxWavesPerEU())
550     return Default;
551 
552   // Make sure requested values are compatible with values implied by requested
553   // minimum/maximum flat work group sizes.
554   if (Requested.first < MinImpliedByFlatWorkGroupSize)
555     return Default;
556 
557   return Requested;
558 }
559 
560 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
561   auto Node = Kernel.getMetadata("reqd_work_group_size");
562   if (Node && Node->getNumOperands() == 3)
563     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
564   return std::numeric_limits<unsigned>::max();
565 }
566 
567 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
568   return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
569 }
570 
571 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
572                                            unsigned Dimension) const {
573   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
574   if (ReqdSize != std::numeric_limits<unsigned>::max())
575     return ReqdSize - 1;
576   return getFlatWorkGroupSizes(Kernel).second - 1;
577 }
578 
579 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
580   Function *Kernel = I->getParent()->getParent();
581   unsigned MinSize = 0;
582   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
583   bool IdQuery = false;
584 
585   // If reqd_work_group_size is present it narrows value down.
586   if (auto *CI = dyn_cast<CallInst>(I)) {
587     const Function *F = CI->getCalledFunction();
588     if (F) {
589       unsigned Dim = UINT_MAX;
590       switch (F->getIntrinsicID()) {
591       case Intrinsic::amdgcn_workitem_id_x:
592       case Intrinsic::r600_read_tidig_x:
593         IdQuery = true;
594         LLVM_FALLTHROUGH;
595       case Intrinsic::r600_read_local_size_x:
596         Dim = 0;
597         break;
598       case Intrinsic::amdgcn_workitem_id_y:
599       case Intrinsic::r600_read_tidig_y:
600         IdQuery = true;
601         LLVM_FALLTHROUGH;
602       case Intrinsic::r600_read_local_size_y:
603         Dim = 1;
604         break;
605       case Intrinsic::amdgcn_workitem_id_z:
606       case Intrinsic::r600_read_tidig_z:
607         IdQuery = true;
608         LLVM_FALLTHROUGH;
609       case Intrinsic::r600_read_local_size_z:
610         Dim = 2;
611         break;
612       default:
613         break;
614       }
615 
616       if (Dim <= 3) {
617         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
618         if (ReqdSize != std::numeric_limits<unsigned>::max())
619           MinSize = MaxSize = ReqdSize;
620       }
621     }
622   }
623 
624   if (!MaxSize)
625     return false;
626 
627   // Range metadata is [Lo, Hi). For ID query we need to pass max size
628   // as Hi. For size query we need to pass Hi + 1.
629   if (IdQuery)
630     MinSize = 0;
631   else
632     ++MaxSize;
633 
634   MDBuilder MDB(I->getContext());
635   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
636                                                   APInt(32, MaxSize));
637   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
638   return true;
639 }
640 
641 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
642   assert(AMDGPU::isKernel(F.getCallingConv()));
643 
644   // We don't allocate the segment if we know the implicit arguments weren't
645   // used, even if the ABI implies we need them.
646   if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
647     return 0;
648 
649   if (isMesaKernel(F))
650     return 16;
651 
652   // Assume all implicit inputs are used by default
653   return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 56);
654 }
655 
656 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
657                                                  Align &MaxAlign) const {
658   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
659          F.getCallingConv() == CallingConv::SPIR_KERNEL);
660 
661   const DataLayout &DL = F.getParent()->getDataLayout();
662   uint64_t ExplicitArgBytes = 0;
663   MaxAlign = Align(1);
664 
665   for (const Argument &Arg : F.args()) {
666     const bool IsByRef = Arg.hasByRefAttr();
667     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
668     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
669     if (!Alignment)
670       Alignment = DL.getABITypeAlign(ArgTy);
671 
672     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
673     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
674     MaxAlign = max(MaxAlign, Alignment);
675   }
676 
677   return ExplicitArgBytes;
678 }
679 
680 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
681                                                 Align &MaxAlign) const {
682   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
683 
684   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
685 
686   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
687   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
688   if (ImplicitBytes != 0) {
689     const Align Alignment = getAlignmentForImplicitArgPtr();
690     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
691     MaxAlign = std::max(MaxAlign, Alignment);
692   }
693 
694   // Being able to dereference past the end is useful for emitting scalar loads.
695   return alignTo(TotalSize, 4);
696 }
697 
698 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
699   return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
700                                   : AMDGPUDwarfFlavour::Wave64;
701 }
702 
703 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
704                                       unsigned NumRegionInstrs) const {
705   // Track register pressure so the scheduler can try to decrease
706   // pressure once register usage is above the threshold defined by
707   // SIRegisterInfo::getRegPressureSetLimit()
708   Policy.ShouldTrackPressure = true;
709 
710   // Enabling both top down and bottom up scheduling seems to give us less
711   // register spills than just using one of these approaches on its own.
712   Policy.OnlyTopDown = false;
713   Policy.OnlyBottomUp = false;
714 
715   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
716   if (!enableSIScheduler())
717     Policy.ShouldTrackLaneMasks = true;
718 }
719 
720 bool GCNSubtarget::hasMadF16() const {
721   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
722 }
723 
724 bool GCNSubtarget::useVGPRIndexMode() const {
725   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
726 }
727 
728 bool GCNSubtarget::useAA() const { return UseAA; }
729 
730 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
731   if (getGeneration() >= AMDGPUSubtarget::GFX10)
732     return getMaxWavesPerEU();
733 
734   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
735     if (SGPRs <= 80)
736       return 10;
737     if (SGPRs <= 88)
738       return 9;
739     if (SGPRs <= 100)
740       return 8;
741     return 7;
742   }
743   if (SGPRs <= 48)
744     return 10;
745   if (SGPRs <= 56)
746     return 9;
747   if (SGPRs <= 64)
748     return 8;
749   if (SGPRs <= 72)
750     return 7;
751   if (SGPRs <= 80)
752     return 6;
753   return 5;
754 }
755 
756 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
757   unsigned MaxWaves = getMaxWavesPerEU();
758   unsigned Granule = getVGPRAllocGranule();
759   if (VGPRs < Granule)
760     return MaxWaves;
761   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
762   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
763 }
764 
765 unsigned
766 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
767   if (getGeneration() >= AMDGPUSubtarget::GFX10)
768     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
769 
770   if (HasFlatScratch || HasArchitectedFlatScratch) {
771     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
772       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
773     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
774       return 4; // FLAT_SCRATCH, VCC (in that order).
775   }
776 
777   if (isXNACKEnabled())
778     return 4; // XNACK, VCC (in that order).
779   return 2; // VCC.
780 }
781 
782 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
783   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
784   return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit());
785 }
786 
787 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
788   // In principle we do not need to reserve SGPR pair used for flat_scratch if
789   // we know flat instructions do not access the stack anywhere in the
790   // program. For now assume it's needed if we have flat instructions.
791   const bool KernelUsesFlatScratch = hasFlatAddressSpace();
792   return getBaseReservedNumSGPRs(KernelUsesFlatScratch);
793 }
794 
795 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
796                                         unsigned NumSGPRs,
797                                         unsigned NumVGPRs) const {
798   unsigned Occupancy =
799     std::min(getMaxWavesPerEU(),
800              getOccupancyWithLocalMemSize(LDSSize, F));
801   if (NumSGPRs)
802     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
803   if (NumVGPRs)
804     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
805   return Occupancy;
806 }
807 
808 unsigned GCNSubtarget::getBaseMaxNumSGPRs(
809     const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
810     unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
811   // Compute maximum number of SGPRs function can use using default/requested
812   // minimum number of waves per execution unit.
813   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
814   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
815 
816   // Check if maximum number of SGPRs was explicitly requested using
817   // "amdgpu-num-sgpr" attribute.
818   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
819     unsigned Requested = AMDGPU::getIntegerAttribute(
820       F, "amdgpu-num-sgpr", MaxNumSGPRs);
821 
822     // Make sure requested value does not violate subtarget's specifications.
823     if (Requested && (Requested <= ReservedNumSGPRs))
824       Requested = 0;
825 
826     // If more SGPRs are required to support the input user/system SGPRs,
827     // increase to accommodate them.
828     //
829     // FIXME: This really ends up using the requested number of SGPRs + number
830     // of reserved special registers in total. Theoretically you could re-use
831     // the last input registers for these special registers, but this would
832     // require a lot of complexity to deal with the weird aliasing.
833     unsigned InputNumSGPRs = PreloadedSGPRs;
834     if (Requested && Requested < InputNumSGPRs)
835       Requested = InputNumSGPRs;
836 
837     // Make sure requested value is compatible with values implied by
838     // default/requested minimum/maximum number of waves per execution unit.
839     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
840       Requested = 0;
841     if (WavesPerEU.second &&
842         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
843       Requested = 0;
844 
845     if (Requested)
846       MaxNumSGPRs = Requested;
847   }
848 
849   if (hasSGPRInitBug())
850     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
851 
852   return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
853 }
854 
855 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
856   const Function &F = MF.getFunction();
857   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
858   return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),
859                             getReservedNumSGPRs(MF));
860 }
861 
862 static unsigned getMaxNumPreloadedSGPRs() {
863   // Max number of user SGPRs
864   unsigned MaxUserSGPRs = 4 + // private segment buffer
865                           2 + // Dispatch ptr
866                           2 + // queue ptr
867                           2 + // kernel segment ptr
868                           2 + // dispatch ID
869                           2 + // flat scratch init
870                           2;  // Implicit buffer ptr
871   // Max number of system SGPRs
872   unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
873                             1 + // WorkGroupIDY
874                             1 + // WorkGroupIDZ
875                             1 + // WorkGroupInfo
876                             1;  // private segment wave byte offset
877   return MaxUserSGPRs + MaxSystemSGPRs;
878 }
879 
880 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
881   return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),
882                             getReservedNumSGPRs(F));
883 }
884 
885 unsigned GCNSubtarget::getBaseMaxNumVGPRs(
886     const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
887   // Compute maximum number of VGPRs function can use using default/requested
888   // minimum number of waves per execution unit.
889   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
890 
891   // Check if maximum number of VGPRs was explicitly requested using
892   // "amdgpu-num-vgpr" attribute.
893   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
894     unsigned Requested = AMDGPU::getIntegerAttribute(
895       F, "amdgpu-num-vgpr", MaxNumVGPRs);
896 
897     if (hasGFX90AInsts())
898       Requested *= 2;
899 
900     // Make sure requested value is compatible with values implied by
901     // default/requested minimum/maximum number of waves per execution unit.
902     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
903       Requested = 0;
904     if (WavesPerEU.second &&
905         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
906       Requested = 0;
907 
908     if (Requested)
909       MaxNumVGPRs = Requested;
910   }
911 
912   return MaxNumVGPRs;
913 }
914 
915 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
916   return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
917 }
918 
919 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
920   const Function &F = MF.getFunction();
921   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
922   return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
923 }
924 
925 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
926                                          int UseOpIdx, SDep &Dep) const {
927   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
928       !Def->isInstr() || !Use->isInstr())
929     return;
930 
931   MachineInstr *DefI = Def->getInstr();
932   MachineInstr *UseI = Use->getInstr();
933 
934   if (DefI->isBundle()) {
935     const SIRegisterInfo *TRI = getRegisterInfo();
936     auto Reg = Dep.getReg();
937     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
938     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
939     unsigned Lat = 0;
940     for (++I; I != E && I->isBundledWithPred(); ++I) {
941       if (I->modifiesRegister(Reg, TRI))
942         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
943       else if (Lat)
944         --Lat;
945     }
946     Dep.setLatency(Lat);
947   } else if (UseI->isBundle()) {
948     const SIRegisterInfo *TRI = getRegisterInfo();
949     auto Reg = Dep.getReg();
950     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
951     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
952     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
953     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
954       if (I->readsRegister(Reg, TRI))
955         break;
956       --Lat;
957     }
958     Dep.setLatency(Lat);
959   } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
960     // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
961     // implicit operands which come from the MCInstrDesc, which can fool
962     // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
963     // pseudo operands.
964     Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
965         DefI, DefOpIdx, UseI, UseOpIdx));
966   }
967 }
968 
969 namespace {
970 struct FillMFMAShadowMutation : ScheduleDAGMutation {
971   const SIInstrInfo *TII;
972 
973   ScheduleDAGMI *DAG;
974 
975   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
976 
977   bool isSALU(const SUnit *SU) const {
978     const MachineInstr *MI = SU->getInstr();
979     return MI && TII->isSALU(*MI) && !MI->isTerminator();
980   }
981 
982   bool isVALU(const SUnit *SU) const {
983     const MachineInstr *MI = SU->getInstr();
984     return MI && TII->isVALU(*MI);
985   }
986 
987   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
988     if (Pred->NodeNum < Succ->NodeNum)
989       return true;
990 
991     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
992 
993     for (unsigned I = 0; I < Succs.size(); ++I) {
994       for (const SDep &SI : Succs[I]->Succs) {
995         const SUnit *SU = SI.getSUnit();
996         if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
997           Succs.push_back(SU);
998       }
999     }
1000 
1001     SmallPtrSet<const SUnit*, 32> Visited;
1002     while (!Preds.empty()) {
1003       const SUnit *SU = Preds.pop_back_val();
1004       if (llvm::is_contained(Succs, SU))
1005         return false;
1006       Visited.insert(SU);
1007       for (const SDep &SI : SU->Preds)
1008         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
1009           Preds.push_back(SI.getSUnit());
1010     }
1011 
1012     return true;
1013   }
1014 
1015   // Link as many SALU instructions in chain as possible. Return the size
1016   // of the chain. Links up to MaxChain instructions.
1017   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
1018                          SmallPtrSetImpl<SUnit *> &Visited) const {
1019     SmallVector<SUnit *, 8> Worklist({To});
1020     unsigned Linked = 0;
1021 
1022     while (!Worklist.empty() && MaxChain-- > 0) {
1023       SUnit *SU = Worklist.pop_back_val();
1024       if (!Visited.insert(SU).second)
1025         continue;
1026 
1027       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
1028                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
1029 
1030       if (SU->addPred(SDep(From, SDep::Artificial), false))
1031         ++Linked;
1032 
1033       for (SDep &SI : From->Succs) {
1034         SUnit *SUv = SI.getSUnit();
1035         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
1036           SUv->addPred(SDep(SU, SDep::Artificial), false);
1037       }
1038 
1039       for (SDep &SI : SU->Succs) {
1040         SUnit *Succ = SI.getSUnit();
1041         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
1042           Worklist.push_back(Succ);
1043       }
1044     }
1045 
1046     return Linked;
1047   }
1048 
1049   void apply(ScheduleDAGInstrs *DAGInstrs) override {
1050     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
1051     if (!ST.hasMAIInsts() || DisablePowerSched)
1052       return;
1053     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
1054     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
1055     if (!TSchedModel || DAG->SUnits.empty())
1056       return;
1057 
1058     // Scan for MFMA long latency instructions and try to add a dependency
1059     // of available SALU instructions to give them a chance to fill MFMA
1060     // shadow. That is desirable to fill MFMA shadow with SALU instructions
1061     // rather than VALU to prevent power consumption bursts and throttle.
1062     auto LastSALU = DAG->SUnits.begin();
1063     auto E = DAG->SUnits.end();
1064     SmallPtrSet<SUnit*, 32> Visited;
1065     for (SUnit &SU : DAG->SUnits) {
1066       MachineInstr &MAI = *SU.getInstr();
1067       if (!TII->isMAI(MAI) ||
1068            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
1069            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
1070         continue;
1071 
1072       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
1073 
1074       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
1075                  dbgs() << "Need " << Lat
1076                         << " instructions to cover latency.\n");
1077 
1078       // Find up to Lat independent scalar instructions as early as
1079       // possible such that they can be scheduled after this MFMA.
1080       for ( ; Lat && LastSALU != E; ++LastSALU) {
1081         if (Visited.count(&*LastSALU))
1082           continue;
1083 
1084         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
1085           continue;
1086 
1087         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
1088       }
1089     }
1090   }
1091 };
1092 } // namespace
1093 
1094 void GCNSubtarget::getPostRAMutations(
1095     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
1096   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
1097 }
1098 
1099 std::unique_ptr<ScheduleDAGMutation>
1100 GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const {
1101   return std::make_unique<FillMFMAShadowMutation>(&InstrInfo);
1102 }
1103 
1104 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
1105   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
1106     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
1107   else
1108     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
1109 }
1110 
1111 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
1112   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
1113     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
1114   else
1115     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
1116 }
1117