1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPUCallLowering.h"
16 #include "AMDGPUInstructionSelector.h"
17 #include "AMDGPULegalizerInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "R600Subtarget.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/IntrinsicsR600.h"
29 #include "llvm/IR/MDBuilder.h"
30 #include "llvm/MC/MCSubtargetInfo.h"
31 #include <algorithm>
32 
33 using namespace llvm;
34 
35 #define DEBUG_TYPE "amdgpu-subtarget"
36 
37 #define GET_SUBTARGETINFO_TARGET_DESC
38 #define GET_SUBTARGETINFO_CTOR
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #undef AMDGPUSubtarget
42 
43 static cl::opt<bool> DisablePowerSched(
44   "amdgpu-disable-power-sched",
45   cl::desc("Disable scheduling to minimize mAI power bursts"),
46   cl::init(false));
47 
48 static cl::opt<bool> EnableVGPRIndexMode(
49   "amdgpu-vgpr-index-mode",
50   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51   cl::init(false));
52 
53 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
54                            cl::desc("Enable the use of AA during codegen."),
55                            cl::init(true));
56 
57 GCNSubtarget::~GCNSubtarget() = default;
58 
59 GCNSubtarget &
60 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
61                                               StringRef GPU, StringRef FS) {
62   // Determine default and user-specified characteristics
63   //
64   // We want to be able to turn these off, but making this a subtarget feature
65   // for SI has the unhelpful behavior that it unsets everything else if you
66   // disable it.
67   //
68   // Similarly we want enable-prt-strict-null to be on by default and not to
69   // unset everything else if it is disabled
70 
71   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
72 
73   // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
74   if (isAmdHsaOS())
75     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
76 
77   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
78 
79   // Disable mutually exclusive bits.
80   if (FS.contains_insensitive("+wavefrontsize")) {
81     if (!FS.contains_insensitive("wavefrontsize16"))
82       FullFS += "-wavefrontsize16,";
83     if (!FS.contains_insensitive("wavefrontsize32"))
84       FullFS += "-wavefrontsize32,";
85     if (!FS.contains_insensitive("wavefrontsize64"))
86       FullFS += "-wavefrontsize64,";
87   }
88 
89   FullFS += FS;
90 
91   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
92 
93   // Implement the "generic" processors, which acts as the default when no
94   // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
95   // the first amdgcn target that supports flat addressing. Other OSes defaults
96   // to the first amdgcn target.
97   if (Gen == AMDGPUSubtarget::INVALID) {
98      Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
99                                         : AMDGPUSubtarget::SOUTHERN_ISLANDS;
100   }
101 
102   // We don't support FP64 for EG/NI atm.
103   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
104 
105   // Targets must either support 64-bit offsets for MUBUF instructions, and/or
106   // support flat operations, otherwise they cannot access a 64-bit global
107   // address space
108   assert(hasAddr64() || hasFlat());
109   // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
110   // that do not support ADDR64 variants of MUBUF instructions. Such targets
111   // cannot use a 64 bit offset with a MUBUF instruction to access the global
112   // address space
113   if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
114     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
115     FlatForGlobal = true;
116   }
117   // Unless +-flat-for-global is specified, use MUBUF instructions for global
118   // address space access if flat operations are not available.
119   if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
120     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
121     FlatForGlobal = false;
122   }
123 
124   // Set defaults if needed.
125   if (MaxPrivateElementSize == 0)
126     MaxPrivateElementSize = 4;
127 
128   if (LDSBankCount == 0)
129     LDSBankCount = 32;
130 
131   if (TT.getArch() == Triple::amdgcn) {
132     if (LocalMemorySize == 0)
133       LocalMemorySize = 32768;
134 
135     // Do something sensible for unspecified target.
136     if (!HasMovrel && !HasVGPRIndexMode)
137       HasMovrel = true;
138   }
139 
140   // Don't crash on invalid devices.
141   if (WavefrontSizeLog2 == 0)
142     WavefrontSizeLog2 = 5;
143 
144   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
145   HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
146 
147   TargetID.setTargetIDFromFeaturesString(FS);
148 
149   LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
150                     << TargetID.getXnackSetting() << '\n');
151   LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
152                     << TargetID.getSramEccSetting() << '\n');
153 
154   return *this;
155 }
156 
157 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
158   TargetTriple(TT),
159   GCN3Encoding(false),
160   Has16BitInsts(false),
161   HasMadMixInsts(false),
162   HasMadMacF32Insts(false),
163   HasDsSrc2Insts(false),
164   HasSDWA(false),
165   HasVOP3PInsts(false),
166   HasMulI24(true),
167   HasMulU24(true),
168   HasSMulHi(false),
169   HasInv2PiInlineImm(false),
170   HasFminFmaxLegacy(true),
171   EnablePromoteAlloca(false),
172   HasTrigReducedRange(false),
173   MaxWavesPerEU(10),
174   LocalMemorySize(0),
175   WavefrontSizeLog2(0)
176   { }
177 
178 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
179                            const GCNTargetMachine &TM)
180     : // clang-format off
181     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
182     AMDGPUSubtarget(TT),
183     TargetTriple(TT),
184     TargetID(*this),
185     Gen(INVALID),
186     InstrItins(getInstrItineraryForCPU(GPU)),
187     LDSBankCount(0),
188     MaxPrivateElementSize(0),
189 
190     FastFMAF32(false),
191     FastDenormalF32(false),
192     HalfRate64Ops(false),
193     FullRate64Ops(false),
194 
195     FlatForGlobal(false),
196     AutoWaitcntBeforeBarrier(false),
197     UnalignedScratchAccess(false),
198     UnalignedAccessMode(false),
199 
200     HasApertureRegs(false),
201     SupportsXNACK(false),
202     EnableXNACK(false),
203     EnableTgSplit(false),
204     EnableCuMode(false),
205     TrapHandler(false),
206 
207     EnableLoadStoreOpt(false),
208     EnableUnsafeDSOffsetFolding(false),
209     EnableSIScheduler(false),
210     EnableDS128(false),
211     EnablePRTStrictNull(false),
212     DumpCode(false),
213 
214     FP64(false),
215     CIInsts(false),
216     GFX8Insts(false),
217     GFX9Insts(false),
218     GFX90AInsts(false),
219     GFX10Insts(false),
220     GFX10_3Insts(false),
221     GFX7GFX8GFX9Insts(false),
222     SGPRInitBug(false),
223     NegativeScratchOffsetBug(false),
224     NegativeUnalignedScratchOffsetBug(false),
225     HasSMemRealTime(false),
226     HasIntClamp(false),
227     HasFmaMixInsts(false),
228     HasMovrel(false),
229     HasVGPRIndexMode(false),
230     HasScalarStores(false),
231     HasScalarAtomics(false),
232     HasSDWAOmod(false),
233     HasSDWAScalar(false),
234     HasSDWASdst(false),
235     HasSDWAMac(false),
236     HasSDWAOutModsVOPC(false),
237     HasDPP(false),
238     HasDPP8(false),
239     Has64BitDPP(false),
240     HasPackedFP32Ops(false),
241     HasExtendedImageInsts(false),
242     HasR128A16(false),
243     HasGFX10A16(false),
244     HasG16(false),
245     HasNSAEncoding(false),
246     NSAMaxSize(0),
247     GFX10_AEncoding(false),
248     GFX10_BEncoding(false),
249     HasDLInsts(false),
250     HasDot1Insts(false),
251     HasDot2Insts(false),
252     HasDot3Insts(false),
253     HasDot4Insts(false),
254     HasDot5Insts(false),
255     HasDot6Insts(false),
256     HasDot7Insts(false),
257     HasMAIInsts(false),
258     HasPkFmacF16Inst(false),
259     HasAtomicFaddInsts(false),
260     SupportsSRAMECC(false),
261     EnableSRAMECC(false),
262     HasNoSdstCMPX(false),
263     HasVscnt(false),
264     HasGetWaveIdInst(false),
265     HasSMemTimeInst(false),
266     HasShaderCyclesRegister(false),
267     HasVOP3Literal(false),
268     HasNoDataDepHazard(false),
269     FlatAddressSpace(false),
270     FlatInstOffsets(false),
271     FlatGlobalInsts(false),
272     FlatScratchInsts(false),
273     ScalarFlatScratchInsts(false),
274     HasArchitectedFlatScratch(false),
275     EnableFlatScratch(false),
276     AddNoCarryInsts(false),
277     HasUnpackedD16VMem(false),
278     LDSMisalignedBug(false),
279     HasMFMAInlineLiteralBug(false),
280     UnalignedBufferAccess(false),
281     UnalignedDSAccess(false),
282     HasPackedTID(false),
283 
284     ScalarizeGlobal(false),
285 
286     HasVcmpxPermlaneHazard(false),
287     HasVMEMtoScalarWriteHazard(false),
288     HasSMEMtoVectorWriteHazard(false),
289     HasInstFwdPrefetchBug(false),
290     HasVcmpxExecWARHazard(false),
291     HasLdsBranchVmemWARHazard(false),
292     HasNSAtoVMEMBug(false),
293     HasNSAClauseBug(false),
294     HasOffset3fBug(false),
295     HasFlatSegmentOffsetBug(false),
296     HasImageStoreD16Bug(false),
297     HasImageGather4D16Bug(false),
298 
299     FeatureDisable(false),
300     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
301     TLInfo(TM, *this),
302     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
303   // clang-format on
304   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
305   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
306   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
307   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
308   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
309   InstSelector.reset(new AMDGPUInstructionSelector(
310   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
311 }
312 
313 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
314   if (getGeneration() < GFX10)
315     return 1;
316 
317   switch (Opcode) {
318   case AMDGPU::V_LSHLREV_B64_e64:
319   case AMDGPU::V_LSHLREV_B64_gfx10:
320   case AMDGPU::V_LSHL_B64_e64:
321   case AMDGPU::V_LSHRREV_B64_e64:
322   case AMDGPU::V_LSHRREV_B64_gfx10:
323   case AMDGPU::V_LSHR_B64_e64:
324   case AMDGPU::V_ASHRREV_I64_e64:
325   case AMDGPU::V_ASHRREV_I64_gfx10:
326   case AMDGPU::V_ASHR_I64_e64:
327     return 1;
328   }
329 
330   return 2;
331 }
332 
333 /// This list was mostly derived from experimentation.
334 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
335   switch (Opcode) {
336   case AMDGPU::V_CVT_F16_F32_e32:
337   case AMDGPU::V_CVT_F16_F32_e64:
338   case AMDGPU::V_CVT_F16_U16_e32:
339   case AMDGPU::V_CVT_F16_U16_e64:
340   case AMDGPU::V_CVT_F16_I16_e32:
341   case AMDGPU::V_CVT_F16_I16_e64:
342   case AMDGPU::V_RCP_F16_e64:
343   case AMDGPU::V_RCP_F16_e32:
344   case AMDGPU::V_RSQ_F16_e64:
345   case AMDGPU::V_RSQ_F16_e32:
346   case AMDGPU::V_SQRT_F16_e64:
347   case AMDGPU::V_SQRT_F16_e32:
348   case AMDGPU::V_LOG_F16_e64:
349   case AMDGPU::V_LOG_F16_e32:
350   case AMDGPU::V_EXP_F16_e64:
351   case AMDGPU::V_EXP_F16_e32:
352   case AMDGPU::V_SIN_F16_e64:
353   case AMDGPU::V_SIN_F16_e32:
354   case AMDGPU::V_COS_F16_e64:
355   case AMDGPU::V_COS_F16_e32:
356   case AMDGPU::V_FLOOR_F16_e64:
357   case AMDGPU::V_FLOOR_F16_e32:
358   case AMDGPU::V_CEIL_F16_e64:
359   case AMDGPU::V_CEIL_F16_e32:
360   case AMDGPU::V_TRUNC_F16_e64:
361   case AMDGPU::V_TRUNC_F16_e32:
362   case AMDGPU::V_RNDNE_F16_e64:
363   case AMDGPU::V_RNDNE_F16_e32:
364   case AMDGPU::V_FRACT_F16_e64:
365   case AMDGPU::V_FRACT_F16_e32:
366   case AMDGPU::V_FREXP_MANT_F16_e64:
367   case AMDGPU::V_FREXP_MANT_F16_e32:
368   case AMDGPU::V_FREXP_EXP_I16_F16_e64:
369   case AMDGPU::V_FREXP_EXP_I16_F16_e32:
370   case AMDGPU::V_LDEXP_F16_e64:
371   case AMDGPU::V_LDEXP_F16_e32:
372   case AMDGPU::V_LSHLREV_B16_e64:
373   case AMDGPU::V_LSHLREV_B16_e32:
374   case AMDGPU::V_LSHRREV_B16_e64:
375   case AMDGPU::V_LSHRREV_B16_e32:
376   case AMDGPU::V_ASHRREV_I16_e64:
377   case AMDGPU::V_ASHRREV_I16_e32:
378   case AMDGPU::V_ADD_U16_e64:
379   case AMDGPU::V_ADD_U16_e32:
380   case AMDGPU::V_SUB_U16_e64:
381   case AMDGPU::V_SUB_U16_e32:
382   case AMDGPU::V_SUBREV_U16_e64:
383   case AMDGPU::V_SUBREV_U16_e32:
384   case AMDGPU::V_MUL_LO_U16_e64:
385   case AMDGPU::V_MUL_LO_U16_e32:
386   case AMDGPU::V_ADD_F16_e64:
387   case AMDGPU::V_ADD_F16_e32:
388   case AMDGPU::V_SUB_F16_e64:
389   case AMDGPU::V_SUB_F16_e32:
390   case AMDGPU::V_SUBREV_F16_e64:
391   case AMDGPU::V_SUBREV_F16_e32:
392   case AMDGPU::V_MUL_F16_e64:
393   case AMDGPU::V_MUL_F16_e32:
394   case AMDGPU::V_MAX_F16_e64:
395   case AMDGPU::V_MAX_F16_e32:
396   case AMDGPU::V_MIN_F16_e64:
397   case AMDGPU::V_MIN_F16_e32:
398   case AMDGPU::V_MAX_U16_e64:
399   case AMDGPU::V_MAX_U16_e32:
400   case AMDGPU::V_MIN_U16_e64:
401   case AMDGPU::V_MIN_U16_e32:
402   case AMDGPU::V_MAX_I16_e64:
403   case AMDGPU::V_MAX_I16_e32:
404   case AMDGPU::V_MIN_I16_e64:
405   case AMDGPU::V_MIN_I16_e32:
406   case AMDGPU::V_MAD_F16_e64:
407   case AMDGPU::V_MAD_U16_e64:
408   case AMDGPU::V_MAD_I16_e64:
409   case AMDGPU::V_FMA_F16_e64:
410   case AMDGPU::V_DIV_FIXUP_F16_e64:
411     // On gfx10, all 16-bit instructions preserve the high bits.
412     return getGeneration() <= AMDGPUSubtarget::GFX9;
413   case AMDGPU::V_MADAK_F16:
414   case AMDGPU::V_MADMK_F16:
415   case AMDGPU::V_MAC_F16_e64:
416   case AMDGPU::V_MAC_F16_e32:
417   case AMDGPU::V_FMAMK_F16:
418   case AMDGPU::V_FMAAK_F16:
419   case AMDGPU::V_FMAC_F16_e64:
420   case AMDGPU::V_FMAC_F16_e32:
421     // In gfx9, the preferred handling of the unused high 16-bits changed. Most
422     // instructions maintain the legacy behavior of 0ing. Some instructions
423     // changed to preserving the high bits.
424     return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
425   case AMDGPU::V_MAD_MIXLO_F16:
426   case AMDGPU::V_MAD_MIXHI_F16:
427   default:
428     return false;
429   }
430 }
431 
432 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
433   const Function &F) const {
434   if (NWaves == 1)
435     return getLocalMemorySize();
436   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
437   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
438   if (!WorkGroupsPerCu)
439     return 0;
440   unsigned MaxWaves = getMaxWavesPerEU();
441   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
442 }
443 
444 // FIXME: Should return min,max range.
445 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
446   const Function &F) const {
447   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
448   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
449   if (!MaxWorkGroupsPerCu)
450     return 0;
451 
452   const unsigned WaveSize = getWavefrontSize();
453 
454   // FIXME: Do we need to account for alignment requirement of LDS rounding the
455   // size up?
456   // Compute restriction based on LDS usage
457   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
458 
459   // This can be queried with more LDS than is possible, so just assume the
460   // worst.
461   if (NumGroups == 0)
462     return 1;
463 
464   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
465 
466   // Round to the number of waves.
467   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
468   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
469 
470   // Clamp to the maximum possible number of waves.
471   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
472 
473   // FIXME: Needs to be a multiple of the group size?
474   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
475 
476   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
477          "computed invalid occupancy");
478   return MaxWaves;
479 }
480 
481 unsigned
482 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
483   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
484   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
485 }
486 
487 std::pair<unsigned, unsigned>
488 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
489   switch (CC) {
490   case CallingConv::AMDGPU_VS:
491   case CallingConv::AMDGPU_LS:
492   case CallingConv::AMDGPU_HS:
493   case CallingConv::AMDGPU_ES:
494   case CallingConv::AMDGPU_GS:
495   case CallingConv::AMDGPU_PS:
496     return std::make_pair(1, getWavefrontSize());
497   default:
498     return std::make_pair(1u, getMaxFlatWorkGroupSize());
499   }
500 }
501 
502 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
503   const Function &F) const {
504   // Default minimum/maximum flat work group sizes.
505   std::pair<unsigned, unsigned> Default =
506     getDefaultFlatWorkGroupSize(F.getCallingConv());
507 
508   // Requested minimum/maximum flat work group sizes.
509   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
510     F, "amdgpu-flat-work-group-size", Default);
511 
512   // Make sure requested minimum is less than requested maximum.
513   if (Requested.first > Requested.second)
514     return Default;
515 
516   // Make sure requested values do not violate subtarget's specifications.
517   if (Requested.first < getMinFlatWorkGroupSize())
518     return Default;
519   if (Requested.second > getMaxFlatWorkGroupSize())
520     return Default;
521 
522   return Requested;
523 }
524 
525 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
526     const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
527   // Default minimum/maximum number of waves per execution unit.
528   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
529 
530   // If minimum/maximum flat work group sizes were explicitly requested using
531   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
532   // number of waves per execution unit to values implied by requested
533   // minimum/maximum flat work group sizes.
534   unsigned MinImpliedByFlatWorkGroupSize =
535     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
536   Default.first = MinImpliedByFlatWorkGroupSize;
537 
538   // Requested minimum/maximum number of waves per execution unit.
539   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
540     F, "amdgpu-waves-per-eu", Default, true);
541 
542   // Make sure requested minimum is less than requested maximum.
543   if (Requested.second && Requested.first > Requested.second)
544     return Default;
545 
546   // Make sure requested values do not violate subtarget's specifications.
547   if (Requested.first < getMinWavesPerEU() ||
548       Requested.second > getMaxWavesPerEU())
549     return Default;
550 
551   // Make sure requested values are compatible with values implied by requested
552   // minimum/maximum flat work group sizes.
553   if (Requested.first < MinImpliedByFlatWorkGroupSize)
554     return Default;
555 
556   return Requested;
557 }
558 
559 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
560   auto Node = Kernel.getMetadata("reqd_work_group_size");
561   if (Node && Node->getNumOperands() == 3)
562     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
563   return std::numeric_limits<unsigned>::max();
564 }
565 
566 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
567   return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
568 }
569 
570 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
571                                            unsigned Dimension) const {
572   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
573   if (ReqdSize != std::numeric_limits<unsigned>::max())
574     return ReqdSize - 1;
575   return getFlatWorkGroupSizes(Kernel).second - 1;
576 }
577 
578 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
579   Function *Kernel = I->getParent()->getParent();
580   unsigned MinSize = 0;
581   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
582   bool IdQuery = false;
583 
584   // If reqd_work_group_size is present it narrows value down.
585   if (auto *CI = dyn_cast<CallInst>(I)) {
586     const Function *F = CI->getCalledFunction();
587     if (F) {
588       unsigned Dim = UINT_MAX;
589       switch (F->getIntrinsicID()) {
590       case Intrinsic::amdgcn_workitem_id_x:
591       case Intrinsic::r600_read_tidig_x:
592         IdQuery = true;
593         LLVM_FALLTHROUGH;
594       case Intrinsic::r600_read_local_size_x:
595         Dim = 0;
596         break;
597       case Intrinsic::amdgcn_workitem_id_y:
598       case Intrinsic::r600_read_tidig_y:
599         IdQuery = true;
600         LLVM_FALLTHROUGH;
601       case Intrinsic::r600_read_local_size_y:
602         Dim = 1;
603         break;
604       case Intrinsic::amdgcn_workitem_id_z:
605       case Intrinsic::r600_read_tidig_z:
606         IdQuery = true;
607         LLVM_FALLTHROUGH;
608       case Intrinsic::r600_read_local_size_z:
609         Dim = 2;
610         break;
611       default:
612         break;
613       }
614 
615       if (Dim <= 3) {
616         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
617         if (ReqdSize != std::numeric_limits<unsigned>::max())
618           MinSize = MaxSize = ReqdSize;
619       }
620     }
621   }
622 
623   if (!MaxSize)
624     return false;
625 
626   // Range metadata is [Lo, Hi). For ID query we need to pass max size
627   // as Hi. For size query we need to pass Hi + 1.
628   if (IdQuery)
629     MinSize = 0;
630   else
631     ++MaxSize;
632 
633   MDBuilder MDB(I->getContext());
634   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
635                                                   APInt(32, MaxSize));
636   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
637   return true;
638 }
639 
640 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
641   assert(AMDGPU::isKernel(F.getCallingConv()));
642 
643   // We don't allocate the segment if we know the implicit arguments weren't
644   // used, even if the ABI implies we need them.
645   if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
646     return 0;
647 
648   if (isMesaKernel(F))
649     return 16;
650 
651   // Assume all implicit inputs are used by default
652   return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 56);
653 }
654 
655 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
656                                                  Align &MaxAlign) const {
657   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
658          F.getCallingConv() == CallingConv::SPIR_KERNEL);
659 
660   const DataLayout &DL = F.getParent()->getDataLayout();
661   uint64_t ExplicitArgBytes = 0;
662   MaxAlign = Align(1);
663 
664   for (const Argument &Arg : F.args()) {
665     const bool IsByRef = Arg.hasByRefAttr();
666     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
667     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
668     if (!Alignment)
669       Alignment = DL.getABITypeAlign(ArgTy);
670 
671     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
672     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
673     MaxAlign = max(MaxAlign, Alignment);
674   }
675 
676   return ExplicitArgBytes;
677 }
678 
679 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
680                                                 Align &MaxAlign) const {
681   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
682 
683   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
684 
685   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
686   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
687   if (ImplicitBytes != 0) {
688     const Align Alignment = getAlignmentForImplicitArgPtr();
689     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
690     MaxAlign = std::max(MaxAlign, Alignment);
691   }
692 
693   // Being able to dereference past the end is useful for emitting scalar loads.
694   return alignTo(TotalSize, 4);
695 }
696 
697 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
698   return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
699                                   : AMDGPUDwarfFlavour::Wave64;
700 }
701 
702 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
703                                       unsigned NumRegionInstrs) const {
704   // Track register pressure so the scheduler can try to decrease
705   // pressure once register usage is above the threshold defined by
706   // SIRegisterInfo::getRegPressureSetLimit()
707   Policy.ShouldTrackPressure = true;
708 
709   // Enabling both top down and bottom up scheduling seems to give us less
710   // register spills than just using one of these approaches on its own.
711   Policy.OnlyTopDown = false;
712   Policy.OnlyBottomUp = false;
713 
714   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
715   if (!enableSIScheduler())
716     Policy.ShouldTrackLaneMasks = true;
717 }
718 
719 bool GCNSubtarget::hasMadF16() const {
720   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
721 }
722 
723 bool GCNSubtarget::useVGPRIndexMode() const {
724   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
725 }
726 
727 bool GCNSubtarget::useAA() const { return UseAA; }
728 
729 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
730   if (getGeneration() >= AMDGPUSubtarget::GFX10)
731     return getMaxWavesPerEU();
732 
733   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
734     if (SGPRs <= 80)
735       return 10;
736     if (SGPRs <= 88)
737       return 9;
738     if (SGPRs <= 100)
739       return 8;
740     return 7;
741   }
742   if (SGPRs <= 48)
743     return 10;
744   if (SGPRs <= 56)
745     return 9;
746   if (SGPRs <= 64)
747     return 8;
748   if (SGPRs <= 72)
749     return 7;
750   if (SGPRs <= 80)
751     return 6;
752   return 5;
753 }
754 
755 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
756   unsigned MaxWaves = getMaxWavesPerEU();
757   unsigned Granule = getVGPRAllocGranule();
758   if (VGPRs < Granule)
759     return MaxWaves;
760   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
761   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
762 }
763 
764 unsigned
765 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
766   if (getGeneration() >= AMDGPUSubtarget::GFX10)
767     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
768 
769   if (HasFlatScratch || HasArchitectedFlatScratch) {
770     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
771       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
772     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
773       return 4; // FLAT_SCRATCH, VCC (in that order).
774   }
775 
776   if (isXNACKEnabled())
777     return 4; // XNACK, VCC (in that order).
778   return 2; // VCC.
779 }
780 
781 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
782   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
783   return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit());
784 }
785 
786 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
787   // In principle we do not need to reserve SGPR pair used for flat_scratch if
788   // we know flat instructions do not access the stack anywhere in the
789   // program. For now assume it's needed if we have flat instructions.
790   const bool KernelUsesFlatScratch = hasFlatAddressSpace();
791   return getBaseReservedNumSGPRs(KernelUsesFlatScratch);
792 }
793 
794 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
795                                         unsigned NumSGPRs,
796                                         unsigned NumVGPRs) const {
797   unsigned Occupancy =
798     std::min(getMaxWavesPerEU(),
799              getOccupancyWithLocalMemSize(LDSSize, F));
800   if (NumSGPRs)
801     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
802   if (NumVGPRs)
803     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
804   return Occupancy;
805 }
806 
807 unsigned GCNSubtarget::getBaseMaxNumSGPRs(
808     const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
809     unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
810   // Compute maximum number of SGPRs function can use using default/requested
811   // minimum number of waves per execution unit.
812   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
813   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
814 
815   // Check if maximum number of SGPRs was explicitly requested using
816   // "amdgpu-num-sgpr" attribute.
817   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
818     unsigned Requested = AMDGPU::getIntegerAttribute(
819       F, "amdgpu-num-sgpr", MaxNumSGPRs);
820 
821     // Make sure requested value does not violate subtarget's specifications.
822     if (Requested && (Requested <= ReservedNumSGPRs))
823       Requested = 0;
824 
825     // If more SGPRs are required to support the input user/system SGPRs,
826     // increase to accommodate them.
827     //
828     // FIXME: This really ends up using the requested number of SGPRs + number
829     // of reserved special registers in total. Theoretically you could re-use
830     // the last input registers for these special registers, but this would
831     // require a lot of complexity to deal with the weird aliasing.
832     unsigned InputNumSGPRs = PreloadedSGPRs;
833     if (Requested && Requested < InputNumSGPRs)
834       Requested = InputNumSGPRs;
835 
836     // Make sure requested value is compatible with values implied by
837     // default/requested minimum/maximum number of waves per execution unit.
838     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
839       Requested = 0;
840     if (WavesPerEU.second &&
841         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
842       Requested = 0;
843 
844     if (Requested)
845       MaxNumSGPRs = Requested;
846   }
847 
848   if (hasSGPRInitBug())
849     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
850 
851   return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
852 }
853 
854 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
855   const Function &F = MF.getFunction();
856   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
857   return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),
858                             getReservedNumSGPRs(MF));
859 }
860 
861 static unsigned getMaxNumPreloadedSGPRs() {
862   // Max number of user SGPRs
863   unsigned MaxUserSGPRs = 4 + // private segment buffer
864                           2 + // Dispatch ptr
865                           2 + // queue ptr
866                           2 + // kernel segment ptr
867                           2 + // dispatch ID
868                           2 + // flat scratch init
869                           2;  // Implicit buffer ptr
870   // Max number of system SGPRs
871   unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
872                             1 + // WorkGroupIDY
873                             1 + // WorkGroupIDZ
874                             1 + // WorkGroupInfo
875                             1;  // private segment wave byte offset
876   return MaxUserSGPRs + MaxSystemSGPRs;
877 }
878 
879 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
880   return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),
881                             getReservedNumSGPRs(F));
882 }
883 
884 unsigned GCNSubtarget::getBaseMaxNumVGPRs(
885     const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
886   // Compute maximum number of VGPRs function can use using default/requested
887   // minimum number of waves per execution unit.
888   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
889 
890   // Check if maximum number of VGPRs was explicitly requested using
891   // "amdgpu-num-vgpr" attribute.
892   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
893     unsigned Requested = AMDGPU::getIntegerAttribute(
894       F, "amdgpu-num-vgpr", MaxNumVGPRs);
895 
896     if (hasGFX90AInsts())
897       Requested *= 2;
898 
899     // Make sure requested value is compatible with values implied by
900     // default/requested minimum/maximum number of waves per execution unit.
901     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
902       Requested = 0;
903     if (WavesPerEU.second &&
904         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
905       Requested = 0;
906 
907     if (Requested)
908       MaxNumVGPRs = Requested;
909   }
910 
911   return MaxNumVGPRs;
912 }
913 
914 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
915   return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
916 }
917 
918 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
919   const Function &F = MF.getFunction();
920   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
921   return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
922 }
923 
924 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
925                                          int UseOpIdx, SDep &Dep) const {
926   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
927       !Def->isInstr() || !Use->isInstr())
928     return;
929 
930   MachineInstr *DefI = Def->getInstr();
931   MachineInstr *UseI = Use->getInstr();
932 
933   if (DefI->isBundle()) {
934     const SIRegisterInfo *TRI = getRegisterInfo();
935     auto Reg = Dep.getReg();
936     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
937     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
938     unsigned Lat = 0;
939     for (++I; I != E && I->isBundledWithPred(); ++I) {
940       if (I->modifiesRegister(Reg, TRI))
941         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
942       else if (Lat)
943         --Lat;
944     }
945     Dep.setLatency(Lat);
946   } else if (UseI->isBundle()) {
947     const SIRegisterInfo *TRI = getRegisterInfo();
948     auto Reg = Dep.getReg();
949     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
950     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
951     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
952     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
953       if (I->readsRegister(Reg, TRI))
954         break;
955       --Lat;
956     }
957     Dep.setLatency(Lat);
958   } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
959     // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
960     // implicit operands which come from the MCInstrDesc, which can fool
961     // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
962     // pseudo operands.
963     Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
964         DefI, DefOpIdx, UseI, UseOpIdx));
965   }
966 }
967 
968 namespace {
969 struct FillMFMAShadowMutation : ScheduleDAGMutation {
970   const SIInstrInfo *TII;
971 
972   ScheduleDAGMI *DAG;
973 
974   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
975 
976   bool isSALU(const SUnit *SU) const {
977     const MachineInstr *MI = SU->getInstr();
978     return MI && TII->isSALU(*MI) && !MI->isTerminator();
979   }
980 
981   bool isVALU(const SUnit *SU) const {
982     const MachineInstr *MI = SU->getInstr();
983     return MI && TII->isVALU(*MI);
984   }
985 
986   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
987     if (Pred->NodeNum < Succ->NodeNum)
988       return true;
989 
990     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
991 
992     for (unsigned I = 0; I < Succs.size(); ++I) {
993       for (const SDep &SI : Succs[I]->Succs) {
994         const SUnit *SU = SI.getSUnit();
995         if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
996           Succs.push_back(SU);
997       }
998     }
999 
1000     SmallPtrSet<const SUnit*, 32> Visited;
1001     while (!Preds.empty()) {
1002       const SUnit *SU = Preds.pop_back_val();
1003       if (llvm::is_contained(Succs, SU))
1004         return false;
1005       Visited.insert(SU);
1006       for (const SDep &SI : SU->Preds)
1007         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
1008           Preds.push_back(SI.getSUnit());
1009     }
1010 
1011     return true;
1012   }
1013 
1014   // Link as many SALU instructions in chain as possible. Return the size
1015   // of the chain. Links up to MaxChain instructions.
1016   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
1017                          SmallPtrSetImpl<SUnit *> &Visited) const {
1018     SmallVector<SUnit *, 8> Worklist({To});
1019     unsigned Linked = 0;
1020 
1021     while (!Worklist.empty() && MaxChain-- > 0) {
1022       SUnit *SU = Worklist.pop_back_val();
1023       if (!Visited.insert(SU).second)
1024         continue;
1025 
1026       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
1027                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
1028 
1029       if (SU->addPred(SDep(From, SDep::Artificial), false))
1030         ++Linked;
1031 
1032       for (SDep &SI : From->Succs) {
1033         SUnit *SUv = SI.getSUnit();
1034         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
1035           SUv->addPred(SDep(SU, SDep::Artificial), false);
1036       }
1037 
1038       for (SDep &SI : SU->Succs) {
1039         SUnit *Succ = SI.getSUnit();
1040         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
1041           Worklist.push_back(Succ);
1042       }
1043     }
1044 
1045     return Linked;
1046   }
1047 
1048   void apply(ScheduleDAGInstrs *DAGInstrs) override {
1049     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
1050     if (!ST.hasMAIInsts() || DisablePowerSched)
1051       return;
1052     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
1053     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
1054     if (!TSchedModel || DAG->SUnits.empty())
1055       return;
1056 
1057     // Scan for MFMA long latency instructions and try to add a dependency
1058     // of available SALU instructions to give them a chance to fill MFMA
1059     // shadow. That is desirable to fill MFMA shadow with SALU instructions
1060     // rather than VALU to prevent power consumption bursts and throttle.
1061     auto LastSALU = DAG->SUnits.begin();
1062     auto E = DAG->SUnits.end();
1063     SmallPtrSet<SUnit*, 32> Visited;
1064     for (SUnit &SU : DAG->SUnits) {
1065       MachineInstr &MAI = *SU.getInstr();
1066       if (!TII->isMAI(MAI) ||
1067            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
1068            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
1069         continue;
1070 
1071       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
1072 
1073       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
1074                  dbgs() << "Need " << Lat
1075                         << " instructions to cover latency.\n");
1076 
1077       // Find up to Lat independent scalar instructions as early as
1078       // possible such that they can be scheduled after this MFMA.
1079       for ( ; Lat && LastSALU != E; ++LastSALU) {
1080         if (Visited.count(&*LastSALU))
1081           continue;
1082 
1083         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
1084           continue;
1085 
1086         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
1087       }
1088     }
1089   }
1090 };
1091 } // namespace
1092 
1093 void GCNSubtarget::getPostRAMutations(
1094     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
1095   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
1096 }
1097 
1098 std::unique_ptr<ScheduleDAGMutation>
1099 GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const {
1100   return std::make_unique<FillMFMAShadowMutation>(&InstrInfo);
1101 }
1102 
1103 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
1104   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
1105     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
1106   else
1107     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
1108 }
1109 
1110 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
1111   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
1112     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
1113   else
1114     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
1115 }
1116