1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPUCallLowering.h"
16 #include "AMDGPUInstructionSelector.h"
17 #include "AMDGPULegalizerInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "R600Subtarget.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/IntrinsicsR600.h"
29 #include "llvm/IR/MDBuilder.h"
30 #include "llvm/MC/MCSubtargetInfo.h"
31 #include <algorithm>
32 
33 using namespace llvm;
34 
35 #define DEBUG_TYPE "amdgpu-subtarget"
36 
37 #define GET_SUBTARGETINFO_TARGET_DESC
38 #define GET_SUBTARGETINFO_CTOR
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #undef AMDGPUSubtarget
42 
43 static cl::opt<bool> DisablePowerSched(
44   "amdgpu-disable-power-sched",
45   cl::desc("Disable scheduling to minimize mAI power bursts"),
46   cl::init(false));
47 
48 static cl::opt<bool> EnableVGPRIndexMode(
49   "amdgpu-vgpr-index-mode",
50   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51   cl::init(false));
52 
53 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
54                            cl::desc("Enable the use of AA during codegen."),
55                            cl::init(true));
56 
57 GCNSubtarget::~GCNSubtarget() = default;
58 
59 GCNSubtarget &
60 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
61                                               StringRef GPU, StringRef FS) {
62   // Determine default and user-specified characteristics
63   //
64   // We want to be able to turn these off, but making this a subtarget feature
65   // for SI has the unhelpful behavior that it unsets everything else if you
66   // disable it.
67   //
68   // Similarly we want enable-prt-strict-null to be on by default and not to
69   // unset everything else if it is disabled
70 
71   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
72 
73   // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
74   if (isAmdHsaOS())
75     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
76 
77   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
78 
79   // Disable mutually exclusive bits.
80   if (FS.contains_insensitive("+wavefrontsize")) {
81     if (!FS.contains_insensitive("wavefrontsize16"))
82       FullFS += "-wavefrontsize16,";
83     if (!FS.contains_insensitive("wavefrontsize32"))
84       FullFS += "-wavefrontsize32,";
85     if (!FS.contains_insensitive("wavefrontsize64"))
86       FullFS += "-wavefrontsize64,";
87   }
88 
89   FullFS += FS;
90 
91   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
92 
93   // Implement the "generic" processors, which acts as the default when no
94   // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
95   // the first amdgcn target that supports flat addressing. Other OSes defaults
96   // to the first amdgcn target.
97   if (Gen == AMDGPUSubtarget::INVALID) {
98      Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
99                                         : AMDGPUSubtarget::SOUTHERN_ISLANDS;
100   }
101 
102   // We don't support FP64 for EG/NI atm.
103   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
104 
105   // Targets must either support 64-bit offsets for MUBUF instructions, and/or
106   // support flat operations, otherwise they cannot access a 64-bit global
107   // address space
108   assert(hasAddr64() || hasFlat());
109   // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
110   // that do not support ADDR64 variants of MUBUF instructions. Such targets
111   // cannot use a 64 bit offset with a MUBUF instruction to access the global
112   // address space
113   if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
114     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
115     FlatForGlobal = true;
116   }
117   // Unless +-flat-for-global is specified, use MUBUF instructions for global
118   // address space access if flat operations are not available.
119   if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
120     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
121     FlatForGlobal = false;
122   }
123 
124   // Set defaults if needed.
125   if (MaxPrivateElementSize == 0)
126     MaxPrivateElementSize = 4;
127 
128   if (LDSBankCount == 0)
129     LDSBankCount = 32;
130 
131   if (TT.getArch() == Triple::amdgcn) {
132     if (LocalMemorySize == 0)
133       LocalMemorySize = 32768;
134 
135     // Do something sensible for unspecified target.
136     if (!HasMovrel && !HasVGPRIndexMode)
137       HasMovrel = true;
138   }
139 
140   // Don't crash on invalid devices.
141   if (WavefrontSizeLog2 == 0)
142     WavefrontSizeLog2 = 5;
143 
144   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
145   HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
146 
147   TargetID.setTargetIDFromFeaturesString(FS);
148 
149   LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
150                     << TargetID.getXnackSetting() << '\n');
151   LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
152                     << TargetID.getSramEccSetting() << '\n');
153 
154   return *this;
155 }
156 
157 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
158   TargetTriple(TT),
159   GCN3Encoding(false),
160   Has16BitInsts(false),
161   HasMadMixInsts(false),
162   HasMadMacF32Insts(false),
163   HasDsSrc2Insts(false),
164   HasSDWA(false),
165   HasVOP3PInsts(false),
166   HasMulI24(true),
167   HasMulU24(true),
168   HasSMulHi(false),
169   HasInv2PiInlineImm(false),
170   HasFminFmaxLegacy(true),
171   EnablePromoteAlloca(false),
172   HasTrigReducedRange(false),
173   MaxWavesPerEU(10),
174   LocalMemorySize(0),
175   WavefrontSizeLog2(0)
176   { }
177 
178 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
179                            const GCNTargetMachine &TM)
180     : // clang-format off
181     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
182     AMDGPUSubtarget(TT),
183     TargetTriple(TT),
184     TargetID(*this),
185     Gen(INVALID),
186     InstrItins(getInstrItineraryForCPU(GPU)),
187     LDSBankCount(0),
188     MaxPrivateElementSize(0),
189 
190     FastFMAF32(false),
191     FastDenormalF32(false),
192     HalfRate64Ops(false),
193     FullRate64Ops(false),
194 
195     FlatForGlobal(false),
196     AutoWaitcntBeforeBarrier(false),
197     UnalignedScratchAccess(false),
198     UnalignedAccessMode(false),
199 
200     HasApertureRegs(false),
201     SupportsXNACK(false),
202     EnableXNACK(false),
203     EnableTgSplit(false),
204     EnableCuMode(false),
205     TrapHandler(false),
206 
207     EnableLoadStoreOpt(false),
208     EnableUnsafeDSOffsetFolding(false),
209     EnableSIScheduler(false),
210     EnableDS128(false),
211     EnablePRTStrictNull(false),
212     DumpCode(false),
213 
214     FP64(false),
215     CIInsts(false),
216     GFX8Insts(false),
217     GFX9Insts(false),
218     GFX90AInsts(false),
219     GFX940Insts(false),
220     GFX10Insts(false),
221     GFX10_3Insts(false),
222     GFX7GFX8GFX9Insts(false),
223     SGPRInitBug(false),
224     NegativeScratchOffsetBug(false),
225     NegativeUnalignedScratchOffsetBug(false),
226     HasSMemRealTime(false),
227     HasIntClamp(false),
228     HasFmaMixInsts(false),
229     HasMovrel(false),
230     HasVGPRIndexMode(false),
231     HasScalarStores(false),
232     HasScalarAtomics(false),
233     HasSDWAOmod(false),
234     HasSDWAScalar(false),
235     HasSDWASdst(false),
236     HasSDWAMac(false),
237     HasSDWAOutModsVOPC(false),
238     HasDPP(false),
239     HasDPP8(false),
240     Has64BitDPP(false),
241     HasPackedFP32Ops(false),
242     HasImageInsts(false),
243     HasExtendedImageInsts(false),
244     HasR128A16(false),
245     HasGFX10A16(false),
246     HasG16(false),
247     HasNSAEncoding(false),
248     NSAMaxSize(0),
249     GFX10_AEncoding(false),
250     GFX10_BEncoding(false),
251     HasDLInsts(false),
252     HasDot1Insts(false),
253     HasDot2Insts(false),
254     HasDot3Insts(false),
255     HasDot4Insts(false),
256     HasDot5Insts(false),
257     HasDot6Insts(false),
258     HasDot7Insts(false),
259     HasMAIInsts(false),
260     HasPkFmacF16Inst(false),
261     HasAtomicFaddInsts(false),
262     SupportsSRAMECC(false),
263     EnableSRAMECC(false),
264     HasNoSdstCMPX(false),
265     HasVscnt(false),
266     HasGetWaveIdInst(false),
267     HasSMemTimeInst(false),
268     HasShaderCyclesRegister(false),
269     HasVOP3Literal(false),
270     HasNoDataDepHazard(false),
271     FlatAddressSpace(false),
272     FlatInstOffsets(false),
273     FlatGlobalInsts(false),
274     FlatScratchInsts(false),
275     ScalarFlatScratchInsts(false),
276     HasArchitectedFlatScratch(false),
277     EnableFlatScratch(false),
278     AddNoCarryInsts(false),
279     HasUnpackedD16VMem(false),
280     LDSMisalignedBug(false),
281     HasMFMAInlineLiteralBug(false),
282     UnalignedBufferAccess(false),
283     UnalignedDSAccess(false),
284     HasPackedTID(false),
285 
286     ScalarizeGlobal(false),
287 
288     HasVcmpxPermlaneHazard(false),
289     HasVMEMtoScalarWriteHazard(false),
290     HasSMEMtoVectorWriteHazard(false),
291     HasInstFwdPrefetchBug(false),
292     HasVcmpxExecWARHazard(false),
293     HasLdsBranchVmemWARHazard(false),
294     HasNSAtoVMEMBug(false),
295     HasNSAClauseBug(false),
296     HasOffset3fBug(false),
297     HasFlatSegmentOffsetBug(false),
298     HasImageStoreD16Bug(false),
299     HasImageGather4D16Bug(false),
300 
301     FeatureDisable(false),
302     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
303     TLInfo(TM, *this),
304     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
305   // clang-format on
306   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
307   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
308   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
309   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
310   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
311   InstSelector.reset(new AMDGPUInstructionSelector(
312   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
313 }
314 
315 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
316   if (getGeneration() < GFX10)
317     return 1;
318 
319   switch (Opcode) {
320   case AMDGPU::V_LSHLREV_B64_e64:
321   case AMDGPU::V_LSHLREV_B64_gfx10:
322   case AMDGPU::V_LSHL_B64_e64:
323   case AMDGPU::V_LSHRREV_B64_e64:
324   case AMDGPU::V_LSHRREV_B64_gfx10:
325   case AMDGPU::V_LSHR_B64_e64:
326   case AMDGPU::V_ASHRREV_I64_e64:
327   case AMDGPU::V_ASHRREV_I64_gfx10:
328   case AMDGPU::V_ASHR_I64_e64:
329     return 1;
330   }
331 
332   return 2;
333 }
334 
335 /// This list was mostly derived from experimentation.
336 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
337   switch (Opcode) {
338   case AMDGPU::V_CVT_F16_F32_e32:
339   case AMDGPU::V_CVT_F16_F32_e64:
340   case AMDGPU::V_CVT_F16_U16_e32:
341   case AMDGPU::V_CVT_F16_U16_e64:
342   case AMDGPU::V_CVT_F16_I16_e32:
343   case AMDGPU::V_CVT_F16_I16_e64:
344   case AMDGPU::V_RCP_F16_e64:
345   case AMDGPU::V_RCP_F16_e32:
346   case AMDGPU::V_RSQ_F16_e64:
347   case AMDGPU::V_RSQ_F16_e32:
348   case AMDGPU::V_SQRT_F16_e64:
349   case AMDGPU::V_SQRT_F16_e32:
350   case AMDGPU::V_LOG_F16_e64:
351   case AMDGPU::V_LOG_F16_e32:
352   case AMDGPU::V_EXP_F16_e64:
353   case AMDGPU::V_EXP_F16_e32:
354   case AMDGPU::V_SIN_F16_e64:
355   case AMDGPU::V_SIN_F16_e32:
356   case AMDGPU::V_COS_F16_e64:
357   case AMDGPU::V_COS_F16_e32:
358   case AMDGPU::V_FLOOR_F16_e64:
359   case AMDGPU::V_FLOOR_F16_e32:
360   case AMDGPU::V_CEIL_F16_e64:
361   case AMDGPU::V_CEIL_F16_e32:
362   case AMDGPU::V_TRUNC_F16_e64:
363   case AMDGPU::V_TRUNC_F16_e32:
364   case AMDGPU::V_RNDNE_F16_e64:
365   case AMDGPU::V_RNDNE_F16_e32:
366   case AMDGPU::V_FRACT_F16_e64:
367   case AMDGPU::V_FRACT_F16_e32:
368   case AMDGPU::V_FREXP_MANT_F16_e64:
369   case AMDGPU::V_FREXP_MANT_F16_e32:
370   case AMDGPU::V_FREXP_EXP_I16_F16_e64:
371   case AMDGPU::V_FREXP_EXP_I16_F16_e32:
372   case AMDGPU::V_LDEXP_F16_e64:
373   case AMDGPU::V_LDEXP_F16_e32:
374   case AMDGPU::V_LSHLREV_B16_e64:
375   case AMDGPU::V_LSHLREV_B16_e32:
376   case AMDGPU::V_LSHRREV_B16_e64:
377   case AMDGPU::V_LSHRREV_B16_e32:
378   case AMDGPU::V_ASHRREV_I16_e64:
379   case AMDGPU::V_ASHRREV_I16_e32:
380   case AMDGPU::V_ADD_U16_e64:
381   case AMDGPU::V_ADD_U16_e32:
382   case AMDGPU::V_SUB_U16_e64:
383   case AMDGPU::V_SUB_U16_e32:
384   case AMDGPU::V_SUBREV_U16_e64:
385   case AMDGPU::V_SUBREV_U16_e32:
386   case AMDGPU::V_MUL_LO_U16_e64:
387   case AMDGPU::V_MUL_LO_U16_e32:
388   case AMDGPU::V_ADD_F16_e64:
389   case AMDGPU::V_ADD_F16_e32:
390   case AMDGPU::V_SUB_F16_e64:
391   case AMDGPU::V_SUB_F16_e32:
392   case AMDGPU::V_SUBREV_F16_e64:
393   case AMDGPU::V_SUBREV_F16_e32:
394   case AMDGPU::V_MUL_F16_e64:
395   case AMDGPU::V_MUL_F16_e32:
396   case AMDGPU::V_MAX_F16_e64:
397   case AMDGPU::V_MAX_F16_e32:
398   case AMDGPU::V_MIN_F16_e64:
399   case AMDGPU::V_MIN_F16_e32:
400   case AMDGPU::V_MAX_U16_e64:
401   case AMDGPU::V_MAX_U16_e32:
402   case AMDGPU::V_MIN_U16_e64:
403   case AMDGPU::V_MIN_U16_e32:
404   case AMDGPU::V_MAX_I16_e64:
405   case AMDGPU::V_MAX_I16_e32:
406   case AMDGPU::V_MIN_I16_e64:
407   case AMDGPU::V_MIN_I16_e32:
408   case AMDGPU::V_MAD_F16_e64:
409   case AMDGPU::V_MAD_U16_e64:
410   case AMDGPU::V_MAD_I16_e64:
411   case AMDGPU::V_FMA_F16_e64:
412   case AMDGPU::V_DIV_FIXUP_F16_e64:
413     // On gfx10, all 16-bit instructions preserve the high bits.
414     return getGeneration() <= AMDGPUSubtarget::GFX9;
415   case AMDGPU::V_MADAK_F16:
416   case AMDGPU::V_MADMK_F16:
417   case AMDGPU::V_MAC_F16_e64:
418   case AMDGPU::V_MAC_F16_e32:
419   case AMDGPU::V_FMAMK_F16:
420   case AMDGPU::V_FMAAK_F16:
421   case AMDGPU::V_FMAC_F16_e64:
422   case AMDGPU::V_FMAC_F16_e32:
423     // In gfx9, the preferred handling of the unused high 16-bits changed. Most
424     // instructions maintain the legacy behavior of 0ing. Some instructions
425     // changed to preserving the high bits.
426     return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
427   case AMDGPU::V_MAD_MIXLO_F16:
428   case AMDGPU::V_MAD_MIXHI_F16:
429   default:
430     return false;
431   }
432 }
433 
434 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
435   const Function &F) const {
436   if (NWaves == 1)
437     return getLocalMemorySize();
438   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
439   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
440   if (!WorkGroupsPerCu)
441     return 0;
442   unsigned MaxWaves = getMaxWavesPerEU();
443   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
444 }
445 
446 // FIXME: Should return min,max range.
447 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
448   const Function &F) const {
449   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
450   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
451   if (!MaxWorkGroupsPerCu)
452     return 0;
453 
454   const unsigned WaveSize = getWavefrontSize();
455 
456   // FIXME: Do we need to account for alignment requirement of LDS rounding the
457   // size up?
458   // Compute restriction based on LDS usage
459   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
460 
461   // This can be queried with more LDS than is possible, so just assume the
462   // worst.
463   if (NumGroups == 0)
464     return 1;
465 
466   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
467 
468   // Round to the number of waves.
469   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
470   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
471 
472   // Clamp to the maximum possible number of waves.
473   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
474 
475   // FIXME: Needs to be a multiple of the group size?
476   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
477 
478   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
479          "computed invalid occupancy");
480   return MaxWaves;
481 }
482 
483 unsigned
484 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
485   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
486   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
487 }
488 
489 std::pair<unsigned, unsigned>
490 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
491   switch (CC) {
492   case CallingConv::AMDGPU_VS:
493   case CallingConv::AMDGPU_LS:
494   case CallingConv::AMDGPU_HS:
495   case CallingConv::AMDGPU_ES:
496   case CallingConv::AMDGPU_GS:
497   case CallingConv::AMDGPU_PS:
498     return std::make_pair(1, getWavefrontSize());
499   default:
500     return std::make_pair(1u, getMaxFlatWorkGroupSize());
501   }
502 }
503 
504 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
505   const Function &F) const {
506   // Default minimum/maximum flat work group sizes.
507   std::pair<unsigned, unsigned> Default =
508     getDefaultFlatWorkGroupSize(F.getCallingConv());
509 
510   // Requested minimum/maximum flat work group sizes.
511   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
512     F, "amdgpu-flat-work-group-size", Default);
513 
514   // Make sure requested minimum is less than requested maximum.
515   if (Requested.first > Requested.second)
516     return Default;
517 
518   // Make sure requested values do not violate subtarget's specifications.
519   if (Requested.first < getMinFlatWorkGroupSize())
520     return Default;
521   if (Requested.second > getMaxFlatWorkGroupSize())
522     return Default;
523 
524   return Requested;
525 }
526 
527 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
528     const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
529   // Default minimum/maximum number of waves per execution unit.
530   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
531 
532   // If minimum/maximum flat work group sizes were explicitly requested using
533   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
534   // number of waves per execution unit to values implied by requested
535   // minimum/maximum flat work group sizes.
536   unsigned MinImpliedByFlatWorkGroupSize =
537     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
538   Default.first = MinImpliedByFlatWorkGroupSize;
539 
540   // Requested minimum/maximum number of waves per execution unit.
541   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
542     F, "amdgpu-waves-per-eu", Default, true);
543 
544   // Make sure requested minimum is less than requested maximum.
545   if (Requested.second && Requested.first > Requested.second)
546     return Default;
547 
548   // Make sure requested values do not violate subtarget's specifications.
549   if (Requested.first < getMinWavesPerEU() ||
550       Requested.second > getMaxWavesPerEU())
551     return Default;
552 
553   // Make sure requested values are compatible with values implied by requested
554   // minimum/maximum flat work group sizes.
555   if (Requested.first < MinImpliedByFlatWorkGroupSize)
556     return Default;
557 
558   return Requested;
559 }
560 
561 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
562   auto Node = Kernel.getMetadata("reqd_work_group_size");
563   if (Node && Node->getNumOperands() == 3)
564     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
565   return std::numeric_limits<unsigned>::max();
566 }
567 
568 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
569   return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
570 }
571 
572 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
573                                            unsigned Dimension) const {
574   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
575   if (ReqdSize != std::numeric_limits<unsigned>::max())
576     return ReqdSize - 1;
577   return getFlatWorkGroupSizes(Kernel).second - 1;
578 }
579 
580 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
581   Function *Kernel = I->getParent()->getParent();
582   unsigned MinSize = 0;
583   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
584   bool IdQuery = false;
585 
586   // If reqd_work_group_size is present it narrows value down.
587   if (auto *CI = dyn_cast<CallInst>(I)) {
588     const Function *F = CI->getCalledFunction();
589     if (F) {
590       unsigned Dim = UINT_MAX;
591       switch (F->getIntrinsicID()) {
592       case Intrinsic::amdgcn_workitem_id_x:
593       case Intrinsic::r600_read_tidig_x:
594         IdQuery = true;
595         LLVM_FALLTHROUGH;
596       case Intrinsic::r600_read_local_size_x:
597         Dim = 0;
598         break;
599       case Intrinsic::amdgcn_workitem_id_y:
600       case Intrinsic::r600_read_tidig_y:
601         IdQuery = true;
602         LLVM_FALLTHROUGH;
603       case Intrinsic::r600_read_local_size_y:
604         Dim = 1;
605         break;
606       case Intrinsic::amdgcn_workitem_id_z:
607       case Intrinsic::r600_read_tidig_z:
608         IdQuery = true;
609         LLVM_FALLTHROUGH;
610       case Intrinsic::r600_read_local_size_z:
611         Dim = 2;
612         break;
613       default:
614         break;
615       }
616 
617       if (Dim <= 3) {
618         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
619         if (ReqdSize != std::numeric_limits<unsigned>::max())
620           MinSize = MaxSize = ReqdSize;
621       }
622     }
623   }
624 
625   if (!MaxSize)
626     return false;
627 
628   // Range metadata is [Lo, Hi). For ID query we need to pass max size
629   // as Hi. For size query we need to pass Hi + 1.
630   if (IdQuery)
631     MinSize = 0;
632   else
633     ++MaxSize;
634 
635   MDBuilder MDB(I->getContext());
636   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
637                                                   APInt(32, MaxSize));
638   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
639   return true;
640 }
641 
642 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
643   assert(AMDGPU::isKernel(F.getCallingConv()));
644 
645   // We don't allocate the segment if we know the implicit arguments weren't
646   // used, even if the ABI implies we need them.
647   if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
648     return 0;
649 
650   if (isMesaKernel(F))
651     return 16;
652 
653   // Assume all implicit inputs are used by default
654   return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 56);
655 }
656 
657 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
658                                                  Align &MaxAlign) const {
659   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
660          F.getCallingConv() == CallingConv::SPIR_KERNEL);
661 
662   const DataLayout &DL = F.getParent()->getDataLayout();
663   uint64_t ExplicitArgBytes = 0;
664   MaxAlign = Align(1);
665 
666   for (const Argument &Arg : F.args()) {
667     const bool IsByRef = Arg.hasByRefAttr();
668     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
669     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
670     if (!Alignment)
671       Alignment = DL.getABITypeAlign(ArgTy);
672 
673     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
674     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
675     MaxAlign = max(MaxAlign, Alignment);
676   }
677 
678   return ExplicitArgBytes;
679 }
680 
681 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
682                                                 Align &MaxAlign) const {
683   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
684 
685   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
686 
687   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
688   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
689   if (ImplicitBytes != 0) {
690     const Align Alignment = getAlignmentForImplicitArgPtr();
691     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
692     MaxAlign = std::max(MaxAlign, Alignment);
693   }
694 
695   // Being able to dereference past the end is useful for emitting scalar loads.
696   return alignTo(TotalSize, 4);
697 }
698 
699 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
700   return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
701                                   : AMDGPUDwarfFlavour::Wave64;
702 }
703 
704 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
705                                       unsigned NumRegionInstrs) const {
706   // Track register pressure so the scheduler can try to decrease
707   // pressure once register usage is above the threshold defined by
708   // SIRegisterInfo::getRegPressureSetLimit()
709   Policy.ShouldTrackPressure = true;
710 
711   // Enabling both top down and bottom up scheduling seems to give us less
712   // register spills than just using one of these approaches on its own.
713   Policy.OnlyTopDown = false;
714   Policy.OnlyBottomUp = false;
715 
716   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
717   if (!enableSIScheduler())
718     Policy.ShouldTrackLaneMasks = true;
719 }
720 
721 bool GCNSubtarget::hasMadF16() const {
722   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
723 }
724 
725 bool GCNSubtarget::useVGPRIndexMode() const {
726   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
727 }
728 
729 bool GCNSubtarget::useAA() const { return UseAA; }
730 
731 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
732   if (getGeneration() >= AMDGPUSubtarget::GFX10)
733     return getMaxWavesPerEU();
734 
735   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
736     if (SGPRs <= 80)
737       return 10;
738     if (SGPRs <= 88)
739       return 9;
740     if (SGPRs <= 100)
741       return 8;
742     return 7;
743   }
744   if (SGPRs <= 48)
745     return 10;
746   if (SGPRs <= 56)
747     return 9;
748   if (SGPRs <= 64)
749     return 8;
750   if (SGPRs <= 72)
751     return 7;
752   if (SGPRs <= 80)
753     return 6;
754   return 5;
755 }
756 
757 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
758   unsigned MaxWaves = getMaxWavesPerEU();
759   unsigned Granule = getVGPRAllocGranule();
760   if (VGPRs < Granule)
761     return MaxWaves;
762   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
763   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
764 }
765 
766 unsigned
767 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
768   if (getGeneration() >= AMDGPUSubtarget::GFX10)
769     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
770 
771   if (HasFlatScratch || HasArchitectedFlatScratch) {
772     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
773       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
774     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
775       return 4; // FLAT_SCRATCH, VCC (in that order).
776   }
777 
778   if (isXNACKEnabled())
779     return 4; // XNACK, VCC (in that order).
780   return 2; // VCC.
781 }
782 
783 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
784   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
785   return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit());
786 }
787 
788 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
789   // In principle we do not need to reserve SGPR pair used for flat_scratch if
790   // we know flat instructions do not access the stack anywhere in the
791   // program. For now assume it's needed if we have flat instructions.
792   const bool KernelUsesFlatScratch = hasFlatAddressSpace();
793   return getBaseReservedNumSGPRs(KernelUsesFlatScratch);
794 }
795 
796 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
797                                         unsigned NumSGPRs,
798                                         unsigned NumVGPRs) const {
799   unsigned Occupancy =
800     std::min(getMaxWavesPerEU(),
801              getOccupancyWithLocalMemSize(LDSSize, F));
802   if (NumSGPRs)
803     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
804   if (NumVGPRs)
805     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
806   return Occupancy;
807 }
808 
809 unsigned GCNSubtarget::getBaseMaxNumSGPRs(
810     const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
811     unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
812   // Compute maximum number of SGPRs function can use using default/requested
813   // minimum number of waves per execution unit.
814   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
815   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
816 
817   // Check if maximum number of SGPRs was explicitly requested using
818   // "amdgpu-num-sgpr" attribute.
819   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
820     unsigned Requested = AMDGPU::getIntegerAttribute(
821       F, "amdgpu-num-sgpr", MaxNumSGPRs);
822 
823     // Make sure requested value does not violate subtarget's specifications.
824     if (Requested && (Requested <= ReservedNumSGPRs))
825       Requested = 0;
826 
827     // If more SGPRs are required to support the input user/system SGPRs,
828     // increase to accommodate them.
829     //
830     // FIXME: This really ends up using the requested number of SGPRs + number
831     // of reserved special registers in total. Theoretically you could re-use
832     // the last input registers for these special registers, but this would
833     // require a lot of complexity to deal with the weird aliasing.
834     unsigned InputNumSGPRs = PreloadedSGPRs;
835     if (Requested && Requested < InputNumSGPRs)
836       Requested = InputNumSGPRs;
837 
838     // Make sure requested value is compatible with values implied by
839     // default/requested minimum/maximum number of waves per execution unit.
840     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
841       Requested = 0;
842     if (WavesPerEU.second &&
843         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
844       Requested = 0;
845 
846     if (Requested)
847       MaxNumSGPRs = Requested;
848   }
849 
850   if (hasSGPRInitBug())
851     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
852 
853   return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
854 }
855 
856 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
857   const Function &F = MF.getFunction();
858   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
859   return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),
860                             getReservedNumSGPRs(MF));
861 }
862 
863 static unsigned getMaxNumPreloadedSGPRs() {
864   // Max number of user SGPRs
865   unsigned MaxUserSGPRs = 4 + // private segment buffer
866                           2 + // Dispatch ptr
867                           2 + // queue ptr
868                           2 + // kernel segment ptr
869                           2 + // dispatch ID
870                           2 + // flat scratch init
871                           2;  // Implicit buffer ptr
872   // Max number of system SGPRs
873   unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
874                             1 + // WorkGroupIDY
875                             1 + // WorkGroupIDZ
876                             1 + // WorkGroupInfo
877                             1;  // private segment wave byte offset
878   return MaxUserSGPRs + MaxSystemSGPRs;
879 }
880 
881 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
882   return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),
883                             getReservedNumSGPRs(F));
884 }
885 
886 unsigned GCNSubtarget::getBaseMaxNumVGPRs(
887     const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
888   // Compute maximum number of VGPRs function can use using default/requested
889   // minimum number of waves per execution unit.
890   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
891 
892   // Check if maximum number of VGPRs was explicitly requested using
893   // "amdgpu-num-vgpr" attribute.
894   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
895     unsigned Requested = AMDGPU::getIntegerAttribute(
896       F, "amdgpu-num-vgpr", MaxNumVGPRs);
897 
898     if (hasGFX90AInsts())
899       Requested *= 2;
900 
901     // Make sure requested value is compatible with values implied by
902     // default/requested minimum/maximum number of waves per execution unit.
903     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
904       Requested = 0;
905     if (WavesPerEU.second &&
906         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
907       Requested = 0;
908 
909     if (Requested)
910       MaxNumVGPRs = Requested;
911   }
912 
913   return MaxNumVGPRs;
914 }
915 
916 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
917   return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
918 }
919 
920 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
921   const Function &F = MF.getFunction();
922   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
923   return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
924 }
925 
926 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
927                                          int UseOpIdx, SDep &Dep) const {
928   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
929       !Def->isInstr() || !Use->isInstr())
930     return;
931 
932   MachineInstr *DefI = Def->getInstr();
933   MachineInstr *UseI = Use->getInstr();
934 
935   if (DefI->isBundle()) {
936     const SIRegisterInfo *TRI = getRegisterInfo();
937     auto Reg = Dep.getReg();
938     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
939     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
940     unsigned Lat = 0;
941     for (++I; I != E && I->isBundledWithPred(); ++I) {
942       if (I->modifiesRegister(Reg, TRI))
943         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
944       else if (Lat)
945         --Lat;
946     }
947     Dep.setLatency(Lat);
948   } else if (UseI->isBundle()) {
949     const SIRegisterInfo *TRI = getRegisterInfo();
950     auto Reg = Dep.getReg();
951     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
952     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
953     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
954     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
955       if (I->readsRegister(Reg, TRI))
956         break;
957       --Lat;
958     }
959     Dep.setLatency(Lat);
960   } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
961     // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
962     // implicit operands which come from the MCInstrDesc, which can fool
963     // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
964     // pseudo operands.
965     Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
966         DefI, DefOpIdx, UseI, UseOpIdx));
967   }
968 }
969 
970 namespace {
971 struct FillMFMAShadowMutation : ScheduleDAGMutation {
972   const SIInstrInfo *TII;
973 
974   ScheduleDAGMI *DAG;
975 
976   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
977 
978   bool isSALU(const SUnit *SU) const {
979     const MachineInstr *MI = SU->getInstr();
980     return MI && TII->isSALU(*MI) && !MI->isTerminator();
981   }
982 
983   bool isVALU(const SUnit *SU) const {
984     const MachineInstr *MI = SU->getInstr();
985     return MI && TII->isVALU(*MI);
986   }
987 
988   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
989     if (Pred->NodeNum < Succ->NodeNum)
990       return true;
991 
992     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
993 
994     for (unsigned I = 0; I < Succs.size(); ++I) {
995       for (const SDep &SI : Succs[I]->Succs) {
996         const SUnit *SU = SI.getSUnit();
997         if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
998           Succs.push_back(SU);
999       }
1000     }
1001 
1002     SmallPtrSet<const SUnit*, 32> Visited;
1003     while (!Preds.empty()) {
1004       const SUnit *SU = Preds.pop_back_val();
1005       if (llvm::is_contained(Succs, SU))
1006         return false;
1007       Visited.insert(SU);
1008       for (const SDep &SI : SU->Preds)
1009         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
1010           Preds.push_back(SI.getSUnit());
1011     }
1012 
1013     return true;
1014   }
1015 
1016   // Link as many SALU instructions in chain as possible. Return the size
1017   // of the chain. Links up to MaxChain instructions.
1018   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
1019                          SmallPtrSetImpl<SUnit *> &Visited) const {
1020     SmallVector<SUnit *, 8> Worklist({To});
1021     unsigned Linked = 0;
1022 
1023     while (!Worklist.empty() && MaxChain-- > 0) {
1024       SUnit *SU = Worklist.pop_back_val();
1025       if (!Visited.insert(SU).second)
1026         continue;
1027 
1028       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
1029                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
1030 
1031       if (SU->addPred(SDep(From, SDep::Artificial), false))
1032         ++Linked;
1033 
1034       for (SDep &SI : From->Succs) {
1035         SUnit *SUv = SI.getSUnit();
1036         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
1037           SUv->addPred(SDep(SU, SDep::Artificial), false);
1038       }
1039 
1040       for (SDep &SI : SU->Succs) {
1041         SUnit *Succ = SI.getSUnit();
1042         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
1043           Worklist.push_back(Succ);
1044       }
1045     }
1046 
1047     return Linked;
1048   }
1049 
1050   void apply(ScheduleDAGInstrs *DAGInstrs) override {
1051     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
1052     if (!ST.hasMAIInsts() || DisablePowerSched)
1053       return;
1054     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
1055     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
1056     if (!TSchedModel || DAG->SUnits.empty())
1057       return;
1058 
1059     // Scan for MFMA long latency instructions and try to add a dependency
1060     // of available SALU instructions to give them a chance to fill MFMA
1061     // shadow. That is desirable to fill MFMA shadow with SALU instructions
1062     // rather than VALU to prevent power consumption bursts and throttle.
1063     auto LastSALU = DAG->SUnits.begin();
1064     auto E = DAG->SUnits.end();
1065     SmallPtrSet<SUnit*, 32> Visited;
1066     for (SUnit &SU : DAG->SUnits) {
1067       MachineInstr &MAI = *SU.getInstr();
1068       if (!TII->isMAI(MAI) ||
1069            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
1070            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
1071         continue;
1072 
1073       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
1074 
1075       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
1076                  dbgs() << "Need " << Lat
1077                         << " instructions to cover latency.\n");
1078 
1079       // Find up to Lat independent scalar instructions as early as
1080       // possible such that they can be scheduled after this MFMA.
1081       for ( ; Lat && LastSALU != E; ++LastSALU) {
1082         if (Visited.count(&*LastSALU))
1083           continue;
1084 
1085         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
1086           continue;
1087 
1088         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
1089       }
1090     }
1091   }
1092 };
1093 } // namespace
1094 
1095 void GCNSubtarget::getPostRAMutations(
1096     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
1097   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
1098 }
1099 
1100 std::unique_ptr<ScheduleDAGMutation>
1101 GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const {
1102   return std::make_unique<FillMFMAShadowMutation>(&InstrInfo);
1103 }
1104 
1105 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
1106   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
1107     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
1108   else
1109     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
1110 }
1111 
1112 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
1113   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
1114     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
1115   else
1116     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
1117 }
1118