1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPUCallLowering.h"
16 #include "AMDGPUInstructionSelector.h"
17 #include "AMDGPULegalizerInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "R600Subtarget.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/IntrinsicsR600.h"
29 #include "llvm/IR/MDBuilder.h"
30 #include "llvm/MC/MCSubtargetInfo.h"
31 #include <algorithm>
32 
33 using namespace llvm;
34 
35 #define DEBUG_TYPE "amdgpu-subtarget"
36 
37 #define GET_SUBTARGETINFO_TARGET_DESC
38 #define GET_SUBTARGETINFO_CTOR
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #undef AMDGPUSubtarget
42 
43 static cl::opt<bool> DisablePowerSched(
44   "amdgpu-disable-power-sched",
45   cl::desc("Disable scheduling to minimize mAI power bursts"),
46   cl::init(false));
47 
48 static cl::opt<bool> EnableVGPRIndexMode(
49   "amdgpu-vgpr-index-mode",
50   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51   cl::init(false));
52 
53 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
54                            cl::desc("Enable the use of AA during codegen."),
55                            cl::init(true));
56 
57 GCNSubtarget::~GCNSubtarget() = default;
58 
59 GCNSubtarget &
60 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
61                                               StringRef GPU, StringRef FS) {
62   // Determine default and user-specified characteristics
63   //
64   // We want to be able to turn these off, but making this a subtarget feature
65   // for SI has the unhelpful behavior that it unsets everything else if you
66   // disable it.
67   //
68   // Similarly we want enable-prt-strict-null to be on by default and not to
69   // unset everything else if it is disabled
70 
71   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
72 
73   // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
74   if (isAmdHsaOS())
75     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
76 
77   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
78 
79   // Disable mutually exclusive bits.
80   if (FS.contains_insensitive("+wavefrontsize")) {
81     if (!FS.contains_insensitive("wavefrontsize16"))
82       FullFS += "-wavefrontsize16,";
83     if (!FS.contains_insensitive("wavefrontsize32"))
84       FullFS += "-wavefrontsize32,";
85     if (!FS.contains_insensitive("wavefrontsize64"))
86       FullFS += "-wavefrontsize64,";
87   }
88 
89   FullFS += FS;
90 
91   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
92 
93   // Implement the "generic" processors, which acts as the default when no
94   // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
95   // the first amdgcn target that supports flat addressing. Other OSes defaults
96   // to the first amdgcn target.
97   if (Gen == AMDGPUSubtarget::INVALID) {
98      Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
99                                         : AMDGPUSubtarget::SOUTHERN_ISLANDS;
100   }
101 
102   // We don't support FP64 for EG/NI atm.
103   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
104 
105   // Targets must either support 64-bit offsets for MUBUF instructions, and/or
106   // support flat operations, otherwise they cannot access a 64-bit global
107   // address space
108   assert(hasAddr64() || hasFlat());
109   // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
110   // that do not support ADDR64 variants of MUBUF instructions. Such targets
111   // cannot use a 64 bit offset with a MUBUF instruction to access the global
112   // address space
113   if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
114     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
115     FlatForGlobal = true;
116   }
117   // Unless +-flat-for-global is specified, use MUBUF instructions for global
118   // address space access if flat operations are not available.
119   if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
120     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
121     FlatForGlobal = false;
122   }
123 
124   // Set defaults if needed.
125   if (MaxPrivateElementSize == 0)
126     MaxPrivateElementSize = 4;
127 
128   if (LDSBankCount == 0)
129     LDSBankCount = 32;
130 
131   if (TT.getArch() == Triple::amdgcn) {
132     if (LocalMemorySize == 0)
133       LocalMemorySize = 32768;
134 
135     // Do something sensible for unspecified target.
136     if (!HasMovrel && !HasVGPRIndexMode)
137       HasMovrel = true;
138   }
139 
140   // Don't crash on invalid devices.
141   if (WavefrontSizeLog2 == 0)
142     WavefrontSizeLog2 = 5;
143 
144   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
145   HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
146 
147   TargetID.setTargetIDFromFeaturesString(FS);
148 
149   LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
150                     << TargetID.getXnackSetting() << '\n');
151   LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
152                     << TargetID.getSramEccSetting() << '\n');
153 
154   return *this;
155 }
156 
157 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
158   TargetTriple(TT),
159   GCN3Encoding(false),
160   Has16BitInsts(false),
161   HasMadMixInsts(false),
162   HasMadMacF32Insts(false),
163   HasDsSrc2Insts(false),
164   HasSDWA(false),
165   HasVOP3PInsts(false),
166   HasMulI24(true),
167   HasMulU24(true),
168   HasSMulHi(false),
169   HasInv2PiInlineImm(false),
170   HasFminFmaxLegacy(true),
171   EnablePromoteAlloca(false),
172   HasTrigReducedRange(false),
173   MaxWavesPerEU(10),
174   LocalMemorySize(0),
175   WavefrontSizeLog2(0)
176   { }
177 
178 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
179                            const GCNTargetMachine &TM)
180     : // clang-format off
181     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
182     AMDGPUSubtarget(TT),
183     TargetTriple(TT),
184     TargetID(*this),
185     Gen(INVALID),
186     InstrItins(getInstrItineraryForCPU(GPU)),
187     LDSBankCount(0),
188     MaxPrivateElementSize(0),
189 
190     FastFMAF32(false),
191     FastDenormalF32(false),
192     HalfRate64Ops(false),
193     FullRate64Ops(false),
194 
195     FlatForGlobal(false),
196     AutoWaitcntBeforeBarrier(false),
197     BackOffBarrier(false),
198     UnalignedScratchAccess(false),
199     UnalignedAccessMode(false),
200 
201     HasApertureRegs(false),
202     SupportsXNACK(false),
203     EnableXNACK(false),
204     EnableTgSplit(false),
205     EnableCuMode(false),
206     TrapHandler(false),
207 
208     EnableLoadStoreOpt(false),
209     EnableUnsafeDSOffsetFolding(false),
210     EnableSIScheduler(false),
211     EnableDS128(false),
212     EnablePRTStrictNull(false),
213     DumpCode(false),
214 
215     FP64(false),
216     CIInsts(false),
217     GFX8Insts(false),
218     GFX9Insts(false),
219     GFX90AInsts(false),
220     GFX940Insts(false),
221     GFX10Insts(false),
222     GFX10_3Insts(false),
223     GFX7GFX8GFX9Insts(false),
224     SGPRInitBug(false),
225     NegativeScratchOffsetBug(false),
226     NegativeUnalignedScratchOffsetBug(false),
227     HasSMemRealTime(false),
228     HasIntClamp(false),
229     HasFmaMixInsts(false),
230     HasMovrel(false),
231     HasVGPRIndexMode(false),
232     HasScalarStores(false),
233     HasScalarAtomics(false),
234     HasSDWAOmod(false),
235     HasSDWAScalar(false),
236     HasSDWASdst(false),
237     HasSDWAMac(false),
238     HasSDWAOutModsVOPC(false),
239     HasDPP(false),
240     HasDPP8(false),
241     Has64BitDPP(false),
242     HasPackedFP32Ops(false),
243     HasImageInsts(false),
244     HasExtendedImageInsts(false),
245     HasR128A16(false),
246     HasGFX10A16(false),
247     HasG16(false),
248     HasNSAEncoding(false),
249     NSAMaxSize(0),
250     GFX10_AEncoding(false),
251     GFX10_BEncoding(false),
252     HasDLInsts(false),
253     HasDot1Insts(false),
254     HasDot2Insts(false),
255     HasDot3Insts(false),
256     HasDot4Insts(false),
257     HasDot5Insts(false),
258     HasDot6Insts(false),
259     HasDot7Insts(false),
260     HasMAIInsts(false),
261     HasPkFmacF16Inst(false),
262     HasAtomicFaddInsts(false),
263     SupportsSRAMECC(false),
264     EnableSRAMECC(false),
265     HasNoSdstCMPX(false),
266     HasVscnt(false),
267     HasGetWaveIdInst(false),
268     HasSMemTimeInst(false),
269     HasShaderCyclesRegister(false),
270     HasVOP3Literal(false),
271     HasNoDataDepHazard(false),
272     FlatAddressSpace(false),
273     FlatInstOffsets(false),
274     FlatGlobalInsts(false),
275     FlatScratchInsts(false),
276     ScalarFlatScratchInsts(false),
277     HasArchitectedFlatScratch(false),
278     EnableFlatScratch(false),
279     AddNoCarryInsts(false),
280     HasUnpackedD16VMem(false),
281     LDSMisalignedBug(false),
282     HasMFMAInlineLiteralBug(false),
283     UnalignedBufferAccess(false),
284     UnalignedDSAccess(false),
285     HasPackedTID(false),
286 
287     ScalarizeGlobal(false),
288 
289     HasVcmpxPermlaneHazard(false),
290     HasVMEMtoScalarWriteHazard(false),
291     HasSMEMtoVectorWriteHazard(false),
292     HasInstFwdPrefetchBug(false),
293     HasVcmpxExecWARHazard(false),
294     HasLdsBranchVmemWARHazard(false),
295     HasNSAtoVMEMBug(false),
296     HasNSAClauseBug(false),
297     HasOffset3fBug(false),
298     HasFlatSegmentOffsetBug(false),
299     HasImageStoreD16Bug(false),
300     HasImageGather4D16Bug(false),
301 
302     FeatureDisable(false),
303     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
304     TLInfo(TM, *this),
305     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
306   // clang-format on
307   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
308   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
309   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
310   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
311   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
312   InstSelector.reset(new AMDGPUInstructionSelector(
313   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
314 }
315 
316 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
317   if (getGeneration() < GFX10)
318     return 1;
319 
320   switch (Opcode) {
321   case AMDGPU::V_LSHLREV_B64_e64:
322   case AMDGPU::V_LSHLREV_B64_gfx10:
323   case AMDGPU::V_LSHL_B64_e64:
324   case AMDGPU::V_LSHRREV_B64_e64:
325   case AMDGPU::V_LSHRREV_B64_gfx10:
326   case AMDGPU::V_LSHR_B64_e64:
327   case AMDGPU::V_ASHRREV_I64_e64:
328   case AMDGPU::V_ASHRREV_I64_gfx10:
329   case AMDGPU::V_ASHR_I64_e64:
330     return 1;
331   }
332 
333   return 2;
334 }
335 
336 /// This list was mostly derived from experimentation.
337 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
338   switch (Opcode) {
339   case AMDGPU::V_CVT_F16_F32_e32:
340   case AMDGPU::V_CVT_F16_F32_e64:
341   case AMDGPU::V_CVT_F16_U16_e32:
342   case AMDGPU::V_CVT_F16_U16_e64:
343   case AMDGPU::V_CVT_F16_I16_e32:
344   case AMDGPU::V_CVT_F16_I16_e64:
345   case AMDGPU::V_RCP_F16_e64:
346   case AMDGPU::V_RCP_F16_e32:
347   case AMDGPU::V_RSQ_F16_e64:
348   case AMDGPU::V_RSQ_F16_e32:
349   case AMDGPU::V_SQRT_F16_e64:
350   case AMDGPU::V_SQRT_F16_e32:
351   case AMDGPU::V_LOG_F16_e64:
352   case AMDGPU::V_LOG_F16_e32:
353   case AMDGPU::V_EXP_F16_e64:
354   case AMDGPU::V_EXP_F16_e32:
355   case AMDGPU::V_SIN_F16_e64:
356   case AMDGPU::V_SIN_F16_e32:
357   case AMDGPU::V_COS_F16_e64:
358   case AMDGPU::V_COS_F16_e32:
359   case AMDGPU::V_FLOOR_F16_e64:
360   case AMDGPU::V_FLOOR_F16_e32:
361   case AMDGPU::V_CEIL_F16_e64:
362   case AMDGPU::V_CEIL_F16_e32:
363   case AMDGPU::V_TRUNC_F16_e64:
364   case AMDGPU::V_TRUNC_F16_e32:
365   case AMDGPU::V_RNDNE_F16_e64:
366   case AMDGPU::V_RNDNE_F16_e32:
367   case AMDGPU::V_FRACT_F16_e64:
368   case AMDGPU::V_FRACT_F16_e32:
369   case AMDGPU::V_FREXP_MANT_F16_e64:
370   case AMDGPU::V_FREXP_MANT_F16_e32:
371   case AMDGPU::V_FREXP_EXP_I16_F16_e64:
372   case AMDGPU::V_FREXP_EXP_I16_F16_e32:
373   case AMDGPU::V_LDEXP_F16_e64:
374   case AMDGPU::V_LDEXP_F16_e32:
375   case AMDGPU::V_LSHLREV_B16_e64:
376   case AMDGPU::V_LSHLREV_B16_e32:
377   case AMDGPU::V_LSHRREV_B16_e64:
378   case AMDGPU::V_LSHRREV_B16_e32:
379   case AMDGPU::V_ASHRREV_I16_e64:
380   case AMDGPU::V_ASHRREV_I16_e32:
381   case AMDGPU::V_ADD_U16_e64:
382   case AMDGPU::V_ADD_U16_e32:
383   case AMDGPU::V_SUB_U16_e64:
384   case AMDGPU::V_SUB_U16_e32:
385   case AMDGPU::V_SUBREV_U16_e64:
386   case AMDGPU::V_SUBREV_U16_e32:
387   case AMDGPU::V_MUL_LO_U16_e64:
388   case AMDGPU::V_MUL_LO_U16_e32:
389   case AMDGPU::V_ADD_F16_e64:
390   case AMDGPU::V_ADD_F16_e32:
391   case AMDGPU::V_SUB_F16_e64:
392   case AMDGPU::V_SUB_F16_e32:
393   case AMDGPU::V_SUBREV_F16_e64:
394   case AMDGPU::V_SUBREV_F16_e32:
395   case AMDGPU::V_MUL_F16_e64:
396   case AMDGPU::V_MUL_F16_e32:
397   case AMDGPU::V_MAX_F16_e64:
398   case AMDGPU::V_MAX_F16_e32:
399   case AMDGPU::V_MIN_F16_e64:
400   case AMDGPU::V_MIN_F16_e32:
401   case AMDGPU::V_MAX_U16_e64:
402   case AMDGPU::V_MAX_U16_e32:
403   case AMDGPU::V_MIN_U16_e64:
404   case AMDGPU::V_MIN_U16_e32:
405   case AMDGPU::V_MAX_I16_e64:
406   case AMDGPU::V_MAX_I16_e32:
407   case AMDGPU::V_MIN_I16_e64:
408   case AMDGPU::V_MIN_I16_e32:
409   case AMDGPU::V_MAD_F16_e64:
410   case AMDGPU::V_MAD_U16_e64:
411   case AMDGPU::V_MAD_I16_e64:
412   case AMDGPU::V_FMA_F16_e64:
413   case AMDGPU::V_DIV_FIXUP_F16_e64:
414     // On gfx10, all 16-bit instructions preserve the high bits.
415     return getGeneration() <= AMDGPUSubtarget::GFX9;
416   case AMDGPU::V_MADAK_F16:
417   case AMDGPU::V_MADMK_F16:
418   case AMDGPU::V_MAC_F16_e64:
419   case AMDGPU::V_MAC_F16_e32:
420   case AMDGPU::V_FMAMK_F16:
421   case AMDGPU::V_FMAAK_F16:
422   case AMDGPU::V_FMAC_F16_e64:
423   case AMDGPU::V_FMAC_F16_e32:
424     // In gfx9, the preferred handling of the unused high 16-bits changed. Most
425     // instructions maintain the legacy behavior of 0ing. Some instructions
426     // changed to preserving the high bits.
427     return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
428   case AMDGPU::V_MAD_MIXLO_F16:
429   case AMDGPU::V_MAD_MIXHI_F16:
430   default:
431     return false;
432   }
433 }
434 
435 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
436   const Function &F) const {
437   if (NWaves == 1)
438     return getLocalMemorySize();
439   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
440   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
441   if (!WorkGroupsPerCu)
442     return 0;
443   unsigned MaxWaves = getMaxWavesPerEU();
444   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
445 }
446 
447 // FIXME: Should return min,max range.
448 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
449   const Function &F) const {
450   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
451   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
452   if (!MaxWorkGroupsPerCu)
453     return 0;
454 
455   const unsigned WaveSize = getWavefrontSize();
456 
457   // FIXME: Do we need to account for alignment requirement of LDS rounding the
458   // size up?
459   // Compute restriction based on LDS usage
460   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
461 
462   // This can be queried with more LDS than is possible, so just assume the
463   // worst.
464   if (NumGroups == 0)
465     return 1;
466 
467   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
468 
469   // Round to the number of waves.
470   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
471   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
472 
473   // Clamp to the maximum possible number of waves.
474   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
475 
476   // FIXME: Needs to be a multiple of the group size?
477   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
478 
479   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
480          "computed invalid occupancy");
481   return MaxWaves;
482 }
483 
484 unsigned
485 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
486   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
487   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
488 }
489 
490 std::pair<unsigned, unsigned>
491 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
492   switch (CC) {
493   case CallingConv::AMDGPU_VS:
494   case CallingConv::AMDGPU_LS:
495   case CallingConv::AMDGPU_HS:
496   case CallingConv::AMDGPU_ES:
497   case CallingConv::AMDGPU_GS:
498   case CallingConv::AMDGPU_PS:
499     return std::make_pair(1, getWavefrontSize());
500   default:
501     return std::make_pair(1u, getMaxFlatWorkGroupSize());
502   }
503 }
504 
505 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
506   const Function &F) const {
507   // Default minimum/maximum flat work group sizes.
508   std::pair<unsigned, unsigned> Default =
509     getDefaultFlatWorkGroupSize(F.getCallingConv());
510 
511   // Requested minimum/maximum flat work group sizes.
512   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
513     F, "amdgpu-flat-work-group-size", Default);
514 
515   // Make sure requested minimum is less than requested maximum.
516   if (Requested.first > Requested.second)
517     return Default;
518 
519   // Make sure requested values do not violate subtarget's specifications.
520   if (Requested.first < getMinFlatWorkGroupSize())
521     return Default;
522   if (Requested.second > getMaxFlatWorkGroupSize())
523     return Default;
524 
525   return Requested;
526 }
527 
528 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
529     const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
530   // Default minimum/maximum number of waves per execution unit.
531   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
532 
533   // If minimum/maximum flat work group sizes were explicitly requested using
534   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
535   // number of waves per execution unit to values implied by requested
536   // minimum/maximum flat work group sizes.
537   unsigned MinImpliedByFlatWorkGroupSize =
538     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
539   Default.first = MinImpliedByFlatWorkGroupSize;
540 
541   // Requested minimum/maximum number of waves per execution unit.
542   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
543     F, "amdgpu-waves-per-eu", Default, true);
544 
545   // Make sure requested minimum is less than requested maximum.
546   if (Requested.second && Requested.first > Requested.second)
547     return Default;
548 
549   // Make sure requested values do not violate subtarget's specifications.
550   if (Requested.first < getMinWavesPerEU() ||
551       Requested.second > getMaxWavesPerEU())
552     return Default;
553 
554   // Make sure requested values are compatible with values implied by requested
555   // minimum/maximum flat work group sizes.
556   if (Requested.first < MinImpliedByFlatWorkGroupSize)
557     return Default;
558 
559   return Requested;
560 }
561 
562 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
563   auto Node = Kernel.getMetadata("reqd_work_group_size");
564   if (Node && Node->getNumOperands() == 3)
565     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
566   return std::numeric_limits<unsigned>::max();
567 }
568 
569 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
570   return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
571 }
572 
573 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
574                                            unsigned Dimension) const {
575   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
576   if (ReqdSize != std::numeric_limits<unsigned>::max())
577     return ReqdSize - 1;
578   return getFlatWorkGroupSizes(Kernel).second - 1;
579 }
580 
581 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
582   Function *Kernel = I->getParent()->getParent();
583   unsigned MinSize = 0;
584   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
585   bool IdQuery = false;
586 
587   // If reqd_work_group_size is present it narrows value down.
588   if (auto *CI = dyn_cast<CallInst>(I)) {
589     const Function *F = CI->getCalledFunction();
590     if (F) {
591       unsigned Dim = UINT_MAX;
592       switch (F->getIntrinsicID()) {
593       case Intrinsic::amdgcn_workitem_id_x:
594       case Intrinsic::r600_read_tidig_x:
595         IdQuery = true;
596         LLVM_FALLTHROUGH;
597       case Intrinsic::r600_read_local_size_x:
598         Dim = 0;
599         break;
600       case Intrinsic::amdgcn_workitem_id_y:
601       case Intrinsic::r600_read_tidig_y:
602         IdQuery = true;
603         LLVM_FALLTHROUGH;
604       case Intrinsic::r600_read_local_size_y:
605         Dim = 1;
606         break;
607       case Intrinsic::amdgcn_workitem_id_z:
608       case Intrinsic::r600_read_tidig_z:
609         IdQuery = true;
610         LLVM_FALLTHROUGH;
611       case Intrinsic::r600_read_local_size_z:
612         Dim = 2;
613         break;
614       default:
615         break;
616       }
617 
618       if (Dim <= 3) {
619         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
620         if (ReqdSize != std::numeric_limits<unsigned>::max())
621           MinSize = MaxSize = ReqdSize;
622       }
623     }
624   }
625 
626   if (!MaxSize)
627     return false;
628 
629   // Range metadata is [Lo, Hi). For ID query we need to pass max size
630   // as Hi. For size query we need to pass Hi + 1.
631   if (IdQuery)
632     MinSize = 0;
633   else
634     ++MaxSize;
635 
636   MDBuilder MDB(I->getContext());
637   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
638                                                   APInt(32, MaxSize));
639   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
640   return true;
641 }
642 
643 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
644   assert(AMDGPU::isKernel(F.getCallingConv()));
645 
646   // We don't allocate the segment if we know the implicit arguments weren't
647   // used, even if the ABI implies we need them.
648   if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
649     return 0;
650 
651   if (isMesaKernel(F))
652     return 16;
653 
654   // Assume all implicit inputs are used by default
655   return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 56);
656 }
657 
658 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
659                                                  Align &MaxAlign) const {
660   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
661          F.getCallingConv() == CallingConv::SPIR_KERNEL);
662 
663   const DataLayout &DL = F.getParent()->getDataLayout();
664   uint64_t ExplicitArgBytes = 0;
665   MaxAlign = Align(1);
666 
667   for (const Argument &Arg : F.args()) {
668     const bool IsByRef = Arg.hasByRefAttr();
669     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
670     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
671     if (!Alignment)
672       Alignment = DL.getABITypeAlign(ArgTy);
673 
674     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
675     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
676     MaxAlign = max(MaxAlign, Alignment);
677   }
678 
679   return ExplicitArgBytes;
680 }
681 
682 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
683                                                 Align &MaxAlign) const {
684   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
685 
686   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
687 
688   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
689   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
690   if (ImplicitBytes != 0) {
691     const Align Alignment = getAlignmentForImplicitArgPtr();
692     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
693     MaxAlign = std::max(MaxAlign, Alignment);
694   }
695 
696   // Being able to dereference past the end is useful for emitting scalar loads.
697   return alignTo(TotalSize, 4);
698 }
699 
700 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
701   return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
702                                   : AMDGPUDwarfFlavour::Wave64;
703 }
704 
705 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
706                                       unsigned NumRegionInstrs) const {
707   // Track register pressure so the scheduler can try to decrease
708   // pressure once register usage is above the threshold defined by
709   // SIRegisterInfo::getRegPressureSetLimit()
710   Policy.ShouldTrackPressure = true;
711 
712   // Enabling both top down and bottom up scheduling seems to give us less
713   // register spills than just using one of these approaches on its own.
714   Policy.OnlyTopDown = false;
715   Policy.OnlyBottomUp = false;
716 
717   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
718   if (!enableSIScheduler())
719     Policy.ShouldTrackLaneMasks = true;
720 }
721 
722 bool GCNSubtarget::hasMadF16() const {
723   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
724 }
725 
726 bool GCNSubtarget::useVGPRIndexMode() const {
727   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
728 }
729 
730 bool GCNSubtarget::useAA() const { return UseAA; }
731 
732 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
733   if (getGeneration() >= AMDGPUSubtarget::GFX10)
734     return getMaxWavesPerEU();
735 
736   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
737     if (SGPRs <= 80)
738       return 10;
739     if (SGPRs <= 88)
740       return 9;
741     if (SGPRs <= 100)
742       return 8;
743     return 7;
744   }
745   if (SGPRs <= 48)
746     return 10;
747   if (SGPRs <= 56)
748     return 9;
749   if (SGPRs <= 64)
750     return 8;
751   if (SGPRs <= 72)
752     return 7;
753   if (SGPRs <= 80)
754     return 6;
755   return 5;
756 }
757 
758 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
759   unsigned MaxWaves = getMaxWavesPerEU();
760   unsigned Granule = getVGPRAllocGranule();
761   if (VGPRs < Granule)
762     return MaxWaves;
763   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
764   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
765 }
766 
767 unsigned
768 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
769   if (getGeneration() >= AMDGPUSubtarget::GFX10)
770     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
771 
772   if (HasFlatScratch || HasArchitectedFlatScratch) {
773     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
774       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
775     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
776       return 4; // FLAT_SCRATCH, VCC (in that order).
777   }
778 
779   if (isXNACKEnabled())
780     return 4; // XNACK, VCC (in that order).
781   return 2; // VCC.
782 }
783 
784 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
785   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
786   return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit());
787 }
788 
789 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
790   // In principle we do not need to reserve SGPR pair used for flat_scratch if
791   // we know flat instructions do not access the stack anywhere in the
792   // program. For now assume it's needed if we have flat instructions.
793   const bool KernelUsesFlatScratch = hasFlatAddressSpace();
794   return getBaseReservedNumSGPRs(KernelUsesFlatScratch);
795 }
796 
797 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
798                                         unsigned NumSGPRs,
799                                         unsigned NumVGPRs) const {
800   unsigned Occupancy =
801     std::min(getMaxWavesPerEU(),
802              getOccupancyWithLocalMemSize(LDSSize, F));
803   if (NumSGPRs)
804     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
805   if (NumVGPRs)
806     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
807   return Occupancy;
808 }
809 
810 unsigned GCNSubtarget::getBaseMaxNumSGPRs(
811     const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
812     unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
813   // Compute maximum number of SGPRs function can use using default/requested
814   // minimum number of waves per execution unit.
815   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
816   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
817 
818   // Check if maximum number of SGPRs was explicitly requested using
819   // "amdgpu-num-sgpr" attribute.
820   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
821     unsigned Requested = AMDGPU::getIntegerAttribute(
822       F, "amdgpu-num-sgpr", MaxNumSGPRs);
823 
824     // Make sure requested value does not violate subtarget's specifications.
825     if (Requested && (Requested <= ReservedNumSGPRs))
826       Requested = 0;
827 
828     // If more SGPRs are required to support the input user/system SGPRs,
829     // increase to accommodate them.
830     //
831     // FIXME: This really ends up using the requested number of SGPRs + number
832     // of reserved special registers in total. Theoretically you could re-use
833     // the last input registers for these special registers, but this would
834     // require a lot of complexity to deal with the weird aliasing.
835     unsigned InputNumSGPRs = PreloadedSGPRs;
836     if (Requested && Requested < InputNumSGPRs)
837       Requested = InputNumSGPRs;
838 
839     // Make sure requested value is compatible with values implied by
840     // default/requested minimum/maximum number of waves per execution unit.
841     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
842       Requested = 0;
843     if (WavesPerEU.second &&
844         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
845       Requested = 0;
846 
847     if (Requested)
848       MaxNumSGPRs = Requested;
849   }
850 
851   if (hasSGPRInitBug())
852     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
853 
854   return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
855 }
856 
857 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
858   const Function &F = MF.getFunction();
859   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
860   return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),
861                             getReservedNumSGPRs(MF));
862 }
863 
864 static unsigned getMaxNumPreloadedSGPRs() {
865   // Max number of user SGPRs
866   unsigned MaxUserSGPRs = 4 + // private segment buffer
867                           2 + // Dispatch ptr
868                           2 + // queue ptr
869                           2 + // kernel segment ptr
870                           2 + // dispatch ID
871                           2 + // flat scratch init
872                           2;  // Implicit buffer ptr
873   // Max number of system SGPRs
874   unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
875                             1 + // WorkGroupIDY
876                             1 + // WorkGroupIDZ
877                             1 + // WorkGroupInfo
878                             1;  // private segment wave byte offset
879   return MaxUserSGPRs + MaxSystemSGPRs;
880 }
881 
882 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
883   return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),
884                             getReservedNumSGPRs(F));
885 }
886 
887 unsigned GCNSubtarget::getBaseMaxNumVGPRs(
888     const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
889   // Compute maximum number of VGPRs function can use using default/requested
890   // minimum number of waves per execution unit.
891   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
892 
893   // Check if maximum number of VGPRs was explicitly requested using
894   // "amdgpu-num-vgpr" attribute.
895   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
896     unsigned Requested = AMDGPU::getIntegerAttribute(
897       F, "amdgpu-num-vgpr", MaxNumVGPRs);
898 
899     if (hasGFX90AInsts())
900       Requested *= 2;
901 
902     // Make sure requested value is compatible with values implied by
903     // default/requested minimum/maximum number of waves per execution unit.
904     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
905       Requested = 0;
906     if (WavesPerEU.second &&
907         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
908       Requested = 0;
909 
910     if (Requested)
911       MaxNumVGPRs = Requested;
912   }
913 
914   return MaxNumVGPRs;
915 }
916 
917 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
918   return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
919 }
920 
921 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
922   const Function &F = MF.getFunction();
923   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
924   return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
925 }
926 
927 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
928                                          int UseOpIdx, SDep &Dep) const {
929   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
930       !Def->isInstr() || !Use->isInstr())
931     return;
932 
933   MachineInstr *DefI = Def->getInstr();
934   MachineInstr *UseI = Use->getInstr();
935 
936   if (DefI->isBundle()) {
937     const SIRegisterInfo *TRI = getRegisterInfo();
938     auto Reg = Dep.getReg();
939     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
940     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
941     unsigned Lat = 0;
942     for (++I; I != E && I->isBundledWithPred(); ++I) {
943       if (I->modifiesRegister(Reg, TRI))
944         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
945       else if (Lat)
946         --Lat;
947     }
948     Dep.setLatency(Lat);
949   } else if (UseI->isBundle()) {
950     const SIRegisterInfo *TRI = getRegisterInfo();
951     auto Reg = Dep.getReg();
952     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
953     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
954     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
955     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
956       if (I->readsRegister(Reg, TRI))
957         break;
958       --Lat;
959     }
960     Dep.setLatency(Lat);
961   } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
962     // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
963     // implicit operands which come from the MCInstrDesc, which can fool
964     // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
965     // pseudo operands.
966     Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
967         DefI, DefOpIdx, UseI, UseOpIdx));
968   }
969 }
970 
971 namespace {
972 struct FillMFMAShadowMutation : ScheduleDAGMutation {
973   const SIInstrInfo *TII;
974 
975   ScheduleDAGMI *DAG;
976 
977   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
978 
979   bool isSALU(const SUnit *SU) const {
980     const MachineInstr *MI = SU->getInstr();
981     return MI && TII->isSALU(*MI) && !MI->isTerminator();
982   }
983 
984   bool isVALU(const SUnit *SU) const {
985     const MachineInstr *MI = SU->getInstr();
986     return MI && TII->isVALU(*MI);
987   }
988 
989   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
990     if (Pred->NodeNum < Succ->NodeNum)
991       return true;
992 
993     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
994 
995     for (unsigned I = 0; I < Succs.size(); ++I) {
996       for (const SDep &SI : Succs[I]->Succs) {
997         const SUnit *SU = SI.getSUnit();
998         if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
999           Succs.push_back(SU);
1000       }
1001     }
1002 
1003     SmallPtrSet<const SUnit*, 32> Visited;
1004     while (!Preds.empty()) {
1005       const SUnit *SU = Preds.pop_back_val();
1006       if (llvm::is_contained(Succs, SU))
1007         return false;
1008       Visited.insert(SU);
1009       for (const SDep &SI : SU->Preds)
1010         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
1011           Preds.push_back(SI.getSUnit());
1012     }
1013 
1014     return true;
1015   }
1016 
1017   // Link as many SALU instructions in chain as possible. Return the size
1018   // of the chain. Links up to MaxChain instructions.
1019   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
1020                          SmallPtrSetImpl<SUnit *> &Visited) const {
1021     SmallVector<SUnit *, 8> Worklist({To});
1022     unsigned Linked = 0;
1023 
1024     while (!Worklist.empty() && MaxChain-- > 0) {
1025       SUnit *SU = Worklist.pop_back_val();
1026       if (!Visited.insert(SU).second)
1027         continue;
1028 
1029       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
1030                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
1031 
1032       if (SU->addPred(SDep(From, SDep::Artificial), false))
1033         ++Linked;
1034 
1035       for (SDep &SI : From->Succs) {
1036         SUnit *SUv = SI.getSUnit();
1037         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
1038           SUv->addPred(SDep(SU, SDep::Artificial), false);
1039       }
1040 
1041       for (SDep &SI : SU->Succs) {
1042         SUnit *Succ = SI.getSUnit();
1043         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
1044           Worklist.push_back(Succ);
1045       }
1046     }
1047 
1048     return Linked;
1049   }
1050 
1051   void apply(ScheduleDAGInstrs *DAGInstrs) override {
1052     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
1053     if (!ST.hasMAIInsts() || DisablePowerSched)
1054       return;
1055     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
1056     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
1057     if (!TSchedModel || DAG->SUnits.empty())
1058       return;
1059 
1060     // Scan for MFMA long latency instructions and try to add a dependency
1061     // of available SALU instructions to give them a chance to fill MFMA
1062     // shadow. That is desirable to fill MFMA shadow with SALU instructions
1063     // rather than VALU to prevent power consumption bursts and throttle.
1064     auto LastSALU = DAG->SUnits.begin();
1065     auto E = DAG->SUnits.end();
1066     SmallPtrSet<SUnit*, 32> Visited;
1067     for (SUnit &SU : DAG->SUnits) {
1068       MachineInstr &MAI = *SU.getInstr();
1069       if (!TII->isMAI(MAI) ||
1070            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
1071            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
1072         continue;
1073 
1074       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
1075 
1076       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
1077                  dbgs() << "Need " << Lat
1078                         << " instructions to cover latency.\n");
1079 
1080       // Find up to Lat independent scalar instructions as early as
1081       // possible such that they can be scheduled after this MFMA.
1082       for ( ; Lat && LastSALU != E; ++LastSALU) {
1083         if (Visited.count(&*LastSALU))
1084           continue;
1085 
1086         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
1087           continue;
1088 
1089         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
1090       }
1091     }
1092   }
1093 };
1094 } // namespace
1095 
1096 void GCNSubtarget::getPostRAMutations(
1097     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
1098   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
1099 }
1100 
1101 std::unique_ptr<ScheduleDAGMutation>
1102 GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const {
1103   return std::make_unique<FillMFMAShadowMutation>(&InstrInfo);
1104 }
1105 
1106 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
1107   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
1108     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
1109   else
1110     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
1111 }
1112 
1113 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
1114   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
1115     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
1116   else
1117     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
1118 }
1119