1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPUCallLowering.h"
16 #include "AMDGPUInstructionSelector.h"
17 #include "AMDGPULegalizerInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "R600Subtarget.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/IntrinsicsR600.h"
29 #include "llvm/IR/MDBuilder.h"
30 #include "llvm/MC/MCSubtargetInfo.h"
31 #include <algorithm>
32 
33 using namespace llvm;
34 
35 #define DEBUG_TYPE "amdgpu-subtarget"
36 
37 #define GET_SUBTARGETINFO_TARGET_DESC
38 #define GET_SUBTARGETINFO_CTOR
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #undef AMDGPUSubtarget
42 
43 static cl::opt<bool> DisablePowerSched(
44   "amdgpu-disable-power-sched",
45   cl::desc("Disable scheduling to minimize mAI power bursts"),
46   cl::init(false));
47 
48 static cl::opt<bool> EnableVGPRIndexMode(
49   "amdgpu-vgpr-index-mode",
50   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51   cl::init(false));
52 
53 static cl::opt<bool> EnableFlatScratch(
54   "amdgpu-enable-flat-scratch",
55   cl::desc("Use flat scratch instructions"),
56   cl::init(false));
57 
58 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
59                            cl::desc("Enable the use of AA during codegen."),
60                            cl::init(true));
61 
62 GCNSubtarget::~GCNSubtarget() = default;
63 
64 GCNSubtarget &
65 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
66                                               StringRef GPU, StringRef FS) {
67   // Determine default and user-specified characteristics
68   //
69   // We want to be able to turn these off, but making this a subtarget feature
70   // for SI has the unhelpful behavior that it unsets everything else if you
71   // disable it.
72   //
73   // Similarly we want enable-prt-strict-null to be on by default and not to
74   // unset everything else if it is disabled
75 
76   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
77 
78   // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
79   if (isAmdHsaOS())
80     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
81 
82   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
83 
84   // Disable mutually exclusive bits.
85   if (FS.find_insensitive("+wavefrontsize") != StringRef::npos) {
86     if (FS.find_insensitive("wavefrontsize16") == StringRef::npos)
87       FullFS += "-wavefrontsize16,";
88     if (FS.find_insensitive("wavefrontsize32") == StringRef::npos)
89       FullFS += "-wavefrontsize32,";
90     if (FS.find_insensitive("wavefrontsize64") == StringRef::npos)
91       FullFS += "-wavefrontsize64,";
92   }
93 
94   FullFS += FS;
95 
96   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
97 
98   // Implement the "generic" processors, which acts as the default when no
99   // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
100   // the first amdgcn target that supports flat addressing. Other OSes defaults
101   // to the first amdgcn target.
102   if (Gen == AMDGPUSubtarget::INVALID) {
103      Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
104                                         : AMDGPUSubtarget::SOUTHERN_ISLANDS;
105   }
106 
107   // We don't support FP64 for EG/NI atm.
108   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
109 
110   // Targets must either support 64-bit offsets for MUBUF instructions, and/or
111   // support flat operations, otherwise they cannot access a 64-bit global
112   // address space
113   assert(hasAddr64() || hasFlat());
114   // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
115   // that do not support ADDR64 variants of MUBUF instructions. Such targets
116   // cannot use a 64 bit offset with a MUBUF instruction to access the global
117   // address space
118   if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
119     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
120     FlatForGlobal = true;
121   }
122   // Unless +-flat-for-global is specified, use MUBUF instructions for global
123   // address space access if flat operations are not available.
124   if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
125     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
126     FlatForGlobal = false;
127   }
128 
129   // Set defaults if needed.
130   if (MaxPrivateElementSize == 0)
131     MaxPrivateElementSize = 4;
132 
133   if (LDSBankCount == 0)
134     LDSBankCount = 32;
135 
136   if (TT.getArch() == Triple::amdgcn) {
137     if (LocalMemorySize == 0)
138       LocalMemorySize = 32768;
139 
140     // Do something sensible for unspecified target.
141     if (!HasMovrel && !HasVGPRIndexMode)
142       HasMovrel = true;
143   }
144 
145   // Don't crash on invalid devices.
146   if (WavefrontSizeLog2 == 0)
147     WavefrontSizeLog2 = 5;
148 
149   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
150   HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
151 
152   TargetID.setTargetIDFromFeaturesString(FS);
153 
154   LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
155                     << TargetID.getXnackSetting() << '\n');
156   LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
157                     << TargetID.getSramEccSetting() << '\n');
158 
159   return *this;
160 }
161 
162 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
163   TargetTriple(TT),
164   GCN3Encoding(false),
165   Has16BitInsts(false),
166   HasMadMixInsts(false),
167   HasMadMacF32Insts(false),
168   HasDsSrc2Insts(false),
169   HasSDWA(false),
170   HasVOP3PInsts(false),
171   HasMulI24(true),
172   HasMulU24(true),
173   HasSMulHi(false),
174   HasInv2PiInlineImm(false),
175   HasFminFmaxLegacy(true),
176   EnablePromoteAlloca(false),
177   HasTrigReducedRange(false),
178   MaxWavesPerEU(10),
179   LocalMemorySize(0),
180   WavefrontSizeLog2(0)
181   { }
182 
183 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
184                            const GCNTargetMachine &TM)
185     : // clang-format off
186     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
187     AMDGPUSubtarget(TT),
188     TargetTriple(TT),
189     TargetID(*this),
190     Gen(INVALID),
191     InstrItins(getInstrItineraryForCPU(GPU)),
192     LDSBankCount(0),
193     MaxPrivateElementSize(0),
194 
195     FastFMAF32(false),
196     FastDenormalF32(false),
197     HalfRate64Ops(false),
198     FullRate64Ops(false),
199 
200     FlatForGlobal(false),
201     AutoWaitcntBeforeBarrier(false),
202     UnalignedScratchAccess(false),
203     UnalignedAccessMode(false),
204 
205     HasApertureRegs(false),
206     SupportsXNACK(false),
207     EnableXNACK(false),
208     EnableTgSplit(false),
209     EnableCuMode(false),
210     TrapHandler(false),
211 
212     EnableLoadStoreOpt(false),
213     EnableUnsafeDSOffsetFolding(false),
214     EnableSIScheduler(false),
215     EnableDS128(false),
216     EnablePRTStrictNull(false),
217     DumpCode(false),
218 
219     FP64(false),
220     CIInsts(false),
221     GFX8Insts(false),
222     GFX9Insts(false),
223     GFX90AInsts(false),
224     GFX10Insts(false),
225     GFX10_3Insts(false),
226     GFX7GFX8GFX9Insts(false),
227     SGPRInitBug(false),
228     NegativeScratchOffsetBug(false),
229     NegativeUnalignedScratchOffsetBug(false),
230     HasSMemRealTime(false),
231     HasIntClamp(false),
232     HasFmaMixInsts(false),
233     HasMovrel(false),
234     HasVGPRIndexMode(false),
235     HasScalarStores(false),
236     HasScalarAtomics(false),
237     HasSDWAOmod(false),
238     HasSDWAScalar(false),
239     HasSDWASdst(false),
240     HasSDWAMac(false),
241     HasSDWAOutModsVOPC(false),
242     HasDPP(false),
243     HasDPP8(false),
244     Has64BitDPP(false),
245     HasPackedFP32Ops(false),
246     HasExtendedImageInsts(false),
247     HasR128A16(false),
248     HasGFX10A16(false),
249     HasG16(false),
250     HasNSAEncoding(false),
251     NSAMaxSize(0),
252     GFX10_AEncoding(false),
253     GFX10_BEncoding(false),
254     HasDLInsts(false),
255     HasDot1Insts(false),
256     HasDot2Insts(false),
257     HasDot3Insts(false),
258     HasDot4Insts(false),
259     HasDot5Insts(false),
260     HasDot6Insts(false),
261     HasDot7Insts(false),
262     HasMAIInsts(false),
263     HasPkFmacF16Inst(false),
264     HasAtomicFaddInsts(false),
265     SupportsSRAMECC(false),
266     EnableSRAMECC(false),
267     HasNoSdstCMPX(false),
268     HasVscnt(false),
269     HasGetWaveIdInst(false),
270     HasSMemTimeInst(false),
271     HasShaderCyclesRegister(false),
272     HasRegisterBanking(false),
273     HasVOP3Literal(false),
274     HasNoDataDepHazard(false),
275     FlatAddressSpace(false),
276     FlatInstOffsets(false),
277     FlatGlobalInsts(false),
278     FlatScratchInsts(false),
279     ScalarFlatScratchInsts(false),
280     HasArchitectedFlatScratch(false),
281     AddNoCarryInsts(false),
282     HasUnpackedD16VMem(false),
283     LDSMisalignedBug(false),
284     HasMFMAInlineLiteralBug(false),
285     UnalignedBufferAccess(false),
286     UnalignedDSAccess(false),
287     HasPackedTID(false),
288 
289     ScalarizeGlobal(false),
290 
291     HasVcmpxPermlaneHazard(false),
292     HasVMEMtoScalarWriteHazard(false),
293     HasSMEMtoVectorWriteHazard(false),
294     HasInstFwdPrefetchBug(false),
295     HasVcmpxExecWARHazard(false),
296     HasLdsBranchVmemWARHazard(false),
297     HasNSAtoVMEMBug(false),
298     HasNSAClauseBug(false),
299     HasOffset3fBug(false),
300     HasFlatSegmentOffsetBug(false),
301     HasImageStoreD16Bug(false),
302     HasImageGather4D16Bug(false),
303 
304     FeatureDisable(false),
305     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
306     TLInfo(TM, *this),
307     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
308   // clang-format on
309   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
310   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
311   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
312   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
313   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
314   InstSelector.reset(new AMDGPUInstructionSelector(
315   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
316 }
317 
318 bool GCNSubtarget::enableFlatScratch() const {
319   return flatScratchIsArchitected() ||
320          (EnableFlatScratch && hasFlatScratchInsts());
321 }
322 
323 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
324   if (getGeneration() < GFX10)
325     return 1;
326 
327   switch (Opcode) {
328   case AMDGPU::V_LSHLREV_B64_e64:
329   case AMDGPU::V_LSHLREV_B64_gfx10:
330   case AMDGPU::V_LSHL_B64_e64:
331   case AMDGPU::V_LSHRREV_B64_e64:
332   case AMDGPU::V_LSHRREV_B64_gfx10:
333   case AMDGPU::V_LSHR_B64_e64:
334   case AMDGPU::V_ASHRREV_I64_e64:
335   case AMDGPU::V_ASHRREV_I64_gfx10:
336   case AMDGPU::V_ASHR_I64_e64:
337     return 1;
338   }
339 
340   return 2;
341 }
342 
343 /// This list was mostly derived from experimentation.
344 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
345   switch (Opcode) {
346   case AMDGPU::V_CVT_F16_F32_e32:
347   case AMDGPU::V_CVT_F16_F32_e64:
348   case AMDGPU::V_CVT_F16_U16_e32:
349   case AMDGPU::V_CVT_F16_U16_e64:
350   case AMDGPU::V_CVT_F16_I16_e32:
351   case AMDGPU::V_CVT_F16_I16_e64:
352   case AMDGPU::V_RCP_F16_e64:
353   case AMDGPU::V_RCP_F16_e32:
354   case AMDGPU::V_RSQ_F16_e64:
355   case AMDGPU::V_RSQ_F16_e32:
356   case AMDGPU::V_SQRT_F16_e64:
357   case AMDGPU::V_SQRT_F16_e32:
358   case AMDGPU::V_LOG_F16_e64:
359   case AMDGPU::V_LOG_F16_e32:
360   case AMDGPU::V_EXP_F16_e64:
361   case AMDGPU::V_EXP_F16_e32:
362   case AMDGPU::V_SIN_F16_e64:
363   case AMDGPU::V_SIN_F16_e32:
364   case AMDGPU::V_COS_F16_e64:
365   case AMDGPU::V_COS_F16_e32:
366   case AMDGPU::V_FLOOR_F16_e64:
367   case AMDGPU::V_FLOOR_F16_e32:
368   case AMDGPU::V_CEIL_F16_e64:
369   case AMDGPU::V_CEIL_F16_e32:
370   case AMDGPU::V_TRUNC_F16_e64:
371   case AMDGPU::V_TRUNC_F16_e32:
372   case AMDGPU::V_RNDNE_F16_e64:
373   case AMDGPU::V_RNDNE_F16_e32:
374   case AMDGPU::V_FRACT_F16_e64:
375   case AMDGPU::V_FRACT_F16_e32:
376   case AMDGPU::V_FREXP_MANT_F16_e64:
377   case AMDGPU::V_FREXP_MANT_F16_e32:
378   case AMDGPU::V_FREXP_EXP_I16_F16_e64:
379   case AMDGPU::V_FREXP_EXP_I16_F16_e32:
380   case AMDGPU::V_LDEXP_F16_e64:
381   case AMDGPU::V_LDEXP_F16_e32:
382   case AMDGPU::V_LSHLREV_B16_e64:
383   case AMDGPU::V_LSHLREV_B16_e32:
384   case AMDGPU::V_LSHRREV_B16_e64:
385   case AMDGPU::V_LSHRREV_B16_e32:
386   case AMDGPU::V_ASHRREV_I16_e64:
387   case AMDGPU::V_ASHRREV_I16_e32:
388   case AMDGPU::V_ADD_U16_e64:
389   case AMDGPU::V_ADD_U16_e32:
390   case AMDGPU::V_SUB_U16_e64:
391   case AMDGPU::V_SUB_U16_e32:
392   case AMDGPU::V_SUBREV_U16_e64:
393   case AMDGPU::V_SUBREV_U16_e32:
394   case AMDGPU::V_MUL_LO_U16_e64:
395   case AMDGPU::V_MUL_LO_U16_e32:
396   case AMDGPU::V_ADD_F16_e64:
397   case AMDGPU::V_ADD_F16_e32:
398   case AMDGPU::V_SUB_F16_e64:
399   case AMDGPU::V_SUB_F16_e32:
400   case AMDGPU::V_SUBREV_F16_e64:
401   case AMDGPU::V_SUBREV_F16_e32:
402   case AMDGPU::V_MUL_F16_e64:
403   case AMDGPU::V_MUL_F16_e32:
404   case AMDGPU::V_MAX_F16_e64:
405   case AMDGPU::V_MAX_F16_e32:
406   case AMDGPU::V_MIN_F16_e64:
407   case AMDGPU::V_MIN_F16_e32:
408   case AMDGPU::V_MAX_U16_e64:
409   case AMDGPU::V_MAX_U16_e32:
410   case AMDGPU::V_MIN_U16_e64:
411   case AMDGPU::V_MIN_U16_e32:
412   case AMDGPU::V_MAX_I16_e64:
413   case AMDGPU::V_MAX_I16_e32:
414   case AMDGPU::V_MIN_I16_e64:
415   case AMDGPU::V_MIN_I16_e32:
416     // On gfx10, all 16-bit instructions preserve the high bits.
417     return getGeneration() <= AMDGPUSubtarget::GFX9;
418   case AMDGPU::V_MAD_F16_e64:
419   case AMDGPU::V_MADAK_F16:
420   case AMDGPU::V_MADMK_F16:
421   case AMDGPU::V_MAC_F16_e64:
422   case AMDGPU::V_MAC_F16_e32:
423   case AMDGPU::V_FMAMK_F16:
424   case AMDGPU::V_FMAAK_F16:
425   case AMDGPU::V_MAD_U16_e64:
426   case AMDGPU::V_MAD_I16_e64:
427   case AMDGPU::V_FMA_F16_e64:
428   case AMDGPU::V_FMAC_F16_e64:
429   case AMDGPU::V_FMAC_F16_e32:
430   case AMDGPU::V_DIV_FIXUP_F16_e64:
431     // In gfx9, the preferred handling of the unused high 16-bits changed. Most
432     // instructions maintain the legacy behavior of 0ing. Some instructions
433     // changed to preserving the high bits.
434     return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
435   case AMDGPU::V_MAD_MIXLO_F16:
436   case AMDGPU::V_MAD_MIXHI_F16:
437   default:
438     return false;
439   }
440 }
441 
442 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
443   const Function &F) const {
444   if (NWaves == 1)
445     return getLocalMemorySize();
446   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
447   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
448   if (!WorkGroupsPerCu)
449     return 0;
450   unsigned MaxWaves = getMaxWavesPerEU();
451   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
452 }
453 
454 // FIXME: Should return min,max range.
455 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
456   const Function &F) const {
457   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
458   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
459   if (!MaxWorkGroupsPerCu)
460     return 0;
461 
462   const unsigned WaveSize = getWavefrontSize();
463 
464   // FIXME: Do we need to account for alignment requirement of LDS rounding the
465   // size up?
466   // Compute restriction based on LDS usage
467   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
468 
469   // This can be queried with more LDS than is possible, so just assume the
470   // worst.
471   if (NumGroups == 0)
472     return 1;
473 
474   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
475 
476   // Round to the number of waves.
477   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
478   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
479 
480   // Clamp to the maximum possible number of waves.
481   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
482 
483   // FIXME: Needs to be a multiple of the group size?
484   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
485 
486   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
487          "computed invalid occupancy");
488   return MaxWaves;
489 }
490 
491 unsigned
492 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
493   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
494   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
495 }
496 
497 std::pair<unsigned, unsigned>
498 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
499   switch (CC) {
500   case CallingConv::AMDGPU_VS:
501   case CallingConv::AMDGPU_LS:
502   case CallingConv::AMDGPU_HS:
503   case CallingConv::AMDGPU_ES:
504   case CallingConv::AMDGPU_GS:
505   case CallingConv::AMDGPU_PS:
506     return std::make_pair(1, getWavefrontSize());
507   default:
508     return std::make_pair(1u, getMaxFlatWorkGroupSize());
509   }
510 }
511 
512 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
513   const Function &F) const {
514   // Default minimum/maximum flat work group sizes.
515   std::pair<unsigned, unsigned> Default =
516     getDefaultFlatWorkGroupSize(F.getCallingConv());
517 
518   // Requested minimum/maximum flat work group sizes.
519   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
520     F, "amdgpu-flat-work-group-size", Default);
521 
522   // Make sure requested minimum is less than requested maximum.
523   if (Requested.first > Requested.second)
524     return Default;
525 
526   // Make sure requested values do not violate subtarget's specifications.
527   if (Requested.first < getMinFlatWorkGroupSize())
528     return Default;
529   if (Requested.second > getMaxFlatWorkGroupSize())
530     return Default;
531 
532   return Requested;
533 }
534 
535 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
536     const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
537   // Default minimum/maximum number of waves per execution unit.
538   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
539 
540   // If minimum/maximum flat work group sizes were explicitly requested using
541   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
542   // number of waves per execution unit to values implied by requested
543   // minimum/maximum flat work group sizes.
544   unsigned MinImpliedByFlatWorkGroupSize =
545     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
546   Default.first = MinImpliedByFlatWorkGroupSize;
547   bool RequestedFlatWorkGroupSize =
548       F.hasFnAttribute("amdgpu-flat-work-group-size");
549 
550   // Requested minimum/maximum number of waves per execution unit.
551   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
552     F, "amdgpu-waves-per-eu", Default, true);
553 
554   // Make sure requested minimum is less than requested maximum.
555   if (Requested.second && Requested.first > Requested.second)
556     return Default;
557 
558   // Make sure requested values do not violate subtarget's specifications.
559   if (Requested.first < getMinWavesPerEU() ||
560       Requested.second > getMaxWavesPerEU())
561     return Default;
562 
563   // Make sure requested values are compatible with values implied by requested
564   // minimum/maximum flat work group sizes.
565   if (RequestedFlatWorkGroupSize &&
566       Requested.first < MinImpliedByFlatWorkGroupSize)
567     return Default;
568 
569   return Requested;
570 }
571 
572 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
573   auto Node = Kernel.getMetadata("reqd_work_group_size");
574   if (Node && Node->getNumOperands() == 3)
575     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
576   return std::numeric_limits<unsigned>::max();
577 }
578 
579 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
580   return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
581 }
582 
583 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
584                                            unsigned Dimension) const {
585   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
586   if (ReqdSize != std::numeric_limits<unsigned>::max())
587     return ReqdSize - 1;
588   return getFlatWorkGroupSizes(Kernel).second - 1;
589 }
590 
591 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
592   Function *Kernel = I->getParent()->getParent();
593   unsigned MinSize = 0;
594   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
595   bool IdQuery = false;
596 
597   // If reqd_work_group_size is present it narrows value down.
598   if (auto *CI = dyn_cast<CallInst>(I)) {
599     const Function *F = CI->getCalledFunction();
600     if (F) {
601       unsigned Dim = UINT_MAX;
602       switch (F->getIntrinsicID()) {
603       case Intrinsic::amdgcn_workitem_id_x:
604       case Intrinsic::r600_read_tidig_x:
605         IdQuery = true;
606         LLVM_FALLTHROUGH;
607       case Intrinsic::r600_read_local_size_x:
608         Dim = 0;
609         break;
610       case Intrinsic::amdgcn_workitem_id_y:
611       case Intrinsic::r600_read_tidig_y:
612         IdQuery = true;
613         LLVM_FALLTHROUGH;
614       case Intrinsic::r600_read_local_size_y:
615         Dim = 1;
616         break;
617       case Intrinsic::amdgcn_workitem_id_z:
618       case Intrinsic::r600_read_tidig_z:
619         IdQuery = true;
620         LLVM_FALLTHROUGH;
621       case Intrinsic::r600_read_local_size_z:
622         Dim = 2;
623         break;
624       default:
625         break;
626       }
627 
628       if (Dim <= 3) {
629         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
630         if (ReqdSize != std::numeric_limits<unsigned>::max())
631           MinSize = MaxSize = ReqdSize;
632       }
633     }
634   }
635 
636   if (!MaxSize)
637     return false;
638 
639   // Range metadata is [Lo, Hi). For ID query we need to pass max size
640   // as Hi. For size query we need to pass Hi + 1.
641   if (IdQuery)
642     MinSize = 0;
643   else
644     ++MaxSize;
645 
646   MDBuilder MDB(I->getContext());
647   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
648                                                   APInt(32, MaxSize));
649   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
650   return true;
651 }
652 
653 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
654   if (isMesaKernel(F))
655     return 16;
656   return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
657 }
658 
659 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
660                                                  Align &MaxAlign) const {
661   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
662          F.getCallingConv() == CallingConv::SPIR_KERNEL);
663 
664   const DataLayout &DL = F.getParent()->getDataLayout();
665   uint64_t ExplicitArgBytes = 0;
666   MaxAlign = Align(1);
667 
668   for (const Argument &Arg : F.args()) {
669     const bool IsByRef = Arg.hasByRefAttr();
670     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
671     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
672     if (!Alignment)
673       Alignment = DL.getABITypeAlign(ArgTy);
674 
675     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
676     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
677     MaxAlign = max(MaxAlign, Alignment);
678   }
679 
680   return ExplicitArgBytes;
681 }
682 
683 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
684                                                 Align &MaxAlign) const {
685   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
686 
687   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
688 
689   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
690   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
691   if (ImplicitBytes != 0) {
692     const Align Alignment = getAlignmentForImplicitArgPtr();
693     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
694   }
695 
696   // Being able to dereference past the end is useful for emitting scalar loads.
697   return alignTo(TotalSize, 4);
698 }
699 
700 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
701   return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
702                                   : AMDGPUDwarfFlavour::Wave64;
703 }
704 
705 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
706                                       unsigned NumRegionInstrs) const {
707   // Track register pressure so the scheduler can try to decrease
708   // pressure once register usage is above the threshold defined by
709   // SIRegisterInfo::getRegPressureSetLimit()
710   Policy.ShouldTrackPressure = true;
711 
712   // Enabling both top down and bottom up scheduling seems to give us less
713   // register spills than just using one of these approaches on its own.
714   Policy.OnlyTopDown = false;
715   Policy.OnlyBottomUp = false;
716 
717   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
718   if (!enableSIScheduler())
719     Policy.ShouldTrackLaneMasks = true;
720 }
721 
722 bool GCNSubtarget::hasMadF16() const {
723   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
724 }
725 
726 bool GCNSubtarget::useVGPRIndexMode() const {
727   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
728 }
729 
730 bool GCNSubtarget::useAA() const { return UseAA; }
731 
732 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
733   if (getGeneration() >= AMDGPUSubtarget::GFX10)
734     return getMaxWavesPerEU();
735 
736   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
737     if (SGPRs <= 80)
738       return 10;
739     if (SGPRs <= 88)
740       return 9;
741     if (SGPRs <= 100)
742       return 8;
743     return 7;
744   }
745   if (SGPRs <= 48)
746     return 10;
747   if (SGPRs <= 56)
748     return 9;
749   if (SGPRs <= 64)
750     return 8;
751   if (SGPRs <= 72)
752     return 7;
753   if (SGPRs <= 80)
754     return 6;
755   return 5;
756 }
757 
758 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
759   unsigned MaxWaves = getMaxWavesPerEU();
760   unsigned Granule = getVGPRAllocGranule();
761   if (VGPRs < Granule)
762     return MaxWaves;
763   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
764   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
765 }
766 
767 unsigned
768 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const {
769   if (getGeneration() >= AMDGPUSubtarget::GFX10)
770     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
771 
772   if (HasFlatScratchInit) {
773     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
774       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
775     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
776       return 4; // FLAT_SCRATCH, VCC (in that order).
777   }
778 
779   if (isXNACKEnabled())
780     return 4; // XNACK, VCC (in that order).
781   return 2; // VCC.
782 }
783 
784 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
785   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
786   return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit());
787 }
788 
789 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
790   // The logic to detect if the function has
791   // flat scratch init is slightly different than how
792   // SIMachineFunctionInfo constructor derives.
793   // We don't use amdgpu-calls, amdgpu-stack-objects
794   // attributes and isAmdHsaOrMesa here as it doesn't really matter.
795   // TODO: Outline this derivation logic and have just
796   // one common function in the backend to avoid duplication.
797   bool isEntry = AMDGPU::isEntryFunctionCC(F.getCallingConv());
798   bool FunctionHasFlatScratchInit = false;
799   if (hasFlatAddressSpace() && isEntry && !flatScratchIsArchitected() &&
800       enableFlatScratch()) {
801     FunctionHasFlatScratchInit = true;
802   }
803   return getBaseReservedNumSGPRs(FunctionHasFlatScratchInit);
804 }
805 
806 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
807                                         unsigned NumSGPRs,
808                                         unsigned NumVGPRs) const {
809   unsigned Occupancy =
810     std::min(getMaxWavesPerEU(),
811              getOccupancyWithLocalMemSize(LDSSize, F));
812   if (NumSGPRs)
813     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
814   if (NumVGPRs)
815     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
816   return Occupancy;
817 }
818 
819 unsigned GCNSubtarget::getBaseMaxNumSGPRs(
820     const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
821     unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
822   // Compute maximum number of SGPRs function can use using default/requested
823   // minimum number of waves per execution unit.
824   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
825   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
826 
827   // Check if maximum number of SGPRs was explicitly requested using
828   // "amdgpu-num-sgpr" attribute.
829   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
830     unsigned Requested = AMDGPU::getIntegerAttribute(
831       F, "amdgpu-num-sgpr", MaxNumSGPRs);
832 
833     // Make sure requested value does not violate subtarget's specifications.
834     if (Requested && (Requested <= ReservedNumSGPRs))
835       Requested = 0;
836 
837     // If more SGPRs are required to support the input user/system SGPRs,
838     // increase to accommodate them.
839     //
840     // FIXME: This really ends up using the requested number of SGPRs + number
841     // of reserved special registers in total. Theoretically you could re-use
842     // the last input registers for these special registers, but this would
843     // require a lot of complexity to deal with the weird aliasing.
844     unsigned InputNumSGPRs = PreloadedSGPRs;
845     if (Requested && Requested < InputNumSGPRs)
846       Requested = InputNumSGPRs;
847 
848     // Make sure requested value is compatible with values implied by
849     // default/requested minimum/maximum number of waves per execution unit.
850     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
851       Requested = 0;
852     if (WavesPerEU.second &&
853         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
854       Requested = 0;
855 
856     if (Requested)
857       MaxNumSGPRs = Requested;
858   }
859 
860   if (hasSGPRInitBug())
861     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
862 
863   return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
864 }
865 
866 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
867   const Function &F = MF.getFunction();
868   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
869   return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),
870                             getReservedNumSGPRs(MF));
871 }
872 
873 static unsigned getMaxNumPreloadedSGPRs() {
874   // Max number of user SGPRs
875   unsigned MaxUserSGPRs = 4 + // private segment buffer
876                           2 + // Dispatch ptr
877                           2 + // queue ptr
878                           2 + // kernel segment ptr
879                           2 + // dispatch ID
880                           2 + // flat scratch init
881                           2;  // Implicit buffer ptr
882   // Max number of system SGPRs
883   unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
884                             1 + // WorkGroupIDY
885                             1 + // WorkGroupIDZ
886                             1 + // WorkGroupInfo
887                             1;  // private segment wave byte offset
888   return MaxUserSGPRs + MaxSystemSGPRs;
889 }
890 
891 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
892   return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),
893                             getReservedNumSGPRs(F));
894 }
895 
896 unsigned GCNSubtarget::getBaseMaxNumVGPRs(
897     const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
898   // Compute maximum number of VGPRs function can use using default/requested
899   // minimum number of waves per execution unit.
900   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
901 
902   // Check if maximum number of VGPRs was explicitly requested using
903   // "amdgpu-num-vgpr" attribute.
904   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
905     unsigned Requested = AMDGPU::getIntegerAttribute(
906       F, "amdgpu-num-vgpr", MaxNumVGPRs);
907 
908     if (hasGFX90AInsts())
909       Requested *= 2;
910 
911     // Make sure requested value is compatible with values implied by
912     // default/requested minimum/maximum number of waves per execution unit.
913     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
914       Requested = 0;
915     if (WavesPerEU.second &&
916         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
917       Requested = 0;
918 
919     if (Requested)
920       MaxNumVGPRs = Requested;
921   }
922 
923   return MaxNumVGPRs;
924 }
925 
926 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
927   return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
928 }
929 
930 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
931   const Function &F = MF.getFunction();
932   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
933   return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
934 }
935 
936 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
937                                          int UseOpIdx, SDep &Dep) const {
938   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
939       !Def->isInstr() || !Use->isInstr())
940     return;
941 
942   MachineInstr *DefI = Def->getInstr();
943   MachineInstr *UseI = Use->getInstr();
944 
945   if (DefI->isBundle()) {
946     const SIRegisterInfo *TRI = getRegisterInfo();
947     auto Reg = Dep.getReg();
948     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
949     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
950     unsigned Lat = 0;
951     for (++I; I != E && I->isBundledWithPred(); ++I) {
952       if (I->modifiesRegister(Reg, TRI))
953         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
954       else if (Lat)
955         --Lat;
956     }
957     Dep.setLatency(Lat);
958   } else if (UseI->isBundle()) {
959     const SIRegisterInfo *TRI = getRegisterInfo();
960     auto Reg = Dep.getReg();
961     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
962     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
963     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
964     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
965       if (I->readsRegister(Reg, TRI))
966         break;
967       --Lat;
968     }
969     Dep.setLatency(Lat);
970   }
971 }
972 
973 namespace {
974 struct FillMFMAShadowMutation : ScheduleDAGMutation {
975   const SIInstrInfo *TII;
976 
977   ScheduleDAGMI *DAG;
978 
979   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
980 
981   bool isSALU(const SUnit *SU) const {
982     const MachineInstr *MI = SU->getInstr();
983     return MI && TII->isSALU(*MI) && !MI->isTerminator();
984   }
985 
986   bool isVALU(const SUnit *SU) const {
987     const MachineInstr *MI = SU->getInstr();
988     return MI && TII->isVALU(*MI);
989   }
990 
991   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
992     if (Pred->NodeNum < Succ->NodeNum)
993       return true;
994 
995     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
996 
997     for (unsigned I = 0; I < Succs.size(); ++I) {
998       for (const SDep &SI : Succs[I]->Succs) {
999         const SUnit *SU = SI.getSUnit();
1000         if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
1001           Succs.push_back(SU);
1002       }
1003     }
1004 
1005     SmallPtrSet<const SUnit*, 32> Visited;
1006     while (!Preds.empty()) {
1007       const SUnit *SU = Preds.pop_back_val();
1008       if (llvm::is_contained(Succs, SU))
1009         return false;
1010       Visited.insert(SU);
1011       for (const SDep &SI : SU->Preds)
1012         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
1013           Preds.push_back(SI.getSUnit());
1014     }
1015 
1016     return true;
1017   }
1018 
1019   // Link as many SALU instructions in chain as possible. Return the size
1020   // of the chain. Links up to MaxChain instructions.
1021   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
1022                          SmallPtrSetImpl<SUnit *> &Visited) const {
1023     SmallVector<SUnit *, 8> Worklist({To});
1024     unsigned Linked = 0;
1025 
1026     while (!Worklist.empty() && MaxChain-- > 0) {
1027       SUnit *SU = Worklist.pop_back_val();
1028       if (!Visited.insert(SU).second)
1029         continue;
1030 
1031       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
1032                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
1033 
1034       if (SU->addPred(SDep(From, SDep::Artificial), false))
1035         ++Linked;
1036 
1037       for (SDep &SI : From->Succs) {
1038         SUnit *SUv = SI.getSUnit();
1039         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
1040           SUv->addPred(SDep(SU, SDep::Artificial), false);
1041       }
1042 
1043       for (SDep &SI : SU->Succs) {
1044         SUnit *Succ = SI.getSUnit();
1045         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
1046           Worklist.push_back(Succ);
1047       }
1048     }
1049 
1050     return Linked;
1051   }
1052 
1053   void apply(ScheduleDAGInstrs *DAGInstrs) override {
1054     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
1055     if (!ST.hasMAIInsts() || DisablePowerSched)
1056       return;
1057     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
1058     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
1059     if (!TSchedModel || DAG->SUnits.empty())
1060       return;
1061 
1062     // Scan for MFMA long latency instructions and try to add a dependency
1063     // of available SALU instructions to give them a chance to fill MFMA
1064     // shadow. That is desirable to fill MFMA shadow with SALU instructions
1065     // rather than VALU to prevent power consumption bursts and throttle.
1066     auto LastSALU = DAG->SUnits.begin();
1067     auto E = DAG->SUnits.end();
1068     SmallPtrSet<SUnit*, 32> Visited;
1069     for (SUnit &SU : DAG->SUnits) {
1070       MachineInstr &MAI = *SU.getInstr();
1071       if (!TII->isMAI(MAI) ||
1072            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
1073            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
1074         continue;
1075 
1076       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
1077 
1078       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
1079                  dbgs() << "Need " << Lat
1080                         << " instructions to cover latency.\n");
1081 
1082       // Find up to Lat independent scalar instructions as early as
1083       // possible such that they can be scheduled after this MFMA.
1084       for ( ; Lat && LastSALU != E; ++LastSALU) {
1085         if (Visited.count(&*LastSALU))
1086           continue;
1087 
1088         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
1089           continue;
1090 
1091         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
1092       }
1093     }
1094   }
1095 };
1096 } // namespace
1097 
1098 void GCNSubtarget::getPostRAMutations(
1099     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
1100   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
1101 }
1102 
1103 std::unique_ptr<ScheduleDAGMutation>
1104 GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const {
1105   return std::make_unique<FillMFMAShadowMutation>(&InstrInfo);
1106 }
1107 
1108 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
1109   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
1110     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
1111   else
1112     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
1113 }
1114 
1115 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
1116   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
1117     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
1118   else
1119     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
1120 }
1121