1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPUCallLowering.h"
16 #include "AMDGPUInstructionSelector.h"
17 #include "AMDGPULegalizerInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "R600Subtarget.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/IntrinsicsR600.h"
29 #include "llvm/IR/MDBuilder.h"
30 #include "llvm/MC/MCSubtargetInfo.h"
31 #include <algorithm>
32 
33 using namespace llvm;
34 
35 #define DEBUG_TYPE "amdgpu-subtarget"
36 
37 #define GET_SUBTARGETINFO_TARGET_DESC
38 #define GET_SUBTARGETINFO_CTOR
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #undef AMDGPUSubtarget
42 
43 static cl::opt<bool> DisablePowerSched(
44   "amdgpu-disable-power-sched",
45   cl::desc("Disable scheduling to minimize mAI power bursts"),
46   cl::init(false));
47 
48 static cl::opt<bool> EnableVGPRIndexMode(
49   "amdgpu-vgpr-index-mode",
50   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51   cl::init(false));
52 
53 static cl::opt<bool> EnableFlatScratch(
54   "amdgpu-enable-flat-scratch",
55   cl::desc("Use flat scratch instructions"),
56   cl::init(false));
57 
58 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
59                            cl::desc("Enable the use of AA during codegen."),
60                            cl::init(true));
61 
62 GCNSubtarget::~GCNSubtarget() = default;
63 
64 GCNSubtarget &
65 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
66                                               StringRef GPU, StringRef FS) {
67   // Determine default and user-specified characteristics
68   //
69   // We want to be able to turn these off, but making this a subtarget feature
70   // for SI has the unhelpful behavior that it unsets everything else if you
71   // disable it.
72   //
73   // Similarly we want enable-prt-strict-null to be on by default and not to
74   // unset everything else if it is disabled
75 
76   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
77 
78   // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
79   if (isAmdHsaOS())
80     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
81 
82   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
83 
84   // Disable mutually exclusive bits.
85   if (FS.find_insensitive("+wavefrontsize") != StringRef::npos) {
86     if (FS.find_insensitive("wavefrontsize16") == StringRef::npos)
87       FullFS += "-wavefrontsize16,";
88     if (FS.find_insensitive("wavefrontsize32") == StringRef::npos)
89       FullFS += "-wavefrontsize32,";
90     if (FS.find_insensitive("wavefrontsize64") == StringRef::npos)
91       FullFS += "-wavefrontsize64,";
92   }
93 
94   FullFS += FS;
95 
96   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
97 
98   // Implement the "generic" processors, which acts as the default when no
99   // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
100   // the first amdgcn target that supports flat addressing. Other OSes defaults
101   // to the first amdgcn target.
102   if (Gen == AMDGPUSubtarget::INVALID) {
103      Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
104                                         : AMDGPUSubtarget::SOUTHERN_ISLANDS;
105   }
106 
107   // We don't support FP64 for EG/NI atm.
108   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
109 
110   // Targets must either support 64-bit offsets for MUBUF instructions, and/or
111   // support flat operations, otherwise they cannot access a 64-bit global
112   // address space
113   assert(hasAddr64() || hasFlat());
114   // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
115   // that do not support ADDR64 variants of MUBUF instructions. Such targets
116   // cannot use a 64 bit offset with a MUBUF instruction to access the global
117   // address space
118   if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
119     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
120     FlatForGlobal = true;
121   }
122   // Unless +-flat-for-global is specified, use MUBUF instructions for global
123   // address space access if flat operations are not available.
124   if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
125     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
126     FlatForGlobal = false;
127   }
128 
129   // Set defaults if needed.
130   if (MaxPrivateElementSize == 0)
131     MaxPrivateElementSize = 4;
132 
133   if (LDSBankCount == 0)
134     LDSBankCount = 32;
135 
136   if (TT.getArch() == Triple::amdgcn) {
137     if (LocalMemorySize == 0)
138       LocalMemorySize = 32768;
139 
140     // Do something sensible for unspecified target.
141     if (!HasMovrel && !HasVGPRIndexMode)
142       HasMovrel = true;
143   }
144 
145   // Don't crash on invalid devices.
146   if (WavefrontSizeLog2 == 0)
147     WavefrontSizeLog2 = 5;
148 
149   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
150   HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
151 
152   TargetID.setTargetIDFromFeaturesString(FS);
153 
154   LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
155                     << TargetID.getXnackSetting() << '\n');
156   LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
157                     << TargetID.getSramEccSetting() << '\n');
158 
159   return *this;
160 }
161 
162 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
163   TargetTriple(TT),
164   GCN3Encoding(false),
165   Has16BitInsts(false),
166   HasMadMixInsts(false),
167   HasMadMacF32Insts(false),
168   HasDsSrc2Insts(false),
169   HasSDWA(false),
170   HasVOP3PInsts(false),
171   HasMulI24(true),
172   HasMulU24(true),
173   HasSMulHi(false),
174   HasInv2PiInlineImm(false),
175   HasFminFmaxLegacy(true),
176   EnablePromoteAlloca(false),
177   HasTrigReducedRange(false),
178   MaxWavesPerEU(10),
179   LocalMemorySize(0),
180   WavefrontSizeLog2(0)
181   { }
182 
183 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
184                            const GCNTargetMachine &TM)
185     : // clang-format off
186     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
187     AMDGPUSubtarget(TT),
188     TargetTriple(TT),
189     TargetID(*this),
190     Gen(INVALID),
191     InstrItins(getInstrItineraryForCPU(GPU)),
192     LDSBankCount(0),
193     MaxPrivateElementSize(0),
194 
195     FastFMAF32(false),
196     FastDenormalF32(false),
197     HalfRate64Ops(false),
198     FullRate64Ops(false),
199 
200     FlatForGlobal(false),
201     AutoWaitcntBeforeBarrier(false),
202     UnalignedScratchAccess(false),
203     UnalignedAccessMode(false),
204 
205     HasApertureRegs(false),
206     SupportsXNACK(false),
207     EnableXNACK(false),
208     EnableTgSplit(false),
209     EnableCuMode(false),
210     TrapHandler(false),
211 
212     EnableLoadStoreOpt(false),
213     EnableUnsafeDSOffsetFolding(false),
214     EnableSIScheduler(false),
215     EnableDS128(false),
216     EnablePRTStrictNull(false),
217     DumpCode(false),
218 
219     FP64(false),
220     CIInsts(false),
221     GFX8Insts(false),
222     GFX9Insts(false),
223     GFX90AInsts(false),
224     GFX10Insts(false),
225     GFX10_3Insts(false),
226     GFX7GFX8GFX9Insts(false),
227     SGPRInitBug(false),
228     NegativeScratchOffsetBug(false),
229     NegativeUnalignedScratchOffsetBug(false),
230     HasSMemRealTime(false),
231     HasIntClamp(false),
232     HasFmaMixInsts(false),
233     HasMovrel(false),
234     HasVGPRIndexMode(false),
235     HasScalarStores(false),
236     HasScalarAtomics(false),
237     HasSDWAOmod(false),
238     HasSDWAScalar(false),
239     HasSDWASdst(false),
240     HasSDWAMac(false),
241     HasSDWAOutModsVOPC(false),
242     HasDPP(false),
243     HasDPP8(false),
244     Has64BitDPP(false),
245     HasPackedFP32Ops(false),
246     HasExtendedImageInsts(false),
247     HasR128A16(false),
248     HasGFX10A16(false),
249     HasG16(false),
250     HasNSAEncoding(false),
251     NSAMaxSize(0),
252     GFX10_AEncoding(false),
253     GFX10_BEncoding(false),
254     HasDLInsts(false),
255     HasDot1Insts(false),
256     HasDot2Insts(false),
257     HasDot3Insts(false),
258     HasDot4Insts(false),
259     HasDot5Insts(false),
260     HasDot6Insts(false),
261     HasDot7Insts(false),
262     HasMAIInsts(false),
263     HasPkFmacF16Inst(false),
264     HasAtomicFaddInsts(false),
265     SupportsSRAMECC(false),
266     EnableSRAMECC(false),
267     HasNoSdstCMPX(false),
268     HasVscnt(false),
269     HasGetWaveIdInst(false),
270     HasSMemTimeInst(false),
271     HasShaderCyclesRegister(false),
272     HasRegisterBanking(false),
273     HasVOP3Literal(false),
274     HasNoDataDepHazard(false),
275     FlatAddressSpace(false),
276     FlatInstOffsets(false),
277     FlatGlobalInsts(false),
278     FlatScratchInsts(false),
279     ScalarFlatScratchInsts(false),
280     HasArchitectedFlatScratch(false),
281     AddNoCarryInsts(false),
282     HasUnpackedD16VMem(false),
283     LDSMisalignedBug(false),
284     HasMFMAInlineLiteralBug(false),
285     UnalignedBufferAccess(false),
286     UnalignedDSAccess(false),
287     HasPackedTID(false),
288 
289     ScalarizeGlobal(false),
290 
291     HasVcmpxPermlaneHazard(false),
292     HasVMEMtoScalarWriteHazard(false),
293     HasSMEMtoVectorWriteHazard(false),
294     HasInstFwdPrefetchBug(false),
295     HasVcmpxExecWARHazard(false),
296     HasLdsBranchVmemWARHazard(false),
297     HasNSAtoVMEMBug(false),
298     HasNSAClauseBug(false),
299     HasOffset3fBug(false),
300     HasFlatSegmentOffsetBug(false),
301     HasImageStoreD16Bug(false),
302     HasImageGather4D16Bug(false),
303 
304     FeatureDisable(false),
305     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
306     TLInfo(TM, *this),
307     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
308   // clang-format on
309   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
310   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
311   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
312   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
313   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
314   InstSelector.reset(new AMDGPUInstructionSelector(
315   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
316 }
317 
318 bool GCNSubtarget::enableFlatScratch() const {
319   return flatScratchIsArchitected() ||
320          (EnableFlatScratch && hasFlatScratchInsts());
321 }
322 
323 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
324   if (getGeneration() < GFX10)
325     return 1;
326 
327   switch (Opcode) {
328   case AMDGPU::V_LSHLREV_B64_e64:
329   case AMDGPU::V_LSHLREV_B64_gfx10:
330   case AMDGPU::V_LSHL_B64_e64:
331   case AMDGPU::V_LSHRREV_B64_e64:
332   case AMDGPU::V_LSHRREV_B64_gfx10:
333   case AMDGPU::V_LSHR_B64_e64:
334   case AMDGPU::V_ASHRREV_I64_e64:
335   case AMDGPU::V_ASHRREV_I64_gfx10:
336   case AMDGPU::V_ASHR_I64_e64:
337     return 1;
338   }
339 
340   return 2;
341 }
342 
343 /// This list was mostly derived from experimentation.
344 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
345   switch (Opcode) {
346   case AMDGPU::V_CVT_F16_F32_e32:
347   case AMDGPU::V_CVT_F16_F32_e64:
348   case AMDGPU::V_CVT_F16_U16_e32:
349   case AMDGPU::V_CVT_F16_U16_e64:
350   case AMDGPU::V_CVT_F16_I16_e32:
351   case AMDGPU::V_CVT_F16_I16_e64:
352   case AMDGPU::V_RCP_F16_e64:
353   case AMDGPU::V_RCP_F16_e32:
354   case AMDGPU::V_RSQ_F16_e64:
355   case AMDGPU::V_RSQ_F16_e32:
356   case AMDGPU::V_SQRT_F16_e64:
357   case AMDGPU::V_SQRT_F16_e32:
358   case AMDGPU::V_LOG_F16_e64:
359   case AMDGPU::V_LOG_F16_e32:
360   case AMDGPU::V_EXP_F16_e64:
361   case AMDGPU::V_EXP_F16_e32:
362   case AMDGPU::V_SIN_F16_e64:
363   case AMDGPU::V_SIN_F16_e32:
364   case AMDGPU::V_COS_F16_e64:
365   case AMDGPU::V_COS_F16_e32:
366   case AMDGPU::V_FLOOR_F16_e64:
367   case AMDGPU::V_FLOOR_F16_e32:
368   case AMDGPU::V_CEIL_F16_e64:
369   case AMDGPU::V_CEIL_F16_e32:
370   case AMDGPU::V_TRUNC_F16_e64:
371   case AMDGPU::V_TRUNC_F16_e32:
372   case AMDGPU::V_RNDNE_F16_e64:
373   case AMDGPU::V_RNDNE_F16_e32:
374   case AMDGPU::V_FRACT_F16_e64:
375   case AMDGPU::V_FRACT_F16_e32:
376   case AMDGPU::V_FREXP_MANT_F16_e64:
377   case AMDGPU::V_FREXP_MANT_F16_e32:
378   case AMDGPU::V_FREXP_EXP_I16_F16_e64:
379   case AMDGPU::V_FREXP_EXP_I16_F16_e32:
380   case AMDGPU::V_LDEXP_F16_e64:
381   case AMDGPU::V_LDEXP_F16_e32:
382   case AMDGPU::V_LSHLREV_B16_e64:
383   case AMDGPU::V_LSHLREV_B16_e32:
384   case AMDGPU::V_LSHRREV_B16_e64:
385   case AMDGPU::V_LSHRREV_B16_e32:
386   case AMDGPU::V_ASHRREV_I16_e64:
387   case AMDGPU::V_ASHRREV_I16_e32:
388   case AMDGPU::V_ADD_U16_e64:
389   case AMDGPU::V_ADD_U16_e32:
390   case AMDGPU::V_SUB_U16_e64:
391   case AMDGPU::V_SUB_U16_e32:
392   case AMDGPU::V_SUBREV_U16_e64:
393   case AMDGPU::V_SUBREV_U16_e32:
394   case AMDGPU::V_MUL_LO_U16_e64:
395   case AMDGPU::V_MUL_LO_U16_e32:
396   case AMDGPU::V_ADD_F16_e64:
397   case AMDGPU::V_ADD_F16_e32:
398   case AMDGPU::V_SUB_F16_e64:
399   case AMDGPU::V_SUB_F16_e32:
400   case AMDGPU::V_SUBREV_F16_e64:
401   case AMDGPU::V_SUBREV_F16_e32:
402   case AMDGPU::V_MUL_F16_e64:
403   case AMDGPU::V_MUL_F16_e32:
404   case AMDGPU::V_MAX_F16_e64:
405   case AMDGPU::V_MAX_F16_e32:
406   case AMDGPU::V_MIN_F16_e64:
407   case AMDGPU::V_MIN_F16_e32:
408   case AMDGPU::V_MAX_U16_e64:
409   case AMDGPU::V_MAX_U16_e32:
410   case AMDGPU::V_MIN_U16_e64:
411   case AMDGPU::V_MIN_U16_e32:
412   case AMDGPU::V_MAX_I16_e64:
413   case AMDGPU::V_MAX_I16_e32:
414   case AMDGPU::V_MIN_I16_e64:
415   case AMDGPU::V_MIN_I16_e32:
416     // On gfx10, all 16-bit instructions preserve the high bits.
417     return getGeneration() <= AMDGPUSubtarget::GFX9;
418   case AMDGPU::V_MAD_F16_e64:
419   case AMDGPU::V_MADAK_F16:
420   case AMDGPU::V_MADMK_F16:
421   case AMDGPU::V_MAC_F16_e64:
422   case AMDGPU::V_MAC_F16_e32:
423   case AMDGPU::V_FMAMK_F16:
424   case AMDGPU::V_FMAAK_F16:
425   case AMDGPU::V_MAD_U16_e64:
426   case AMDGPU::V_MAD_I16_e64:
427   case AMDGPU::V_FMA_F16_e64:
428   case AMDGPU::V_FMAC_F16_e64:
429   case AMDGPU::V_FMAC_F16_e32:
430   case AMDGPU::V_DIV_FIXUP_F16_e64:
431     // In gfx9, the preferred handling of the unused high 16-bits changed. Most
432     // instructions maintain the legacy behavior of 0ing. Some instructions
433     // changed to preserving the high bits.
434     return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
435   case AMDGPU::V_MAD_MIXLO_F16:
436   case AMDGPU::V_MAD_MIXHI_F16:
437   default:
438     return false;
439   }
440 }
441 
442 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
443   const Function &F) const {
444   if (NWaves == 1)
445     return getLocalMemorySize();
446   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
447   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
448   if (!WorkGroupsPerCu)
449     return 0;
450   unsigned MaxWaves = getMaxWavesPerEU();
451   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
452 }
453 
454 // FIXME: Should return min,max range.
455 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
456   const Function &F) const {
457   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
458   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
459   if (!MaxWorkGroupsPerCu)
460     return 0;
461 
462   const unsigned WaveSize = getWavefrontSize();
463 
464   // FIXME: Do we need to account for alignment requirement of LDS rounding the
465   // size up?
466   // Compute restriction based on LDS usage
467   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
468 
469   // This can be queried with more LDS than is possible, so just assume the
470   // worst.
471   if (NumGroups == 0)
472     return 1;
473 
474   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
475 
476   // Round to the number of waves.
477   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
478   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
479 
480   // Clamp to the maximum possible number of waves.
481   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
482 
483   // FIXME: Needs to be a multiple of the group size?
484   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
485 
486   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
487          "computed invalid occupancy");
488   return MaxWaves;
489 }
490 
491 unsigned
492 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
493   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
494   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
495 }
496 
497 std::pair<unsigned, unsigned>
498 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
499   switch (CC) {
500   case CallingConv::AMDGPU_VS:
501   case CallingConv::AMDGPU_LS:
502   case CallingConv::AMDGPU_HS:
503   case CallingConv::AMDGPU_ES:
504   case CallingConv::AMDGPU_GS:
505   case CallingConv::AMDGPU_PS:
506     return std::make_pair(1, getWavefrontSize());
507   default:
508     return std::make_pair(1u, getMaxFlatWorkGroupSize());
509   }
510 }
511 
512 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
513   const Function &F) const {
514   // Default minimum/maximum flat work group sizes.
515   std::pair<unsigned, unsigned> Default =
516     getDefaultFlatWorkGroupSize(F.getCallingConv());
517 
518   // Requested minimum/maximum flat work group sizes.
519   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
520     F, "amdgpu-flat-work-group-size", Default);
521 
522   // Make sure requested minimum is less than requested maximum.
523   if (Requested.first > Requested.second)
524     return Default;
525 
526   // Make sure requested values do not violate subtarget's specifications.
527   if (Requested.first < getMinFlatWorkGroupSize())
528     return Default;
529   if (Requested.second > getMaxFlatWorkGroupSize())
530     return Default;
531 
532   return Requested;
533 }
534 
535 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
536   const Function &F) const {
537   // Default minimum/maximum number of waves per execution unit.
538   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
539 
540   // Default/requested minimum/maximum flat work group sizes.
541   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
542 
543   // If minimum/maximum flat work group sizes were explicitly requested using
544   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
545   // number of waves per execution unit to values implied by requested
546   // minimum/maximum flat work group sizes.
547   unsigned MinImpliedByFlatWorkGroupSize =
548     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
549   Default.first = MinImpliedByFlatWorkGroupSize;
550   bool RequestedFlatWorkGroupSize =
551       F.hasFnAttribute("amdgpu-flat-work-group-size");
552 
553   // Requested minimum/maximum number of waves per execution unit.
554   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
555     F, "amdgpu-waves-per-eu", Default, true);
556 
557   // Make sure requested minimum is less than requested maximum.
558   if (Requested.second && Requested.first > Requested.second)
559     return Default;
560 
561   // Make sure requested values do not violate subtarget's specifications.
562   if (Requested.first < getMinWavesPerEU() ||
563       Requested.second > getMaxWavesPerEU())
564     return Default;
565 
566   // Make sure requested values are compatible with values implied by requested
567   // minimum/maximum flat work group sizes.
568   if (RequestedFlatWorkGroupSize &&
569       Requested.first < MinImpliedByFlatWorkGroupSize)
570     return Default;
571 
572   return Requested;
573 }
574 
575 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
576   auto Node = Kernel.getMetadata("reqd_work_group_size");
577   if (Node && Node->getNumOperands() == 3)
578     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
579   return std::numeric_limits<unsigned>::max();
580 }
581 
582 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
583   return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
584 }
585 
586 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
587                                            unsigned Dimension) const {
588   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
589   if (ReqdSize != std::numeric_limits<unsigned>::max())
590     return ReqdSize - 1;
591   return getFlatWorkGroupSizes(Kernel).second - 1;
592 }
593 
594 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
595   Function *Kernel = I->getParent()->getParent();
596   unsigned MinSize = 0;
597   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
598   bool IdQuery = false;
599 
600   // If reqd_work_group_size is present it narrows value down.
601   if (auto *CI = dyn_cast<CallInst>(I)) {
602     const Function *F = CI->getCalledFunction();
603     if (F) {
604       unsigned Dim = UINT_MAX;
605       switch (F->getIntrinsicID()) {
606       case Intrinsic::amdgcn_workitem_id_x:
607       case Intrinsic::r600_read_tidig_x:
608         IdQuery = true;
609         LLVM_FALLTHROUGH;
610       case Intrinsic::r600_read_local_size_x:
611         Dim = 0;
612         break;
613       case Intrinsic::amdgcn_workitem_id_y:
614       case Intrinsic::r600_read_tidig_y:
615         IdQuery = true;
616         LLVM_FALLTHROUGH;
617       case Intrinsic::r600_read_local_size_y:
618         Dim = 1;
619         break;
620       case Intrinsic::amdgcn_workitem_id_z:
621       case Intrinsic::r600_read_tidig_z:
622         IdQuery = true;
623         LLVM_FALLTHROUGH;
624       case Intrinsic::r600_read_local_size_z:
625         Dim = 2;
626         break;
627       default:
628         break;
629       }
630 
631       if (Dim <= 3) {
632         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
633         if (ReqdSize != std::numeric_limits<unsigned>::max())
634           MinSize = MaxSize = ReqdSize;
635       }
636     }
637   }
638 
639   if (!MaxSize)
640     return false;
641 
642   // Range metadata is [Lo, Hi). For ID query we need to pass max size
643   // as Hi. For size query we need to pass Hi + 1.
644   if (IdQuery)
645     MinSize = 0;
646   else
647     ++MaxSize;
648 
649   MDBuilder MDB(I->getContext());
650   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
651                                                   APInt(32, MaxSize));
652   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
653   return true;
654 }
655 
656 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
657   if (isMesaKernel(F))
658     return 16;
659   return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
660 }
661 
662 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
663                                                  Align &MaxAlign) const {
664   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
665          F.getCallingConv() == CallingConv::SPIR_KERNEL);
666 
667   const DataLayout &DL = F.getParent()->getDataLayout();
668   uint64_t ExplicitArgBytes = 0;
669   MaxAlign = Align(1);
670 
671   for (const Argument &Arg : F.args()) {
672     const bool IsByRef = Arg.hasByRefAttr();
673     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
674     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
675     if (!Alignment)
676       Alignment = DL.getABITypeAlign(ArgTy);
677 
678     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
679     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
680     MaxAlign = max(MaxAlign, Alignment);
681   }
682 
683   return ExplicitArgBytes;
684 }
685 
686 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
687                                                 Align &MaxAlign) const {
688   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
689 
690   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
691 
692   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
693   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
694   if (ImplicitBytes != 0) {
695     const Align Alignment = getAlignmentForImplicitArgPtr();
696     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
697   }
698 
699   // Being able to dereference past the end is useful for emitting scalar loads.
700   return alignTo(TotalSize, 4);
701 }
702 
703 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
704   return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
705                                   : AMDGPUDwarfFlavour::Wave64;
706 }
707 
708 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
709                                       unsigned NumRegionInstrs) const {
710   // Track register pressure so the scheduler can try to decrease
711   // pressure once register usage is above the threshold defined by
712   // SIRegisterInfo::getRegPressureSetLimit()
713   Policy.ShouldTrackPressure = true;
714 
715   // Enabling both top down and bottom up scheduling seems to give us less
716   // register spills than just using one of these approaches on its own.
717   Policy.OnlyTopDown = false;
718   Policy.OnlyBottomUp = false;
719 
720   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
721   if (!enableSIScheduler())
722     Policy.ShouldTrackLaneMasks = true;
723 }
724 
725 bool GCNSubtarget::hasMadF16() const {
726   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
727 }
728 
729 bool GCNSubtarget::useVGPRIndexMode() const {
730   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
731 }
732 
733 bool GCNSubtarget::useAA() const { return UseAA; }
734 
735 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
736   if (getGeneration() >= AMDGPUSubtarget::GFX10)
737     return getMaxWavesPerEU();
738 
739   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
740     if (SGPRs <= 80)
741       return 10;
742     if (SGPRs <= 88)
743       return 9;
744     if (SGPRs <= 100)
745       return 8;
746     return 7;
747   }
748   if (SGPRs <= 48)
749     return 10;
750   if (SGPRs <= 56)
751     return 9;
752   if (SGPRs <= 64)
753     return 8;
754   if (SGPRs <= 72)
755     return 7;
756   if (SGPRs <= 80)
757     return 6;
758   return 5;
759 }
760 
761 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
762   unsigned MaxWaves = getMaxWavesPerEU();
763   unsigned Granule = getVGPRAllocGranule();
764   if (VGPRs < Granule)
765     return MaxWaves;
766   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
767   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
768 }
769 
770 unsigned
771 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const {
772   if (getGeneration() >= AMDGPUSubtarget::GFX10)
773     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
774 
775   if (HasFlatScratchInit) {
776     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
777       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
778     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
779       return 4; // FLAT_SCRATCH, VCC (in that order).
780   }
781 
782   if (isXNACKEnabled())
783     return 4; // XNACK, VCC (in that order).
784   return 2; // VCC.
785 }
786 
787 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
788   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
789   return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit());
790 }
791 
792 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
793   // The logic to detect if the function has
794   // flat scratch init is slightly different than how
795   // SIMachineFunctionInfo constructor derives.
796   // We don't use amdgpu-calls, amdgpu-stack-objects
797   // attributes and isAmdHsaOrMesa here as it doesn't really matter.
798   // TODO: Outline this derivation logic and have just
799   // one common function in the backend to avoid duplication.
800   bool isEntry = AMDGPU::isEntryFunctionCC(F.getCallingConv());
801   bool FunctionHasFlatScratchInit = false;
802   if (hasFlatAddressSpace() && isEntry && !flatScratchIsArchitected() &&
803       enableFlatScratch()) {
804     FunctionHasFlatScratchInit = true;
805   }
806   return getBaseReservedNumSGPRs(FunctionHasFlatScratchInit);
807 }
808 
809 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
810                                         unsigned NumSGPRs,
811                                         unsigned NumVGPRs) const {
812   unsigned Occupancy =
813     std::min(getMaxWavesPerEU(),
814              getOccupancyWithLocalMemSize(LDSSize, F));
815   if (NumSGPRs)
816     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
817   if (NumVGPRs)
818     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
819   return Occupancy;
820 }
821 
822 unsigned GCNSubtarget::getBaseMaxNumSGPRs(
823     const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
824     unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
825   // Compute maximum number of SGPRs function can use using default/requested
826   // minimum number of waves per execution unit.
827   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
828   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
829 
830   // Check if maximum number of SGPRs was explicitly requested using
831   // "amdgpu-num-sgpr" attribute.
832   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
833     unsigned Requested = AMDGPU::getIntegerAttribute(
834       F, "amdgpu-num-sgpr", MaxNumSGPRs);
835 
836     // Make sure requested value does not violate subtarget's specifications.
837     if (Requested && (Requested <= ReservedNumSGPRs))
838       Requested = 0;
839 
840     // If more SGPRs are required to support the input user/system SGPRs,
841     // increase to accommodate them.
842     //
843     // FIXME: This really ends up using the requested number of SGPRs + number
844     // of reserved special registers in total. Theoretically you could re-use
845     // the last input registers for these special registers, but this would
846     // require a lot of complexity to deal with the weird aliasing.
847     unsigned InputNumSGPRs = PreloadedSGPRs;
848     if (Requested && Requested < InputNumSGPRs)
849       Requested = InputNumSGPRs;
850 
851     // Make sure requested value is compatible with values implied by
852     // default/requested minimum/maximum number of waves per execution unit.
853     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
854       Requested = 0;
855     if (WavesPerEU.second &&
856         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
857       Requested = 0;
858 
859     if (Requested)
860       MaxNumSGPRs = Requested;
861   }
862 
863   if (hasSGPRInitBug())
864     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
865 
866   return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
867 }
868 
869 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
870   const Function &F = MF.getFunction();
871   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
872   return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),
873                             getReservedNumSGPRs(MF));
874 }
875 
876 static unsigned getMaxNumPreloadedSGPRs() {
877   // Max number of user SGPRs
878   unsigned MaxUserSGPRs = 4 + // private segment buffer
879                           2 + // Dispatch ptr
880                           2 + // queue ptr
881                           2 + // kernel segment ptr
882                           2 + // dispatch ID
883                           2 + // flat scratch init
884                           2;  // Implicit buffer ptr
885   // Max number of system SGPRs
886   unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
887                             1 + // WorkGroupIDY
888                             1 + // WorkGroupIDZ
889                             1 + // WorkGroupInfo
890                             1;  // private segment wave byte offset
891   return MaxUserSGPRs + MaxSystemSGPRs;
892 }
893 
894 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
895   return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),
896                             getReservedNumSGPRs(F));
897 }
898 
899 unsigned GCNSubtarget::getBaseMaxNumVGPRs(
900     const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
901   // Compute maximum number of VGPRs function can use using default/requested
902   // minimum number of waves per execution unit.
903   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
904 
905   // Check if maximum number of VGPRs was explicitly requested using
906   // "amdgpu-num-vgpr" attribute.
907   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
908     unsigned Requested = AMDGPU::getIntegerAttribute(
909       F, "amdgpu-num-vgpr", MaxNumVGPRs);
910 
911     if (hasGFX90AInsts())
912       Requested *= 2;
913 
914     // Make sure requested value is compatible with values implied by
915     // default/requested minimum/maximum number of waves per execution unit.
916     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
917       Requested = 0;
918     if (WavesPerEU.second &&
919         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
920       Requested = 0;
921 
922     if (Requested)
923       MaxNumVGPRs = Requested;
924   }
925 
926   return MaxNumVGPRs;
927 }
928 
929 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
930   return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
931 }
932 
933 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
934   const Function &F = MF.getFunction();
935   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
936   return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
937 }
938 
939 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
940                                          int UseOpIdx, SDep &Dep) const {
941   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
942       !Def->isInstr() || !Use->isInstr())
943     return;
944 
945   MachineInstr *DefI = Def->getInstr();
946   MachineInstr *UseI = Use->getInstr();
947 
948   if (DefI->isBundle()) {
949     const SIRegisterInfo *TRI = getRegisterInfo();
950     auto Reg = Dep.getReg();
951     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
952     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
953     unsigned Lat = 0;
954     for (++I; I != E && I->isBundledWithPred(); ++I) {
955       if (I->modifiesRegister(Reg, TRI))
956         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
957       else if (Lat)
958         --Lat;
959     }
960     Dep.setLatency(Lat);
961   } else if (UseI->isBundle()) {
962     const SIRegisterInfo *TRI = getRegisterInfo();
963     auto Reg = Dep.getReg();
964     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
965     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
966     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
967     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
968       if (I->readsRegister(Reg, TRI))
969         break;
970       --Lat;
971     }
972     Dep.setLatency(Lat);
973   }
974 }
975 
976 namespace {
977 struct FillMFMAShadowMutation : ScheduleDAGMutation {
978   const SIInstrInfo *TII;
979 
980   ScheduleDAGMI *DAG;
981 
982   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
983 
984   bool isSALU(const SUnit *SU) const {
985     const MachineInstr *MI = SU->getInstr();
986     return MI && TII->isSALU(*MI) && !MI->isTerminator();
987   }
988 
989   bool isVALU(const SUnit *SU) const {
990     const MachineInstr *MI = SU->getInstr();
991     return MI && TII->isVALU(*MI);
992   }
993 
994   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
995     if (Pred->NodeNum < Succ->NodeNum)
996       return true;
997 
998     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
999 
1000     for (unsigned I = 0; I < Succs.size(); ++I) {
1001       for (const SDep &SI : Succs[I]->Succs) {
1002         const SUnit *SU = SI.getSUnit();
1003         if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
1004           Succs.push_back(SU);
1005       }
1006     }
1007 
1008     SmallPtrSet<const SUnit*, 32> Visited;
1009     while (!Preds.empty()) {
1010       const SUnit *SU = Preds.pop_back_val();
1011       if (llvm::is_contained(Succs, SU))
1012         return false;
1013       Visited.insert(SU);
1014       for (const SDep &SI : SU->Preds)
1015         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
1016           Preds.push_back(SI.getSUnit());
1017     }
1018 
1019     return true;
1020   }
1021 
1022   // Link as many SALU instructions in chain as possible. Return the size
1023   // of the chain. Links up to MaxChain instructions.
1024   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
1025                          SmallPtrSetImpl<SUnit *> &Visited) const {
1026     SmallVector<SUnit *, 8> Worklist({To});
1027     unsigned Linked = 0;
1028 
1029     while (!Worklist.empty() && MaxChain-- > 0) {
1030       SUnit *SU = Worklist.pop_back_val();
1031       if (!Visited.insert(SU).second)
1032         continue;
1033 
1034       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
1035                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
1036 
1037       if (SU->addPred(SDep(From, SDep::Artificial), false))
1038         ++Linked;
1039 
1040       for (SDep &SI : From->Succs) {
1041         SUnit *SUv = SI.getSUnit();
1042         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
1043           SUv->addPred(SDep(SU, SDep::Artificial), false);
1044       }
1045 
1046       for (SDep &SI : SU->Succs) {
1047         SUnit *Succ = SI.getSUnit();
1048         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
1049           Worklist.push_back(Succ);
1050       }
1051     }
1052 
1053     return Linked;
1054   }
1055 
1056   void apply(ScheduleDAGInstrs *DAGInstrs) override {
1057     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
1058     if (!ST.hasMAIInsts() || DisablePowerSched)
1059       return;
1060     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
1061     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
1062     if (!TSchedModel || DAG->SUnits.empty())
1063       return;
1064 
1065     // Scan for MFMA long latency instructions and try to add a dependency
1066     // of available SALU instructions to give them a chance to fill MFMA
1067     // shadow. That is desirable to fill MFMA shadow with SALU instructions
1068     // rather than VALU to prevent power consumption bursts and throttle.
1069     auto LastSALU = DAG->SUnits.begin();
1070     auto E = DAG->SUnits.end();
1071     SmallPtrSet<SUnit*, 32> Visited;
1072     for (SUnit &SU : DAG->SUnits) {
1073       MachineInstr &MAI = *SU.getInstr();
1074       if (!TII->isMAI(MAI) ||
1075            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
1076            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
1077         continue;
1078 
1079       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
1080 
1081       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
1082                  dbgs() << "Need " << Lat
1083                         << " instructions to cover latency.\n");
1084 
1085       // Find up to Lat independent scalar instructions as early as
1086       // possible such that they can be scheduled after this MFMA.
1087       for ( ; Lat && LastSALU != E; ++LastSALU) {
1088         if (Visited.count(&*LastSALU))
1089           continue;
1090 
1091         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
1092           continue;
1093 
1094         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
1095       }
1096     }
1097   }
1098 };
1099 } // namespace
1100 
1101 void GCNSubtarget::getPostRAMutations(
1102     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
1103   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
1104 }
1105 
1106 std::unique_ptr<ScheduleDAGMutation>
1107 GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const {
1108   return std::make_unique<FillMFMAShadowMutation>(&InstrInfo);
1109 }
1110 
1111 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
1112   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
1113     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
1114   else
1115     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
1116 }
1117 
1118 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
1119   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
1120     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
1121   else
1122     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
1123 }
1124