1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPUCallLowering.h"
16 #include "AMDGPUInstructionSelector.h"
17 #include "AMDGPULegalizerInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "R600Subtarget.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/IntrinsicsR600.h"
29 #include "llvm/IR/MDBuilder.h"
30 #include "llvm/MC/MCSubtargetInfo.h"
31 #include <algorithm>
32 
33 using namespace llvm;
34 
35 #define DEBUG_TYPE "amdgpu-subtarget"
36 
37 #define GET_SUBTARGETINFO_TARGET_DESC
38 #define GET_SUBTARGETINFO_CTOR
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #undef AMDGPUSubtarget
42 
43 static cl::opt<bool> DisablePowerSched(
44   "amdgpu-disable-power-sched",
45   cl::desc("Disable scheduling to minimize mAI power bursts"),
46   cl::init(false));
47 
48 static cl::opt<bool> EnableVGPRIndexMode(
49   "amdgpu-vgpr-index-mode",
50   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51   cl::init(false));
52 
53 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
54                            cl::desc("Enable the use of AA during codegen."),
55                            cl::init(true));
56 
57 GCNSubtarget::~GCNSubtarget() = default;
58 
59 GCNSubtarget &
60 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
61                                               StringRef GPU, StringRef FS) {
62   // Determine default and user-specified characteristics
63   //
64   // We want to be able to turn these off, but making this a subtarget feature
65   // for SI has the unhelpful behavior that it unsets everything else if you
66   // disable it.
67   //
68   // Similarly we want enable-prt-strict-null to be on by default and not to
69   // unset everything else if it is disabled
70 
71   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
72 
73   // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
74   if (isAmdHsaOS())
75     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
76 
77   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
78 
79   // Disable mutually exclusive bits.
80   if (FS.contains_insensitive("+wavefrontsize")) {
81     if (!FS.contains_insensitive("wavefrontsize16"))
82       FullFS += "-wavefrontsize16,";
83     if (!FS.contains_insensitive("wavefrontsize32"))
84       FullFS += "-wavefrontsize32,";
85     if (!FS.contains_insensitive("wavefrontsize64"))
86       FullFS += "-wavefrontsize64,";
87   }
88 
89   FullFS += FS;
90 
91   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
92 
93   // Implement the "generic" processors, which acts as the default when no
94   // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
95   // the first amdgcn target that supports flat addressing. Other OSes defaults
96   // to the first amdgcn target.
97   if (Gen == AMDGPUSubtarget::INVALID) {
98      Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
99                                         : AMDGPUSubtarget::SOUTHERN_ISLANDS;
100   }
101 
102   // We don't support FP64 for EG/NI atm.
103   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
104 
105   // Targets must either support 64-bit offsets for MUBUF instructions, and/or
106   // support flat operations, otherwise they cannot access a 64-bit global
107   // address space
108   assert(hasAddr64() || hasFlat());
109   // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
110   // that do not support ADDR64 variants of MUBUF instructions. Such targets
111   // cannot use a 64 bit offset with a MUBUF instruction to access the global
112   // address space
113   if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
114     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
115     FlatForGlobal = true;
116   }
117   // Unless +-flat-for-global is specified, use MUBUF instructions for global
118   // address space access if flat operations are not available.
119   if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
120     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
121     FlatForGlobal = false;
122   }
123 
124   // Set defaults if needed.
125   if (MaxPrivateElementSize == 0)
126     MaxPrivateElementSize = 4;
127 
128   if (LDSBankCount == 0)
129     LDSBankCount = 32;
130 
131   if (TT.getArch() == Triple::amdgcn) {
132     if (LocalMemorySize == 0)
133       LocalMemorySize = 32768;
134 
135     // Do something sensible for unspecified target.
136     if (!HasMovrel && !HasVGPRIndexMode)
137       HasMovrel = true;
138   }
139 
140   // Don't crash on invalid devices.
141   if (WavefrontSizeLog2 == 0)
142     WavefrontSizeLog2 = 5;
143 
144   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
145   HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
146 
147   TargetID.setTargetIDFromFeaturesString(FS);
148 
149   LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
150                     << TargetID.getXnackSetting() << '\n');
151   LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
152                     << TargetID.getSramEccSetting() << '\n');
153 
154   return *this;
155 }
156 
157 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
158   TargetTriple(TT),
159   GCN3Encoding(false),
160   Has16BitInsts(false),
161   HasMadMixInsts(false),
162   HasMadMacF32Insts(false),
163   HasDsSrc2Insts(false),
164   HasSDWA(false),
165   HasVOP3PInsts(false),
166   HasMulI24(true),
167   HasMulU24(true),
168   HasSMulHi(false),
169   HasInv2PiInlineImm(false),
170   HasFminFmaxLegacy(true),
171   EnablePromoteAlloca(false),
172   HasTrigReducedRange(false),
173   MaxWavesPerEU(10),
174   LocalMemorySize(0),
175   WavefrontSizeLog2(0)
176   { }
177 
178 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
179                            const GCNTargetMachine &TM)
180     : // clang-format off
181     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
182     AMDGPUSubtarget(TT),
183     TargetTriple(TT),
184     TargetID(*this),
185     Gen(INVALID),
186     InstrItins(getInstrItineraryForCPU(GPU)),
187     LDSBankCount(0),
188     MaxPrivateElementSize(0),
189 
190     FastFMAF32(false),
191     FastDenormalF32(false),
192     HalfRate64Ops(false),
193     FullRate64Ops(false),
194 
195     FlatForGlobal(false),
196     AutoWaitcntBeforeBarrier(false),
197     BackOffBarrier(false),
198     UnalignedScratchAccess(false),
199     UnalignedAccessMode(false),
200 
201     HasApertureRegs(false),
202     SupportsXNACK(false),
203     EnableXNACK(false),
204     EnableTgSplit(false),
205     EnableCuMode(false),
206     TrapHandler(false),
207 
208     EnableLoadStoreOpt(false),
209     EnableUnsafeDSOffsetFolding(false),
210     EnableSIScheduler(false),
211     EnableDS128(false),
212     EnablePRTStrictNull(false),
213     DumpCode(false),
214 
215     FP64(false),
216     CIInsts(false),
217     GFX8Insts(false),
218     GFX9Insts(false),
219     GFX90AInsts(false),
220     GFX940Insts(false),
221     GFX10Insts(false),
222     GFX10_3Insts(false),
223     GFX7GFX8GFX9Insts(false),
224     SGPRInitBug(false),
225     NegativeScratchOffsetBug(false),
226     NegativeUnalignedScratchOffsetBug(false),
227     HasSMemRealTime(false),
228     HasIntClamp(false),
229     HasFmaMixInsts(false),
230     HasMovrel(false),
231     HasVGPRIndexMode(false),
232     HasScalarStores(false),
233     HasScalarAtomics(false),
234     HasSDWAOmod(false),
235     HasSDWAScalar(false),
236     HasSDWASdst(false),
237     HasSDWAMac(false),
238     HasSDWAOutModsVOPC(false),
239     HasDPP(false),
240     HasDPP8(false),
241     Has64BitDPP(false),
242     HasPackedFP32Ops(false),
243     HasImageInsts(false),
244     HasExtendedImageInsts(false),
245     HasR128A16(false),
246     HasGFX10A16(false),
247     HasG16(false),
248     HasNSAEncoding(false),
249     NSAMaxSize(0),
250     GFX10_AEncoding(false),
251     GFX10_BEncoding(false),
252     HasDLInsts(false),
253     HasDot1Insts(false),
254     HasDot2Insts(false),
255     HasDot3Insts(false),
256     HasDot4Insts(false),
257     HasDot5Insts(false),
258     HasDot6Insts(false),
259     HasDot7Insts(false),
260     HasMAIInsts(false),
261     HasPkFmacF16Inst(false),
262     HasAtomicFaddInsts(false),
263     SupportsSRAMECC(false),
264     EnableSRAMECC(false),
265     HasNoSdstCMPX(false),
266     HasVscnt(false),
267     HasGetWaveIdInst(false),
268     HasSMemTimeInst(false),
269     HasShaderCyclesRegister(false),
270     HasVOP3Literal(false),
271     HasNoDataDepHazard(false),
272     FlatAddressSpace(false),
273     FlatInstOffsets(false),
274     FlatGlobalInsts(false),
275     FlatScratchInsts(false),
276     ScalarFlatScratchInsts(false),
277     HasArchitectedFlatScratch(false),
278     EnableFlatScratch(false),
279     AddNoCarryInsts(false),
280     HasUnpackedD16VMem(false),
281     LDSMisalignedBug(false),
282     HasMFMAInlineLiteralBug(false),
283     UnalignedBufferAccess(false),
284     UnalignedDSAccess(false),
285     HasPackedTID(false),
286 
287     ScalarizeGlobal(false),
288 
289     HasVcmpxPermlaneHazard(false),
290     HasVMEMtoScalarWriteHazard(false),
291     HasSMEMtoVectorWriteHazard(false),
292     HasInstFwdPrefetchBug(false),
293     HasVcmpxExecWARHazard(false),
294     HasLdsBranchVmemWARHazard(false),
295     HasNSAtoVMEMBug(false),
296     HasNSAClauseBug(false),
297     HasOffset3fBug(false),
298     HasFlatSegmentOffsetBug(false),
299     HasImageStoreD16Bug(false),
300     HasImageGather4D16Bug(false),
301 
302     FeatureDisable(false),
303     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
304     TLInfo(TM, *this),
305     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
306   // clang-format on
307   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
308   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
309   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
310   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
311   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
312   InstSelector.reset(new AMDGPUInstructionSelector(
313   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
314 }
315 
316 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
317   if (getGeneration() < GFX10)
318     return 1;
319 
320   switch (Opcode) {
321   case AMDGPU::V_LSHLREV_B64_e64:
322   case AMDGPU::V_LSHLREV_B64_gfx10:
323   case AMDGPU::V_LSHL_B64_e64:
324   case AMDGPU::V_LSHRREV_B64_e64:
325   case AMDGPU::V_LSHRREV_B64_gfx10:
326   case AMDGPU::V_LSHR_B64_e64:
327   case AMDGPU::V_ASHRREV_I64_e64:
328   case AMDGPU::V_ASHRREV_I64_gfx10:
329   case AMDGPU::V_ASHR_I64_e64:
330     return 1;
331   }
332 
333   return 2;
334 }
335 
336 /// This list was mostly derived from experimentation.
337 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
338   switch (Opcode) {
339   case AMDGPU::V_CVT_F16_F32_e32:
340   case AMDGPU::V_CVT_F16_F32_e64:
341   case AMDGPU::V_CVT_F16_U16_e32:
342   case AMDGPU::V_CVT_F16_U16_e64:
343   case AMDGPU::V_CVT_F16_I16_e32:
344   case AMDGPU::V_CVT_F16_I16_e64:
345   case AMDGPU::V_RCP_F16_e64:
346   case AMDGPU::V_RCP_F16_e32:
347   case AMDGPU::V_RSQ_F16_e64:
348   case AMDGPU::V_RSQ_F16_e32:
349   case AMDGPU::V_SQRT_F16_e64:
350   case AMDGPU::V_SQRT_F16_e32:
351   case AMDGPU::V_LOG_F16_e64:
352   case AMDGPU::V_LOG_F16_e32:
353   case AMDGPU::V_EXP_F16_e64:
354   case AMDGPU::V_EXP_F16_e32:
355   case AMDGPU::V_SIN_F16_e64:
356   case AMDGPU::V_SIN_F16_e32:
357   case AMDGPU::V_COS_F16_e64:
358   case AMDGPU::V_COS_F16_e32:
359   case AMDGPU::V_FLOOR_F16_e64:
360   case AMDGPU::V_FLOOR_F16_e32:
361   case AMDGPU::V_CEIL_F16_e64:
362   case AMDGPU::V_CEIL_F16_e32:
363   case AMDGPU::V_TRUNC_F16_e64:
364   case AMDGPU::V_TRUNC_F16_e32:
365   case AMDGPU::V_RNDNE_F16_e64:
366   case AMDGPU::V_RNDNE_F16_e32:
367   case AMDGPU::V_FRACT_F16_e64:
368   case AMDGPU::V_FRACT_F16_e32:
369   case AMDGPU::V_FREXP_MANT_F16_e64:
370   case AMDGPU::V_FREXP_MANT_F16_e32:
371   case AMDGPU::V_FREXP_EXP_I16_F16_e64:
372   case AMDGPU::V_FREXP_EXP_I16_F16_e32:
373   case AMDGPU::V_LDEXP_F16_e64:
374   case AMDGPU::V_LDEXP_F16_e32:
375   case AMDGPU::V_LSHLREV_B16_e64:
376   case AMDGPU::V_LSHLREV_B16_e32:
377   case AMDGPU::V_LSHRREV_B16_e64:
378   case AMDGPU::V_LSHRREV_B16_e32:
379   case AMDGPU::V_ASHRREV_I16_e64:
380   case AMDGPU::V_ASHRREV_I16_e32:
381   case AMDGPU::V_ADD_U16_e64:
382   case AMDGPU::V_ADD_U16_e32:
383   case AMDGPU::V_SUB_U16_e64:
384   case AMDGPU::V_SUB_U16_e32:
385   case AMDGPU::V_SUBREV_U16_e64:
386   case AMDGPU::V_SUBREV_U16_e32:
387   case AMDGPU::V_MUL_LO_U16_e64:
388   case AMDGPU::V_MUL_LO_U16_e32:
389   case AMDGPU::V_ADD_F16_e64:
390   case AMDGPU::V_ADD_F16_e32:
391   case AMDGPU::V_SUB_F16_e64:
392   case AMDGPU::V_SUB_F16_e32:
393   case AMDGPU::V_SUBREV_F16_e64:
394   case AMDGPU::V_SUBREV_F16_e32:
395   case AMDGPU::V_MUL_F16_e64:
396   case AMDGPU::V_MUL_F16_e32:
397   case AMDGPU::V_MAX_F16_e64:
398   case AMDGPU::V_MAX_F16_e32:
399   case AMDGPU::V_MIN_F16_e64:
400   case AMDGPU::V_MIN_F16_e32:
401   case AMDGPU::V_MAX_U16_e64:
402   case AMDGPU::V_MAX_U16_e32:
403   case AMDGPU::V_MIN_U16_e64:
404   case AMDGPU::V_MIN_U16_e32:
405   case AMDGPU::V_MAX_I16_e64:
406   case AMDGPU::V_MAX_I16_e32:
407   case AMDGPU::V_MIN_I16_e64:
408   case AMDGPU::V_MIN_I16_e32:
409   case AMDGPU::V_MAD_F16_e64:
410   case AMDGPU::V_MAD_U16_e64:
411   case AMDGPU::V_MAD_I16_e64:
412   case AMDGPU::V_FMA_F16_e64:
413   case AMDGPU::V_DIV_FIXUP_F16_e64:
414     // On gfx10, all 16-bit instructions preserve the high bits.
415     return getGeneration() <= AMDGPUSubtarget::GFX9;
416   case AMDGPU::V_MADAK_F16:
417   case AMDGPU::V_MADMK_F16:
418   case AMDGPU::V_MAC_F16_e64:
419   case AMDGPU::V_MAC_F16_e32:
420   case AMDGPU::V_FMAMK_F16:
421   case AMDGPU::V_FMAAK_F16:
422   case AMDGPU::V_FMAC_F16_e64:
423   case AMDGPU::V_FMAC_F16_e32:
424     // In gfx9, the preferred handling of the unused high 16-bits changed. Most
425     // instructions maintain the legacy behavior of 0ing. Some instructions
426     // changed to preserving the high bits.
427     return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
428   case AMDGPU::V_MAD_MIXLO_F16:
429   case AMDGPU::V_MAD_MIXHI_F16:
430   default:
431     return false;
432   }
433 }
434 
435 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
436   const Function &F) const {
437   if (NWaves == 1)
438     return getLocalMemorySize();
439   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
440   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
441   if (!WorkGroupsPerCu)
442     return 0;
443   unsigned MaxWaves = getMaxWavesPerEU();
444   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
445 }
446 
447 // FIXME: Should return min,max range.
448 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
449   const Function &F) const {
450   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
451   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
452   if (!MaxWorkGroupsPerCu)
453     return 0;
454 
455   const unsigned WaveSize = getWavefrontSize();
456 
457   // FIXME: Do we need to account for alignment requirement of LDS rounding the
458   // size up?
459   // Compute restriction based on LDS usage
460   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
461 
462   // This can be queried with more LDS than is possible, so just assume the
463   // worst.
464   if (NumGroups == 0)
465     return 1;
466 
467   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
468 
469   // Round to the number of waves.
470   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
471   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
472 
473   // Clamp to the maximum possible number of waves.
474   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
475 
476   // FIXME: Needs to be a multiple of the group size?
477   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
478 
479   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
480          "computed invalid occupancy");
481   return MaxWaves;
482 }
483 
484 unsigned
485 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
486   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
487   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
488 }
489 
490 std::pair<unsigned, unsigned>
491 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
492   switch (CC) {
493   case CallingConv::AMDGPU_VS:
494   case CallingConv::AMDGPU_LS:
495   case CallingConv::AMDGPU_HS:
496   case CallingConv::AMDGPU_ES:
497   case CallingConv::AMDGPU_GS:
498   case CallingConv::AMDGPU_PS:
499     return std::make_pair(1, getWavefrontSize());
500   default:
501     return std::make_pair(1u, getMaxFlatWorkGroupSize());
502   }
503 }
504 
505 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
506   const Function &F) const {
507   // Default minimum/maximum flat work group sizes.
508   std::pair<unsigned, unsigned> Default =
509     getDefaultFlatWorkGroupSize(F.getCallingConv());
510 
511   // Requested minimum/maximum flat work group sizes.
512   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
513     F, "amdgpu-flat-work-group-size", Default);
514 
515   // Make sure requested minimum is less than requested maximum.
516   if (Requested.first > Requested.second)
517     return Default;
518 
519   // Make sure requested values do not violate subtarget's specifications.
520   if (Requested.first < getMinFlatWorkGroupSize())
521     return Default;
522   if (Requested.second > getMaxFlatWorkGroupSize())
523     return Default;
524 
525   return Requested;
526 }
527 
528 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
529     const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
530   // Default minimum/maximum number of waves per execution unit.
531   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
532 
533   // If minimum/maximum flat work group sizes were explicitly requested using
534   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
535   // number of waves per execution unit to values implied by requested
536   // minimum/maximum flat work group sizes.
537   unsigned MinImpliedByFlatWorkGroupSize =
538     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
539   Default.first = MinImpliedByFlatWorkGroupSize;
540 
541   // Requested minimum/maximum number of waves per execution unit.
542   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
543     F, "amdgpu-waves-per-eu", Default, true);
544 
545   // Make sure requested minimum is less than requested maximum.
546   if (Requested.second && Requested.first > Requested.second)
547     return Default;
548 
549   // Make sure requested values do not violate subtarget's specifications.
550   if (Requested.first < getMinWavesPerEU() ||
551       Requested.second > getMaxWavesPerEU())
552     return Default;
553 
554   // Make sure requested values are compatible with values implied by requested
555   // minimum/maximum flat work group sizes.
556   if (Requested.first < MinImpliedByFlatWorkGroupSize)
557     return Default;
558 
559   return Requested;
560 }
561 
562 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
563   auto Node = Kernel.getMetadata("reqd_work_group_size");
564   if (Node && Node->getNumOperands() == 3)
565     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
566   return std::numeric_limits<unsigned>::max();
567 }
568 
569 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
570   return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
571 }
572 
573 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
574                                            unsigned Dimension) const {
575   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
576   if (ReqdSize != std::numeric_limits<unsigned>::max())
577     return ReqdSize - 1;
578   return getFlatWorkGroupSizes(Kernel).second - 1;
579 }
580 
581 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
582   Function *Kernel = I->getParent()->getParent();
583   unsigned MinSize = 0;
584   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
585   bool IdQuery = false;
586 
587   // If reqd_work_group_size is present it narrows value down.
588   if (auto *CI = dyn_cast<CallInst>(I)) {
589     const Function *F = CI->getCalledFunction();
590     if (F) {
591       unsigned Dim = UINT_MAX;
592       switch (F->getIntrinsicID()) {
593       case Intrinsic::amdgcn_workitem_id_x:
594       case Intrinsic::r600_read_tidig_x:
595         IdQuery = true;
596         LLVM_FALLTHROUGH;
597       case Intrinsic::r600_read_local_size_x:
598         Dim = 0;
599         break;
600       case Intrinsic::amdgcn_workitem_id_y:
601       case Intrinsic::r600_read_tidig_y:
602         IdQuery = true;
603         LLVM_FALLTHROUGH;
604       case Intrinsic::r600_read_local_size_y:
605         Dim = 1;
606         break;
607       case Intrinsic::amdgcn_workitem_id_z:
608       case Intrinsic::r600_read_tidig_z:
609         IdQuery = true;
610         LLVM_FALLTHROUGH;
611       case Intrinsic::r600_read_local_size_z:
612         Dim = 2;
613         break;
614       default:
615         break;
616       }
617 
618       if (Dim <= 3) {
619         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
620         if (ReqdSize != std::numeric_limits<unsigned>::max())
621           MinSize = MaxSize = ReqdSize;
622       }
623     }
624   }
625 
626   if (!MaxSize)
627     return false;
628 
629   // Range metadata is [Lo, Hi). For ID query we need to pass max size
630   // as Hi. For size query we need to pass Hi + 1.
631   if (IdQuery)
632     MinSize = 0;
633   else
634     ++MaxSize;
635 
636   MDBuilder MDB(I->getContext());
637   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
638                                                   APInt(32, MaxSize));
639   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
640   return true;
641 }
642 
643 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
644   assert(AMDGPU::isKernel(F.getCallingConv()));
645 
646   // We don't allocate the segment if we know the implicit arguments weren't
647   // used, even if the ABI implies we need them.
648   if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
649     return 0;
650 
651   if (isMesaKernel(F))
652     return 16;
653 
654   // Assume all implicit inputs are used by default
655   unsigned NBytes = (AMDGPU::getAmdhsaCodeObjectVersion() >= 5) ? 256 : 56;
656   return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", NBytes);
657 }
658 
659 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
660                                                  Align &MaxAlign) const {
661   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
662          F.getCallingConv() == CallingConv::SPIR_KERNEL);
663 
664   const DataLayout &DL = F.getParent()->getDataLayout();
665   uint64_t ExplicitArgBytes = 0;
666   MaxAlign = Align(1);
667 
668   for (const Argument &Arg : F.args()) {
669     const bool IsByRef = Arg.hasByRefAttr();
670     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
671     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
672     if (!Alignment)
673       Alignment = DL.getABITypeAlign(ArgTy);
674 
675     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
676     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
677     MaxAlign = max(MaxAlign, Alignment);
678   }
679 
680   return ExplicitArgBytes;
681 }
682 
683 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
684                                                 Align &MaxAlign) const {
685   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
686 
687   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
688 
689   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
690   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
691   if (ImplicitBytes != 0) {
692     const Align Alignment = getAlignmentForImplicitArgPtr();
693     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
694     MaxAlign = std::max(MaxAlign, Alignment);
695   }
696 
697   // Being able to dereference past the end is useful for emitting scalar loads.
698   return alignTo(TotalSize, 4);
699 }
700 
701 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
702   return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
703                                   : AMDGPUDwarfFlavour::Wave64;
704 }
705 
706 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
707                                       unsigned NumRegionInstrs) const {
708   // Track register pressure so the scheduler can try to decrease
709   // pressure once register usage is above the threshold defined by
710   // SIRegisterInfo::getRegPressureSetLimit()
711   Policy.ShouldTrackPressure = true;
712 
713   // Enabling both top down and bottom up scheduling seems to give us less
714   // register spills than just using one of these approaches on its own.
715   Policy.OnlyTopDown = false;
716   Policy.OnlyBottomUp = false;
717 
718   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
719   if (!enableSIScheduler())
720     Policy.ShouldTrackLaneMasks = true;
721 }
722 
723 bool GCNSubtarget::hasMadF16() const {
724   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
725 }
726 
727 bool GCNSubtarget::useVGPRIndexMode() const {
728   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
729 }
730 
731 bool GCNSubtarget::useAA() const { return UseAA; }
732 
733 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
734   if (getGeneration() >= AMDGPUSubtarget::GFX10)
735     return getMaxWavesPerEU();
736 
737   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
738     if (SGPRs <= 80)
739       return 10;
740     if (SGPRs <= 88)
741       return 9;
742     if (SGPRs <= 100)
743       return 8;
744     return 7;
745   }
746   if (SGPRs <= 48)
747     return 10;
748   if (SGPRs <= 56)
749     return 9;
750   if (SGPRs <= 64)
751     return 8;
752   if (SGPRs <= 72)
753     return 7;
754   if (SGPRs <= 80)
755     return 6;
756   return 5;
757 }
758 
759 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
760   unsigned MaxWaves = getMaxWavesPerEU();
761   unsigned Granule = getVGPRAllocGranule();
762   if (VGPRs < Granule)
763     return MaxWaves;
764   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
765   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
766 }
767 
768 unsigned
769 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
770   if (getGeneration() >= AMDGPUSubtarget::GFX10)
771     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
772 
773   if (HasFlatScratch || HasArchitectedFlatScratch) {
774     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
775       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
776     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
777       return 4; // FLAT_SCRATCH, VCC (in that order).
778   }
779 
780   if (isXNACKEnabled())
781     return 4; // XNACK, VCC (in that order).
782   return 2; // VCC.
783 }
784 
785 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
786   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
787   return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit());
788 }
789 
790 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
791   // In principle we do not need to reserve SGPR pair used for flat_scratch if
792   // we know flat instructions do not access the stack anywhere in the
793   // program. For now assume it's needed if we have flat instructions.
794   const bool KernelUsesFlatScratch = hasFlatAddressSpace();
795   return getBaseReservedNumSGPRs(KernelUsesFlatScratch);
796 }
797 
798 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
799                                         unsigned NumSGPRs,
800                                         unsigned NumVGPRs) const {
801   unsigned Occupancy =
802     std::min(getMaxWavesPerEU(),
803              getOccupancyWithLocalMemSize(LDSSize, F));
804   if (NumSGPRs)
805     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
806   if (NumVGPRs)
807     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
808   return Occupancy;
809 }
810 
811 unsigned GCNSubtarget::getBaseMaxNumSGPRs(
812     const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
813     unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
814   // Compute maximum number of SGPRs function can use using default/requested
815   // minimum number of waves per execution unit.
816   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
817   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
818 
819   // Check if maximum number of SGPRs was explicitly requested using
820   // "amdgpu-num-sgpr" attribute.
821   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
822     unsigned Requested = AMDGPU::getIntegerAttribute(
823       F, "amdgpu-num-sgpr", MaxNumSGPRs);
824 
825     // Make sure requested value does not violate subtarget's specifications.
826     if (Requested && (Requested <= ReservedNumSGPRs))
827       Requested = 0;
828 
829     // If more SGPRs are required to support the input user/system SGPRs,
830     // increase to accommodate them.
831     //
832     // FIXME: This really ends up using the requested number of SGPRs + number
833     // of reserved special registers in total. Theoretically you could re-use
834     // the last input registers for these special registers, but this would
835     // require a lot of complexity to deal with the weird aliasing.
836     unsigned InputNumSGPRs = PreloadedSGPRs;
837     if (Requested && Requested < InputNumSGPRs)
838       Requested = InputNumSGPRs;
839 
840     // Make sure requested value is compatible with values implied by
841     // default/requested minimum/maximum number of waves per execution unit.
842     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
843       Requested = 0;
844     if (WavesPerEU.second &&
845         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
846       Requested = 0;
847 
848     if (Requested)
849       MaxNumSGPRs = Requested;
850   }
851 
852   if (hasSGPRInitBug())
853     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
854 
855   return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
856 }
857 
858 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
859   const Function &F = MF.getFunction();
860   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
861   return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),
862                             getReservedNumSGPRs(MF));
863 }
864 
865 static unsigned getMaxNumPreloadedSGPRs() {
866   // Max number of user SGPRs
867   unsigned MaxUserSGPRs = 4 + // private segment buffer
868                           2 + // Dispatch ptr
869                           2 + // queue ptr
870                           2 + // kernel segment ptr
871                           2 + // dispatch ID
872                           2 + // flat scratch init
873                           2;  // Implicit buffer ptr
874   // Max number of system SGPRs
875   unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
876                             1 + // WorkGroupIDY
877                             1 + // WorkGroupIDZ
878                             1 + // WorkGroupInfo
879                             1;  // private segment wave byte offset
880   return MaxUserSGPRs + MaxSystemSGPRs;
881 }
882 
883 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
884   return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),
885                             getReservedNumSGPRs(F));
886 }
887 
888 unsigned GCNSubtarget::getBaseMaxNumVGPRs(
889     const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
890   // Compute maximum number of VGPRs function can use using default/requested
891   // minimum number of waves per execution unit.
892   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
893 
894   // Check if maximum number of VGPRs was explicitly requested using
895   // "amdgpu-num-vgpr" attribute.
896   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
897     unsigned Requested = AMDGPU::getIntegerAttribute(
898       F, "amdgpu-num-vgpr", MaxNumVGPRs);
899 
900     if (hasGFX90AInsts())
901       Requested *= 2;
902 
903     // Make sure requested value is compatible with values implied by
904     // default/requested minimum/maximum number of waves per execution unit.
905     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
906       Requested = 0;
907     if (WavesPerEU.second &&
908         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
909       Requested = 0;
910 
911     if (Requested)
912       MaxNumVGPRs = Requested;
913   }
914 
915   return MaxNumVGPRs;
916 }
917 
918 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
919   return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
920 }
921 
922 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
923   const Function &F = MF.getFunction();
924   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
925   return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
926 }
927 
928 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
929                                          int UseOpIdx, SDep &Dep) const {
930   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
931       !Def->isInstr() || !Use->isInstr())
932     return;
933 
934   MachineInstr *DefI = Def->getInstr();
935   MachineInstr *UseI = Use->getInstr();
936 
937   if (DefI->isBundle()) {
938     const SIRegisterInfo *TRI = getRegisterInfo();
939     auto Reg = Dep.getReg();
940     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
941     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
942     unsigned Lat = 0;
943     for (++I; I != E && I->isBundledWithPred(); ++I) {
944       if (I->modifiesRegister(Reg, TRI))
945         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
946       else if (Lat)
947         --Lat;
948     }
949     Dep.setLatency(Lat);
950   } else if (UseI->isBundle()) {
951     const SIRegisterInfo *TRI = getRegisterInfo();
952     auto Reg = Dep.getReg();
953     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
954     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
955     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
956     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
957       if (I->readsRegister(Reg, TRI))
958         break;
959       --Lat;
960     }
961     Dep.setLatency(Lat);
962   } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
963     // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
964     // implicit operands which come from the MCInstrDesc, which can fool
965     // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
966     // pseudo operands.
967     Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
968         DefI, DefOpIdx, UseI, UseOpIdx));
969   }
970 }
971 
972 namespace {
973 struct FillMFMAShadowMutation : ScheduleDAGMutation {
974   const SIInstrInfo *TII;
975 
976   ScheduleDAGMI *DAG;
977 
978   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
979 
980   bool isSALU(const SUnit *SU) const {
981     const MachineInstr *MI = SU->getInstr();
982     return MI && TII->isSALU(*MI) && !MI->isTerminator();
983   }
984 
985   bool isVALU(const SUnit *SU) const {
986     const MachineInstr *MI = SU->getInstr();
987     return MI && TII->isVALU(*MI);
988   }
989 
990   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
991     if (Pred->NodeNum < Succ->NodeNum)
992       return true;
993 
994     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
995 
996     for (unsigned I = 0; I < Succs.size(); ++I) {
997       for (const SDep &SI : Succs[I]->Succs) {
998         const SUnit *SU = SI.getSUnit();
999         if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
1000           Succs.push_back(SU);
1001       }
1002     }
1003 
1004     SmallPtrSet<const SUnit*, 32> Visited;
1005     while (!Preds.empty()) {
1006       const SUnit *SU = Preds.pop_back_val();
1007       if (llvm::is_contained(Succs, SU))
1008         return false;
1009       Visited.insert(SU);
1010       for (const SDep &SI : SU->Preds)
1011         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
1012           Preds.push_back(SI.getSUnit());
1013     }
1014 
1015     return true;
1016   }
1017 
1018   // Link as many SALU instructions in chain as possible. Return the size
1019   // of the chain. Links up to MaxChain instructions.
1020   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
1021                          SmallPtrSetImpl<SUnit *> &Visited) const {
1022     SmallVector<SUnit *, 8> Worklist({To});
1023     unsigned Linked = 0;
1024 
1025     while (!Worklist.empty() && MaxChain-- > 0) {
1026       SUnit *SU = Worklist.pop_back_val();
1027       if (!Visited.insert(SU).second)
1028         continue;
1029 
1030       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
1031                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
1032 
1033       if (SU->addPred(SDep(From, SDep::Artificial), false))
1034         ++Linked;
1035 
1036       for (SDep &SI : From->Succs) {
1037         SUnit *SUv = SI.getSUnit();
1038         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
1039           SUv->addPred(SDep(SU, SDep::Artificial), false);
1040       }
1041 
1042       for (SDep &SI : SU->Succs) {
1043         SUnit *Succ = SI.getSUnit();
1044         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
1045           Worklist.push_back(Succ);
1046       }
1047     }
1048 
1049     return Linked;
1050   }
1051 
1052   void apply(ScheduleDAGInstrs *DAGInstrs) override {
1053     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
1054     if (!ST.hasMAIInsts() || DisablePowerSched)
1055       return;
1056     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
1057     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
1058     if (!TSchedModel || DAG->SUnits.empty())
1059       return;
1060 
1061     // Scan for MFMA long latency instructions and try to add a dependency
1062     // of available SALU instructions to give them a chance to fill MFMA
1063     // shadow. That is desirable to fill MFMA shadow with SALU instructions
1064     // rather than VALU to prevent power consumption bursts and throttle.
1065     auto LastSALU = DAG->SUnits.begin();
1066     auto E = DAG->SUnits.end();
1067     SmallPtrSet<SUnit*, 32> Visited;
1068     for (SUnit &SU : DAG->SUnits) {
1069       MachineInstr &MAI = *SU.getInstr();
1070       if (!TII->isMAI(MAI) ||
1071            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
1072            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
1073         continue;
1074 
1075       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
1076 
1077       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
1078                  dbgs() << "Need " << Lat
1079                         << " instructions to cover latency.\n");
1080 
1081       // Find up to Lat independent scalar instructions as early as
1082       // possible such that they can be scheduled after this MFMA.
1083       for ( ; Lat && LastSALU != E; ++LastSALU) {
1084         if (Visited.count(&*LastSALU))
1085           continue;
1086 
1087         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
1088           continue;
1089 
1090         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
1091       }
1092     }
1093   }
1094 };
1095 } // namespace
1096 
1097 void GCNSubtarget::getPostRAMutations(
1098     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
1099   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
1100 }
1101 
1102 std::unique_ptr<ScheduleDAGMutation>
1103 GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const {
1104   return std::make_unique<FillMFMAShadowMutation>(&InstrInfo);
1105 }
1106 
1107 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
1108   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
1109     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
1110   else
1111     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
1112 }
1113 
1114 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
1115   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
1116     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
1117   else
1118     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
1119 }
1120