1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPUCallLowering.h"
16 #include "AMDGPUInstructionSelector.h"
17 #include "AMDGPULegalizerInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "R600Subtarget.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/IntrinsicsR600.h"
29 #include "llvm/IR/MDBuilder.h"
30 #include "llvm/MC/MCSubtargetInfo.h"
31 #include <algorithm>
32 
33 using namespace llvm;
34 
35 #define DEBUG_TYPE "amdgpu-subtarget"
36 
37 #define GET_SUBTARGETINFO_TARGET_DESC
38 #define GET_SUBTARGETINFO_CTOR
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #undef AMDGPUSubtarget
42 
43 static cl::opt<bool> DisablePowerSched(
44   "amdgpu-disable-power-sched",
45   cl::desc("Disable scheduling to minimize mAI power bursts"),
46   cl::init(false));
47 
48 static cl::opt<bool> EnableVGPRIndexMode(
49   "amdgpu-vgpr-index-mode",
50   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51   cl::init(false));
52 
53 static cl::opt<bool> EnableFlatScratch(
54   "amdgpu-enable-flat-scratch",
55   cl::desc("Use flat scratch instructions"),
56   cl::init(false));
57 
58 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
59                            cl::desc("Enable the use of AA during codegen."),
60                            cl::init(true));
61 
62 GCNSubtarget::~GCNSubtarget() = default;
63 
64 GCNSubtarget &
65 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
66                                               StringRef GPU, StringRef FS) {
67   // Determine default and user-specified characteristics
68   //
69   // We want to be able to turn these off, but making this a subtarget feature
70   // for SI has the unhelpful behavior that it unsets everything else if you
71   // disable it.
72   //
73   // Similarly we want enable-prt-strict-null to be on by default and not to
74   // unset everything else if it is disabled
75 
76   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
77 
78   // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
79   if (isAmdHsaOS())
80     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
81 
82   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
83 
84   // Disable mutually exclusive bits.
85   if (FS.contains_insensitive("+wavefrontsize")) {
86     if (!FS.contains_insensitive("wavefrontsize16"))
87       FullFS += "-wavefrontsize16,";
88     if (!FS.contains_insensitive("wavefrontsize32"))
89       FullFS += "-wavefrontsize32,";
90     if (!FS.contains_insensitive("wavefrontsize64"))
91       FullFS += "-wavefrontsize64,";
92   }
93 
94   FullFS += FS;
95 
96   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
97 
98   // Implement the "generic" processors, which acts as the default when no
99   // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
100   // the first amdgcn target that supports flat addressing. Other OSes defaults
101   // to the first amdgcn target.
102   if (Gen == AMDGPUSubtarget::INVALID) {
103      Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
104                                         : AMDGPUSubtarget::SOUTHERN_ISLANDS;
105   }
106 
107   // We don't support FP64 for EG/NI atm.
108   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
109 
110   // Targets must either support 64-bit offsets for MUBUF instructions, and/or
111   // support flat operations, otherwise they cannot access a 64-bit global
112   // address space
113   assert(hasAddr64() || hasFlat());
114   // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
115   // that do not support ADDR64 variants of MUBUF instructions. Such targets
116   // cannot use a 64 bit offset with a MUBUF instruction to access the global
117   // address space
118   if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
119     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
120     FlatForGlobal = true;
121   }
122   // Unless +-flat-for-global is specified, use MUBUF instructions for global
123   // address space access if flat operations are not available.
124   if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
125     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
126     FlatForGlobal = false;
127   }
128 
129   // Set defaults if needed.
130   if (MaxPrivateElementSize == 0)
131     MaxPrivateElementSize = 4;
132 
133   if (LDSBankCount == 0)
134     LDSBankCount = 32;
135 
136   if (TT.getArch() == Triple::amdgcn) {
137     if (LocalMemorySize == 0)
138       LocalMemorySize = 32768;
139 
140     // Do something sensible for unspecified target.
141     if (!HasMovrel && !HasVGPRIndexMode)
142       HasMovrel = true;
143   }
144 
145   // Don't crash on invalid devices.
146   if (WavefrontSizeLog2 == 0)
147     WavefrontSizeLog2 = 5;
148 
149   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
150   HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
151 
152   TargetID.setTargetIDFromFeaturesString(FS);
153 
154   LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
155                     << TargetID.getXnackSetting() << '\n');
156   LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
157                     << TargetID.getSramEccSetting() << '\n');
158 
159   return *this;
160 }
161 
162 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
163   TargetTriple(TT),
164   GCN3Encoding(false),
165   Has16BitInsts(false),
166   HasMadMixInsts(false),
167   HasMadMacF32Insts(false),
168   HasDsSrc2Insts(false),
169   HasSDWA(false),
170   HasVOP3PInsts(false),
171   HasMulI24(true),
172   HasMulU24(true),
173   HasSMulHi(false),
174   HasInv2PiInlineImm(false),
175   HasFminFmaxLegacy(true),
176   EnablePromoteAlloca(false),
177   HasTrigReducedRange(false),
178   MaxWavesPerEU(10),
179   LocalMemorySize(0),
180   WavefrontSizeLog2(0)
181   { }
182 
183 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
184                            const GCNTargetMachine &TM)
185     : // clang-format off
186     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
187     AMDGPUSubtarget(TT),
188     TargetTriple(TT),
189     TargetID(*this),
190     Gen(INVALID),
191     InstrItins(getInstrItineraryForCPU(GPU)),
192     LDSBankCount(0),
193     MaxPrivateElementSize(0),
194 
195     FastFMAF32(false),
196     FastDenormalF32(false),
197     HalfRate64Ops(false),
198     FullRate64Ops(false),
199 
200     FlatForGlobal(false),
201     AutoWaitcntBeforeBarrier(false),
202     UnalignedScratchAccess(false),
203     UnalignedAccessMode(false),
204 
205     HasApertureRegs(false),
206     SupportsXNACK(false),
207     EnableXNACK(false),
208     EnableTgSplit(false),
209     EnableCuMode(false),
210     TrapHandler(false),
211 
212     EnableLoadStoreOpt(false),
213     EnableUnsafeDSOffsetFolding(false),
214     EnableSIScheduler(false),
215     EnableDS128(false),
216     EnablePRTStrictNull(false),
217     DumpCode(false),
218 
219     FP64(false),
220     CIInsts(false),
221     GFX8Insts(false),
222     GFX9Insts(false),
223     GFX90AInsts(false),
224     GFX10Insts(false),
225     GFX10_3Insts(false),
226     GFX7GFX8GFX9Insts(false),
227     SGPRInitBug(false),
228     NegativeScratchOffsetBug(false),
229     NegativeUnalignedScratchOffsetBug(false),
230     HasSMemRealTime(false),
231     HasIntClamp(false),
232     HasFmaMixInsts(false),
233     HasMovrel(false),
234     HasVGPRIndexMode(false),
235     HasScalarStores(false),
236     HasScalarAtomics(false),
237     HasSDWAOmod(false),
238     HasSDWAScalar(false),
239     HasSDWASdst(false),
240     HasSDWAMac(false),
241     HasSDWAOutModsVOPC(false),
242     HasDPP(false),
243     HasDPP8(false),
244     Has64BitDPP(false),
245     HasPackedFP32Ops(false),
246     HasExtendedImageInsts(false),
247     HasR128A16(false),
248     HasGFX10A16(false),
249     HasG16(false),
250     HasNSAEncoding(false),
251     NSAMaxSize(0),
252     GFX10_AEncoding(false),
253     GFX10_BEncoding(false),
254     HasDLInsts(false),
255     HasDot1Insts(false),
256     HasDot2Insts(false),
257     HasDot3Insts(false),
258     HasDot4Insts(false),
259     HasDot5Insts(false),
260     HasDot6Insts(false),
261     HasDot7Insts(false),
262     HasMAIInsts(false),
263     HasPkFmacF16Inst(false),
264     HasAtomicFaddInsts(false),
265     SupportsSRAMECC(false),
266     EnableSRAMECC(false),
267     HasNoSdstCMPX(false),
268     HasVscnt(false),
269     HasGetWaveIdInst(false),
270     HasSMemTimeInst(false),
271     HasShaderCyclesRegister(false),
272     HasRegisterBanking(false),
273     HasVOP3Literal(false),
274     HasNoDataDepHazard(false),
275     FlatAddressSpace(false),
276     FlatInstOffsets(false),
277     FlatGlobalInsts(false),
278     FlatScratchInsts(false),
279     ScalarFlatScratchInsts(false),
280     HasArchitectedFlatScratch(false),
281     AddNoCarryInsts(false),
282     HasUnpackedD16VMem(false),
283     LDSMisalignedBug(false),
284     HasMFMAInlineLiteralBug(false),
285     UnalignedBufferAccess(false),
286     UnalignedDSAccess(false),
287     HasPackedTID(false),
288 
289     ScalarizeGlobal(false),
290 
291     HasVcmpxPermlaneHazard(false),
292     HasVMEMtoScalarWriteHazard(false),
293     HasSMEMtoVectorWriteHazard(false),
294     HasInstFwdPrefetchBug(false),
295     HasVcmpxExecWARHazard(false),
296     HasLdsBranchVmemWARHazard(false),
297     HasNSAtoVMEMBug(false),
298     HasNSAClauseBug(false),
299     HasOffset3fBug(false),
300     HasFlatSegmentOffsetBug(false),
301     HasImageStoreD16Bug(false),
302     HasImageGather4D16Bug(false),
303 
304     FeatureDisable(false),
305     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
306     TLInfo(TM, *this),
307     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
308   // clang-format on
309   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
310   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
311   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
312   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
313   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
314   InstSelector.reset(new AMDGPUInstructionSelector(
315   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
316 }
317 
318 bool GCNSubtarget::enableFlatScratch() const {
319   return flatScratchIsArchitected() ||
320          (EnableFlatScratch && hasFlatScratchInsts());
321 }
322 
323 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
324   if (getGeneration() < GFX10)
325     return 1;
326 
327   switch (Opcode) {
328   case AMDGPU::V_LSHLREV_B64_e64:
329   case AMDGPU::V_LSHLREV_B64_gfx10:
330   case AMDGPU::V_LSHL_B64_e64:
331   case AMDGPU::V_LSHRREV_B64_e64:
332   case AMDGPU::V_LSHRREV_B64_gfx10:
333   case AMDGPU::V_LSHR_B64_e64:
334   case AMDGPU::V_ASHRREV_I64_e64:
335   case AMDGPU::V_ASHRREV_I64_gfx10:
336   case AMDGPU::V_ASHR_I64_e64:
337     return 1;
338   }
339 
340   return 2;
341 }
342 
343 /// This list was mostly derived from experimentation.
344 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
345   switch (Opcode) {
346   case AMDGPU::V_CVT_F16_F32_e32:
347   case AMDGPU::V_CVT_F16_F32_e64:
348   case AMDGPU::V_CVT_F16_U16_e32:
349   case AMDGPU::V_CVT_F16_U16_e64:
350   case AMDGPU::V_CVT_F16_I16_e32:
351   case AMDGPU::V_CVT_F16_I16_e64:
352   case AMDGPU::V_RCP_F16_e64:
353   case AMDGPU::V_RCP_F16_e32:
354   case AMDGPU::V_RSQ_F16_e64:
355   case AMDGPU::V_RSQ_F16_e32:
356   case AMDGPU::V_SQRT_F16_e64:
357   case AMDGPU::V_SQRT_F16_e32:
358   case AMDGPU::V_LOG_F16_e64:
359   case AMDGPU::V_LOG_F16_e32:
360   case AMDGPU::V_EXP_F16_e64:
361   case AMDGPU::V_EXP_F16_e32:
362   case AMDGPU::V_SIN_F16_e64:
363   case AMDGPU::V_SIN_F16_e32:
364   case AMDGPU::V_COS_F16_e64:
365   case AMDGPU::V_COS_F16_e32:
366   case AMDGPU::V_FLOOR_F16_e64:
367   case AMDGPU::V_FLOOR_F16_e32:
368   case AMDGPU::V_CEIL_F16_e64:
369   case AMDGPU::V_CEIL_F16_e32:
370   case AMDGPU::V_TRUNC_F16_e64:
371   case AMDGPU::V_TRUNC_F16_e32:
372   case AMDGPU::V_RNDNE_F16_e64:
373   case AMDGPU::V_RNDNE_F16_e32:
374   case AMDGPU::V_FRACT_F16_e64:
375   case AMDGPU::V_FRACT_F16_e32:
376   case AMDGPU::V_FREXP_MANT_F16_e64:
377   case AMDGPU::V_FREXP_MANT_F16_e32:
378   case AMDGPU::V_FREXP_EXP_I16_F16_e64:
379   case AMDGPU::V_FREXP_EXP_I16_F16_e32:
380   case AMDGPU::V_LDEXP_F16_e64:
381   case AMDGPU::V_LDEXP_F16_e32:
382   case AMDGPU::V_LSHLREV_B16_e64:
383   case AMDGPU::V_LSHLREV_B16_e32:
384   case AMDGPU::V_LSHRREV_B16_e64:
385   case AMDGPU::V_LSHRREV_B16_e32:
386   case AMDGPU::V_ASHRREV_I16_e64:
387   case AMDGPU::V_ASHRREV_I16_e32:
388   case AMDGPU::V_ADD_U16_e64:
389   case AMDGPU::V_ADD_U16_e32:
390   case AMDGPU::V_SUB_U16_e64:
391   case AMDGPU::V_SUB_U16_e32:
392   case AMDGPU::V_SUBREV_U16_e64:
393   case AMDGPU::V_SUBREV_U16_e32:
394   case AMDGPU::V_MUL_LO_U16_e64:
395   case AMDGPU::V_MUL_LO_U16_e32:
396   case AMDGPU::V_ADD_F16_e64:
397   case AMDGPU::V_ADD_F16_e32:
398   case AMDGPU::V_SUB_F16_e64:
399   case AMDGPU::V_SUB_F16_e32:
400   case AMDGPU::V_SUBREV_F16_e64:
401   case AMDGPU::V_SUBREV_F16_e32:
402   case AMDGPU::V_MUL_F16_e64:
403   case AMDGPU::V_MUL_F16_e32:
404   case AMDGPU::V_MAX_F16_e64:
405   case AMDGPU::V_MAX_F16_e32:
406   case AMDGPU::V_MIN_F16_e64:
407   case AMDGPU::V_MIN_F16_e32:
408   case AMDGPU::V_MAX_U16_e64:
409   case AMDGPU::V_MAX_U16_e32:
410   case AMDGPU::V_MIN_U16_e64:
411   case AMDGPU::V_MIN_U16_e32:
412   case AMDGPU::V_MAX_I16_e64:
413   case AMDGPU::V_MAX_I16_e32:
414   case AMDGPU::V_MIN_I16_e64:
415   case AMDGPU::V_MIN_I16_e32:
416     // On gfx10, all 16-bit instructions preserve the high bits.
417     return getGeneration() <= AMDGPUSubtarget::GFX9;
418   case AMDGPU::V_MAD_F16_e64:
419   case AMDGPU::V_MADAK_F16:
420   case AMDGPU::V_MADMK_F16:
421   case AMDGPU::V_MAC_F16_e64:
422   case AMDGPU::V_MAC_F16_e32:
423   case AMDGPU::V_FMAMK_F16:
424   case AMDGPU::V_FMAAK_F16:
425   case AMDGPU::V_MAD_U16_e64:
426   case AMDGPU::V_MAD_I16_e64:
427   case AMDGPU::V_FMA_F16_e64:
428   case AMDGPU::V_FMAC_F16_e64:
429   case AMDGPU::V_FMAC_F16_e32:
430   case AMDGPU::V_DIV_FIXUP_F16_e64:
431     // In gfx9, the preferred handling of the unused high 16-bits changed. Most
432     // instructions maintain the legacy behavior of 0ing. Some instructions
433     // changed to preserving the high bits.
434     return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
435   case AMDGPU::V_MAD_MIXLO_F16:
436   case AMDGPU::V_MAD_MIXHI_F16:
437   default:
438     return false;
439   }
440 }
441 
442 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
443   const Function &F) const {
444   if (NWaves == 1)
445     return getLocalMemorySize();
446   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
447   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
448   if (!WorkGroupsPerCu)
449     return 0;
450   unsigned MaxWaves = getMaxWavesPerEU();
451   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
452 }
453 
454 // FIXME: Should return min,max range.
455 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
456   const Function &F) const {
457   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
458   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
459   if (!MaxWorkGroupsPerCu)
460     return 0;
461 
462   const unsigned WaveSize = getWavefrontSize();
463 
464   // FIXME: Do we need to account for alignment requirement of LDS rounding the
465   // size up?
466   // Compute restriction based on LDS usage
467   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
468 
469   // This can be queried with more LDS than is possible, so just assume the
470   // worst.
471   if (NumGroups == 0)
472     return 1;
473 
474   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
475 
476   // Round to the number of waves.
477   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
478   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
479 
480   // Clamp to the maximum possible number of waves.
481   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
482 
483   // FIXME: Needs to be a multiple of the group size?
484   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
485 
486   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
487          "computed invalid occupancy");
488   return MaxWaves;
489 }
490 
491 unsigned
492 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
493   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
494   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
495 }
496 
497 std::pair<unsigned, unsigned>
498 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
499   switch (CC) {
500   case CallingConv::AMDGPU_VS:
501   case CallingConv::AMDGPU_LS:
502   case CallingConv::AMDGPU_HS:
503   case CallingConv::AMDGPU_ES:
504   case CallingConv::AMDGPU_GS:
505   case CallingConv::AMDGPU_PS:
506     return std::make_pair(1, getWavefrontSize());
507   default:
508     return std::make_pair(1u, getMaxFlatWorkGroupSize());
509   }
510 }
511 
512 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
513   const Function &F) const {
514   // Default minimum/maximum flat work group sizes.
515   std::pair<unsigned, unsigned> Default =
516     getDefaultFlatWorkGroupSize(F.getCallingConv());
517 
518   // Requested minimum/maximum flat work group sizes.
519   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
520     F, "amdgpu-flat-work-group-size", Default);
521 
522   // Make sure requested minimum is less than requested maximum.
523   if (Requested.first > Requested.second)
524     return Default;
525 
526   // Make sure requested values do not violate subtarget's specifications.
527   if (Requested.first < getMinFlatWorkGroupSize())
528     return Default;
529   if (Requested.second > getMaxFlatWorkGroupSize())
530     return Default;
531 
532   return Requested;
533 }
534 
535 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
536     const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
537   // Default minimum/maximum number of waves per execution unit.
538   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
539 
540   // If minimum/maximum flat work group sizes were explicitly requested using
541   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
542   // number of waves per execution unit to values implied by requested
543   // minimum/maximum flat work group sizes.
544   unsigned MinImpliedByFlatWorkGroupSize =
545     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
546   Default.first = MinImpliedByFlatWorkGroupSize;
547 
548   // Requested minimum/maximum number of waves per execution unit.
549   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
550     F, "amdgpu-waves-per-eu", Default, true);
551 
552   // Make sure requested minimum is less than requested maximum.
553   if (Requested.second && Requested.first > Requested.second)
554     return Default;
555 
556   // Make sure requested values do not violate subtarget's specifications.
557   if (Requested.first < getMinWavesPerEU() ||
558       Requested.second > getMaxWavesPerEU())
559     return Default;
560 
561   // Make sure requested values are compatible with values implied by requested
562   // minimum/maximum flat work group sizes.
563   if (Requested.first < MinImpliedByFlatWorkGroupSize)
564     return Default;
565 
566   return Requested;
567 }
568 
569 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
570   auto Node = Kernel.getMetadata("reqd_work_group_size");
571   if (Node && Node->getNumOperands() == 3)
572     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
573   return std::numeric_limits<unsigned>::max();
574 }
575 
576 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
577   return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
578 }
579 
580 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
581                                            unsigned Dimension) const {
582   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
583   if (ReqdSize != std::numeric_limits<unsigned>::max())
584     return ReqdSize - 1;
585   return getFlatWorkGroupSizes(Kernel).second - 1;
586 }
587 
588 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
589   Function *Kernel = I->getParent()->getParent();
590   unsigned MinSize = 0;
591   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
592   bool IdQuery = false;
593 
594   // If reqd_work_group_size is present it narrows value down.
595   if (auto *CI = dyn_cast<CallInst>(I)) {
596     const Function *F = CI->getCalledFunction();
597     if (F) {
598       unsigned Dim = UINT_MAX;
599       switch (F->getIntrinsicID()) {
600       case Intrinsic::amdgcn_workitem_id_x:
601       case Intrinsic::r600_read_tidig_x:
602         IdQuery = true;
603         LLVM_FALLTHROUGH;
604       case Intrinsic::r600_read_local_size_x:
605         Dim = 0;
606         break;
607       case Intrinsic::amdgcn_workitem_id_y:
608       case Intrinsic::r600_read_tidig_y:
609         IdQuery = true;
610         LLVM_FALLTHROUGH;
611       case Intrinsic::r600_read_local_size_y:
612         Dim = 1;
613         break;
614       case Intrinsic::amdgcn_workitem_id_z:
615       case Intrinsic::r600_read_tidig_z:
616         IdQuery = true;
617         LLVM_FALLTHROUGH;
618       case Intrinsic::r600_read_local_size_z:
619         Dim = 2;
620         break;
621       default:
622         break;
623       }
624 
625       if (Dim <= 3) {
626         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
627         if (ReqdSize != std::numeric_limits<unsigned>::max())
628           MinSize = MaxSize = ReqdSize;
629       }
630     }
631   }
632 
633   if (!MaxSize)
634     return false;
635 
636   // Range metadata is [Lo, Hi). For ID query we need to pass max size
637   // as Hi. For size query we need to pass Hi + 1.
638   if (IdQuery)
639     MinSize = 0;
640   else
641     ++MaxSize;
642 
643   MDBuilder MDB(I->getContext());
644   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
645                                                   APInt(32, MaxSize));
646   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
647   return true;
648 }
649 
650 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
651   if (isMesaKernel(F))
652     return 16;
653   return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
654 }
655 
656 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
657                                                  Align &MaxAlign) const {
658   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
659          F.getCallingConv() == CallingConv::SPIR_KERNEL);
660 
661   const DataLayout &DL = F.getParent()->getDataLayout();
662   uint64_t ExplicitArgBytes = 0;
663   MaxAlign = Align(1);
664 
665   for (const Argument &Arg : F.args()) {
666     const bool IsByRef = Arg.hasByRefAttr();
667     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
668     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
669     if (!Alignment)
670       Alignment = DL.getABITypeAlign(ArgTy);
671 
672     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
673     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
674     MaxAlign = max(MaxAlign, Alignment);
675   }
676 
677   return ExplicitArgBytes;
678 }
679 
680 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
681                                                 Align &MaxAlign) const {
682   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
683 
684   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
685 
686   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
687   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
688   if (ImplicitBytes != 0) {
689     const Align Alignment = getAlignmentForImplicitArgPtr();
690     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
691   }
692 
693   // Being able to dereference past the end is useful for emitting scalar loads.
694   return alignTo(TotalSize, 4);
695 }
696 
697 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
698   return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
699                                   : AMDGPUDwarfFlavour::Wave64;
700 }
701 
702 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
703                                       unsigned NumRegionInstrs) const {
704   // Track register pressure so the scheduler can try to decrease
705   // pressure once register usage is above the threshold defined by
706   // SIRegisterInfo::getRegPressureSetLimit()
707   Policy.ShouldTrackPressure = true;
708 
709   // Enabling both top down and bottom up scheduling seems to give us less
710   // register spills than just using one of these approaches on its own.
711   Policy.OnlyTopDown = false;
712   Policy.OnlyBottomUp = false;
713 
714   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
715   if (!enableSIScheduler())
716     Policy.ShouldTrackLaneMasks = true;
717 }
718 
719 bool GCNSubtarget::hasMadF16() const {
720   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
721 }
722 
723 bool GCNSubtarget::useVGPRIndexMode() const {
724   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
725 }
726 
727 bool GCNSubtarget::useAA() const { return UseAA; }
728 
729 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
730   if (getGeneration() >= AMDGPUSubtarget::GFX10)
731     return getMaxWavesPerEU();
732 
733   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
734     if (SGPRs <= 80)
735       return 10;
736     if (SGPRs <= 88)
737       return 9;
738     if (SGPRs <= 100)
739       return 8;
740     return 7;
741   }
742   if (SGPRs <= 48)
743     return 10;
744   if (SGPRs <= 56)
745     return 9;
746   if (SGPRs <= 64)
747     return 8;
748   if (SGPRs <= 72)
749     return 7;
750   if (SGPRs <= 80)
751     return 6;
752   return 5;
753 }
754 
755 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
756   unsigned MaxWaves = getMaxWavesPerEU();
757   unsigned Granule = getVGPRAllocGranule();
758   if (VGPRs < Granule)
759     return MaxWaves;
760   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
761   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
762 }
763 
764 unsigned
765 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const {
766   if (getGeneration() >= AMDGPUSubtarget::GFX10)
767     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
768 
769   if (HasFlatScratchInit || HasArchitectedFlatScratch) {
770     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
771       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
772     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
773       return 4; // FLAT_SCRATCH, VCC (in that order).
774   }
775 
776   if (isXNACKEnabled())
777     return 4; // XNACK, VCC (in that order).
778   return 2; // VCC.
779 }
780 
781 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
782   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
783   return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit());
784 }
785 
786 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
787   // The logic to detect if the function has
788   // flat scratch init is slightly different than how
789   // SIMachineFunctionInfo constructor derives.
790   // We don't use amdgpu-calls, amdgpu-stack-objects
791   // attributes and isAmdHsaOrMesa here as it doesn't really matter.
792   // TODO: Outline this derivation logic and have just
793   // one common function in the backend to avoid duplication.
794   bool isEntry = AMDGPU::isEntryFunctionCC(F.getCallingConv());
795   bool FunctionHasFlatScratchInit = false;
796   if (hasFlatAddressSpace() && isEntry && !flatScratchIsArchitected() &&
797       enableFlatScratch()) {
798     FunctionHasFlatScratchInit = true;
799   }
800   return getBaseReservedNumSGPRs(FunctionHasFlatScratchInit);
801 }
802 
803 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
804                                         unsigned NumSGPRs,
805                                         unsigned NumVGPRs) const {
806   unsigned Occupancy =
807     std::min(getMaxWavesPerEU(),
808              getOccupancyWithLocalMemSize(LDSSize, F));
809   if (NumSGPRs)
810     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
811   if (NumVGPRs)
812     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
813   return Occupancy;
814 }
815 
816 unsigned GCNSubtarget::getBaseMaxNumSGPRs(
817     const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
818     unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
819   // Compute maximum number of SGPRs function can use using default/requested
820   // minimum number of waves per execution unit.
821   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
822   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
823 
824   // Check if maximum number of SGPRs was explicitly requested using
825   // "amdgpu-num-sgpr" attribute.
826   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
827     unsigned Requested = AMDGPU::getIntegerAttribute(
828       F, "amdgpu-num-sgpr", MaxNumSGPRs);
829 
830     // Make sure requested value does not violate subtarget's specifications.
831     if (Requested && (Requested <= ReservedNumSGPRs))
832       Requested = 0;
833 
834     // If more SGPRs are required to support the input user/system SGPRs,
835     // increase to accommodate them.
836     //
837     // FIXME: This really ends up using the requested number of SGPRs + number
838     // of reserved special registers in total. Theoretically you could re-use
839     // the last input registers for these special registers, but this would
840     // require a lot of complexity to deal with the weird aliasing.
841     unsigned InputNumSGPRs = PreloadedSGPRs;
842     if (Requested && Requested < InputNumSGPRs)
843       Requested = InputNumSGPRs;
844 
845     // Make sure requested value is compatible with values implied by
846     // default/requested minimum/maximum number of waves per execution unit.
847     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
848       Requested = 0;
849     if (WavesPerEU.second &&
850         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
851       Requested = 0;
852 
853     if (Requested)
854       MaxNumSGPRs = Requested;
855   }
856 
857   if (hasSGPRInitBug())
858     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
859 
860   return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
861 }
862 
863 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
864   const Function &F = MF.getFunction();
865   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
866   return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),
867                             getReservedNumSGPRs(MF));
868 }
869 
870 static unsigned getMaxNumPreloadedSGPRs() {
871   // Max number of user SGPRs
872   unsigned MaxUserSGPRs = 4 + // private segment buffer
873                           2 + // Dispatch ptr
874                           2 + // queue ptr
875                           2 + // kernel segment ptr
876                           2 + // dispatch ID
877                           2 + // flat scratch init
878                           2;  // Implicit buffer ptr
879   // Max number of system SGPRs
880   unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
881                             1 + // WorkGroupIDY
882                             1 + // WorkGroupIDZ
883                             1 + // WorkGroupInfo
884                             1;  // private segment wave byte offset
885   return MaxUserSGPRs + MaxSystemSGPRs;
886 }
887 
888 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
889   return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),
890                             getReservedNumSGPRs(F));
891 }
892 
893 unsigned GCNSubtarget::getBaseMaxNumVGPRs(
894     const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
895   // Compute maximum number of VGPRs function can use using default/requested
896   // minimum number of waves per execution unit.
897   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
898 
899   // Check if maximum number of VGPRs was explicitly requested using
900   // "amdgpu-num-vgpr" attribute.
901   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
902     unsigned Requested = AMDGPU::getIntegerAttribute(
903       F, "amdgpu-num-vgpr", MaxNumVGPRs);
904 
905     if (hasGFX90AInsts())
906       Requested *= 2;
907 
908     // Make sure requested value is compatible with values implied by
909     // default/requested minimum/maximum number of waves per execution unit.
910     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
911       Requested = 0;
912     if (WavesPerEU.second &&
913         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
914       Requested = 0;
915 
916     if (Requested)
917       MaxNumVGPRs = Requested;
918   }
919 
920   return MaxNumVGPRs;
921 }
922 
923 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
924   return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
925 }
926 
927 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
928   const Function &F = MF.getFunction();
929   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
930   return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
931 }
932 
933 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
934                                          int UseOpIdx, SDep &Dep) const {
935   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
936       !Def->isInstr() || !Use->isInstr())
937     return;
938 
939   MachineInstr *DefI = Def->getInstr();
940   MachineInstr *UseI = Use->getInstr();
941 
942   if (DefI->isBundle()) {
943     const SIRegisterInfo *TRI = getRegisterInfo();
944     auto Reg = Dep.getReg();
945     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
946     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
947     unsigned Lat = 0;
948     for (++I; I != E && I->isBundledWithPred(); ++I) {
949       if (I->modifiesRegister(Reg, TRI))
950         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
951       else if (Lat)
952         --Lat;
953     }
954     Dep.setLatency(Lat);
955   } else if (UseI->isBundle()) {
956     const SIRegisterInfo *TRI = getRegisterInfo();
957     auto Reg = Dep.getReg();
958     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
959     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
960     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
961     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
962       if (I->readsRegister(Reg, TRI))
963         break;
964       --Lat;
965     }
966     Dep.setLatency(Lat);
967   } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
968     // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
969     // implicit operands which come from the MCInstrDesc, which can fool
970     // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
971     // pseudo operands.
972     Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
973         DefI, DefOpIdx, UseI, UseOpIdx));
974   }
975 }
976 
977 namespace {
978 struct FillMFMAShadowMutation : ScheduleDAGMutation {
979   const SIInstrInfo *TII;
980 
981   ScheduleDAGMI *DAG;
982 
983   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
984 
985   bool isSALU(const SUnit *SU) const {
986     const MachineInstr *MI = SU->getInstr();
987     return MI && TII->isSALU(*MI) && !MI->isTerminator();
988   }
989 
990   bool isVALU(const SUnit *SU) const {
991     const MachineInstr *MI = SU->getInstr();
992     return MI && TII->isVALU(*MI);
993   }
994 
995   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
996     if (Pred->NodeNum < Succ->NodeNum)
997       return true;
998 
999     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
1000 
1001     for (unsigned I = 0; I < Succs.size(); ++I) {
1002       for (const SDep &SI : Succs[I]->Succs) {
1003         const SUnit *SU = SI.getSUnit();
1004         if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
1005           Succs.push_back(SU);
1006       }
1007     }
1008 
1009     SmallPtrSet<const SUnit*, 32> Visited;
1010     while (!Preds.empty()) {
1011       const SUnit *SU = Preds.pop_back_val();
1012       if (llvm::is_contained(Succs, SU))
1013         return false;
1014       Visited.insert(SU);
1015       for (const SDep &SI : SU->Preds)
1016         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
1017           Preds.push_back(SI.getSUnit());
1018     }
1019 
1020     return true;
1021   }
1022 
1023   // Link as many SALU instructions in chain as possible. Return the size
1024   // of the chain. Links up to MaxChain instructions.
1025   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
1026                          SmallPtrSetImpl<SUnit *> &Visited) const {
1027     SmallVector<SUnit *, 8> Worklist({To});
1028     unsigned Linked = 0;
1029 
1030     while (!Worklist.empty() && MaxChain-- > 0) {
1031       SUnit *SU = Worklist.pop_back_val();
1032       if (!Visited.insert(SU).second)
1033         continue;
1034 
1035       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
1036                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
1037 
1038       if (SU->addPred(SDep(From, SDep::Artificial), false))
1039         ++Linked;
1040 
1041       for (SDep &SI : From->Succs) {
1042         SUnit *SUv = SI.getSUnit();
1043         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
1044           SUv->addPred(SDep(SU, SDep::Artificial), false);
1045       }
1046 
1047       for (SDep &SI : SU->Succs) {
1048         SUnit *Succ = SI.getSUnit();
1049         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
1050           Worklist.push_back(Succ);
1051       }
1052     }
1053 
1054     return Linked;
1055   }
1056 
1057   void apply(ScheduleDAGInstrs *DAGInstrs) override {
1058     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
1059     if (!ST.hasMAIInsts() || DisablePowerSched)
1060       return;
1061     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
1062     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
1063     if (!TSchedModel || DAG->SUnits.empty())
1064       return;
1065 
1066     // Scan for MFMA long latency instructions and try to add a dependency
1067     // of available SALU instructions to give them a chance to fill MFMA
1068     // shadow. That is desirable to fill MFMA shadow with SALU instructions
1069     // rather than VALU to prevent power consumption bursts and throttle.
1070     auto LastSALU = DAG->SUnits.begin();
1071     auto E = DAG->SUnits.end();
1072     SmallPtrSet<SUnit*, 32> Visited;
1073     for (SUnit &SU : DAG->SUnits) {
1074       MachineInstr &MAI = *SU.getInstr();
1075       if (!TII->isMAI(MAI) ||
1076            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
1077            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
1078         continue;
1079 
1080       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
1081 
1082       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
1083                  dbgs() << "Need " << Lat
1084                         << " instructions to cover latency.\n");
1085 
1086       // Find up to Lat independent scalar instructions as early as
1087       // possible such that they can be scheduled after this MFMA.
1088       for ( ; Lat && LastSALU != E; ++LastSALU) {
1089         if (Visited.count(&*LastSALU))
1090           continue;
1091 
1092         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
1093           continue;
1094 
1095         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
1096       }
1097     }
1098   }
1099 };
1100 } // namespace
1101 
1102 void GCNSubtarget::getPostRAMutations(
1103     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
1104   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
1105 }
1106 
1107 std::unique_ptr<ScheduleDAGMutation>
1108 GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const {
1109   return std::make_unique<FillMFMAShadowMutation>(&InstrInfo);
1110 }
1111 
1112 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
1113   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
1114     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
1115   else
1116     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
1117 }
1118 
1119 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
1120   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
1121     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
1122   else
1123     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
1124 }
1125