1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUCallLowering.h"
17 #include "AMDGPUInstructionSelector.h"
18 #include "AMDGPULegalizerInfo.h"
19 #include "AMDGPURegisterBankInfo.h"
20 #include "AMDGPUTargetMachine.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/IntrinsicsR600.h"
29 #include "llvm/IR/MDBuilder.h"
30 #include "llvm/MC/MCSubtargetInfo.h"
31 #include <algorithm>
32 
33 using namespace llvm;
34 
35 #define DEBUG_TYPE "amdgpu-subtarget"
36 
37 #define GET_SUBTARGETINFO_TARGET_DESC
38 #define GET_SUBTARGETINFO_CTOR
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #define GET_SUBTARGETINFO_TARGET_DESC
42 #define GET_SUBTARGETINFO_CTOR
43 #undef AMDGPUSubtarget
44 #include "R600GenSubtargetInfo.inc"
45 
46 static cl::opt<bool> DisablePowerSched(
47   "amdgpu-disable-power-sched",
48   cl::desc("Disable scheduling to minimize mAI power bursts"),
49   cl::init(false));
50 
51 static cl::opt<bool> EnableVGPRIndexMode(
52   "amdgpu-vgpr-index-mode",
53   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
54   cl::init(false));
55 
56 static cl::opt<bool> EnableFlatScratch(
57   "amdgpu-enable-flat-scratch",
58   cl::desc("Use flat scratch instructions"),
59   cl::init(false));
60 
61 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
62                            cl::desc("Enable the use of AA during codegen."),
63                            cl::init(true));
64 
65 GCNSubtarget::~GCNSubtarget() = default;
66 
67 R600Subtarget &
68 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
69                                                StringRef GPU, StringRef FS) {
70   SmallString<256> FullFS("+promote-alloca,");
71   FullFS += FS;
72   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
73 
74   HasMulU24 = getGeneration() >= EVERGREEN;
75   HasMulI24 = hasCaymanISA();
76 
77   return *this;
78 }
79 
80 GCNSubtarget &
81 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
82                                               StringRef GPU, StringRef FS) {
83   // Determine default and user-specified characteristics
84   //
85   // We want to be able to turn these off, but making this a subtarget feature
86   // for SI has the unhelpful behavior that it unsets everything else if you
87   // disable it.
88   //
89   // Similarly we want enable-prt-strict-null to be on by default and not to
90   // unset everything else if it is disabled
91 
92   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
93 
94   // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
95   if (isAmdHsaOS())
96     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
97 
98   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
99 
100   // Disable mutually exclusive bits.
101   if (FS.find_insensitive("+wavefrontsize") != StringRef::npos) {
102     if (FS.find_insensitive("wavefrontsize16") == StringRef::npos)
103       FullFS += "-wavefrontsize16,";
104     if (FS.find_insensitive("wavefrontsize32") == StringRef::npos)
105       FullFS += "-wavefrontsize32,";
106     if (FS.find_insensitive("wavefrontsize64") == StringRef::npos)
107       FullFS += "-wavefrontsize64,";
108   }
109 
110   FullFS += FS;
111 
112   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
113 
114   // Implement the "generic" processors, which acts as the default when no
115   // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
116   // the first amdgcn target that supports flat addressing. Other OSes defaults
117   // to the first amdgcn target.
118   if (Gen == AMDGPUSubtarget::INVALID) {
119      Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
120                                         : AMDGPUSubtarget::SOUTHERN_ISLANDS;
121   }
122 
123   // We don't support FP64 for EG/NI atm.
124   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
125 
126   // Targets must either support 64-bit offsets for MUBUF instructions, and/or
127   // support flat operations, otherwise they cannot access a 64-bit global
128   // address space
129   assert(hasAddr64() || hasFlat());
130   // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
131   // that do not support ADDR64 variants of MUBUF instructions. Such targets
132   // cannot use a 64 bit offset with a MUBUF instruction to access the global
133   // address space
134   if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
135     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
136     FlatForGlobal = true;
137   }
138   // Unless +-flat-for-global is specified, use MUBUF instructions for global
139   // address space access if flat operations are not available.
140   if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
141     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
142     FlatForGlobal = false;
143   }
144 
145   // Set defaults if needed.
146   if (MaxPrivateElementSize == 0)
147     MaxPrivateElementSize = 4;
148 
149   if (LDSBankCount == 0)
150     LDSBankCount = 32;
151 
152   if (TT.getArch() == Triple::amdgcn) {
153     if (LocalMemorySize == 0)
154       LocalMemorySize = 32768;
155 
156     // Do something sensible for unspecified target.
157     if (!HasMovrel && !HasVGPRIndexMode)
158       HasMovrel = true;
159   }
160 
161   // Don't crash on invalid devices.
162   if (WavefrontSizeLog2 == 0)
163     WavefrontSizeLog2 = 5;
164 
165   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
166   HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
167 
168   TargetID.setTargetIDFromFeaturesString(FS);
169 
170   LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
171                     << TargetID.getXnackSetting() << '\n');
172   LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
173                     << TargetID.getSramEccSetting() << '\n');
174 
175   return *this;
176 }
177 
178 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
179   TargetTriple(TT),
180   GCN3Encoding(false),
181   Has16BitInsts(false),
182   HasMadMixInsts(false),
183   HasMadMacF32Insts(false),
184   HasDsSrc2Insts(false),
185   HasSDWA(false),
186   HasVOP3PInsts(false),
187   HasMulI24(true),
188   HasMulU24(true),
189   HasSMulHi(false),
190   HasInv2PiInlineImm(false),
191   HasFminFmaxLegacy(true),
192   EnablePromoteAlloca(false),
193   HasTrigReducedRange(false),
194   MaxWavesPerEU(10),
195   LocalMemorySize(0),
196   WavefrontSizeLog2(0)
197   { }
198 
199 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
200                            const GCNTargetMachine &TM)
201     : // clang-format off
202     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
203     AMDGPUSubtarget(TT),
204     TargetTriple(TT),
205     TargetID(*this),
206     Gen(INVALID),
207     InstrItins(getInstrItineraryForCPU(GPU)),
208     LDSBankCount(0),
209     MaxPrivateElementSize(0),
210 
211     FastFMAF32(false),
212     FastDenormalF32(false),
213     HalfRate64Ops(false),
214     FullRate64Ops(false),
215 
216     FlatForGlobal(false),
217     AutoWaitcntBeforeBarrier(false),
218     UnalignedScratchAccess(false),
219     UnalignedAccessMode(false),
220 
221     HasApertureRegs(false),
222     SupportsXNACK(false),
223     EnableXNACK(false),
224     EnableTgSplit(false),
225     EnableCuMode(false),
226     TrapHandler(false),
227 
228     EnableLoadStoreOpt(false),
229     EnableUnsafeDSOffsetFolding(false),
230     EnableSIScheduler(false),
231     EnableDS128(false),
232     EnablePRTStrictNull(false),
233     DumpCode(false),
234 
235     FP64(false),
236     CIInsts(false),
237     GFX8Insts(false),
238     GFX9Insts(false),
239     GFX90AInsts(false),
240     GFX10Insts(false),
241     GFX10_3Insts(false),
242     GFX7GFX8GFX9Insts(false),
243     SGPRInitBug(false),
244     NegativeScratchOffsetBug(false),
245     NegativeUnalignedScratchOffsetBug(false),
246     HasSMemRealTime(false),
247     HasIntClamp(false),
248     HasFmaMixInsts(false),
249     HasMovrel(false),
250     HasVGPRIndexMode(false),
251     HasScalarStores(false),
252     HasScalarAtomics(false),
253     HasSDWAOmod(false),
254     HasSDWAScalar(false),
255     HasSDWASdst(false),
256     HasSDWAMac(false),
257     HasSDWAOutModsVOPC(false),
258     HasDPP(false),
259     HasDPP8(false),
260     Has64BitDPP(false),
261     HasPackedFP32Ops(false),
262     HasExtendedImageInsts(false),
263     HasR128A16(false),
264     HasGFX10A16(false),
265     HasG16(false),
266     HasNSAEncoding(false),
267     GFX10_AEncoding(false),
268     GFX10_BEncoding(false),
269     HasDLInsts(false),
270     HasDot1Insts(false),
271     HasDot2Insts(false),
272     HasDot3Insts(false),
273     HasDot4Insts(false),
274     HasDot5Insts(false),
275     HasDot6Insts(false),
276     HasDot7Insts(false),
277     HasMAIInsts(false),
278     HasPkFmacF16Inst(false),
279     HasAtomicFaddInsts(false),
280     SupportsSRAMECC(false),
281     EnableSRAMECC(false),
282     HasNoSdstCMPX(false),
283     HasVscnt(false),
284     HasGetWaveIdInst(false),
285     HasSMemTimeInst(false),
286     HasShaderCyclesRegister(false),
287     HasRegisterBanking(false),
288     HasVOP3Literal(false),
289     HasNoDataDepHazard(false),
290     FlatAddressSpace(false),
291     FlatInstOffsets(false),
292     FlatGlobalInsts(false),
293     FlatScratchInsts(false),
294     ScalarFlatScratchInsts(false),
295     HasArchitectedFlatScratch(false),
296     AddNoCarryInsts(false),
297     HasUnpackedD16VMem(false),
298     LDSMisalignedBug(false),
299     HasMFMAInlineLiteralBug(false),
300     UnalignedBufferAccess(false),
301     UnalignedDSAccess(false),
302     HasPackedTID(false),
303 
304     ScalarizeGlobal(false),
305 
306     HasVcmpxPermlaneHazard(false),
307     HasVMEMtoScalarWriteHazard(false),
308     HasSMEMtoVectorWriteHazard(false),
309     HasInstFwdPrefetchBug(false),
310     HasVcmpxExecWARHazard(false),
311     HasLdsBranchVmemWARHazard(false),
312     HasNSAtoVMEMBug(false),
313     HasNSAClauseBug(false),
314     HasOffset3fBug(false),
315     HasFlatSegmentOffsetBug(false),
316     HasImageStoreD16Bug(false),
317     HasImageGather4D16Bug(false),
318 
319     FeatureDisable(false),
320     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
321     TLInfo(TM, *this),
322     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
323   // clang-format on
324   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
325   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
326   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
327   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
328   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
329   InstSelector.reset(new AMDGPUInstructionSelector(
330   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
331 }
332 
333 bool GCNSubtarget::enableFlatScratch() const {
334   return flatScratchIsArchitected() ||
335          (EnableFlatScratch && hasFlatScratchInsts());
336 }
337 
338 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
339   if (getGeneration() < GFX10)
340     return 1;
341 
342   switch (Opcode) {
343   case AMDGPU::V_LSHLREV_B64_e64:
344   case AMDGPU::V_LSHLREV_B64_gfx10:
345   case AMDGPU::V_LSHL_B64_e64:
346   case AMDGPU::V_LSHRREV_B64_e64:
347   case AMDGPU::V_LSHRREV_B64_gfx10:
348   case AMDGPU::V_LSHR_B64_e64:
349   case AMDGPU::V_ASHRREV_I64_e64:
350   case AMDGPU::V_ASHRREV_I64_gfx10:
351   case AMDGPU::V_ASHR_I64_e64:
352     return 1;
353   }
354 
355   return 2;
356 }
357 
358 /// This list was mostly derived from experimentation.
359 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
360   switch (Opcode) {
361   case AMDGPU::V_CVT_F16_F32_e32:
362   case AMDGPU::V_CVT_F16_F32_e64:
363   case AMDGPU::V_CVT_F16_U16_e32:
364   case AMDGPU::V_CVT_F16_U16_e64:
365   case AMDGPU::V_CVT_F16_I16_e32:
366   case AMDGPU::V_CVT_F16_I16_e64:
367   case AMDGPU::V_RCP_F16_e64:
368   case AMDGPU::V_RCP_F16_e32:
369   case AMDGPU::V_RSQ_F16_e64:
370   case AMDGPU::V_RSQ_F16_e32:
371   case AMDGPU::V_SQRT_F16_e64:
372   case AMDGPU::V_SQRT_F16_e32:
373   case AMDGPU::V_LOG_F16_e64:
374   case AMDGPU::V_LOG_F16_e32:
375   case AMDGPU::V_EXP_F16_e64:
376   case AMDGPU::V_EXP_F16_e32:
377   case AMDGPU::V_SIN_F16_e64:
378   case AMDGPU::V_SIN_F16_e32:
379   case AMDGPU::V_COS_F16_e64:
380   case AMDGPU::V_COS_F16_e32:
381   case AMDGPU::V_FLOOR_F16_e64:
382   case AMDGPU::V_FLOOR_F16_e32:
383   case AMDGPU::V_CEIL_F16_e64:
384   case AMDGPU::V_CEIL_F16_e32:
385   case AMDGPU::V_TRUNC_F16_e64:
386   case AMDGPU::V_TRUNC_F16_e32:
387   case AMDGPU::V_RNDNE_F16_e64:
388   case AMDGPU::V_RNDNE_F16_e32:
389   case AMDGPU::V_FRACT_F16_e64:
390   case AMDGPU::V_FRACT_F16_e32:
391   case AMDGPU::V_FREXP_MANT_F16_e64:
392   case AMDGPU::V_FREXP_MANT_F16_e32:
393   case AMDGPU::V_FREXP_EXP_I16_F16_e64:
394   case AMDGPU::V_FREXP_EXP_I16_F16_e32:
395   case AMDGPU::V_LDEXP_F16_e64:
396   case AMDGPU::V_LDEXP_F16_e32:
397   case AMDGPU::V_LSHLREV_B16_e64:
398   case AMDGPU::V_LSHLREV_B16_e32:
399   case AMDGPU::V_LSHRREV_B16_e64:
400   case AMDGPU::V_LSHRREV_B16_e32:
401   case AMDGPU::V_ASHRREV_I16_e64:
402   case AMDGPU::V_ASHRREV_I16_e32:
403   case AMDGPU::V_ADD_U16_e64:
404   case AMDGPU::V_ADD_U16_e32:
405   case AMDGPU::V_SUB_U16_e64:
406   case AMDGPU::V_SUB_U16_e32:
407   case AMDGPU::V_SUBREV_U16_e64:
408   case AMDGPU::V_SUBREV_U16_e32:
409   case AMDGPU::V_MUL_LO_U16_e64:
410   case AMDGPU::V_MUL_LO_U16_e32:
411   case AMDGPU::V_ADD_F16_e64:
412   case AMDGPU::V_ADD_F16_e32:
413   case AMDGPU::V_SUB_F16_e64:
414   case AMDGPU::V_SUB_F16_e32:
415   case AMDGPU::V_SUBREV_F16_e64:
416   case AMDGPU::V_SUBREV_F16_e32:
417   case AMDGPU::V_MUL_F16_e64:
418   case AMDGPU::V_MUL_F16_e32:
419   case AMDGPU::V_MAX_F16_e64:
420   case AMDGPU::V_MAX_F16_e32:
421   case AMDGPU::V_MIN_F16_e64:
422   case AMDGPU::V_MIN_F16_e32:
423   case AMDGPU::V_MAX_U16_e64:
424   case AMDGPU::V_MAX_U16_e32:
425   case AMDGPU::V_MIN_U16_e64:
426   case AMDGPU::V_MIN_U16_e32:
427   case AMDGPU::V_MAX_I16_e64:
428   case AMDGPU::V_MAX_I16_e32:
429   case AMDGPU::V_MIN_I16_e64:
430   case AMDGPU::V_MIN_I16_e32:
431     // On gfx10, all 16-bit instructions preserve the high bits.
432     return getGeneration() <= AMDGPUSubtarget::GFX9;
433   case AMDGPU::V_MAD_F16_e64:
434   case AMDGPU::V_MADAK_F16:
435   case AMDGPU::V_MADMK_F16:
436   case AMDGPU::V_MAC_F16_e64:
437   case AMDGPU::V_MAC_F16_e32:
438   case AMDGPU::V_FMAMK_F16:
439   case AMDGPU::V_FMAAK_F16:
440   case AMDGPU::V_MAD_U16_e64:
441   case AMDGPU::V_MAD_I16_e64:
442   case AMDGPU::V_FMA_F16_e64:
443   case AMDGPU::V_FMAC_F16_e64:
444   case AMDGPU::V_FMAC_F16_e32:
445   case AMDGPU::V_DIV_FIXUP_F16_e64:
446     // In gfx9, the preferred handling of the unused high 16-bits changed. Most
447     // instructions maintain the legacy behavior of 0ing. Some instructions
448     // changed to preserving the high bits.
449     return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
450   case AMDGPU::V_MAD_MIXLO_F16:
451   case AMDGPU::V_MAD_MIXHI_F16:
452   default:
453     return false;
454   }
455 }
456 
457 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
458   const Function &F) const {
459   if (NWaves == 1)
460     return getLocalMemorySize();
461   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
462   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
463   if (!WorkGroupsPerCu)
464     return 0;
465   unsigned MaxWaves = getMaxWavesPerEU();
466   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
467 }
468 
469 // FIXME: Should return min,max range.
470 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
471   const Function &F) const {
472   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
473   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
474   if (!MaxWorkGroupsPerCu)
475     return 0;
476 
477   const unsigned WaveSize = getWavefrontSize();
478 
479   // FIXME: Do we need to account for alignment requirement of LDS rounding the
480   // size up?
481   // Compute restriction based on LDS usage
482   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
483 
484   // This can be queried with more LDS than is possible, so just assume the
485   // worst.
486   if (NumGroups == 0)
487     return 1;
488 
489   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
490 
491   // Round to the number of waves.
492   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
493   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
494 
495   // Clamp to the maximum possible number of waves.
496   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
497 
498   // FIXME: Needs to be a multiple of the group size?
499   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
500 
501   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
502          "computed invalid occupancy");
503   return MaxWaves;
504 }
505 
506 unsigned
507 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
508   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
509   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
510 }
511 
512 std::pair<unsigned, unsigned>
513 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
514   switch (CC) {
515   case CallingConv::AMDGPU_VS:
516   case CallingConv::AMDGPU_LS:
517   case CallingConv::AMDGPU_HS:
518   case CallingConv::AMDGPU_ES:
519   case CallingConv::AMDGPU_GS:
520   case CallingConv::AMDGPU_PS:
521     return std::make_pair(1, getWavefrontSize());
522   default:
523     return std::make_pair(1u, getMaxFlatWorkGroupSize());
524   }
525 }
526 
527 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
528   const Function &F) const {
529   // Default minimum/maximum flat work group sizes.
530   std::pair<unsigned, unsigned> Default =
531     getDefaultFlatWorkGroupSize(F.getCallingConv());
532 
533   // Requested minimum/maximum flat work group sizes.
534   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
535     F, "amdgpu-flat-work-group-size", Default);
536 
537   // Make sure requested minimum is less than requested maximum.
538   if (Requested.first > Requested.second)
539     return Default;
540 
541   // Make sure requested values do not violate subtarget's specifications.
542   if (Requested.first < getMinFlatWorkGroupSize())
543     return Default;
544   if (Requested.second > getMaxFlatWorkGroupSize())
545     return Default;
546 
547   return Requested;
548 }
549 
550 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
551   const Function &F) const {
552   // Default minimum/maximum number of waves per execution unit.
553   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
554 
555   // Default/requested minimum/maximum flat work group sizes.
556   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
557 
558   // If minimum/maximum flat work group sizes were explicitly requested using
559   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
560   // number of waves per execution unit to values implied by requested
561   // minimum/maximum flat work group sizes.
562   unsigned MinImpliedByFlatWorkGroupSize =
563     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
564   Default.first = MinImpliedByFlatWorkGroupSize;
565   bool RequestedFlatWorkGroupSize =
566       F.hasFnAttribute("amdgpu-flat-work-group-size");
567 
568   // Requested minimum/maximum number of waves per execution unit.
569   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
570     F, "amdgpu-waves-per-eu", Default, true);
571 
572   // Make sure requested minimum is less than requested maximum.
573   if (Requested.second && Requested.first > Requested.second)
574     return Default;
575 
576   // Make sure requested values do not violate subtarget's specifications.
577   if (Requested.first < getMinWavesPerEU() ||
578       Requested.second > getMaxWavesPerEU())
579     return Default;
580 
581   // Make sure requested values are compatible with values implied by requested
582   // minimum/maximum flat work group sizes.
583   if (RequestedFlatWorkGroupSize &&
584       Requested.first < MinImpliedByFlatWorkGroupSize)
585     return Default;
586 
587   return Requested;
588 }
589 
590 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
591   auto Node = Kernel.getMetadata("reqd_work_group_size");
592   if (Node && Node->getNumOperands() == 3)
593     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
594   return std::numeric_limits<unsigned>::max();
595 }
596 
597 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
598   return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
599 }
600 
601 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
602                                            unsigned Dimension) const {
603   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
604   if (ReqdSize != std::numeric_limits<unsigned>::max())
605     return ReqdSize - 1;
606   return getFlatWorkGroupSizes(Kernel).second - 1;
607 }
608 
609 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
610   Function *Kernel = I->getParent()->getParent();
611   unsigned MinSize = 0;
612   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
613   bool IdQuery = false;
614 
615   // If reqd_work_group_size is present it narrows value down.
616   if (auto *CI = dyn_cast<CallInst>(I)) {
617     const Function *F = CI->getCalledFunction();
618     if (F) {
619       unsigned Dim = UINT_MAX;
620       switch (F->getIntrinsicID()) {
621       case Intrinsic::amdgcn_workitem_id_x:
622       case Intrinsic::r600_read_tidig_x:
623         IdQuery = true;
624         LLVM_FALLTHROUGH;
625       case Intrinsic::r600_read_local_size_x:
626         Dim = 0;
627         break;
628       case Intrinsic::amdgcn_workitem_id_y:
629       case Intrinsic::r600_read_tidig_y:
630         IdQuery = true;
631         LLVM_FALLTHROUGH;
632       case Intrinsic::r600_read_local_size_y:
633         Dim = 1;
634         break;
635       case Intrinsic::amdgcn_workitem_id_z:
636       case Intrinsic::r600_read_tidig_z:
637         IdQuery = true;
638         LLVM_FALLTHROUGH;
639       case Intrinsic::r600_read_local_size_z:
640         Dim = 2;
641         break;
642       default:
643         break;
644       }
645 
646       if (Dim <= 3) {
647         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
648         if (ReqdSize != std::numeric_limits<unsigned>::max())
649           MinSize = MaxSize = ReqdSize;
650       }
651     }
652   }
653 
654   if (!MaxSize)
655     return false;
656 
657   // Range metadata is [Lo, Hi). For ID query we need to pass max size
658   // as Hi. For size query we need to pass Hi + 1.
659   if (IdQuery)
660     MinSize = 0;
661   else
662     ++MaxSize;
663 
664   MDBuilder MDB(I->getContext());
665   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
666                                                   APInt(32, MaxSize));
667   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
668   return true;
669 }
670 
671 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
672   if (isMesaKernel(F))
673     return 16;
674   return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
675 }
676 
677 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
678                                                  Align &MaxAlign) const {
679   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
680          F.getCallingConv() == CallingConv::SPIR_KERNEL);
681 
682   const DataLayout &DL = F.getParent()->getDataLayout();
683   uint64_t ExplicitArgBytes = 0;
684   MaxAlign = Align(1);
685 
686   for (const Argument &Arg : F.args()) {
687     const bool IsByRef = Arg.hasByRefAttr();
688     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
689     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
690     if (!Alignment)
691       Alignment = DL.getABITypeAlign(ArgTy);
692 
693     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
694     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
695     MaxAlign = max(MaxAlign, Alignment);
696   }
697 
698   return ExplicitArgBytes;
699 }
700 
701 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
702                                                 Align &MaxAlign) const {
703   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
704 
705   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
706 
707   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
708   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
709   if (ImplicitBytes != 0) {
710     const Align Alignment = getAlignmentForImplicitArgPtr();
711     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
712   }
713 
714   // Being able to dereference past the end is useful for emitting scalar loads.
715   return alignTo(TotalSize, 4);
716 }
717 
718 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
719   return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
720                                   : AMDGPUDwarfFlavour::Wave64;
721 }
722 
723 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
724                              const TargetMachine &TM) :
725   R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS),
726   AMDGPUSubtarget(TT),
727   InstrInfo(*this),
728   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
729   FMA(false),
730   CaymanISA(false),
731   CFALUBug(false),
732   HasVertexCache(false),
733   R600ALUInst(false),
734   FP64(false),
735   TexVTXClauseSize(0),
736   Gen(R600),
737   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
738   InstrItins(getInstrItineraryForCPU(GPU)) { }
739 
740 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
741                                       unsigned NumRegionInstrs) const {
742   // Track register pressure so the scheduler can try to decrease
743   // pressure once register usage is above the threshold defined by
744   // SIRegisterInfo::getRegPressureSetLimit()
745   Policy.ShouldTrackPressure = true;
746 
747   // Enabling both top down and bottom up scheduling seems to give us less
748   // register spills than just using one of these approaches on its own.
749   Policy.OnlyTopDown = false;
750   Policy.OnlyBottomUp = false;
751 
752   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
753   if (!enableSIScheduler())
754     Policy.ShouldTrackLaneMasks = true;
755 }
756 
757 bool GCNSubtarget::hasMadF16() const {
758   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
759 }
760 
761 bool GCNSubtarget::useVGPRIndexMode() const {
762   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
763 }
764 
765 bool GCNSubtarget::useAA() const { return UseAA; }
766 
767 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
768   if (getGeneration() >= AMDGPUSubtarget::GFX10)
769     return getMaxWavesPerEU();
770 
771   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
772     if (SGPRs <= 80)
773       return 10;
774     if (SGPRs <= 88)
775       return 9;
776     if (SGPRs <= 100)
777       return 8;
778     return 7;
779   }
780   if (SGPRs <= 48)
781     return 10;
782   if (SGPRs <= 56)
783     return 9;
784   if (SGPRs <= 64)
785     return 8;
786   if (SGPRs <= 72)
787     return 7;
788   if (SGPRs <= 80)
789     return 6;
790   return 5;
791 }
792 
793 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
794   unsigned MaxWaves = getMaxWavesPerEU();
795   unsigned Granule = getVGPRAllocGranule();
796   if (VGPRs < Granule)
797     return MaxWaves;
798   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
799   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
800 }
801 
802 unsigned
803 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const {
804   if (getGeneration() >= AMDGPUSubtarget::GFX10)
805     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
806 
807   if (HasFlatScratchInit) {
808     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
809       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
810     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
811       return 4; // FLAT_SCRATCH, VCC (in that order).
812   }
813 
814   if (isXNACKEnabled())
815     return 4; // XNACK, VCC (in that order).
816   return 2; // VCC.
817 }
818 
819 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
820   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
821   return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit());
822 }
823 
824 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
825   // The logic to detect if the function has
826   // flat scratch init is slightly different than how
827   // SIMachineFunctionInfo constructor derives.
828   // We don't use amdgpu-calls, amdgpu-stack-objects
829   // attributes and isAmdHsaOrMesa here as it doesn't really matter.
830   // TODO: Outline this derivation logic and have just
831   // one common function in the backend to avoid duplication.
832   bool isEntry = AMDGPU::isEntryFunctionCC(F.getCallingConv());
833   bool FunctionHasFlatScratchInit = false;
834   if (hasFlatAddressSpace() && isEntry && !flatScratchIsArchitected() &&
835       enableFlatScratch()) {
836     FunctionHasFlatScratchInit = true;
837   }
838   return getBaseReservedNumSGPRs(FunctionHasFlatScratchInit);
839 }
840 
841 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
842                                         unsigned NumSGPRs,
843                                         unsigned NumVGPRs) const {
844   unsigned Occupancy =
845     std::min(getMaxWavesPerEU(),
846              getOccupancyWithLocalMemSize(LDSSize, F));
847   if (NumSGPRs)
848     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
849   if (NumVGPRs)
850     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
851   return Occupancy;
852 }
853 
854 unsigned GCNSubtarget::getBaseMaxNumSGPRs(
855     const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
856     unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
857   // Compute maximum number of SGPRs function can use using default/requested
858   // minimum number of waves per execution unit.
859   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
860   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
861 
862   // Check if maximum number of SGPRs was explicitly requested using
863   // "amdgpu-num-sgpr" attribute.
864   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
865     unsigned Requested = AMDGPU::getIntegerAttribute(
866       F, "amdgpu-num-sgpr", MaxNumSGPRs);
867 
868     // Make sure requested value does not violate subtarget's specifications.
869     if (Requested && (Requested <= ReservedNumSGPRs))
870       Requested = 0;
871 
872     // If more SGPRs are required to support the input user/system SGPRs,
873     // increase to accommodate them.
874     //
875     // FIXME: This really ends up using the requested number of SGPRs + number
876     // of reserved special registers in total. Theoretically you could re-use
877     // the last input registers for these special registers, but this would
878     // require a lot of complexity to deal with the weird aliasing.
879     unsigned InputNumSGPRs = PreloadedSGPRs;
880     if (Requested && Requested < InputNumSGPRs)
881       Requested = InputNumSGPRs;
882 
883     // Make sure requested value is compatible with values implied by
884     // default/requested minimum/maximum number of waves per execution unit.
885     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
886       Requested = 0;
887     if (WavesPerEU.second &&
888         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
889       Requested = 0;
890 
891     if (Requested)
892       MaxNumSGPRs = Requested;
893   }
894 
895   if (hasSGPRInitBug())
896     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
897 
898   return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
899 }
900 
901 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
902   const Function &F = MF.getFunction();
903   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
904   return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),
905                             getReservedNumSGPRs(MF));
906 }
907 
908 static unsigned getMaxNumPreloadedSGPRs() {
909   // Max number of user SGPRs
910   unsigned MaxUserSGPRs = 4 + // private segment buffer
911                           2 + // Dispatch ptr
912                           2 + // queue ptr
913                           2 + // kernel segment ptr
914                           2 + // dispatch ID
915                           2 + // flat scratch init
916                           2;  // Implicit buffer ptr
917   // Max number of system SGPRs
918   unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
919                             1 + // WorkGroupIDY
920                             1 + // WorkGroupIDZ
921                             1 + // WorkGroupInfo
922                             1;  // private segment wave byte offset
923   return MaxUserSGPRs + MaxSystemSGPRs;
924 }
925 
926 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
927   return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),
928                             getReservedNumSGPRs(F));
929 }
930 
931 unsigned GCNSubtarget::getBaseMaxNumVGPRs(
932     const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
933   // Compute maximum number of VGPRs function can use using default/requested
934   // minimum number of waves per execution unit.
935   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
936 
937   // Check if maximum number of VGPRs was explicitly requested using
938   // "amdgpu-num-vgpr" attribute.
939   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
940     unsigned Requested = AMDGPU::getIntegerAttribute(
941       F, "amdgpu-num-vgpr", MaxNumVGPRs);
942 
943     if (hasGFX90AInsts())
944       Requested *= 2;
945 
946     // Make sure requested value is compatible with values implied by
947     // default/requested minimum/maximum number of waves per execution unit.
948     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
949       Requested = 0;
950     if (WavesPerEU.second &&
951         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
952       Requested = 0;
953 
954     if (Requested)
955       MaxNumVGPRs = Requested;
956   }
957 
958   return MaxNumVGPRs;
959 }
960 
961 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
962   return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
963 }
964 
965 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
966   const Function &F = MF.getFunction();
967   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
968   return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
969 }
970 
971 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
972                                          int UseOpIdx, SDep &Dep) const {
973   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
974       !Def->isInstr() || !Use->isInstr())
975     return;
976 
977   MachineInstr *DefI = Def->getInstr();
978   MachineInstr *UseI = Use->getInstr();
979 
980   if (DefI->isBundle()) {
981     const SIRegisterInfo *TRI = getRegisterInfo();
982     auto Reg = Dep.getReg();
983     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
984     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
985     unsigned Lat = 0;
986     for (++I; I != E && I->isBundledWithPred(); ++I) {
987       if (I->modifiesRegister(Reg, TRI))
988         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
989       else if (Lat)
990         --Lat;
991     }
992     Dep.setLatency(Lat);
993   } else if (UseI->isBundle()) {
994     const SIRegisterInfo *TRI = getRegisterInfo();
995     auto Reg = Dep.getReg();
996     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
997     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
998     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
999     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
1000       if (I->readsRegister(Reg, TRI))
1001         break;
1002       --Lat;
1003     }
1004     Dep.setLatency(Lat);
1005   }
1006 }
1007 
1008 namespace {
1009 struct FillMFMAShadowMutation : ScheduleDAGMutation {
1010   const SIInstrInfo *TII;
1011 
1012   ScheduleDAGMI *DAG;
1013 
1014   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
1015 
1016   bool isSALU(const SUnit *SU) const {
1017     const MachineInstr *MI = SU->getInstr();
1018     return MI && TII->isSALU(*MI) && !MI->isTerminator();
1019   }
1020 
1021   bool isVALU(const SUnit *SU) const {
1022     const MachineInstr *MI = SU->getInstr();
1023     return MI && TII->isVALU(*MI);
1024   }
1025 
1026   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
1027     if (Pred->NodeNum < Succ->NodeNum)
1028       return true;
1029 
1030     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
1031 
1032     for (unsigned I = 0; I < Succs.size(); ++I) {
1033       for (const SDep &SI : Succs[I]->Succs) {
1034         const SUnit *SU = SI.getSUnit();
1035         if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
1036           Succs.push_back(SU);
1037       }
1038     }
1039 
1040     SmallPtrSet<const SUnit*, 32> Visited;
1041     while (!Preds.empty()) {
1042       const SUnit *SU = Preds.pop_back_val();
1043       if (llvm::is_contained(Succs, SU))
1044         return false;
1045       Visited.insert(SU);
1046       for (const SDep &SI : SU->Preds)
1047         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
1048           Preds.push_back(SI.getSUnit());
1049     }
1050 
1051     return true;
1052   }
1053 
1054   // Link as much SALU intructions in chain as possible. Return the size
1055   // of the chain. Links up to MaxChain instructions.
1056   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
1057                          SmallPtrSetImpl<SUnit *> &Visited) const {
1058     SmallVector<SUnit *, 8> Worklist({To});
1059     unsigned Linked = 0;
1060 
1061     while (!Worklist.empty() && MaxChain-- > 0) {
1062       SUnit *SU = Worklist.pop_back_val();
1063       if (!Visited.insert(SU).second)
1064         continue;
1065 
1066       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
1067                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
1068 
1069       if (SU->addPred(SDep(From, SDep::Artificial), false))
1070         ++Linked;
1071 
1072       for (SDep &SI : From->Succs) {
1073         SUnit *SUv = SI.getSUnit();
1074         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
1075           SUv->addPred(SDep(SU, SDep::Artificial), false);
1076       }
1077 
1078       for (SDep &SI : SU->Succs) {
1079         SUnit *Succ = SI.getSUnit();
1080         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
1081           Worklist.push_back(Succ);
1082       }
1083     }
1084 
1085     return Linked;
1086   }
1087 
1088   void apply(ScheduleDAGInstrs *DAGInstrs) override {
1089     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
1090     if (!ST.hasMAIInsts() || DisablePowerSched)
1091       return;
1092     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
1093     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
1094     if (!TSchedModel || DAG->SUnits.empty())
1095       return;
1096 
1097     // Scan for MFMA long latency instructions and try to add a dependency
1098     // of available SALU instructions to give them a chance to fill MFMA
1099     // shadow. That is desirable to fill MFMA shadow with SALU instructions
1100     // rather than VALU to prevent power consumption bursts and throttle.
1101     auto LastSALU = DAG->SUnits.begin();
1102     auto E = DAG->SUnits.end();
1103     SmallPtrSet<SUnit*, 32> Visited;
1104     for (SUnit &SU : DAG->SUnits) {
1105       MachineInstr &MAI = *SU.getInstr();
1106       if (!TII->isMAI(MAI) ||
1107            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
1108            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
1109         continue;
1110 
1111       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
1112 
1113       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
1114                  dbgs() << "Need " << Lat
1115                         << " instructions to cover latency.\n");
1116 
1117       // Find up to Lat independent scalar instructions as early as
1118       // possible such that they can be scheduled after this MFMA.
1119       for ( ; Lat && LastSALU != E; ++LastSALU) {
1120         if (Visited.count(&*LastSALU))
1121           continue;
1122 
1123         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
1124           continue;
1125 
1126         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
1127       }
1128     }
1129   }
1130 };
1131 } // namespace
1132 
1133 void GCNSubtarget::getPostRAMutations(
1134     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
1135   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
1136 }
1137 
1138 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
1139   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
1140     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
1141   else
1142     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
1143 }
1144 
1145 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
1146   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
1147     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
1148   else
1149     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
1150 }
1151