1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUCallLowering.h"
17 #include "AMDGPUInstructionSelector.h"
18 #include "AMDGPULegalizerInfo.h"
19 #include "AMDGPURegisterBankInfo.h"
20 #include "AMDGPUTargetMachine.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/IntrinsicsR600.h"
29 #include "llvm/IR/MDBuilder.h"
30 #include "llvm/MC/MCSubtargetInfo.h"
31 #include <algorithm>
32 
33 using namespace llvm;
34 
35 #define DEBUG_TYPE "amdgpu-subtarget"
36 
37 #define GET_SUBTARGETINFO_TARGET_DESC
38 #define GET_SUBTARGETINFO_CTOR
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #define GET_SUBTARGETINFO_TARGET_DESC
42 #define GET_SUBTARGETINFO_CTOR
43 #undef AMDGPUSubtarget
44 #include "R600GenSubtargetInfo.inc"
45 
46 static cl::opt<bool> DisablePowerSched(
47   "amdgpu-disable-power-sched",
48   cl::desc("Disable scheduling to minimize mAI power bursts"),
49   cl::init(false));
50 
51 static cl::opt<bool> EnableVGPRIndexMode(
52   "amdgpu-vgpr-index-mode",
53   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
54   cl::init(false));
55 
56 static cl::opt<bool> EnableFlatScratch(
57   "amdgpu-enable-flat-scratch",
58   cl::desc("Use flat scratch instructions"),
59   cl::init(false));
60 
61 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
62                            cl::desc("Enable the use of AA during codegen."),
63                            cl::init(true));
64 
65 GCNSubtarget::~GCNSubtarget() = default;
66 
67 R600Subtarget &
68 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
69                                                StringRef GPU, StringRef FS) {
70   SmallString<256> FullFS("+promote-alloca,");
71   FullFS += FS;
72   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
73 
74   HasMulU24 = getGeneration() >= EVERGREEN;
75   HasMulI24 = hasCaymanISA();
76 
77   return *this;
78 }
79 
80 GCNSubtarget &
81 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
82                                               StringRef GPU, StringRef FS) {
83   // Determine default and user-specified characteristics
84   //
85   // We want to be able to turn these off, but making this a subtarget feature
86   // for SI has the unhelpful behavior that it unsets everything else if you
87   // disable it.
88   //
89   // Similarly we want enable-prt-strict-null to be on by default and not to
90   // unset everything else if it is disabled
91 
92   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
93 
94   // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
95   if (isAmdHsaOS())
96     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
97 
98   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
99 
100   // Disable mutually exclusive bits.
101   if (FS.find_insensitive("+wavefrontsize") != StringRef::npos) {
102     if (FS.find_insensitive("wavefrontsize16") == StringRef::npos)
103       FullFS += "-wavefrontsize16,";
104     if (FS.find_insensitive("wavefrontsize32") == StringRef::npos)
105       FullFS += "-wavefrontsize32,";
106     if (FS.find_insensitive("wavefrontsize64") == StringRef::npos)
107       FullFS += "-wavefrontsize64,";
108   }
109 
110   FullFS += FS;
111 
112   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
113 
114   // Implement the "generic" processors, which acts as the default when no
115   // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
116   // the first amdgcn target that supports flat addressing. Other OSes defaults
117   // to the first amdgcn target.
118   if (Gen == AMDGPUSubtarget::INVALID) {
119      Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
120                                         : AMDGPUSubtarget::SOUTHERN_ISLANDS;
121   }
122 
123   // We don't support FP64 for EG/NI atm.
124   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
125 
126   // Targets must either support 64-bit offsets for MUBUF instructions, and/or
127   // support flat operations, otherwise they cannot access a 64-bit global
128   // address space
129   assert(hasAddr64() || hasFlat());
130   // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
131   // that do not support ADDR64 variants of MUBUF instructions. Such targets
132   // cannot use a 64 bit offset with a MUBUF instruction to access the global
133   // address space
134   if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
135     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
136     FlatForGlobal = true;
137   }
138   // Unless +-flat-for-global is specified, use MUBUF instructions for global
139   // address space access if flat operations are not available.
140   if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
141     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
142     FlatForGlobal = false;
143   }
144 
145   // Set defaults if needed.
146   if (MaxPrivateElementSize == 0)
147     MaxPrivateElementSize = 4;
148 
149   if (LDSBankCount == 0)
150     LDSBankCount = 32;
151 
152   if (TT.getArch() == Triple::amdgcn) {
153     if (LocalMemorySize == 0)
154       LocalMemorySize = 32768;
155 
156     // Do something sensible for unspecified target.
157     if (!HasMovrel && !HasVGPRIndexMode)
158       HasMovrel = true;
159   }
160 
161   // Don't crash on invalid devices.
162   if (WavefrontSizeLog2 == 0)
163     WavefrontSizeLog2 = 5;
164 
165   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
166 
167   TargetID.setTargetIDFromFeaturesString(FS);
168 
169   LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
170                     << TargetID.getXnackSetting() << '\n');
171   LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
172                     << TargetID.getSramEccSetting() << '\n');
173 
174   return *this;
175 }
176 
177 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
178   TargetTriple(TT),
179   GCN3Encoding(false),
180   Has16BitInsts(false),
181   HasMadMixInsts(false),
182   HasMadMacF32Insts(false),
183   HasDsSrc2Insts(false),
184   HasSDWA(false),
185   HasVOP3PInsts(false),
186   HasMulI24(true),
187   HasMulU24(true),
188   HasInv2PiInlineImm(false),
189   HasFminFmaxLegacy(true),
190   EnablePromoteAlloca(false),
191   HasTrigReducedRange(false),
192   MaxWavesPerEU(10),
193   LocalMemorySize(0),
194   WavefrontSizeLog2(0)
195   { }
196 
197 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
198                            const GCNTargetMachine &TM)
199     : // clang-format off
200     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
201     AMDGPUSubtarget(TT),
202     TargetTriple(TT),
203     TargetID(*this),
204     Gen(INVALID),
205     InstrItins(getInstrItineraryForCPU(GPU)),
206     LDSBankCount(0),
207     MaxPrivateElementSize(0),
208 
209     FastFMAF32(false),
210     FastDenormalF32(false),
211     HalfRate64Ops(false),
212     FullRate64Ops(false),
213 
214     FlatForGlobal(false),
215     AutoWaitcntBeforeBarrier(false),
216     UnalignedScratchAccess(false),
217     UnalignedAccessMode(false),
218 
219     HasApertureRegs(false),
220     SupportsXNACK(false),
221     EnableXNACK(false),
222     EnableTgSplit(false),
223     EnableCuMode(false),
224     TrapHandler(false),
225 
226     EnableLoadStoreOpt(false),
227     EnableUnsafeDSOffsetFolding(false),
228     EnableSIScheduler(false),
229     EnableDS128(false),
230     EnablePRTStrictNull(false),
231     DumpCode(false),
232 
233     FP64(false),
234     CIInsts(false),
235     GFX8Insts(false),
236     GFX9Insts(false),
237     GFX90AInsts(false),
238     GFX10Insts(false),
239     GFX10_3Insts(false),
240     GFX7GFX8GFX9Insts(false),
241     SGPRInitBug(false),
242     NegativeScratchOffsetBug(false),
243     NegativeUnalignedScratchOffsetBug(false),
244     HasSMemRealTime(false),
245     HasIntClamp(false),
246     HasFmaMixInsts(false),
247     HasMovrel(false),
248     HasVGPRIndexMode(false),
249     HasScalarStores(false),
250     HasScalarAtomics(false),
251     HasSDWAOmod(false),
252     HasSDWAScalar(false),
253     HasSDWASdst(false),
254     HasSDWAMac(false),
255     HasSDWAOutModsVOPC(false),
256     HasDPP(false),
257     HasDPP8(false),
258     Has64BitDPP(false),
259     HasPackedFP32Ops(false),
260     HasExtendedImageInsts(false),
261     HasR128A16(false),
262     HasGFX10A16(false),
263     HasG16(false),
264     HasNSAEncoding(false),
265     GFX10_AEncoding(false),
266     GFX10_BEncoding(false),
267     HasDLInsts(false),
268     HasDot1Insts(false),
269     HasDot2Insts(false),
270     HasDot3Insts(false),
271     HasDot4Insts(false),
272     HasDot5Insts(false),
273     HasDot6Insts(false),
274     HasDot7Insts(false),
275     HasMAIInsts(false),
276     HasPkFmacF16Inst(false),
277     HasAtomicFaddInsts(false),
278     SupportsSRAMECC(false),
279     EnableSRAMECC(false),
280     HasNoSdstCMPX(false),
281     HasVscnt(false),
282     HasGetWaveIdInst(false),
283     HasSMemTimeInst(false),
284     HasShaderCyclesRegister(false),
285     HasRegisterBanking(false),
286     HasVOP3Literal(false),
287     HasNoDataDepHazard(false),
288     FlatAddressSpace(false),
289     FlatInstOffsets(false),
290     FlatGlobalInsts(false),
291     FlatScratchInsts(false),
292     ScalarFlatScratchInsts(false),
293     HasArchitectedFlatScratch(false),
294     AddNoCarryInsts(false),
295     HasUnpackedD16VMem(false),
296     LDSMisalignedBug(false),
297     HasMFMAInlineLiteralBug(false),
298     UnalignedBufferAccess(false),
299     UnalignedDSAccess(false),
300     HasPackedTID(false),
301 
302     ScalarizeGlobal(false),
303 
304     HasVcmpxPermlaneHazard(false),
305     HasVMEMtoScalarWriteHazard(false),
306     HasSMEMtoVectorWriteHazard(false),
307     HasInstFwdPrefetchBug(false),
308     HasVcmpxExecWARHazard(false),
309     HasLdsBranchVmemWARHazard(false),
310     HasNSAtoVMEMBug(false),
311     HasNSAClauseBug(false),
312     HasOffset3fBug(false),
313     HasFlatSegmentOffsetBug(false),
314     HasImageStoreD16Bug(false),
315     HasImageGather4D16Bug(false),
316 
317     FeatureDisable(false),
318     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
319     TLInfo(TM, *this),
320     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
321   // clang-format on
322   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
323   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
324   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
325   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
326   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
327   InstSelector.reset(new AMDGPUInstructionSelector(
328   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
329 }
330 
331 bool GCNSubtarget::enableFlatScratch() const {
332   return flatScratchIsArchitected() ||
333          (EnableFlatScratch && hasFlatScratchInsts());
334 }
335 
336 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
337   if (getGeneration() < GFX10)
338     return 1;
339 
340   switch (Opcode) {
341   case AMDGPU::V_LSHLREV_B64_e64:
342   case AMDGPU::V_LSHLREV_B64_gfx10:
343   case AMDGPU::V_LSHL_B64_e64:
344   case AMDGPU::V_LSHRREV_B64_e64:
345   case AMDGPU::V_LSHRREV_B64_gfx10:
346   case AMDGPU::V_LSHR_B64_e64:
347   case AMDGPU::V_ASHRREV_I64_e64:
348   case AMDGPU::V_ASHRREV_I64_gfx10:
349   case AMDGPU::V_ASHR_I64_e64:
350     return 1;
351   }
352 
353   return 2;
354 }
355 
356 /// This list was mostly derived from experimentation.
357 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
358   switch (Opcode) {
359   case AMDGPU::V_CVT_F16_F32_e32:
360   case AMDGPU::V_CVT_F16_F32_e64:
361   case AMDGPU::V_CVT_F16_U16_e32:
362   case AMDGPU::V_CVT_F16_U16_e64:
363   case AMDGPU::V_CVT_F16_I16_e32:
364   case AMDGPU::V_CVT_F16_I16_e64:
365   case AMDGPU::V_RCP_F16_e64:
366   case AMDGPU::V_RCP_F16_e32:
367   case AMDGPU::V_RSQ_F16_e64:
368   case AMDGPU::V_RSQ_F16_e32:
369   case AMDGPU::V_SQRT_F16_e64:
370   case AMDGPU::V_SQRT_F16_e32:
371   case AMDGPU::V_LOG_F16_e64:
372   case AMDGPU::V_LOG_F16_e32:
373   case AMDGPU::V_EXP_F16_e64:
374   case AMDGPU::V_EXP_F16_e32:
375   case AMDGPU::V_SIN_F16_e64:
376   case AMDGPU::V_SIN_F16_e32:
377   case AMDGPU::V_COS_F16_e64:
378   case AMDGPU::V_COS_F16_e32:
379   case AMDGPU::V_FLOOR_F16_e64:
380   case AMDGPU::V_FLOOR_F16_e32:
381   case AMDGPU::V_CEIL_F16_e64:
382   case AMDGPU::V_CEIL_F16_e32:
383   case AMDGPU::V_TRUNC_F16_e64:
384   case AMDGPU::V_TRUNC_F16_e32:
385   case AMDGPU::V_RNDNE_F16_e64:
386   case AMDGPU::V_RNDNE_F16_e32:
387   case AMDGPU::V_FRACT_F16_e64:
388   case AMDGPU::V_FRACT_F16_e32:
389   case AMDGPU::V_FREXP_MANT_F16_e64:
390   case AMDGPU::V_FREXP_MANT_F16_e32:
391   case AMDGPU::V_FREXP_EXP_I16_F16_e64:
392   case AMDGPU::V_FREXP_EXP_I16_F16_e32:
393   case AMDGPU::V_LDEXP_F16_e64:
394   case AMDGPU::V_LDEXP_F16_e32:
395   case AMDGPU::V_LSHLREV_B16_e64:
396   case AMDGPU::V_LSHLREV_B16_e32:
397   case AMDGPU::V_LSHRREV_B16_e64:
398   case AMDGPU::V_LSHRREV_B16_e32:
399   case AMDGPU::V_ASHRREV_I16_e64:
400   case AMDGPU::V_ASHRREV_I16_e32:
401   case AMDGPU::V_ADD_U16_e64:
402   case AMDGPU::V_ADD_U16_e32:
403   case AMDGPU::V_SUB_U16_e64:
404   case AMDGPU::V_SUB_U16_e32:
405   case AMDGPU::V_SUBREV_U16_e64:
406   case AMDGPU::V_SUBREV_U16_e32:
407   case AMDGPU::V_MUL_LO_U16_e64:
408   case AMDGPU::V_MUL_LO_U16_e32:
409   case AMDGPU::V_ADD_F16_e64:
410   case AMDGPU::V_ADD_F16_e32:
411   case AMDGPU::V_SUB_F16_e64:
412   case AMDGPU::V_SUB_F16_e32:
413   case AMDGPU::V_SUBREV_F16_e64:
414   case AMDGPU::V_SUBREV_F16_e32:
415   case AMDGPU::V_MUL_F16_e64:
416   case AMDGPU::V_MUL_F16_e32:
417   case AMDGPU::V_MAX_F16_e64:
418   case AMDGPU::V_MAX_F16_e32:
419   case AMDGPU::V_MIN_F16_e64:
420   case AMDGPU::V_MIN_F16_e32:
421   case AMDGPU::V_MAX_U16_e64:
422   case AMDGPU::V_MAX_U16_e32:
423   case AMDGPU::V_MIN_U16_e64:
424   case AMDGPU::V_MIN_U16_e32:
425   case AMDGPU::V_MAX_I16_e64:
426   case AMDGPU::V_MAX_I16_e32:
427   case AMDGPU::V_MIN_I16_e64:
428   case AMDGPU::V_MIN_I16_e32:
429     // On gfx10, all 16-bit instructions preserve the high bits.
430     return getGeneration() <= AMDGPUSubtarget::GFX9;
431   case AMDGPU::V_MAD_F16_e64:
432   case AMDGPU::V_MADAK_F16:
433   case AMDGPU::V_MADMK_F16:
434   case AMDGPU::V_MAC_F16_e64:
435   case AMDGPU::V_MAC_F16_e32:
436   case AMDGPU::V_FMAMK_F16:
437   case AMDGPU::V_FMAAK_F16:
438   case AMDGPU::V_MAD_U16_e64:
439   case AMDGPU::V_MAD_I16_e64:
440   case AMDGPU::V_FMA_F16_e64:
441   case AMDGPU::V_FMAC_F16_e64:
442   case AMDGPU::V_FMAC_F16_e32:
443   case AMDGPU::V_DIV_FIXUP_F16_e64:
444     // In gfx9, the preferred handling of the unused high 16-bits changed. Most
445     // instructions maintain the legacy behavior of 0ing. Some instructions
446     // changed to preserving the high bits.
447     return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
448   case AMDGPU::V_MAD_MIXLO_F16:
449   case AMDGPU::V_MAD_MIXHI_F16:
450   default:
451     return false;
452   }
453 }
454 
455 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
456   const Function &F) const {
457   if (NWaves == 1)
458     return getLocalMemorySize();
459   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
460   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
461   if (!WorkGroupsPerCu)
462     return 0;
463   unsigned MaxWaves = getMaxWavesPerEU();
464   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
465 }
466 
467 // FIXME: Should return min,max range.
468 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
469   const Function &F) const {
470   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
471   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
472   if (!MaxWorkGroupsPerCu)
473     return 0;
474 
475   const unsigned WaveSize = getWavefrontSize();
476 
477   // FIXME: Do we need to account for alignment requirement of LDS rounding the
478   // size up?
479   // Compute restriction based on LDS usage
480   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
481 
482   // This can be queried with more LDS than is possible, so just assume the
483   // worst.
484   if (NumGroups == 0)
485     return 1;
486 
487   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
488 
489   // Round to the number of waves.
490   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
491   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
492 
493   // Clamp to the maximum possible number of waves.
494   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
495 
496   // FIXME: Needs to be a multiple of the group size?
497   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
498 
499   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
500          "computed invalid occupancy");
501   return MaxWaves;
502 }
503 
504 unsigned
505 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
506   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
507   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
508 }
509 
510 std::pair<unsigned, unsigned>
511 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
512   switch (CC) {
513   case CallingConv::AMDGPU_VS:
514   case CallingConv::AMDGPU_LS:
515   case CallingConv::AMDGPU_HS:
516   case CallingConv::AMDGPU_ES:
517   case CallingConv::AMDGPU_GS:
518   case CallingConv::AMDGPU_PS:
519     return std::make_pair(1, getWavefrontSize());
520   default:
521     return std::make_pair(1u, getMaxFlatWorkGroupSize());
522   }
523 }
524 
525 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
526   const Function &F) const {
527   // Default minimum/maximum flat work group sizes.
528   std::pair<unsigned, unsigned> Default =
529     getDefaultFlatWorkGroupSize(F.getCallingConv());
530 
531   // Requested minimum/maximum flat work group sizes.
532   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
533     F, "amdgpu-flat-work-group-size", Default);
534 
535   // Make sure requested minimum is less than requested maximum.
536   if (Requested.first > Requested.second)
537     return Default;
538 
539   // Make sure requested values do not violate subtarget's specifications.
540   if (Requested.first < getMinFlatWorkGroupSize())
541     return Default;
542   if (Requested.second > getMaxFlatWorkGroupSize())
543     return Default;
544 
545   return Requested;
546 }
547 
548 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
549   const Function &F) const {
550   // Default minimum/maximum number of waves per execution unit.
551   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
552 
553   // Default/requested minimum/maximum flat work group sizes.
554   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
555 
556   // If minimum/maximum flat work group sizes were explicitly requested using
557   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
558   // number of waves per execution unit to values implied by requested
559   // minimum/maximum flat work group sizes.
560   unsigned MinImpliedByFlatWorkGroupSize =
561     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
562   Default.first = MinImpliedByFlatWorkGroupSize;
563   bool RequestedFlatWorkGroupSize =
564       F.hasFnAttribute("amdgpu-flat-work-group-size");
565 
566   // Requested minimum/maximum number of waves per execution unit.
567   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
568     F, "amdgpu-waves-per-eu", Default, true);
569 
570   // Make sure requested minimum is less than requested maximum.
571   if (Requested.second && Requested.first > Requested.second)
572     return Default;
573 
574   // Make sure requested values do not violate subtarget's specifications.
575   if (Requested.first < getMinWavesPerEU() ||
576       Requested.second > getMaxWavesPerEU())
577     return Default;
578 
579   // Make sure requested values are compatible with values implied by requested
580   // minimum/maximum flat work group sizes.
581   if (RequestedFlatWorkGroupSize &&
582       Requested.first < MinImpliedByFlatWorkGroupSize)
583     return Default;
584 
585   return Requested;
586 }
587 
588 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
589   auto Node = Kernel.getMetadata("reqd_work_group_size");
590   if (Node && Node->getNumOperands() == 3)
591     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
592   return std::numeric_limits<unsigned>::max();
593 }
594 
595 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
596   return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
597 }
598 
599 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
600                                            unsigned Dimension) const {
601   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
602   if (ReqdSize != std::numeric_limits<unsigned>::max())
603     return ReqdSize - 1;
604   return getFlatWorkGroupSizes(Kernel).second - 1;
605 }
606 
607 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
608   Function *Kernel = I->getParent()->getParent();
609   unsigned MinSize = 0;
610   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
611   bool IdQuery = false;
612 
613   // If reqd_work_group_size is present it narrows value down.
614   if (auto *CI = dyn_cast<CallInst>(I)) {
615     const Function *F = CI->getCalledFunction();
616     if (F) {
617       unsigned Dim = UINT_MAX;
618       switch (F->getIntrinsicID()) {
619       case Intrinsic::amdgcn_workitem_id_x:
620       case Intrinsic::r600_read_tidig_x:
621         IdQuery = true;
622         LLVM_FALLTHROUGH;
623       case Intrinsic::r600_read_local_size_x:
624         Dim = 0;
625         break;
626       case Intrinsic::amdgcn_workitem_id_y:
627       case Intrinsic::r600_read_tidig_y:
628         IdQuery = true;
629         LLVM_FALLTHROUGH;
630       case Intrinsic::r600_read_local_size_y:
631         Dim = 1;
632         break;
633       case Intrinsic::amdgcn_workitem_id_z:
634       case Intrinsic::r600_read_tidig_z:
635         IdQuery = true;
636         LLVM_FALLTHROUGH;
637       case Intrinsic::r600_read_local_size_z:
638         Dim = 2;
639         break;
640       default:
641         break;
642       }
643 
644       if (Dim <= 3) {
645         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
646         if (ReqdSize != std::numeric_limits<unsigned>::max())
647           MinSize = MaxSize = ReqdSize;
648       }
649     }
650   }
651 
652   if (!MaxSize)
653     return false;
654 
655   // Range metadata is [Lo, Hi). For ID query we need to pass max size
656   // as Hi. For size query we need to pass Hi + 1.
657   if (IdQuery)
658     MinSize = 0;
659   else
660     ++MaxSize;
661 
662   MDBuilder MDB(I->getContext());
663   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
664                                                   APInt(32, MaxSize));
665   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
666   return true;
667 }
668 
669 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
670   if (isMesaKernel(F))
671     return 16;
672   return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
673 }
674 
675 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
676                                                  Align &MaxAlign) const {
677   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
678          F.getCallingConv() == CallingConv::SPIR_KERNEL);
679 
680   const DataLayout &DL = F.getParent()->getDataLayout();
681   uint64_t ExplicitArgBytes = 0;
682   MaxAlign = Align(1);
683 
684   for (const Argument &Arg : F.args()) {
685     const bool IsByRef = Arg.hasByRefAttr();
686     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
687     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
688     if (!Alignment)
689       Alignment = DL.getABITypeAlign(ArgTy);
690 
691     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
692     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
693     MaxAlign = max(MaxAlign, Alignment);
694   }
695 
696   return ExplicitArgBytes;
697 }
698 
699 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
700                                                 Align &MaxAlign) const {
701   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
702 
703   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
704 
705   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
706   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
707   if (ImplicitBytes != 0) {
708     const Align Alignment = getAlignmentForImplicitArgPtr();
709     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
710   }
711 
712   // Being able to dereference past the end is useful for emitting scalar loads.
713   return alignTo(TotalSize, 4);
714 }
715 
716 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
717   return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
718                                   : AMDGPUDwarfFlavour::Wave64;
719 }
720 
721 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
722                              const TargetMachine &TM) :
723   R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS),
724   AMDGPUSubtarget(TT),
725   InstrInfo(*this),
726   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
727   FMA(false),
728   CaymanISA(false),
729   CFALUBug(false),
730   HasVertexCache(false),
731   R600ALUInst(false),
732   FP64(false),
733   TexVTXClauseSize(0),
734   Gen(R600),
735   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
736   InstrItins(getInstrItineraryForCPU(GPU)) { }
737 
738 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
739                                       unsigned NumRegionInstrs) const {
740   // Track register pressure so the scheduler can try to decrease
741   // pressure once register usage is above the threshold defined by
742   // SIRegisterInfo::getRegPressureSetLimit()
743   Policy.ShouldTrackPressure = true;
744 
745   // Enabling both top down and bottom up scheduling seems to give us less
746   // register spills than just using one of these approaches on its own.
747   Policy.OnlyTopDown = false;
748   Policy.OnlyBottomUp = false;
749 
750   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
751   if (!enableSIScheduler())
752     Policy.ShouldTrackLaneMasks = true;
753 }
754 
755 bool GCNSubtarget::hasMadF16() const {
756   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
757 }
758 
759 bool GCNSubtarget::useVGPRIndexMode() const {
760   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
761 }
762 
763 bool GCNSubtarget::useAA() const { return UseAA; }
764 
765 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
766   if (getGeneration() >= AMDGPUSubtarget::GFX10)
767     return getMaxWavesPerEU();
768 
769   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
770     if (SGPRs <= 80)
771       return 10;
772     if (SGPRs <= 88)
773       return 9;
774     if (SGPRs <= 100)
775       return 8;
776     return 7;
777   }
778   if (SGPRs <= 48)
779     return 10;
780   if (SGPRs <= 56)
781     return 9;
782   if (SGPRs <= 64)
783     return 8;
784   if (SGPRs <= 72)
785     return 7;
786   if (SGPRs <= 80)
787     return 6;
788   return 5;
789 }
790 
791 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
792   unsigned MaxWaves = getMaxWavesPerEU();
793   unsigned Granule = getVGPRAllocGranule();
794   if (VGPRs < Granule)
795     return MaxWaves;
796   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
797   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
798 }
799 
800 unsigned
801 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const {
802   if (getGeneration() >= AMDGPUSubtarget::GFX10)
803     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
804 
805   if (HasFlatScratchInit) {
806     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
807       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
808     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
809       return 4; // FLAT_SCRATCH, VCC (in that order).
810   }
811 
812   if (isXNACKEnabled())
813     return 4; // XNACK, VCC (in that order).
814   return 2; // VCC.
815 }
816 
817 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
818   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
819   return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit());
820 }
821 
822 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
823   // The logic to detect if the function has
824   // flat scratch init is slightly different than how
825   // SIMachineFunctionInfo constructor derives.
826   // We don't use amdgpu-calls, amdgpu-stack-objects
827   // attributes and isAmdHsaOrMesa here as it doesn't really matter.
828   // TODO: Outline this derivation logic and have just
829   // one common function in the backend to avoid duplication.
830   bool isEntry = AMDGPU::isEntryFunctionCC(F.getCallingConv());
831   bool FunctionHasFlatScratchInit = false;
832   if (hasFlatAddressSpace() && isEntry && !flatScratchIsArchitected() &&
833       enableFlatScratch()) {
834     FunctionHasFlatScratchInit = true;
835   }
836   return getBaseReservedNumSGPRs(FunctionHasFlatScratchInit);
837 }
838 
839 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
840                                         unsigned NumSGPRs,
841                                         unsigned NumVGPRs) const {
842   unsigned Occupancy =
843     std::min(getMaxWavesPerEU(),
844              getOccupancyWithLocalMemSize(LDSSize, F));
845   if (NumSGPRs)
846     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
847   if (NumVGPRs)
848     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
849   return Occupancy;
850 }
851 
852 unsigned GCNSubtarget::getBaseMaxNumSGPRs(
853     const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
854     unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
855   // Compute maximum number of SGPRs function can use using default/requested
856   // minimum number of waves per execution unit.
857   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
858   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
859 
860   // Check if maximum number of SGPRs was explicitly requested using
861   // "amdgpu-num-sgpr" attribute.
862   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
863     unsigned Requested = AMDGPU::getIntegerAttribute(
864       F, "amdgpu-num-sgpr", MaxNumSGPRs);
865 
866     // Make sure requested value does not violate subtarget's specifications.
867     if (Requested && (Requested <= ReservedNumSGPRs))
868       Requested = 0;
869 
870     // If more SGPRs are required to support the input user/system SGPRs,
871     // increase to accommodate them.
872     //
873     // FIXME: This really ends up using the requested number of SGPRs + number
874     // of reserved special registers in total. Theoretically you could re-use
875     // the last input registers for these special registers, but this would
876     // require a lot of complexity to deal with the weird aliasing.
877     unsigned InputNumSGPRs = PreloadedSGPRs;
878     if (Requested && Requested < InputNumSGPRs)
879       Requested = InputNumSGPRs;
880 
881     // Make sure requested value is compatible with values implied by
882     // default/requested minimum/maximum number of waves per execution unit.
883     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
884       Requested = 0;
885     if (WavesPerEU.second &&
886         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
887       Requested = 0;
888 
889     if (Requested)
890       MaxNumSGPRs = Requested;
891   }
892 
893   if (hasSGPRInitBug())
894     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
895 
896   return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
897 }
898 
899 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
900   const Function &F = MF.getFunction();
901   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
902   return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),
903                             getReservedNumSGPRs(MF));
904 }
905 
906 static unsigned getMaxNumPreloadedSGPRs() {
907   // Max number of user SGPRs
908   unsigned MaxUserSGPRs = 4 + // private segment buffer
909                           2 + // Dispatch ptr
910                           2 + // queue ptr
911                           2 + // kernel segment ptr
912                           2 + // dispatch ID
913                           2 + // flat scratch init
914                           2;  // Implicit buffer ptr
915   // Max number of system SGPRs
916   unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
917                             1 + // WorkGroupIDY
918                             1 + // WorkGroupIDZ
919                             1 + // WorkGroupInfo
920                             1;  // private segment wave byte offset
921   return MaxUserSGPRs + MaxSystemSGPRs;
922 }
923 
924 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
925   return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),
926                             getReservedNumSGPRs(F));
927 }
928 
929 unsigned GCNSubtarget::getBaseMaxNumVGPRs(
930     const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
931   // Compute maximum number of VGPRs function can use using default/requested
932   // minimum number of waves per execution unit.
933   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
934 
935   // Check if maximum number of VGPRs was explicitly requested using
936   // "amdgpu-num-vgpr" attribute.
937   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
938     unsigned Requested = AMDGPU::getIntegerAttribute(
939       F, "amdgpu-num-vgpr", MaxNumVGPRs);
940 
941     if (hasGFX90AInsts())
942       Requested *= 2;
943 
944     // Make sure requested value is compatible with values implied by
945     // default/requested minimum/maximum number of waves per execution unit.
946     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
947       Requested = 0;
948     if (WavesPerEU.second &&
949         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
950       Requested = 0;
951 
952     if (Requested)
953       MaxNumVGPRs = Requested;
954   }
955 
956   return MaxNumVGPRs;
957 }
958 
959 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
960   return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
961 }
962 
963 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
964   const Function &F = MF.getFunction();
965   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
966   return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
967 }
968 
969 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
970                                          int UseOpIdx, SDep &Dep) const {
971   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
972       !Def->isInstr() || !Use->isInstr())
973     return;
974 
975   MachineInstr *DefI = Def->getInstr();
976   MachineInstr *UseI = Use->getInstr();
977 
978   if (DefI->isBundle()) {
979     const SIRegisterInfo *TRI = getRegisterInfo();
980     auto Reg = Dep.getReg();
981     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
982     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
983     unsigned Lat = 0;
984     for (++I; I != E && I->isBundledWithPred(); ++I) {
985       if (I->modifiesRegister(Reg, TRI))
986         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
987       else if (Lat)
988         --Lat;
989     }
990     Dep.setLatency(Lat);
991   } else if (UseI->isBundle()) {
992     const SIRegisterInfo *TRI = getRegisterInfo();
993     auto Reg = Dep.getReg();
994     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
995     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
996     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
997     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
998       if (I->readsRegister(Reg, TRI))
999         break;
1000       --Lat;
1001     }
1002     Dep.setLatency(Lat);
1003   }
1004 }
1005 
1006 namespace {
1007 struct FillMFMAShadowMutation : ScheduleDAGMutation {
1008   const SIInstrInfo *TII;
1009 
1010   ScheduleDAGMI *DAG;
1011 
1012   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
1013 
1014   bool isSALU(const SUnit *SU) const {
1015     const MachineInstr *MI = SU->getInstr();
1016     return MI && TII->isSALU(*MI) && !MI->isTerminator();
1017   }
1018 
1019   bool isVALU(const SUnit *SU) const {
1020     const MachineInstr *MI = SU->getInstr();
1021     return MI && TII->isVALU(*MI);
1022   }
1023 
1024   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
1025     if (Pred->NodeNum < Succ->NodeNum)
1026       return true;
1027 
1028     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
1029 
1030     for (unsigned I = 0; I < Succs.size(); ++I) {
1031       for (const SDep &SI : Succs[I]->Succs) {
1032         const SUnit *SU = SI.getSUnit();
1033         if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
1034           Succs.push_back(SU);
1035       }
1036     }
1037 
1038     SmallPtrSet<const SUnit*, 32> Visited;
1039     while (!Preds.empty()) {
1040       const SUnit *SU = Preds.pop_back_val();
1041       if (llvm::is_contained(Succs, SU))
1042         return false;
1043       Visited.insert(SU);
1044       for (const SDep &SI : SU->Preds)
1045         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
1046           Preds.push_back(SI.getSUnit());
1047     }
1048 
1049     return true;
1050   }
1051 
1052   // Link as much SALU intructions in chain as possible. Return the size
1053   // of the chain. Links up to MaxChain instructions.
1054   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
1055                          SmallPtrSetImpl<SUnit *> &Visited) const {
1056     SmallVector<SUnit *, 8> Worklist({To});
1057     unsigned Linked = 0;
1058 
1059     while (!Worklist.empty() && MaxChain-- > 0) {
1060       SUnit *SU = Worklist.pop_back_val();
1061       if (!Visited.insert(SU).second)
1062         continue;
1063 
1064       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
1065                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
1066 
1067       if (SU->addPred(SDep(From, SDep::Artificial), false))
1068         ++Linked;
1069 
1070       for (SDep &SI : From->Succs) {
1071         SUnit *SUv = SI.getSUnit();
1072         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
1073           SUv->addPred(SDep(SU, SDep::Artificial), false);
1074       }
1075 
1076       for (SDep &SI : SU->Succs) {
1077         SUnit *Succ = SI.getSUnit();
1078         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
1079           Worklist.push_back(Succ);
1080       }
1081     }
1082 
1083     return Linked;
1084   }
1085 
1086   void apply(ScheduleDAGInstrs *DAGInstrs) override {
1087     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
1088     if (!ST.hasMAIInsts() || DisablePowerSched)
1089       return;
1090     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
1091     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
1092     if (!TSchedModel || DAG->SUnits.empty())
1093       return;
1094 
1095     // Scan for MFMA long latency instructions and try to add a dependency
1096     // of available SALU instructions to give them a chance to fill MFMA
1097     // shadow. That is desirable to fill MFMA shadow with SALU instructions
1098     // rather than VALU to prevent power consumption bursts and throttle.
1099     auto LastSALU = DAG->SUnits.begin();
1100     auto E = DAG->SUnits.end();
1101     SmallPtrSet<SUnit*, 32> Visited;
1102     for (SUnit &SU : DAG->SUnits) {
1103       MachineInstr &MAI = *SU.getInstr();
1104       if (!TII->isMAI(MAI) ||
1105            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
1106            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
1107         continue;
1108 
1109       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
1110 
1111       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
1112                  dbgs() << "Need " << Lat
1113                         << " instructions to cover latency.\n");
1114 
1115       // Find up to Lat independent scalar instructions as early as
1116       // possible such that they can be scheduled after this MFMA.
1117       for ( ; Lat && LastSALU != E; ++LastSALU) {
1118         if (Visited.count(&*LastSALU))
1119           continue;
1120 
1121         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
1122           continue;
1123 
1124         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
1125       }
1126     }
1127   }
1128 };
1129 } // namespace
1130 
1131 void GCNSubtarget::getPostRAMutations(
1132     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
1133   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
1134 }
1135 
1136 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
1137   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
1138     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
1139   else
1140     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
1141 }
1142 
1143 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
1144   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
1145     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
1146   else
1147     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
1148 }
1149