1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUCallLowering.h"
17 #include "AMDGPUInstructionSelector.h"
18 #include "AMDGPULegalizerInfo.h"
19 #include "AMDGPURegisterBankInfo.h"
20 #include "AMDGPUTargetMachine.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/IntrinsicsR600.h"
29 #include "llvm/IR/MDBuilder.h"
30 #include "llvm/MC/MCSubtargetInfo.h"
31 #include <algorithm>
32 
33 using namespace llvm;
34 
35 #define DEBUG_TYPE "amdgpu-subtarget"
36 
37 #define GET_SUBTARGETINFO_TARGET_DESC
38 #define GET_SUBTARGETINFO_CTOR
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #define GET_SUBTARGETINFO_TARGET_DESC
42 #define GET_SUBTARGETINFO_CTOR
43 #undef AMDGPUSubtarget
44 #include "R600GenSubtargetInfo.inc"
45 
46 static cl::opt<bool> DisablePowerSched(
47   "amdgpu-disable-power-sched",
48   cl::desc("Disable scheduling to minimize mAI power bursts"),
49   cl::init(false));
50 
51 static cl::opt<bool> EnableVGPRIndexMode(
52   "amdgpu-vgpr-index-mode",
53   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
54   cl::init(false));
55 
56 static cl::opt<bool> EnableFlatScratch(
57   "amdgpu-enable-flat-scratch",
58   cl::desc("Use flat scratch instructions"),
59   cl::init(false));
60 
61 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
62                            cl::desc("Enable the use of AA during codegen."),
63                            cl::init(true));
64 
65 GCNSubtarget::~GCNSubtarget() = default;
66 
67 R600Subtarget &
68 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
69                                                StringRef GPU, StringRef FS) {
70   SmallString<256> FullFS("+promote-alloca,");
71   FullFS += FS;
72   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
73 
74   HasMulU24 = getGeneration() >= EVERGREEN;
75   HasMulI24 = hasCaymanISA();
76 
77   return *this;
78 }
79 
80 GCNSubtarget &
81 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
82                                               StringRef GPU, StringRef FS) {
83   // Determine default and user-specified characteristics
84   //
85   // We want to be able to turn these off, but making this a subtarget feature
86   // for SI has the unhelpful behavior that it unsets everything else if you
87   // disable it.
88   //
89   // Similarly we want enable-prt-strict-null to be on by default and not to
90   // unset everything else if it is disabled
91 
92   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
93 
94   // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
95   if (isAmdHsaOS())
96     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
97 
98   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
99 
100   // Disable mutually exclusive bits.
101   if (FS.find_insensitive("+wavefrontsize") != StringRef::npos) {
102     if (FS.find_insensitive("wavefrontsize16") == StringRef::npos)
103       FullFS += "-wavefrontsize16,";
104     if (FS.find_insensitive("wavefrontsize32") == StringRef::npos)
105       FullFS += "-wavefrontsize32,";
106     if (FS.find_insensitive("wavefrontsize64") == StringRef::npos)
107       FullFS += "-wavefrontsize64,";
108   }
109 
110   FullFS += FS;
111 
112   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
113 
114   // Implement the "generic" processors, which acts as the default when no
115   // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
116   // the first amdgcn target that supports flat addressing. Other OSes defaults
117   // to the first amdgcn target.
118   if (Gen == AMDGPUSubtarget::INVALID) {
119      Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
120                                         : AMDGPUSubtarget::SOUTHERN_ISLANDS;
121   }
122 
123   // We don't support FP64 for EG/NI atm.
124   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
125 
126   // Targets must either support 64-bit offsets for MUBUF instructions, and/or
127   // support flat operations, otherwise they cannot access a 64-bit global
128   // address space
129   assert(hasAddr64() || hasFlat());
130   // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
131   // that do not support ADDR64 variants of MUBUF instructions. Such targets
132   // cannot use a 64 bit offset with a MUBUF instruction to access the global
133   // address space
134   if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
135     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
136     FlatForGlobal = true;
137   }
138   // Unless +-flat-for-global is specified, use MUBUF instructions for global
139   // address space access if flat operations are not available.
140   if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
141     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
142     FlatForGlobal = false;
143   }
144 
145   // Set defaults if needed.
146   if (MaxPrivateElementSize == 0)
147     MaxPrivateElementSize = 4;
148 
149   if (LDSBankCount == 0)
150     LDSBankCount = 32;
151 
152   if (TT.getArch() == Triple::amdgcn) {
153     if (LocalMemorySize == 0)
154       LocalMemorySize = 32768;
155 
156     // Do something sensible for unspecified target.
157     if (!HasMovrel && !HasVGPRIndexMode)
158       HasMovrel = true;
159   }
160 
161   // Don't crash on invalid devices.
162   if (WavefrontSizeLog2 == 0)
163     WavefrontSizeLog2 = 5;
164 
165   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
166 
167   TargetID.setTargetIDFromFeaturesString(FS);
168 
169   LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
170                     << TargetID.getXnackSetting() << '\n');
171   LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
172                     << TargetID.getSramEccSetting() << '\n');
173 
174   return *this;
175 }
176 
177 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
178   TargetTriple(TT),
179   GCN3Encoding(false),
180   Has16BitInsts(false),
181   HasMadMixInsts(false),
182   HasMadMacF32Insts(false),
183   HasDsSrc2Insts(false),
184   HasSDWA(false),
185   HasVOP3PInsts(false),
186   HasMulI24(true),
187   HasMulU24(true),
188   HasInv2PiInlineImm(false),
189   HasFminFmaxLegacy(true),
190   EnablePromoteAlloca(false),
191   HasTrigReducedRange(false),
192   MaxWavesPerEU(10),
193   LocalMemorySize(0),
194   WavefrontSizeLog2(0)
195   { }
196 
197 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
198                            const GCNTargetMachine &TM)
199     : // clang-format off
200     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
201     AMDGPUSubtarget(TT),
202     TargetTriple(TT),
203     TargetID(*this),
204     Gen(INVALID),
205     InstrItins(getInstrItineraryForCPU(GPU)),
206     LDSBankCount(0),
207     MaxPrivateElementSize(0),
208 
209     FastFMAF32(false),
210     FastDenormalF32(false),
211     HalfRate64Ops(false),
212     FullRate64Ops(false),
213 
214     FlatForGlobal(false),
215     AutoWaitcntBeforeBarrier(false),
216     UnalignedScratchAccess(false),
217     UnalignedAccessMode(false),
218 
219     HasApertureRegs(false),
220     SupportsXNACK(false),
221     EnableXNACK(false),
222     EnableTgSplit(false),
223     EnableCuMode(false),
224     TrapHandler(false),
225 
226     EnableLoadStoreOpt(false),
227     EnableUnsafeDSOffsetFolding(false),
228     EnableSIScheduler(false),
229     EnableDS128(false),
230     EnablePRTStrictNull(false),
231     DumpCode(false),
232 
233     FP64(false),
234     CIInsts(false),
235     GFX8Insts(false),
236     GFX9Insts(false),
237     GFX90AInsts(false),
238     GFX10Insts(false),
239     GFX10_3Insts(false),
240     GFX7GFX8GFX9Insts(false),
241     SGPRInitBug(false),
242     NegativeScratchOffsetBug(false),
243     NegativeUnalignedScratchOffsetBug(false),
244     HasSMemRealTime(false),
245     HasIntClamp(false),
246     HasFmaMixInsts(false),
247     HasMovrel(false),
248     HasVGPRIndexMode(false),
249     HasScalarStores(false),
250     HasScalarAtomics(false),
251     HasSDWAOmod(false),
252     HasSDWAScalar(false),
253     HasSDWASdst(false),
254     HasSDWAMac(false),
255     HasSDWAOutModsVOPC(false),
256     HasDPP(false),
257     HasDPP8(false),
258     Has64BitDPP(false),
259     HasPackedFP32Ops(false),
260     HasExtendedImageInsts(false),
261     HasR128A16(false),
262     HasGFX10A16(false),
263     HasG16(false),
264     HasNSAEncoding(false),
265     GFX10_AEncoding(false),
266     GFX10_BEncoding(false),
267     HasDLInsts(false),
268     HasDot1Insts(false),
269     HasDot2Insts(false),
270     HasDot3Insts(false),
271     HasDot4Insts(false),
272     HasDot5Insts(false),
273     HasDot6Insts(false),
274     HasDot7Insts(false),
275     HasMAIInsts(false),
276     HasPkFmacF16Inst(false),
277     HasAtomicFaddInsts(false),
278     SupportsSRAMECC(false),
279     EnableSRAMECC(false),
280     HasNoSdstCMPX(false),
281     HasVscnt(false),
282     HasGetWaveIdInst(false),
283     HasSMemTimeInst(false),
284     HasShaderCyclesRegister(false),
285     HasRegisterBanking(false),
286     HasVOP3Literal(false),
287     HasNoDataDepHazard(false),
288     FlatAddressSpace(false),
289     FlatInstOffsets(false),
290     FlatGlobalInsts(false),
291     FlatScratchInsts(false),
292     ScalarFlatScratchInsts(false),
293     HasArchitectedFlatScratch(false),
294     AddNoCarryInsts(false),
295     HasUnpackedD16VMem(false),
296     LDSMisalignedBug(false),
297     HasMFMAInlineLiteralBug(false),
298     UnalignedBufferAccess(false),
299     UnalignedDSAccess(false),
300     HasPackedTID(false),
301 
302     ScalarizeGlobal(false),
303 
304     HasVcmpxPermlaneHazard(false),
305     HasVMEMtoScalarWriteHazard(false),
306     HasSMEMtoVectorWriteHazard(false),
307     HasInstFwdPrefetchBug(false),
308     HasVcmpxExecWARHazard(false),
309     HasLdsBranchVmemWARHazard(false),
310     HasNSAtoVMEMBug(false),
311     HasNSAClauseBug(false),
312     HasOffset3fBug(false),
313     HasFlatSegmentOffsetBug(false),
314     HasImageStoreD16Bug(false),
315     HasImageGather4D16Bug(false),
316 
317     FeatureDisable(false),
318     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
319     TLInfo(TM, *this),
320     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
321   // clang-format on
322   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
323   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
324   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
325   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
326   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
327   InstSelector.reset(new AMDGPUInstructionSelector(
328   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
329 }
330 
331 bool GCNSubtarget::enableFlatScratch() const {
332   return flatScratchIsArchitected() ||
333          (EnableFlatScratch && hasFlatScratchInsts());
334 }
335 
336 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
337   if (getGeneration() < GFX10)
338     return 1;
339 
340   switch (Opcode) {
341   case AMDGPU::V_LSHLREV_B64_e64:
342   case AMDGPU::V_LSHLREV_B64_gfx10:
343   case AMDGPU::V_LSHL_B64_e64:
344   case AMDGPU::V_LSHRREV_B64_e64:
345   case AMDGPU::V_LSHRREV_B64_gfx10:
346   case AMDGPU::V_LSHR_B64_e64:
347   case AMDGPU::V_ASHRREV_I64_e64:
348   case AMDGPU::V_ASHRREV_I64_gfx10:
349   case AMDGPU::V_ASHR_I64_e64:
350     return 1;
351   }
352 
353   return 2;
354 }
355 
356 /// This list was mostly derived from experimentation.
357 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
358   switch (Opcode) {
359   case AMDGPU::V_CVT_F16_F32_e32:
360   case AMDGPU::V_CVT_F16_F32_e64:
361   case AMDGPU::V_CVT_F16_U16_e32:
362   case AMDGPU::V_CVT_F16_U16_e64:
363   case AMDGPU::V_CVT_F16_I16_e32:
364   case AMDGPU::V_CVT_F16_I16_e64:
365   case AMDGPU::V_RCP_F16_e64:
366   case AMDGPU::V_RCP_F16_e32:
367   case AMDGPU::V_RSQ_F16_e64:
368   case AMDGPU::V_RSQ_F16_e32:
369   case AMDGPU::V_SQRT_F16_e64:
370   case AMDGPU::V_SQRT_F16_e32:
371   case AMDGPU::V_LOG_F16_e64:
372   case AMDGPU::V_LOG_F16_e32:
373   case AMDGPU::V_EXP_F16_e64:
374   case AMDGPU::V_EXP_F16_e32:
375   case AMDGPU::V_SIN_F16_e64:
376   case AMDGPU::V_SIN_F16_e32:
377   case AMDGPU::V_COS_F16_e64:
378   case AMDGPU::V_COS_F16_e32:
379   case AMDGPU::V_FLOOR_F16_e64:
380   case AMDGPU::V_FLOOR_F16_e32:
381   case AMDGPU::V_CEIL_F16_e64:
382   case AMDGPU::V_CEIL_F16_e32:
383   case AMDGPU::V_TRUNC_F16_e64:
384   case AMDGPU::V_TRUNC_F16_e32:
385   case AMDGPU::V_RNDNE_F16_e64:
386   case AMDGPU::V_RNDNE_F16_e32:
387   case AMDGPU::V_FRACT_F16_e64:
388   case AMDGPU::V_FRACT_F16_e32:
389   case AMDGPU::V_FREXP_MANT_F16_e64:
390   case AMDGPU::V_FREXP_MANT_F16_e32:
391   case AMDGPU::V_FREXP_EXP_I16_F16_e64:
392   case AMDGPU::V_FREXP_EXP_I16_F16_e32:
393   case AMDGPU::V_LDEXP_F16_e64:
394   case AMDGPU::V_LDEXP_F16_e32:
395   case AMDGPU::V_LSHLREV_B16_e64:
396   case AMDGPU::V_LSHLREV_B16_e32:
397   case AMDGPU::V_LSHRREV_B16_e64:
398   case AMDGPU::V_LSHRREV_B16_e32:
399   case AMDGPU::V_ASHRREV_I16_e64:
400   case AMDGPU::V_ASHRREV_I16_e32:
401   case AMDGPU::V_ADD_U16_e64:
402   case AMDGPU::V_ADD_U16_e32:
403   case AMDGPU::V_SUB_U16_e64:
404   case AMDGPU::V_SUB_U16_e32:
405   case AMDGPU::V_SUBREV_U16_e64:
406   case AMDGPU::V_SUBREV_U16_e32:
407   case AMDGPU::V_MUL_LO_U16_e64:
408   case AMDGPU::V_MUL_LO_U16_e32:
409   case AMDGPU::V_ADD_F16_e64:
410   case AMDGPU::V_ADD_F16_e32:
411   case AMDGPU::V_SUB_F16_e64:
412   case AMDGPU::V_SUB_F16_e32:
413   case AMDGPU::V_SUBREV_F16_e64:
414   case AMDGPU::V_SUBREV_F16_e32:
415   case AMDGPU::V_MUL_F16_e64:
416   case AMDGPU::V_MUL_F16_e32:
417   case AMDGPU::V_MAX_F16_e64:
418   case AMDGPU::V_MAX_F16_e32:
419   case AMDGPU::V_MIN_F16_e64:
420   case AMDGPU::V_MIN_F16_e32:
421   case AMDGPU::V_MAX_U16_e64:
422   case AMDGPU::V_MAX_U16_e32:
423   case AMDGPU::V_MIN_U16_e64:
424   case AMDGPU::V_MIN_U16_e32:
425   case AMDGPU::V_MAX_I16_e64:
426   case AMDGPU::V_MAX_I16_e32:
427   case AMDGPU::V_MIN_I16_e64:
428   case AMDGPU::V_MIN_I16_e32:
429     // On gfx10, all 16-bit instructions preserve the high bits.
430     return getGeneration() <= AMDGPUSubtarget::GFX9;
431   case AMDGPU::V_MAD_F16_e64:
432   case AMDGPU::V_MADAK_F16:
433   case AMDGPU::V_MADMK_F16:
434   case AMDGPU::V_MAC_F16_e64:
435   case AMDGPU::V_MAC_F16_e32:
436   case AMDGPU::V_FMAMK_F16:
437   case AMDGPU::V_FMAAK_F16:
438   case AMDGPU::V_MAD_U16_e64:
439   case AMDGPU::V_MAD_I16_e64:
440   case AMDGPU::V_FMA_F16_e64:
441   case AMDGPU::V_FMAC_F16_e64:
442   case AMDGPU::V_FMAC_F16_e32:
443   case AMDGPU::V_DIV_FIXUP_F16_e64:
444     // In gfx9, the preferred handling of the unused high 16-bits changed. Most
445     // instructions maintain the legacy behavior of 0ing. Some instructions
446     // changed to preserving the high bits.
447     return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
448   case AMDGPU::V_MAD_MIXLO_F16:
449   case AMDGPU::V_MAD_MIXHI_F16:
450   default:
451     return false;
452   }
453 }
454 
455 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
456   const Function &F) const {
457   if (NWaves == 1)
458     return getLocalMemorySize();
459   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
460   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
461   if (!WorkGroupsPerCu)
462     return 0;
463   unsigned MaxWaves = getMaxWavesPerEU();
464   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
465 }
466 
467 // FIXME: Should return min,max range.
468 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
469   const Function &F) const {
470   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
471   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
472   if (!MaxWorkGroupsPerCu)
473     return 0;
474 
475   const unsigned WaveSize = getWavefrontSize();
476 
477   // FIXME: Do we need to account for alignment requirement of LDS rounding the
478   // size up?
479   // Compute restriction based on LDS usage
480   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
481 
482   // This can be queried with more LDS than is possible, so just assume the
483   // worst.
484   if (NumGroups == 0)
485     return 1;
486 
487   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
488 
489   // Round to the number of waves.
490   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
491   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
492 
493   // Clamp to the maximum possible number of waves.
494   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
495 
496   // FIXME: Needs to be a multiple of the group size?
497   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
498 
499   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
500          "computed invalid occupancy");
501   return MaxWaves;
502 }
503 
504 unsigned
505 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
506   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
507   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
508 }
509 
510 std::pair<unsigned, unsigned>
511 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
512   switch (CC) {
513   case CallingConv::AMDGPU_VS:
514   case CallingConv::AMDGPU_LS:
515   case CallingConv::AMDGPU_HS:
516   case CallingConv::AMDGPU_ES:
517   case CallingConv::AMDGPU_GS:
518   case CallingConv::AMDGPU_PS:
519     return std::make_pair(1, getWavefrontSize());
520   default:
521     return std::make_pair(1u, getMaxFlatWorkGroupSize());
522   }
523 }
524 
525 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
526   const Function &F) const {
527   // Default minimum/maximum flat work group sizes.
528   std::pair<unsigned, unsigned> Default =
529     getDefaultFlatWorkGroupSize(F.getCallingConv());
530 
531   // Requested minimum/maximum flat work group sizes.
532   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
533     F, "amdgpu-flat-work-group-size", Default);
534 
535   // Make sure requested minimum is less than requested maximum.
536   if (Requested.first > Requested.second)
537     return Default;
538 
539   // Make sure requested values do not violate subtarget's specifications.
540   if (Requested.first < getMinFlatWorkGroupSize())
541     return Default;
542   if (Requested.second > getMaxFlatWorkGroupSize())
543     return Default;
544 
545   return Requested;
546 }
547 
548 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
549   const Function &F) const {
550   // Default minimum/maximum number of waves per execution unit.
551   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
552 
553   // Default/requested minimum/maximum flat work group sizes.
554   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
555 
556   // If minimum/maximum flat work group sizes were explicitly requested using
557   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
558   // number of waves per execution unit to values implied by requested
559   // minimum/maximum flat work group sizes.
560   unsigned MinImpliedByFlatWorkGroupSize =
561     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
562   Default.first = MinImpliedByFlatWorkGroupSize;
563   bool RequestedFlatWorkGroupSize =
564       F.hasFnAttribute("amdgpu-flat-work-group-size");
565 
566   // Requested minimum/maximum number of waves per execution unit.
567   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
568     F, "amdgpu-waves-per-eu", Default, true);
569 
570   // Make sure requested minimum is less than requested maximum.
571   if (Requested.second && Requested.first > Requested.second)
572     return Default;
573 
574   // Make sure requested values do not violate subtarget's specifications.
575   if (Requested.first < getMinWavesPerEU() ||
576       Requested.second > getMaxWavesPerEU())
577     return Default;
578 
579   // Make sure requested values are compatible with values implied by requested
580   // minimum/maximum flat work group sizes.
581   if (RequestedFlatWorkGroupSize &&
582       Requested.first < MinImpliedByFlatWorkGroupSize)
583     return Default;
584 
585   return Requested;
586 }
587 
588 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
589   auto Node = Kernel.getMetadata("reqd_work_group_size");
590   if (Node && Node->getNumOperands() == 3)
591     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
592   return std::numeric_limits<unsigned>::max();
593 }
594 
595 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
596   return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
597 }
598 
599 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
600                                            unsigned Dimension) const {
601   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
602   if (ReqdSize != std::numeric_limits<unsigned>::max())
603     return ReqdSize - 1;
604   return getFlatWorkGroupSizes(Kernel).second - 1;
605 }
606 
607 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
608   Function *Kernel = I->getParent()->getParent();
609   unsigned MinSize = 0;
610   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
611   bool IdQuery = false;
612 
613   // If reqd_work_group_size is present it narrows value down.
614   if (auto *CI = dyn_cast<CallInst>(I)) {
615     const Function *F = CI->getCalledFunction();
616     if (F) {
617       unsigned Dim = UINT_MAX;
618       switch (F->getIntrinsicID()) {
619       case Intrinsic::amdgcn_workitem_id_x:
620       case Intrinsic::r600_read_tidig_x:
621         IdQuery = true;
622         LLVM_FALLTHROUGH;
623       case Intrinsic::r600_read_local_size_x:
624         Dim = 0;
625         break;
626       case Intrinsic::amdgcn_workitem_id_y:
627       case Intrinsic::r600_read_tidig_y:
628         IdQuery = true;
629         LLVM_FALLTHROUGH;
630       case Intrinsic::r600_read_local_size_y:
631         Dim = 1;
632         break;
633       case Intrinsic::amdgcn_workitem_id_z:
634       case Intrinsic::r600_read_tidig_z:
635         IdQuery = true;
636         LLVM_FALLTHROUGH;
637       case Intrinsic::r600_read_local_size_z:
638         Dim = 2;
639         break;
640       default:
641         break;
642       }
643 
644       if (Dim <= 3) {
645         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
646         if (ReqdSize != std::numeric_limits<unsigned>::max())
647           MinSize = MaxSize = ReqdSize;
648       }
649     }
650   }
651 
652   if (!MaxSize)
653     return false;
654 
655   // Range metadata is [Lo, Hi). For ID query we need to pass max size
656   // as Hi. For size query we need to pass Hi + 1.
657   if (IdQuery)
658     MinSize = 0;
659   else
660     ++MaxSize;
661 
662   MDBuilder MDB(I->getContext());
663   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
664                                                   APInt(32, MaxSize));
665   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
666   return true;
667 }
668 
669 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
670   if (isMesaKernel(F))
671     return 16;
672   return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
673 }
674 
675 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
676                                                  Align &MaxAlign) const {
677   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
678          F.getCallingConv() == CallingConv::SPIR_KERNEL);
679 
680   const DataLayout &DL = F.getParent()->getDataLayout();
681   uint64_t ExplicitArgBytes = 0;
682   MaxAlign = Align(1);
683 
684   for (const Argument &Arg : F.args()) {
685     const bool IsByRef = Arg.hasByRefAttr();
686     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
687     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
688     if (!Alignment)
689       Alignment = DL.getABITypeAlign(ArgTy);
690 
691     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
692     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
693     MaxAlign = max(MaxAlign, Alignment);
694   }
695 
696   return ExplicitArgBytes;
697 }
698 
699 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
700                                                 Align &MaxAlign) const {
701   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
702 
703   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
704 
705   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
706   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
707   if (ImplicitBytes != 0) {
708     const Align Alignment = getAlignmentForImplicitArgPtr();
709     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
710   }
711 
712   // Being able to dereference past the end is useful for emitting scalar loads.
713   return alignTo(TotalSize, 4);
714 }
715 
716 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
717   return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
718                                   : AMDGPUDwarfFlavour::Wave64;
719 }
720 
721 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
722                              const TargetMachine &TM) :
723   R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS),
724   AMDGPUSubtarget(TT),
725   InstrInfo(*this),
726   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
727   FMA(false),
728   CaymanISA(false),
729   CFALUBug(false),
730   HasVertexCache(false),
731   R600ALUInst(false),
732   FP64(false),
733   TexVTXClauseSize(0),
734   Gen(R600),
735   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
736   InstrItins(getInstrItineraryForCPU(GPU)) { }
737 
738 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
739                                       unsigned NumRegionInstrs) const {
740   // Track register pressure so the scheduler can try to decrease
741   // pressure once register usage is above the threshold defined by
742   // SIRegisterInfo::getRegPressureSetLimit()
743   Policy.ShouldTrackPressure = true;
744 
745   // Enabling both top down and bottom up scheduling seems to give us less
746   // register spills than just using one of these approaches on its own.
747   Policy.OnlyTopDown = false;
748   Policy.OnlyBottomUp = false;
749 
750   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
751   if (!enableSIScheduler())
752     Policy.ShouldTrackLaneMasks = true;
753 }
754 
755 bool GCNSubtarget::hasMadF16() const {
756   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
757 }
758 
759 bool GCNSubtarget::useVGPRIndexMode() const {
760   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
761 }
762 
763 bool GCNSubtarget::useAA() const { return UseAA; }
764 
765 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
766   if (getGeneration() >= AMDGPUSubtarget::GFX10)
767     return getMaxWavesPerEU();
768 
769   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
770     if (SGPRs <= 80)
771       return 10;
772     if (SGPRs <= 88)
773       return 9;
774     if (SGPRs <= 100)
775       return 8;
776     return 7;
777   }
778   if (SGPRs <= 48)
779     return 10;
780   if (SGPRs <= 56)
781     return 9;
782   if (SGPRs <= 64)
783     return 8;
784   if (SGPRs <= 72)
785     return 7;
786   if (SGPRs <= 80)
787     return 6;
788   return 5;
789 }
790 
791 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
792   unsigned MaxWaves = getMaxWavesPerEU();
793   unsigned Granule = getVGPRAllocGranule();
794   if (VGPRs < Granule)
795     return MaxWaves;
796   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
797   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
798 }
799 
800 unsigned
801 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const {
802   if (getGeneration() >= AMDGPUSubtarget::GFX10)
803     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
804 
805   if (HasFlatScratchInit) {
806     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
807       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
808     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
809       return 4; // FLAT_SCRATCH, VCC (in that order).
810   }
811 
812   if (isXNACKEnabled())
813     return 4; // XNACK, VCC (in that order).
814   return 2; // VCC.
815 }
816 
817 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
818   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
819   return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit());
820 }
821 
822 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
823   // The logic to detect if the function has
824   // flat scratch init is same as how MachineFunctionInfo derives.
825   bool FunctionHasFlatScratchInit = false;
826   bool HasCalls = F.hasFnAttribute("amdgpu-calls");
827   bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects");
828   if (hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(F.getCallingConv()) &&
829       (isAmdHsaOrMesa(F) || enableFlatScratch()) &&
830       !flatScratchIsArchitected()) {
831     if (HasCalls || HasStackObjects || enableFlatScratch())
832       FunctionHasFlatScratchInit = true;
833   }
834   return getBaseReservedNumSGPRs(FunctionHasFlatScratchInit);
835 }
836 
837 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
838                                         unsigned NumSGPRs,
839                                         unsigned NumVGPRs) const {
840   unsigned Occupancy =
841     std::min(getMaxWavesPerEU(),
842              getOccupancyWithLocalMemSize(LDSSize, F));
843   if (NumSGPRs)
844     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
845   if (NumVGPRs)
846     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
847   return Occupancy;
848 }
849 
850 unsigned GCNSubtarget::getBaseMaxNumSGPRs(
851     const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
852     unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
853   // Compute maximum number of SGPRs function can use using default/requested
854   // minimum number of waves per execution unit.
855   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
856   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
857 
858   // Check if maximum number of SGPRs was explicitly requested using
859   // "amdgpu-num-sgpr" attribute.
860   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
861     unsigned Requested = AMDGPU::getIntegerAttribute(
862       F, "amdgpu-num-sgpr", MaxNumSGPRs);
863 
864     // Make sure requested value does not violate subtarget's specifications.
865     if (Requested && (Requested <= ReservedNumSGPRs))
866       Requested = 0;
867 
868     // If more SGPRs are required to support the input user/system SGPRs,
869     // increase to accommodate them.
870     //
871     // FIXME: This really ends up using the requested number of SGPRs + number
872     // of reserved special registers in total. Theoretically you could re-use
873     // the last input registers for these special registers, but this would
874     // require a lot of complexity to deal with the weird aliasing.
875     unsigned InputNumSGPRs = PreloadedSGPRs;
876     if (Requested && Requested < InputNumSGPRs)
877       Requested = InputNumSGPRs;
878 
879     // Make sure requested value is compatible with values implied by
880     // default/requested minimum/maximum number of waves per execution unit.
881     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
882       Requested = 0;
883     if (WavesPerEU.second &&
884         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
885       Requested = 0;
886 
887     if (Requested)
888       MaxNumSGPRs = Requested;
889   }
890 
891   if (hasSGPRInitBug())
892     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
893 
894   return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
895 }
896 
897 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
898   const Function &F = MF.getFunction();
899   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
900   return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),
901                             getReservedNumSGPRs(MF));
902 }
903 
904 static unsigned getMaxNumPreloadedSGPRs() {
905   // Max number of user SGPRs
906   unsigned MaxUserSGPRs = 4 + // private segment buffer
907                           2 + // Dispatch ptr
908                           2 + // queue ptr
909                           2 + // kernel segment ptr
910                           2 + // dispatch ID
911                           2 + // flat scratch init
912                           2;  // Implicit buffer ptr
913   // Max number of system SGPRs
914   unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
915                             1 + // WorkGroupIDY
916                             1 + // WorkGroupIDZ
917                             1 + // WorkGroupInfo
918                             1;  // private segment wave byte offset
919   return MaxUserSGPRs + MaxSystemSGPRs;
920 }
921 
922 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
923   return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),
924                             getReservedNumSGPRs(F));
925 }
926 
927 unsigned GCNSubtarget::getBaseMaxNumVGPRs(
928     const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
929   // Compute maximum number of VGPRs function can use using default/requested
930   // minimum number of waves per execution unit.
931   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
932 
933   // Check if maximum number of VGPRs was explicitly requested using
934   // "amdgpu-num-vgpr" attribute.
935   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
936     unsigned Requested = AMDGPU::getIntegerAttribute(
937       F, "amdgpu-num-vgpr", MaxNumVGPRs);
938 
939     if (hasGFX90AInsts())
940       Requested *= 2;
941 
942     // Make sure requested value is compatible with values implied by
943     // default/requested minimum/maximum number of waves per execution unit.
944     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
945       Requested = 0;
946     if (WavesPerEU.second &&
947         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
948       Requested = 0;
949 
950     if (Requested)
951       MaxNumVGPRs = Requested;
952   }
953 
954   return MaxNumVGPRs;
955 }
956 
957 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
958   return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
959 }
960 
961 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
962   const Function &F = MF.getFunction();
963   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
964   return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
965 }
966 
967 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
968                                          int UseOpIdx, SDep &Dep) const {
969   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
970       !Def->isInstr() || !Use->isInstr())
971     return;
972 
973   MachineInstr *DefI = Def->getInstr();
974   MachineInstr *UseI = Use->getInstr();
975 
976   if (DefI->isBundle()) {
977     const SIRegisterInfo *TRI = getRegisterInfo();
978     auto Reg = Dep.getReg();
979     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
980     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
981     unsigned Lat = 0;
982     for (++I; I != E && I->isBundledWithPred(); ++I) {
983       if (I->modifiesRegister(Reg, TRI))
984         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
985       else if (Lat)
986         --Lat;
987     }
988     Dep.setLatency(Lat);
989   } else if (UseI->isBundle()) {
990     const SIRegisterInfo *TRI = getRegisterInfo();
991     auto Reg = Dep.getReg();
992     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
993     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
994     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
995     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
996       if (I->readsRegister(Reg, TRI))
997         break;
998       --Lat;
999     }
1000     Dep.setLatency(Lat);
1001   }
1002 }
1003 
1004 namespace {
1005 struct FillMFMAShadowMutation : ScheduleDAGMutation {
1006   const SIInstrInfo *TII;
1007 
1008   ScheduleDAGMI *DAG;
1009 
1010   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
1011 
1012   bool isSALU(const SUnit *SU) const {
1013     const MachineInstr *MI = SU->getInstr();
1014     return MI && TII->isSALU(*MI) && !MI->isTerminator();
1015   }
1016 
1017   bool isVALU(const SUnit *SU) const {
1018     const MachineInstr *MI = SU->getInstr();
1019     return MI && TII->isVALU(*MI);
1020   }
1021 
1022   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
1023     if (Pred->NodeNum < Succ->NodeNum)
1024       return true;
1025 
1026     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
1027 
1028     for (unsigned I = 0; I < Succs.size(); ++I) {
1029       for (const SDep &SI : Succs[I]->Succs) {
1030         const SUnit *SU = SI.getSUnit();
1031         if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
1032           Succs.push_back(SU);
1033       }
1034     }
1035 
1036     SmallPtrSet<const SUnit*, 32> Visited;
1037     while (!Preds.empty()) {
1038       const SUnit *SU = Preds.pop_back_val();
1039       if (llvm::is_contained(Succs, SU))
1040         return false;
1041       Visited.insert(SU);
1042       for (const SDep &SI : SU->Preds)
1043         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
1044           Preds.push_back(SI.getSUnit());
1045     }
1046 
1047     return true;
1048   }
1049 
1050   // Link as much SALU intructions in chain as possible. Return the size
1051   // of the chain. Links up to MaxChain instructions.
1052   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
1053                          SmallPtrSetImpl<SUnit *> &Visited) const {
1054     SmallVector<SUnit *, 8> Worklist({To});
1055     unsigned Linked = 0;
1056 
1057     while (!Worklist.empty() && MaxChain-- > 0) {
1058       SUnit *SU = Worklist.pop_back_val();
1059       if (!Visited.insert(SU).second)
1060         continue;
1061 
1062       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
1063                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
1064 
1065       if (SU->addPred(SDep(From, SDep::Artificial), false))
1066         ++Linked;
1067 
1068       for (SDep &SI : From->Succs) {
1069         SUnit *SUv = SI.getSUnit();
1070         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
1071           SUv->addPred(SDep(SU, SDep::Artificial), false);
1072       }
1073 
1074       for (SDep &SI : SU->Succs) {
1075         SUnit *Succ = SI.getSUnit();
1076         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
1077           Worklist.push_back(Succ);
1078       }
1079     }
1080 
1081     return Linked;
1082   }
1083 
1084   void apply(ScheduleDAGInstrs *DAGInstrs) override {
1085     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
1086     if (!ST.hasMAIInsts() || DisablePowerSched)
1087       return;
1088     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
1089     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
1090     if (!TSchedModel || DAG->SUnits.empty())
1091       return;
1092 
1093     // Scan for MFMA long latency instructions and try to add a dependency
1094     // of available SALU instructions to give them a chance to fill MFMA
1095     // shadow. That is desirable to fill MFMA shadow with SALU instructions
1096     // rather than VALU to prevent power consumption bursts and throttle.
1097     auto LastSALU = DAG->SUnits.begin();
1098     auto E = DAG->SUnits.end();
1099     SmallPtrSet<SUnit*, 32> Visited;
1100     for (SUnit &SU : DAG->SUnits) {
1101       MachineInstr &MAI = *SU.getInstr();
1102       if (!TII->isMAI(MAI) ||
1103            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
1104            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
1105         continue;
1106 
1107       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
1108 
1109       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
1110                  dbgs() << "Need " << Lat
1111                         << " instructions to cover latency.\n");
1112 
1113       // Find up to Lat independent scalar instructions as early as
1114       // possible such that they can be scheduled after this MFMA.
1115       for ( ; Lat && LastSALU != E; ++LastSALU) {
1116         if (Visited.count(&*LastSALU))
1117           continue;
1118 
1119         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
1120           continue;
1121 
1122         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
1123       }
1124     }
1125   }
1126 };
1127 } // namespace
1128 
1129 void GCNSubtarget::getPostRAMutations(
1130     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
1131   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
1132 }
1133 
1134 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
1135   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
1136     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
1137   else
1138     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
1139 }
1140 
1141 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
1142   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
1143     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
1144   else
1145     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
1146 }
1147