1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 static cl::opt<bool> DisablePowerSched(
44   "amdgpu-disable-power-sched",
45   cl::desc("Disable scheduling to minimize mAI power bursts"),
46   cl::init(false));
47 
48 GCNSubtarget::~GCNSubtarget() = default;
49 
50 R600Subtarget &
51 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
52                                                StringRef GPU, StringRef FS) {
53   SmallString<256> FullFS("+promote-alloca,");
54   FullFS += FS;
55   ParseSubtargetFeatures(GPU, FullFS);
56 
57   // FIXME: I don't think think Evergreen has any useful support for
58   // denormals, but should be checked. Should we issue a warning somewhere
59   // if someone tries to enable these?
60   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
61     FP32Denormals = false;
62   }
63 
64   HasMulU24 = getGeneration() >= EVERGREEN;
65   HasMulI24 = hasCaymanISA();
66 
67   return *this;
68 }
69 
70 GCNSubtarget &
71 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
72                                               StringRef GPU, StringRef FS) {
73   // Determine default and user-specified characteristics
74   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
75   // enabled, but some instructions do not respect them and they run at the
76   // double precision rate, so don't enable by default.
77   //
78   // We want to be able to turn these off, but making this a subtarget feature
79   // for SI has the unhelpful behavior that it unsets everything else if you
80   // disable it.
81   //
82   // Similarly we want enable-prt-strict-null to be on by default and not to
83   // unset everything else if it is disabled
84 
85   // Assuming ECC is enabled is the conservative default.
86   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
87 
88   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
89     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
90 
91   // FIXME: I don't think think Evergreen has any useful support for
92   // denormals, but should be checked. Should we issue a warning somewhere
93   // if someone tries to enable these?
94   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
95     FullFS += "+fp64-fp16-denormals,";
96   } else {
97     FullFS += "-fp32-denormals,";
98   }
99 
100   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
101 
102   // Disable mutually exclusive bits.
103   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
104     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
105       FullFS += "-wavefrontsize16,";
106     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
107       FullFS += "-wavefrontsize32,";
108     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
109       FullFS += "-wavefrontsize64,";
110   }
111 
112   FullFS += FS;
113 
114   ParseSubtargetFeatures(GPU, FullFS);
115 
116   // We don't support FP64 for EG/NI atm.
117   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
118 
119   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
120   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
121   // variants of MUBUF instructions.
122   if (!hasAddr64() && !FS.contains("flat-for-global")) {
123     FlatForGlobal = true;
124   }
125 
126   // Set defaults if needed.
127   if (MaxPrivateElementSize == 0)
128     MaxPrivateElementSize = 4;
129 
130   if (LDSBankCount == 0)
131     LDSBankCount = 32;
132 
133   if (TT.getArch() == Triple::amdgcn) {
134     if (LocalMemorySize == 0)
135       LocalMemorySize = 32768;
136 
137     // Do something sensible for unspecified target.
138     if (!HasMovrel && !HasVGPRIndexMode)
139       HasMovrel = true;
140   }
141 
142   // Don't crash on invalid devices.
143   if (WavefrontSize == 0)
144     WavefrontSize = 64;
145 
146   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
147 
148   if (DoesNotSupportXNACK && EnableXNACK) {
149     ToggleFeature(AMDGPU::FeatureXNACK);
150     EnableXNACK = false;
151   }
152 
153   // ECC is on by default, but turn it off if the hardware doesn't support it
154   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
155   // ECC.
156   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
157     ToggleFeature(AMDGPU::FeatureSRAMECC);
158     EnableSRAMECC = false;
159   }
160 
161   return *this;
162 }
163 
164 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
165   TargetTriple(TT),
166   Has16BitInsts(false),
167   HasMadMixInsts(false),
168   FP32Denormals(false),
169   FPExceptions(false),
170   HasSDWA(false),
171   HasVOP3PInsts(false),
172   HasMulI24(true),
173   HasMulU24(true),
174   HasInv2PiInlineImm(false),
175   HasFminFmaxLegacy(true),
176   EnablePromoteAlloca(false),
177   HasTrigReducedRange(false),
178   MaxWavesPerEU(10),
179   LocalMemorySize(0),
180   WavefrontSize(0)
181   { }
182 
183 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
184                            const GCNTargetMachine &TM) :
185     AMDGPUGenSubtargetInfo(TT, GPU, FS),
186     AMDGPUSubtarget(TT),
187     TargetTriple(TT),
188     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
189     InstrItins(getInstrItineraryForCPU(GPU)),
190     LDSBankCount(0),
191     MaxPrivateElementSize(0),
192 
193     FastFMAF32(false),
194     HalfRate64Ops(false),
195 
196     FP64FP16Denormals(false),
197     FlatForGlobal(false),
198     AutoWaitcntBeforeBarrier(false),
199     CodeObjectV3(false),
200     UnalignedScratchAccess(false),
201     UnalignedBufferAccess(false),
202 
203     HasApertureRegs(false),
204     EnableXNACK(false),
205     DoesNotSupportXNACK(false),
206     EnableCuMode(false),
207     TrapHandler(false),
208 
209     EnableLoadStoreOpt(false),
210     EnableUnsafeDSOffsetFolding(false),
211     EnableSIScheduler(false),
212     EnableDS128(false),
213     EnablePRTStrictNull(false),
214     DumpCode(false),
215 
216     FP64(false),
217     GCN3Encoding(false),
218     CIInsts(false),
219     GFX8Insts(false),
220     GFX9Insts(false),
221     GFX10Insts(false),
222     GFX7GFX8GFX9Insts(false),
223     SGPRInitBug(false),
224     HasSMemRealTime(false),
225     HasIntClamp(false),
226     HasFmaMixInsts(false),
227     HasMovrel(false),
228     HasVGPRIndexMode(false),
229     HasScalarStores(false),
230     HasScalarAtomics(false),
231     HasSDWAOmod(false),
232     HasSDWAScalar(false),
233     HasSDWASdst(false),
234     HasSDWAMac(false),
235     HasSDWAOutModsVOPC(false),
236     HasDPP(false),
237     HasDPP8(false),
238     HasR128A16(false),
239     HasNSAEncoding(false),
240     HasDLInsts(false),
241     HasDot1Insts(false),
242     HasDot2Insts(false),
243     HasDot3Insts(false),
244     HasDot4Insts(false),
245     HasDot5Insts(false),
246     HasDot6Insts(false),
247     HasMAIInsts(false),
248     HasPkFmacF16Inst(false),
249     HasAtomicFaddInsts(false),
250     EnableSRAMECC(false),
251     DoesNotSupportSRAMECC(false),
252     HasNoSdstCMPX(false),
253     HasVscnt(false),
254     HasRegisterBanking(false),
255     HasVOP3Literal(false),
256     HasNoDataDepHazard(false),
257     FlatAddressSpace(false),
258     FlatInstOffsets(false),
259     FlatGlobalInsts(false),
260     FlatScratchInsts(false),
261     ScalarFlatScratchInsts(false),
262     AddNoCarryInsts(false),
263     HasUnpackedD16VMem(false),
264     LDSMisalignedBug(false),
265     HasMFMAInlineLiteralBug(false),
266 
267     ScalarizeGlobal(false),
268 
269     HasVcmpxPermlaneHazard(false),
270     HasVMEMtoScalarWriteHazard(false),
271     HasSMEMtoVectorWriteHazard(false),
272     HasInstFwdPrefetchBug(false),
273     HasVcmpxExecWARHazard(false),
274     HasLdsBranchVmemWARHazard(false),
275     HasNSAtoVMEMBug(false),
276     HasOffset3fBug(false),
277     HasFlatSegmentOffsetBug(false),
278 
279     FeatureDisable(false),
280     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
281     TLInfo(TM, *this),
282     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
283   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
284   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
285   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
286   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
287   InstSelector.reset(new AMDGPUInstructionSelector(
288   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
289 }
290 
291 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
292   if (getGeneration() < GFX10)
293     return 1;
294 
295   switch (Opcode) {
296   case AMDGPU::V_LSHLREV_B64:
297   case AMDGPU::V_LSHLREV_B64_gfx10:
298   case AMDGPU::V_LSHL_B64:
299   case AMDGPU::V_LSHRREV_B64:
300   case AMDGPU::V_LSHRREV_B64_gfx10:
301   case AMDGPU::V_LSHR_B64:
302   case AMDGPU::V_ASHRREV_I64:
303   case AMDGPU::V_ASHRREV_I64_gfx10:
304   case AMDGPU::V_ASHR_I64:
305     return 1;
306   }
307 
308   return 2;
309 }
310 
311 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
312   const Function &F) const {
313   if (NWaves == 1)
314     return getLocalMemorySize();
315   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
316   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
317   if (!WorkGroupsPerCu)
318     return 0;
319   unsigned MaxWaves = getMaxWavesPerEU();
320   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
321 }
322 
323 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
324   const Function &F) const {
325   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
326   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
327   if (!WorkGroupsPerCu)
328     return 0;
329   unsigned MaxWaves = getMaxWavesPerEU();
330   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
331   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
332   NumWaves = std::min(NumWaves, MaxWaves);
333   NumWaves = std::max(NumWaves, 1u);
334   return NumWaves;
335 }
336 
337 unsigned
338 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
339   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
340   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
341 }
342 
343 std::pair<unsigned, unsigned>
344 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
345   switch (CC) {
346   case CallingConv::AMDGPU_VS:
347   case CallingConv::AMDGPU_LS:
348   case CallingConv::AMDGPU_HS:
349   case CallingConv::AMDGPU_ES:
350   case CallingConv::AMDGPU_GS:
351   case CallingConv::AMDGPU_PS:
352     return std::make_pair(1, getWavefrontSize());
353   default:
354     return std::make_pair(1u, getMaxFlatWorkGroupSize());
355   }
356 }
357 
358 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
359   const Function &F) const {
360   // Default minimum/maximum flat work group sizes.
361   std::pair<unsigned, unsigned> Default =
362     getDefaultFlatWorkGroupSize(F.getCallingConv());
363 
364   // Requested minimum/maximum flat work group sizes.
365   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
366     F, "amdgpu-flat-work-group-size", Default);
367 
368   // Make sure requested minimum is less than requested maximum.
369   if (Requested.first > Requested.second)
370     return Default;
371 
372   // Make sure requested values do not violate subtarget's specifications.
373   if (Requested.first < getMinFlatWorkGroupSize())
374     return Default;
375   if (Requested.second > getMaxFlatWorkGroupSize())
376     return Default;
377 
378   return Requested;
379 }
380 
381 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
382   const Function &F) const {
383   // Default minimum/maximum number of waves per execution unit.
384   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
385 
386   // Default/requested minimum/maximum flat work group sizes.
387   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
388 
389   // If minimum/maximum flat work group sizes were explicitly requested using
390   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
391   // number of waves per execution unit to values implied by requested
392   // minimum/maximum flat work group sizes.
393   unsigned MinImpliedByFlatWorkGroupSize =
394     getMaxWavesPerEU(FlatWorkGroupSizes.second);
395   bool RequestedFlatWorkGroupSize = false;
396 
397   if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
398     Default.first = MinImpliedByFlatWorkGroupSize;
399     RequestedFlatWorkGroupSize = true;
400   }
401 
402   // Requested minimum/maximum number of waves per execution unit.
403   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
404     F, "amdgpu-waves-per-eu", Default, true);
405 
406   // Make sure requested minimum is less than requested maximum.
407   if (Requested.second && Requested.first > Requested.second)
408     return Default;
409 
410   // Make sure requested values do not violate subtarget's specifications.
411   if (Requested.first < getMinWavesPerEU() ||
412       Requested.first > getMaxWavesPerEU())
413     return Default;
414   if (Requested.second > getMaxWavesPerEU())
415     return Default;
416 
417   // Make sure requested values are compatible with values implied by requested
418   // minimum/maximum flat work group sizes.
419   if (RequestedFlatWorkGroupSize &&
420       Requested.first < MinImpliedByFlatWorkGroupSize)
421     return Default;
422 
423   return Requested;
424 }
425 
426 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
427   Function *Kernel = I->getParent()->getParent();
428   unsigned MinSize = 0;
429   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
430   bool IdQuery = false;
431 
432   // If reqd_work_group_size is present it narrows value down.
433   if (auto *CI = dyn_cast<CallInst>(I)) {
434     const Function *F = CI->getCalledFunction();
435     if (F) {
436       unsigned Dim = UINT_MAX;
437       switch (F->getIntrinsicID()) {
438       case Intrinsic::amdgcn_workitem_id_x:
439       case Intrinsic::r600_read_tidig_x:
440         IdQuery = true;
441         LLVM_FALLTHROUGH;
442       case Intrinsic::r600_read_local_size_x:
443         Dim = 0;
444         break;
445       case Intrinsic::amdgcn_workitem_id_y:
446       case Intrinsic::r600_read_tidig_y:
447         IdQuery = true;
448         LLVM_FALLTHROUGH;
449       case Intrinsic::r600_read_local_size_y:
450         Dim = 1;
451         break;
452       case Intrinsic::amdgcn_workitem_id_z:
453       case Intrinsic::r600_read_tidig_z:
454         IdQuery = true;
455         LLVM_FALLTHROUGH;
456       case Intrinsic::r600_read_local_size_z:
457         Dim = 2;
458         break;
459       default:
460         break;
461       }
462       if (Dim <= 3) {
463         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
464           if (Node->getNumOperands() == 3)
465             MinSize = MaxSize = mdconst::extract<ConstantInt>(
466                                   Node->getOperand(Dim))->getZExtValue();
467       }
468     }
469   }
470 
471   if (!MaxSize)
472     return false;
473 
474   // Range metadata is [Lo, Hi). For ID query we need to pass max size
475   // as Hi. For size query we need to pass Hi + 1.
476   if (IdQuery)
477     MinSize = 0;
478   else
479     ++MaxSize;
480 
481   MDBuilder MDB(I->getContext());
482   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
483                                                   APInt(32, MaxSize));
484   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
485   return true;
486 }
487 
488 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
489                                                  Align &MaxAlign) const {
490   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
491          F.getCallingConv() == CallingConv::SPIR_KERNEL);
492 
493   const DataLayout &DL = F.getParent()->getDataLayout();
494   uint64_t ExplicitArgBytes = 0;
495   MaxAlign = Align::None();
496 
497   for (const Argument &Arg : F.args()) {
498     Type *ArgTy = Arg.getType();
499 
500     const Align Alignment(DL.getABITypeAlignment(ArgTy));
501     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
502     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
503     MaxAlign = std::max(MaxAlign, Alignment);
504   }
505 
506   return ExplicitArgBytes;
507 }
508 
509 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
510                                                 Align &MaxAlign) const {
511   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
512 
513   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
514 
515   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
516   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
517   if (ImplicitBytes != 0) {
518     const Align Alignment = getAlignmentForImplicitArgPtr();
519     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
520   }
521 
522   // Being able to dereference past the end is useful for emitting scalar loads.
523   return alignTo(TotalSize, 4);
524 }
525 
526 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
527                              const TargetMachine &TM) :
528   R600GenSubtargetInfo(TT, GPU, FS),
529   AMDGPUSubtarget(TT),
530   InstrInfo(*this),
531   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
532   FMA(false),
533   CaymanISA(false),
534   CFALUBug(false),
535   HasVertexCache(false),
536   R600ALUInst(false),
537   FP64(false),
538   TexVTXClauseSize(0),
539   Gen(R600),
540   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
541   InstrItins(getInstrItineraryForCPU(GPU)) { }
542 
543 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
544                                       unsigned NumRegionInstrs) const {
545   // Track register pressure so the scheduler can try to decrease
546   // pressure once register usage is above the threshold defined by
547   // SIRegisterInfo::getRegPressureSetLimit()
548   Policy.ShouldTrackPressure = true;
549 
550   // Enabling both top down and bottom up scheduling seems to give us less
551   // register spills than just using one of these approaches on its own.
552   Policy.OnlyTopDown = false;
553   Policy.OnlyBottomUp = false;
554 
555   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
556   if (!enableSIScheduler())
557     Policy.ShouldTrackLaneMasks = true;
558 }
559 
560 bool GCNSubtarget::hasMadF16() const {
561   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
562 }
563 
564 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
565   if (getGeneration() >= AMDGPUSubtarget::GFX10)
566     return getMaxWavesPerEU();
567 
568   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
569     if (SGPRs <= 80)
570       return 10;
571     if (SGPRs <= 88)
572       return 9;
573     if (SGPRs <= 100)
574       return 8;
575     return 7;
576   }
577   if (SGPRs <= 48)
578     return 10;
579   if (SGPRs <= 56)
580     return 9;
581   if (SGPRs <= 64)
582     return 8;
583   if (SGPRs <= 72)
584     return 7;
585   if (SGPRs <= 80)
586     return 6;
587   return 5;
588 }
589 
590 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
591   unsigned MaxWaves = getMaxWavesPerEU();
592   unsigned Granule = getVGPRAllocGranule();
593   if (VGPRs < Granule)
594     return MaxWaves;
595   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
596   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
597 }
598 
599 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
600   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
601   if (getGeneration() >= AMDGPUSubtarget::GFX10)
602     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
603 
604   if (MFI.hasFlatScratchInit()) {
605     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
606       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
607     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
608       return 4; // FLAT_SCRATCH, VCC (in that order).
609   }
610 
611   if (isXNACKEnabled())
612     return 4; // XNACK, VCC (in that order).
613   return 2; // VCC.
614 }
615 
616 unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF,
617                                         unsigned LDSSize,
618                                         unsigned NumSGPRs,
619                                         unsigned NumVGPRs) const {
620   unsigned Occupancy =
621     std::min(getMaxWavesPerEU(),
622              getOccupancyWithLocalMemSize(LDSSize, MF.getFunction()));
623   if (NumSGPRs)
624     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
625   if (NumVGPRs)
626     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
627   return Occupancy;
628 }
629 
630 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
631   const Function &F = MF.getFunction();
632   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
633 
634   // Compute maximum number of SGPRs function can use using default/requested
635   // minimum number of waves per execution unit.
636   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
637   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
638   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
639 
640   // Check if maximum number of SGPRs was explicitly requested using
641   // "amdgpu-num-sgpr" attribute.
642   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
643     unsigned Requested = AMDGPU::getIntegerAttribute(
644       F, "amdgpu-num-sgpr", MaxNumSGPRs);
645 
646     // Make sure requested value does not violate subtarget's specifications.
647     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
648       Requested = 0;
649 
650     // If more SGPRs are required to support the input user/system SGPRs,
651     // increase to accommodate them.
652     //
653     // FIXME: This really ends up using the requested number of SGPRs + number
654     // of reserved special registers in total. Theoretically you could re-use
655     // the last input registers for these special registers, but this would
656     // require a lot of complexity to deal with the weird aliasing.
657     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
658     if (Requested && Requested < InputNumSGPRs)
659       Requested = InputNumSGPRs;
660 
661     // Make sure requested value is compatible with values implied by
662     // default/requested minimum/maximum number of waves per execution unit.
663     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
664       Requested = 0;
665     if (WavesPerEU.second &&
666         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
667       Requested = 0;
668 
669     if (Requested)
670       MaxNumSGPRs = Requested;
671   }
672 
673   if (hasSGPRInitBug())
674     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
675 
676   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
677                   MaxAddressableNumSGPRs);
678 }
679 
680 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
681   const Function &F = MF.getFunction();
682   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
683 
684   // Compute maximum number of VGPRs function can use using default/requested
685   // minimum number of waves per execution unit.
686   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
687   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
688 
689   // Check if maximum number of VGPRs was explicitly requested using
690   // "amdgpu-num-vgpr" attribute.
691   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
692     unsigned Requested = AMDGPU::getIntegerAttribute(
693       F, "amdgpu-num-vgpr", MaxNumVGPRs);
694 
695     // Make sure requested value is compatible with values implied by
696     // default/requested minimum/maximum number of waves per execution unit.
697     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
698       Requested = 0;
699     if (WavesPerEU.second &&
700         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
701       Requested = 0;
702 
703     if (Requested)
704       MaxNumVGPRs = Requested;
705   }
706 
707   return MaxNumVGPRs;
708 }
709 
710 namespace {
711 struct MemOpClusterMutation : ScheduleDAGMutation {
712   const SIInstrInfo *TII;
713 
714   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
715 
716   void apply(ScheduleDAGInstrs *DAG) override {
717     SUnit *SUa = nullptr;
718     // Search for two consequent memory operations and link them
719     // to prevent scheduler from moving them apart.
720     // In DAG pre-process SUnits are in the original order of
721     // the instructions before scheduling.
722     for (SUnit &SU : DAG->SUnits) {
723       MachineInstr &MI2 = *SU.getInstr();
724       if (!MI2.mayLoad() && !MI2.mayStore()) {
725         SUa = nullptr;
726         continue;
727       }
728       if (!SUa) {
729         SUa = &SU;
730         continue;
731       }
732 
733       MachineInstr &MI1 = *SUa->getInstr();
734       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
735           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
736           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
737           (TII->isDS(MI1)   && TII->isDS(MI2))) {
738         SU.addPredBarrier(SUa);
739 
740         for (const SDep &SI : SU.Preds) {
741           if (SI.getSUnit() != SUa)
742             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
743         }
744 
745         if (&SU != &DAG->ExitSU) {
746           for (const SDep &SI : SUa->Succs) {
747             if (SI.getSUnit() != &SU)
748               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
749           }
750         }
751       }
752 
753       SUa = &SU;
754     }
755   }
756 };
757 
758 struct FillMFMAShadowMutation : ScheduleDAGMutation {
759   const SIInstrInfo *TII;
760 
761   ScheduleDAGMI *DAG;
762 
763   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
764 
765   bool isSALU(const SUnit *SU) const {
766     const MachineInstr *MI = SU->getInstr();
767     return MI && TII->isSALU(*MI) && !MI->isTerminator();
768   }
769 
770   bool isVALU(const SUnit *SU) const {
771     const MachineInstr *MI = SU->getInstr();
772     return MI && TII->isVALU(*MI);
773   }
774 
775   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
776     if (Pred->NodeNum < Succ->NodeNum)
777       return true;
778 
779     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
780 
781     for (unsigned I = 0; I < Succs.size(); ++I) {
782       for (const SDep &SI : Succs[I]->Succs) {
783         const SUnit *SU = SI.getSUnit();
784         if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
785           Succs.push_back(SU);
786       }
787     }
788 
789     SmallPtrSet<const SUnit*, 32> Visited;
790     while (!Preds.empty()) {
791       const SUnit *SU = Preds.pop_back_val();
792       if (llvm::find(Succs, SU) != Succs.end())
793         return false;
794       Visited.insert(SU);
795       for (const SDep &SI : SU->Preds)
796         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
797           Preds.push_back(SI.getSUnit());
798     }
799 
800     return true;
801   }
802 
803   // Link as much SALU intructions in chain as possible. Return the size
804   // of the chain. Links up to MaxChain instructions.
805   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
806                          SmallPtrSetImpl<SUnit *> &Visited) const {
807     SmallVector<SUnit *, 8> Worklist({To});
808     unsigned Linked = 0;
809 
810     while (!Worklist.empty() && MaxChain-- > 0) {
811       SUnit *SU = Worklist.pop_back_val();
812       if (!Visited.insert(SU).second)
813         continue;
814 
815       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
816                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
817 
818       if (SU->addPred(SDep(From, SDep::Artificial), false))
819         ++Linked;
820 
821       for (SDep &SI : From->Succs) {
822         SUnit *SUv = SI.getSUnit();
823         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
824           SUv->addPred(SDep(SU, SDep::Artificial), false);
825       }
826 
827       for (SDep &SI : SU->Succs) {
828         SUnit *Succ = SI.getSUnit();
829         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
830           Worklist.push_back(Succ);
831       }
832     }
833 
834     return Linked;
835   }
836 
837   void apply(ScheduleDAGInstrs *DAGInstrs) override {
838     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
839     if (!ST.hasMAIInsts() || DisablePowerSched)
840       return;
841     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
842     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
843     if (!TSchedModel || DAG->SUnits.empty())
844       return;
845 
846     // Scan for MFMA long latency instructions and try to add a dependency
847     // of available SALU instructions to give them a chance to fill MFMA
848     // shadow. That is desirable to fill MFMA shadow with SALU instructions
849     // rather than VALU to prevent power consumption bursts and throttle.
850     auto LastSALU = DAG->SUnits.begin();
851     auto E = DAG->SUnits.end();
852     SmallPtrSet<SUnit*, 32> Visited;
853     for (SUnit &SU : DAG->SUnits) {
854       MachineInstr &MAI = *SU.getInstr();
855       if (!TII->isMAI(MAI) ||
856            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
857            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
858         continue;
859 
860       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
861 
862       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
863                  dbgs() << "Need " << Lat
864                         << " instructions to cover latency.\n");
865 
866       // Find up to Lat independent scalar instructions as early as
867       // possible such that they can be scheduled after this MFMA.
868       for ( ; Lat && LastSALU != E; ++LastSALU) {
869         if (Visited.count(&*LastSALU))
870           continue;
871 
872         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
873           continue;
874 
875         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
876       }
877     }
878   }
879 };
880 } // namespace
881 
882 void GCNSubtarget::getPostRAMutations(
883     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
884   Mutations.push_back(std::make_unique<MemOpClusterMutation>(&InstrInfo));
885   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
886 }
887 
888 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
889   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
890     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
891   else
892     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
893 }
894 
895 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
896   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
897     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
898   else
899     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
900 }
901