1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 static cl::opt<bool> DisablePowerSched(
44   "amdgpu-disable-power-sched",
45   cl::desc("Disable scheduling to minimize mAI power bursts"),
46   cl::init(false));
47 
48 static cl::opt<bool> EnableVGPRIndexMode(
49   "amdgpu-vgpr-index-mode",
50   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51   cl::init(false));
52 
53 GCNSubtarget::~GCNSubtarget() = default;
54 
55 R600Subtarget &
56 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
57                                                StringRef GPU, StringRef FS) {
58   SmallString<256> FullFS("+promote-alloca,");
59   FullFS += FS;
60   ParseSubtargetFeatures(GPU, FullFS);
61 
62   // FIXME: I don't think think Evergreen has any useful support for
63   // denormals, but should be checked. Should we issue a warning somewhere
64   // if someone tries to enable these?
65   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
66     FP32Denormals = false;
67   }
68 
69   HasMulU24 = getGeneration() >= EVERGREEN;
70   HasMulI24 = hasCaymanISA();
71 
72   return *this;
73 }
74 
75 GCNSubtarget &
76 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
77                                               StringRef GPU, StringRef FS) {
78   // Determine default and user-specified characteristics
79   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
80   // enabled, but some instructions do not respect them and they run at the
81   // double precision rate, so don't enable by default.
82   //
83   // We want to be able to turn these off, but making this a subtarget feature
84   // for SI has the unhelpful behavior that it unsets everything else if you
85   // disable it.
86   //
87   // Similarly we want enable-prt-strict-null to be on by default and not to
88   // unset everything else if it is disabled
89 
90   // Assuming ECC is enabled is the conservative default.
91   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
92 
93   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
94     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
95 
96   // FIXME: I don't think think Evergreen has any useful support for
97   // denormals, but should be checked. Should we issue a warning somewhere
98   // if someone tries to enable these?
99   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
100     FullFS += "+fp64-fp16-denormals,";
101   } else {
102     FullFS += "-fp32-denormals,";
103   }
104 
105   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
106 
107   // Disable mutually exclusive bits.
108   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
109     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
110       FullFS += "-wavefrontsize16,";
111     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
112       FullFS += "-wavefrontsize32,";
113     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
114       FullFS += "-wavefrontsize64,";
115   }
116 
117   FullFS += FS;
118 
119   ParseSubtargetFeatures(GPU, FullFS);
120 
121   // We don't support FP64 for EG/NI atm.
122   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
123 
124   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
125   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
126   // variants of MUBUF instructions.
127   if (!hasAddr64() && !FS.contains("flat-for-global")) {
128     FlatForGlobal = true;
129   }
130 
131   // Set defaults if needed.
132   if (MaxPrivateElementSize == 0)
133     MaxPrivateElementSize = 4;
134 
135   if (LDSBankCount == 0)
136     LDSBankCount = 32;
137 
138   if (TT.getArch() == Triple::amdgcn) {
139     if (LocalMemorySize == 0)
140       LocalMemorySize = 32768;
141 
142     // Do something sensible for unspecified target.
143     if (!HasMovrel && !HasVGPRIndexMode)
144       HasMovrel = true;
145   }
146 
147   // Don't crash on invalid devices.
148   if (WavefrontSize == 0)
149     WavefrontSize = 64;
150 
151   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
152 
153   // Disable XNACK on targets where it is not enabled by default unless it is
154   // explicitly requested.
155   if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) {
156     ToggleFeature(AMDGPU::FeatureXNACK);
157     EnableXNACK = false;
158   }
159 
160   // ECC is on by default, but turn it off if the hardware doesn't support it
161   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
162   // ECC.
163   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
164     ToggleFeature(AMDGPU::FeatureSRAMECC);
165     EnableSRAMECC = false;
166   }
167 
168   return *this;
169 }
170 
171 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
172   TargetTriple(TT),
173   Has16BitInsts(false),
174   HasMadMixInsts(false),
175   FP32Denormals(false),
176   FPExceptions(false),
177   HasSDWA(false),
178   HasVOP3PInsts(false),
179   HasMulI24(true),
180   HasMulU24(true),
181   HasInv2PiInlineImm(false),
182   HasFminFmaxLegacy(true),
183   EnablePromoteAlloca(false),
184   HasTrigReducedRange(false),
185   MaxWavesPerEU(10),
186   LocalMemorySize(0),
187   WavefrontSize(0)
188   { }
189 
190 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
191                            const GCNTargetMachine &TM) :
192     AMDGPUGenSubtargetInfo(TT, GPU, FS),
193     AMDGPUSubtarget(TT),
194     TargetTriple(TT),
195     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
196     InstrItins(getInstrItineraryForCPU(GPU)),
197     LDSBankCount(0),
198     MaxPrivateElementSize(0),
199 
200     FastFMAF32(false),
201     HalfRate64Ops(false),
202 
203     FP64FP16Denormals(false),
204     FlatForGlobal(false),
205     AutoWaitcntBeforeBarrier(false),
206     CodeObjectV3(false),
207     UnalignedScratchAccess(false),
208     UnalignedBufferAccess(false),
209 
210     HasApertureRegs(false),
211     EnableXNACK(false),
212     DoesNotSupportXNACK(false),
213     EnableCuMode(false),
214     TrapHandler(false),
215 
216     EnableLoadStoreOpt(false),
217     EnableUnsafeDSOffsetFolding(false),
218     EnableSIScheduler(false),
219     EnableDS128(false),
220     EnablePRTStrictNull(false),
221     DumpCode(false),
222 
223     FP64(false),
224     GCN3Encoding(false),
225     CIInsts(false),
226     GFX8Insts(false),
227     GFX9Insts(false),
228     GFX10Insts(false),
229     GFX7GFX8GFX9Insts(false),
230     SGPRInitBug(false),
231     HasSMemRealTime(false),
232     HasIntClamp(false),
233     HasFmaMixInsts(false),
234     HasMovrel(false),
235     HasVGPRIndexMode(false),
236     HasScalarStores(false),
237     HasScalarAtomics(false),
238     HasSDWAOmod(false),
239     HasSDWAScalar(false),
240     HasSDWASdst(false),
241     HasSDWAMac(false),
242     HasSDWAOutModsVOPC(false),
243     HasDPP(false),
244     HasDPP8(false),
245     HasR128A16(false),
246     HasGFX10A16(false),
247     HasNSAEncoding(false),
248     HasDLInsts(false),
249     HasDot1Insts(false),
250     HasDot2Insts(false),
251     HasDot3Insts(false),
252     HasDot4Insts(false),
253     HasDot5Insts(false),
254     HasDot6Insts(false),
255     HasMAIInsts(false),
256     HasPkFmacF16Inst(false),
257     HasAtomicFaddInsts(false),
258     EnableSRAMECC(false),
259     DoesNotSupportSRAMECC(false),
260     HasNoSdstCMPX(false),
261     HasVscnt(false),
262     HasRegisterBanking(false),
263     HasVOP3Literal(false),
264     HasNoDataDepHazard(false),
265     FlatAddressSpace(false),
266     FlatInstOffsets(false),
267     FlatGlobalInsts(false),
268     FlatScratchInsts(false),
269     ScalarFlatScratchInsts(false),
270     AddNoCarryInsts(false),
271     HasUnpackedD16VMem(false),
272     LDSMisalignedBug(false),
273     HasMFMAInlineLiteralBug(false),
274 
275     ScalarizeGlobal(false),
276 
277     HasVcmpxPermlaneHazard(false),
278     HasVMEMtoScalarWriteHazard(false),
279     HasSMEMtoVectorWriteHazard(false),
280     HasInstFwdPrefetchBug(false),
281     HasVcmpxExecWARHazard(false),
282     HasLdsBranchVmemWARHazard(false),
283     HasNSAtoVMEMBug(false),
284     HasOffset3fBug(false),
285     HasFlatSegmentOffsetBug(false),
286 
287     FeatureDisable(false),
288     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
289     TLInfo(TM, *this),
290     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
291   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
292   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
293   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
294   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
295   InstSelector.reset(new AMDGPUInstructionSelector(
296   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
297 }
298 
299 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
300   if (getGeneration() < GFX10)
301     return 1;
302 
303   switch (Opcode) {
304   case AMDGPU::V_LSHLREV_B64:
305   case AMDGPU::V_LSHLREV_B64_gfx10:
306   case AMDGPU::V_LSHL_B64:
307   case AMDGPU::V_LSHRREV_B64:
308   case AMDGPU::V_LSHRREV_B64_gfx10:
309   case AMDGPU::V_LSHR_B64:
310   case AMDGPU::V_ASHRREV_I64:
311   case AMDGPU::V_ASHRREV_I64_gfx10:
312   case AMDGPU::V_ASHR_I64:
313     return 1;
314   }
315 
316   return 2;
317 }
318 
319 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
320   const Function &F) const {
321   if (NWaves == 1)
322     return getLocalMemorySize();
323   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
324   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
325   if (!WorkGroupsPerCu)
326     return 0;
327   unsigned MaxWaves = getMaxWavesPerEU();
328   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
329 }
330 
331 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
332   const Function &F) const {
333   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
334   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
335   if (!WorkGroupsPerCu)
336     return 0;
337   unsigned MaxWaves = getMaxWavesPerEU();
338   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
339   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
340   NumWaves = std::min(NumWaves, MaxWaves);
341   NumWaves = std::max(NumWaves, 1u);
342   return NumWaves;
343 }
344 
345 unsigned
346 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
347   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
348   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
349 }
350 
351 std::pair<unsigned, unsigned>
352 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
353   switch (CC) {
354   case CallingConv::AMDGPU_VS:
355   case CallingConv::AMDGPU_LS:
356   case CallingConv::AMDGPU_HS:
357   case CallingConv::AMDGPU_ES:
358   case CallingConv::AMDGPU_GS:
359   case CallingConv::AMDGPU_PS:
360     return std::make_pair(1, getWavefrontSize());
361   default:
362     return std::make_pair(1u, getMaxFlatWorkGroupSize());
363   }
364 }
365 
366 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
367   const Function &F) const {
368   // Default minimum/maximum flat work group sizes.
369   std::pair<unsigned, unsigned> Default =
370     getDefaultFlatWorkGroupSize(F.getCallingConv());
371 
372   // Requested minimum/maximum flat work group sizes.
373   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
374     F, "amdgpu-flat-work-group-size", Default);
375 
376   // Make sure requested minimum is less than requested maximum.
377   if (Requested.first > Requested.second)
378     return Default;
379 
380   // Make sure requested values do not violate subtarget's specifications.
381   if (Requested.first < getMinFlatWorkGroupSize())
382     return Default;
383   if (Requested.second > getMaxFlatWorkGroupSize())
384     return Default;
385 
386   return Requested;
387 }
388 
389 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
390   const Function &F) const {
391   // Default minimum/maximum number of waves per execution unit.
392   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
393 
394   // Default/requested minimum/maximum flat work group sizes.
395   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
396 
397   // If minimum/maximum flat work group sizes were explicitly requested using
398   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
399   // number of waves per execution unit to values implied by requested
400   // minimum/maximum flat work group sizes.
401   unsigned MinImpliedByFlatWorkGroupSize =
402     getMaxWavesPerEU(FlatWorkGroupSizes.second);
403   bool RequestedFlatWorkGroupSize = false;
404 
405   if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
406     Default.first = MinImpliedByFlatWorkGroupSize;
407     RequestedFlatWorkGroupSize = true;
408   }
409 
410   // Requested minimum/maximum number of waves per execution unit.
411   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
412     F, "amdgpu-waves-per-eu", Default, true);
413 
414   // Make sure requested minimum is less than requested maximum.
415   if (Requested.second && Requested.first > Requested.second)
416     return Default;
417 
418   // Make sure requested values do not violate subtarget's specifications.
419   if (Requested.first < getMinWavesPerEU() ||
420       Requested.first > getMaxWavesPerEU())
421     return Default;
422   if (Requested.second > getMaxWavesPerEU())
423     return Default;
424 
425   // Make sure requested values are compatible with values implied by requested
426   // minimum/maximum flat work group sizes.
427   if (RequestedFlatWorkGroupSize &&
428       Requested.first < MinImpliedByFlatWorkGroupSize)
429     return Default;
430 
431   return Requested;
432 }
433 
434 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
435   Function *Kernel = I->getParent()->getParent();
436   unsigned MinSize = 0;
437   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
438   bool IdQuery = false;
439 
440   // If reqd_work_group_size is present it narrows value down.
441   if (auto *CI = dyn_cast<CallInst>(I)) {
442     const Function *F = CI->getCalledFunction();
443     if (F) {
444       unsigned Dim = UINT_MAX;
445       switch (F->getIntrinsicID()) {
446       case Intrinsic::amdgcn_workitem_id_x:
447       case Intrinsic::r600_read_tidig_x:
448         IdQuery = true;
449         LLVM_FALLTHROUGH;
450       case Intrinsic::r600_read_local_size_x:
451         Dim = 0;
452         break;
453       case Intrinsic::amdgcn_workitem_id_y:
454       case Intrinsic::r600_read_tidig_y:
455         IdQuery = true;
456         LLVM_FALLTHROUGH;
457       case Intrinsic::r600_read_local_size_y:
458         Dim = 1;
459         break;
460       case Intrinsic::amdgcn_workitem_id_z:
461       case Intrinsic::r600_read_tidig_z:
462         IdQuery = true;
463         LLVM_FALLTHROUGH;
464       case Intrinsic::r600_read_local_size_z:
465         Dim = 2;
466         break;
467       default:
468         break;
469       }
470       if (Dim <= 3) {
471         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
472           if (Node->getNumOperands() == 3)
473             MinSize = MaxSize = mdconst::extract<ConstantInt>(
474                                   Node->getOperand(Dim))->getZExtValue();
475       }
476     }
477   }
478 
479   if (!MaxSize)
480     return false;
481 
482   // Range metadata is [Lo, Hi). For ID query we need to pass max size
483   // as Hi. For size query we need to pass Hi + 1.
484   if (IdQuery)
485     MinSize = 0;
486   else
487     ++MaxSize;
488 
489   MDBuilder MDB(I->getContext());
490   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
491                                                   APInt(32, MaxSize));
492   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
493   return true;
494 }
495 
496 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
497                                                  Align &MaxAlign) const {
498   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
499          F.getCallingConv() == CallingConv::SPIR_KERNEL);
500 
501   const DataLayout &DL = F.getParent()->getDataLayout();
502   uint64_t ExplicitArgBytes = 0;
503   MaxAlign = Align(1);
504 
505   for (const Argument &Arg : F.args()) {
506     Type *ArgTy = Arg.getType();
507 
508     const Align Alignment(DL.getABITypeAlignment(ArgTy));
509     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
510     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
511     MaxAlign = std::max(MaxAlign, Alignment);
512   }
513 
514   return ExplicitArgBytes;
515 }
516 
517 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
518                                                 Align &MaxAlign) const {
519   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
520 
521   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
522 
523   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
524   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
525   if (ImplicitBytes != 0) {
526     const Align Alignment = getAlignmentForImplicitArgPtr();
527     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
528   }
529 
530   // Being able to dereference past the end is useful for emitting scalar loads.
531   return alignTo(TotalSize, 4);
532 }
533 
534 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
535                              const TargetMachine &TM) :
536   R600GenSubtargetInfo(TT, GPU, FS),
537   AMDGPUSubtarget(TT),
538   InstrInfo(*this),
539   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
540   FMA(false),
541   CaymanISA(false),
542   CFALUBug(false),
543   HasVertexCache(false),
544   R600ALUInst(false),
545   FP64(false),
546   TexVTXClauseSize(0),
547   Gen(R600),
548   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
549   InstrItins(getInstrItineraryForCPU(GPU)) { }
550 
551 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
552                                       unsigned NumRegionInstrs) const {
553   // Track register pressure so the scheduler can try to decrease
554   // pressure once register usage is above the threshold defined by
555   // SIRegisterInfo::getRegPressureSetLimit()
556   Policy.ShouldTrackPressure = true;
557 
558   // Enabling both top down and bottom up scheduling seems to give us less
559   // register spills than just using one of these approaches on its own.
560   Policy.OnlyTopDown = false;
561   Policy.OnlyBottomUp = false;
562 
563   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
564   if (!enableSIScheduler())
565     Policy.ShouldTrackLaneMasks = true;
566 }
567 
568 bool GCNSubtarget::hasMadF16() const {
569   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
570 }
571 
572 bool GCNSubtarget::useVGPRIndexMode() const {
573   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
574 }
575 
576 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
577   if (getGeneration() >= AMDGPUSubtarget::GFX10)
578     return getMaxWavesPerEU();
579 
580   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
581     if (SGPRs <= 80)
582       return 10;
583     if (SGPRs <= 88)
584       return 9;
585     if (SGPRs <= 100)
586       return 8;
587     return 7;
588   }
589   if (SGPRs <= 48)
590     return 10;
591   if (SGPRs <= 56)
592     return 9;
593   if (SGPRs <= 64)
594     return 8;
595   if (SGPRs <= 72)
596     return 7;
597   if (SGPRs <= 80)
598     return 6;
599   return 5;
600 }
601 
602 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
603   unsigned MaxWaves = getMaxWavesPerEU();
604   unsigned Granule = getVGPRAllocGranule();
605   if (VGPRs < Granule)
606     return MaxWaves;
607   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
608   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
609 }
610 
611 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
612   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
613   if (getGeneration() >= AMDGPUSubtarget::GFX10)
614     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
615 
616   if (MFI.hasFlatScratchInit()) {
617     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
618       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
619     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
620       return 4; // FLAT_SCRATCH, VCC (in that order).
621   }
622 
623   if (isXNACKEnabled())
624     return 4; // XNACK, VCC (in that order).
625   return 2; // VCC.
626 }
627 
628 unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF,
629                                         unsigned LDSSize,
630                                         unsigned NumSGPRs,
631                                         unsigned NumVGPRs) const {
632   unsigned Occupancy =
633     std::min(getMaxWavesPerEU(),
634              getOccupancyWithLocalMemSize(LDSSize, MF.getFunction()));
635   if (NumSGPRs)
636     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
637   if (NumVGPRs)
638     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
639   return Occupancy;
640 }
641 
642 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
643   const Function &F = MF.getFunction();
644   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
645 
646   // Compute maximum number of SGPRs function can use using default/requested
647   // minimum number of waves per execution unit.
648   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
649   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
650   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
651 
652   // Check if maximum number of SGPRs was explicitly requested using
653   // "amdgpu-num-sgpr" attribute.
654   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
655     unsigned Requested = AMDGPU::getIntegerAttribute(
656       F, "amdgpu-num-sgpr", MaxNumSGPRs);
657 
658     // Make sure requested value does not violate subtarget's specifications.
659     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
660       Requested = 0;
661 
662     // If more SGPRs are required to support the input user/system SGPRs,
663     // increase to accommodate them.
664     //
665     // FIXME: This really ends up using the requested number of SGPRs + number
666     // of reserved special registers in total. Theoretically you could re-use
667     // the last input registers for these special registers, but this would
668     // require a lot of complexity to deal with the weird aliasing.
669     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
670     if (Requested && Requested < InputNumSGPRs)
671       Requested = InputNumSGPRs;
672 
673     // Make sure requested value is compatible with values implied by
674     // default/requested minimum/maximum number of waves per execution unit.
675     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
676       Requested = 0;
677     if (WavesPerEU.second &&
678         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
679       Requested = 0;
680 
681     if (Requested)
682       MaxNumSGPRs = Requested;
683   }
684 
685   if (hasSGPRInitBug())
686     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
687 
688   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
689                   MaxAddressableNumSGPRs);
690 }
691 
692 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
693   const Function &F = MF.getFunction();
694   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
695 
696   // Compute maximum number of VGPRs function can use using default/requested
697   // minimum number of waves per execution unit.
698   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
699   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
700 
701   // Check if maximum number of VGPRs was explicitly requested using
702   // "amdgpu-num-vgpr" attribute.
703   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
704     unsigned Requested = AMDGPU::getIntegerAttribute(
705       F, "amdgpu-num-vgpr", MaxNumVGPRs);
706 
707     // Make sure requested value is compatible with values implied by
708     // default/requested minimum/maximum number of waves per execution unit.
709     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
710       Requested = 0;
711     if (WavesPerEU.second &&
712         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
713       Requested = 0;
714 
715     if (Requested)
716       MaxNumVGPRs = Requested;
717   }
718 
719   return MaxNumVGPRs;
720 }
721 
722 void GCNSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
723                                          SDep &Dep) const {
724   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
725       !Src->isInstr() || !Dst->isInstr())
726     return;
727 
728   MachineInstr *SrcI = Src->getInstr();
729   MachineInstr *DstI = Dst->getInstr();
730 
731   if (SrcI->isBundle()) {
732     const SIRegisterInfo *TRI = getRegisterInfo();
733     auto Reg = Dep.getReg();
734     MachineBasicBlock::const_instr_iterator I(SrcI->getIterator());
735     MachineBasicBlock::const_instr_iterator E(SrcI->getParent()->instr_end());
736     unsigned Lat = 0;
737     for (++I; I != E && I->isBundledWithPred(); ++I) {
738       if (I->modifiesRegister(Reg, TRI))
739         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
740       else if (Lat)
741         --Lat;
742     }
743     Dep.setLatency(Lat);
744   } else if (DstI->isBundle()) {
745     const SIRegisterInfo *TRI = getRegisterInfo();
746     auto Reg = Dep.getReg();
747     MachineBasicBlock::const_instr_iterator I(DstI->getIterator());
748     MachineBasicBlock::const_instr_iterator E(DstI->getParent()->instr_end());
749     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *SrcI);
750     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
751       if (I->readsRegister(Reg, TRI))
752         break;
753       --Lat;
754     }
755     Dep.setLatency(Lat);
756   }
757 }
758 
759 namespace {
760 struct FillMFMAShadowMutation : ScheduleDAGMutation {
761   const SIInstrInfo *TII;
762 
763   ScheduleDAGMI *DAG;
764 
765   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
766 
767   bool isSALU(const SUnit *SU) const {
768     const MachineInstr *MI = SU->getInstr();
769     return MI && TII->isSALU(*MI) && !MI->isTerminator();
770   }
771 
772   bool isVALU(const SUnit *SU) const {
773     const MachineInstr *MI = SU->getInstr();
774     return MI && TII->isVALU(*MI);
775   }
776 
777   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
778     if (Pred->NodeNum < Succ->NodeNum)
779       return true;
780 
781     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
782 
783     for (unsigned I = 0; I < Succs.size(); ++I) {
784       for (const SDep &SI : Succs[I]->Succs) {
785         const SUnit *SU = SI.getSUnit();
786         if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
787           Succs.push_back(SU);
788       }
789     }
790 
791     SmallPtrSet<const SUnit*, 32> Visited;
792     while (!Preds.empty()) {
793       const SUnit *SU = Preds.pop_back_val();
794       if (llvm::find(Succs, SU) != Succs.end())
795         return false;
796       Visited.insert(SU);
797       for (const SDep &SI : SU->Preds)
798         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
799           Preds.push_back(SI.getSUnit());
800     }
801 
802     return true;
803   }
804 
805   // Link as much SALU intructions in chain as possible. Return the size
806   // of the chain. Links up to MaxChain instructions.
807   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
808                          SmallPtrSetImpl<SUnit *> &Visited) const {
809     SmallVector<SUnit *, 8> Worklist({To});
810     unsigned Linked = 0;
811 
812     while (!Worklist.empty() && MaxChain-- > 0) {
813       SUnit *SU = Worklist.pop_back_val();
814       if (!Visited.insert(SU).second)
815         continue;
816 
817       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
818                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
819 
820       if (SU->addPred(SDep(From, SDep::Artificial), false))
821         ++Linked;
822 
823       for (SDep &SI : From->Succs) {
824         SUnit *SUv = SI.getSUnit();
825         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
826           SUv->addPred(SDep(SU, SDep::Artificial), false);
827       }
828 
829       for (SDep &SI : SU->Succs) {
830         SUnit *Succ = SI.getSUnit();
831         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
832           Worklist.push_back(Succ);
833       }
834     }
835 
836     return Linked;
837   }
838 
839   void apply(ScheduleDAGInstrs *DAGInstrs) override {
840     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
841     if (!ST.hasMAIInsts() || DisablePowerSched)
842       return;
843     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
844     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
845     if (!TSchedModel || DAG->SUnits.empty())
846       return;
847 
848     // Scan for MFMA long latency instructions and try to add a dependency
849     // of available SALU instructions to give them a chance to fill MFMA
850     // shadow. That is desirable to fill MFMA shadow with SALU instructions
851     // rather than VALU to prevent power consumption bursts and throttle.
852     auto LastSALU = DAG->SUnits.begin();
853     auto E = DAG->SUnits.end();
854     SmallPtrSet<SUnit*, 32> Visited;
855     for (SUnit &SU : DAG->SUnits) {
856       MachineInstr &MAI = *SU.getInstr();
857       if (!TII->isMAI(MAI) ||
858            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
859            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
860         continue;
861 
862       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
863 
864       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
865                  dbgs() << "Need " << Lat
866                         << " instructions to cover latency.\n");
867 
868       // Find up to Lat independent scalar instructions as early as
869       // possible such that they can be scheduled after this MFMA.
870       for ( ; Lat && LastSALU != E; ++LastSALU) {
871         if (Visited.count(&*LastSALU))
872           continue;
873 
874         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
875           continue;
876 
877         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
878       }
879     }
880   }
881 };
882 } // namespace
883 
884 void GCNSubtarget::getPostRAMutations(
885     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
886   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
887 }
888 
889 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
890   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
891     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
892   else
893     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
894 }
895 
896 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
897   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
898     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
899   else
900     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
901 }
902