1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 static cl::opt<bool> DisablePowerSched(
44   "amdgpu-disable-power-sched",
45   cl::desc("Disable scheduling to minimize mAI power bursts"),
46   cl::init(false));
47 
48 static cl::opt<bool> EnableVGPRIndexMode(
49   "amdgpu-vgpr-index-mode",
50   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51   cl::init(false));
52 
53 GCNSubtarget::~GCNSubtarget() = default;
54 
55 R600Subtarget &
56 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
57                                                StringRef GPU, StringRef FS) {
58   SmallString<256> FullFS("+promote-alloca,");
59   FullFS += FS;
60   ParseSubtargetFeatures(GPU, FullFS);
61 
62   // FIXME: I don't think think Evergreen has any useful support for
63   // denormals, but should be checked. Should we issue a warning somewhere
64   // if someone tries to enable these?
65   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
66     FP32Denormals = false;
67   }
68 
69   HasMulU24 = getGeneration() >= EVERGREEN;
70   HasMulI24 = hasCaymanISA();
71 
72   return *this;
73 }
74 
75 GCNSubtarget &
76 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
77                                               StringRef GPU, StringRef FS) {
78   // Determine default and user-specified characteristics
79   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
80   // enabled, but some instructions do not respect them and they run at the
81   // double precision rate, so don't enable by default.
82   //
83   // We want to be able to turn these off, but making this a subtarget feature
84   // for SI has the unhelpful behavior that it unsets everything else if you
85   // disable it.
86   //
87   // Similarly we want enable-prt-strict-null to be on by default and not to
88   // unset everything else if it is disabled
89 
90   // Assuming ECC is enabled is the conservative default.
91   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
92 
93   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
94     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
95 
96   // FIXME: I don't think think Evergreen has any useful support for
97   // denormals, but should be checked. Should we issue a warning somewhere
98   // if someone tries to enable these?
99   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
100     FullFS += "+fp64-fp16-denormals,";
101   } else {
102     FullFS += "-fp32-denormals,";
103   }
104 
105   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
106 
107   // Disable mutually exclusive bits.
108   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
109     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
110       FullFS += "-wavefrontsize16,";
111     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
112       FullFS += "-wavefrontsize32,";
113     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
114       FullFS += "-wavefrontsize64,";
115   }
116 
117   FullFS += FS;
118 
119   ParseSubtargetFeatures(GPU, FullFS);
120 
121   // We don't support FP64 for EG/NI atm.
122   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
123 
124   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
125   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
126   // variants of MUBUF instructions.
127   if (!hasAddr64() && !FS.contains("flat-for-global")) {
128     FlatForGlobal = true;
129   }
130 
131   // Set defaults if needed.
132   if (MaxPrivateElementSize == 0)
133     MaxPrivateElementSize = 4;
134 
135   if (LDSBankCount == 0)
136     LDSBankCount = 32;
137 
138   if (TT.getArch() == Triple::amdgcn) {
139     if (LocalMemorySize == 0)
140       LocalMemorySize = 32768;
141 
142     // Do something sensible for unspecified target.
143     if (!HasMovrel && !HasVGPRIndexMode)
144       HasMovrel = true;
145   }
146 
147   // Don't crash on invalid devices.
148   if (WavefrontSize == 0)
149     WavefrontSize = 64;
150 
151   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
152 
153   if (DoesNotSupportXNACK && EnableXNACK) {
154     ToggleFeature(AMDGPU::FeatureXNACK);
155     EnableXNACK = false;
156   }
157 
158   // ECC is on by default, but turn it off if the hardware doesn't support it
159   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
160   // ECC.
161   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
162     ToggleFeature(AMDGPU::FeatureSRAMECC);
163     EnableSRAMECC = false;
164   }
165 
166   return *this;
167 }
168 
169 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
170   TargetTriple(TT),
171   Has16BitInsts(false),
172   HasMadMixInsts(false),
173   FP32Denormals(false),
174   FPExceptions(false),
175   HasSDWA(false),
176   HasVOP3PInsts(false),
177   HasMulI24(true),
178   HasMulU24(true),
179   HasInv2PiInlineImm(false),
180   HasFminFmaxLegacy(true),
181   EnablePromoteAlloca(false),
182   HasTrigReducedRange(false),
183   MaxWavesPerEU(10),
184   LocalMemorySize(0),
185   WavefrontSize(0)
186   { }
187 
188 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
189                            const GCNTargetMachine &TM) :
190     AMDGPUGenSubtargetInfo(TT, GPU, FS),
191     AMDGPUSubtarget(TT),
192     TargetTriple(TT),
193     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
194     InstrItins(getInstrItineraryForCPU(GPU)),
195     LDSBankCount(0),
196     MaxPrivateElementSize(0),
197 
198     FastFMAF32(false),
199     HalfRate64Ops(false),
200 
201     FP64FP16Denormals(false),
202     FlatForGlobal(false),
203     AutoWaitcntBeforeBarrier(false),
204     CodeObjectV3(false),
205     UnalignedScratchAccess(false),
206     UnalignedBufferAccess(false),
207 
208     HasApertureRegs(false),
209     EnableXNACK(false),
210     DoesNotSupportXNACK(false),
211     EnableCuMode(false),
212     TrapHandler(false),
213 
214     EnableLoadStoreOpt(false),
215     EnableUnsafeDSOffsetFolding(false),
216     EnableSIScheduler(false),
217     EnableDS128(false),
218     EnablePRTStrictNull(false),
219     DumpCode(false),
220 
221     FP64(false),
222     GCN3Encoding(false),
223     CIInsts(false),
224     GFX8Insts(false),
225     GFX9Insts(false),
226     GFX10Insts(false),
227     GFX7GFX8GFX9Insts(false),
228     SGPRInitBug(false),
229     HasSMemRealTime(false),
230     HasIntClamp(false),
231     HasFmaMixInsts(false),
232     HasMovrel(false),
233     HasVGPRIndexMode(false),
234     HasScalarStores(false),
235     HasScalarAtomics(false),
236     HasSDWAOmod(false),
237     HasSDWAScalar(false),
238     HasSDWASdst(false),
239     HasSDWAMac(false),
240     HasSDWAOutModsVOPC(false),
241     HasDPP(false),
242     HasDPP8(false),
243     HasR128A16(false),
244     HasGFX10A16(false),
245     HasNSAEncoding(false),
246     HasDLInsts(false),
247     HasDot1Insts(false),
248     HasDot2Insts(false),
249     HasDot3Insts(false),
250     HasDot4Insts(false),
251     HasDot5Insts(false),
252     HasDot6Insts(false),
253     HasMAIInsts(false),
254     HasPkFmacF16Inst(false),
255     HasAtomicFaddInsts(false),
256     EnableSRAMECC(false),
257     DoesNotSupportSRAMECC(false),
258     HasNoSdstCMPX(false),
259     HasVscnt(false),
260     HasRegisterBanking(false),
261     HasVOP3Literal(false),
262     HasNoDataDepHazard(false),
263     FlatAddressSpace(false),
264     FlatInstOffsets(false),
265     FlatGlobalInsts(false),
266     FlatScratchInsts(false),
267     ScalarFlatScratchInsts(false),
268     AddNoCarryInsts(false),
269     HasUnpackedD16VMem(false),
270     LDSMisalignedBug(false),
271     HasMFMAInlineLiteralBug(false),
272 
273     ScalarizeGlobal(false),
274 
275     HasVcmpxPermlaneHazard(false),
276     HasVMEMtoScalarWriteHazard(false),
277     HasSMEMtoVectorWriteHazard(false),
278     HasInstFwdPrefetchBug(false),
279     HasVcmpxExecWARHazard(false),
280     HasLdsBranchVmemWARHazard(false),
281     HasNSAtoVMEMBug(false),
282     HasOffset3fBug(false),
283     HasFlatSegmentOffsetBug(false),
284 
285     FeatureDisable(false),
286     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
287     TLInfo(TM, *this),
288     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
289   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
290   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
291   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
292   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
293   InstSelector.reset(new AMDGPUInstructionSelector(
294   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
295 }
296 
297 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
298   if (getGeneration() < GFX10)
299     return 1;
300 
301   switch (Opcode) {
302   case AMDGPU::V_LSHLREV_B64:
303   case AMDGPU::V_LSHLREV_B64_gfx10:
304   case AMDGPU::V_LSHL_B64:
305   case AMDGPU::V_LSHRREV_B64:
306   case AMDGPU::V_LSHRREV_B64_gfx10:
307   case AMDGPU::V_LSHR_B64:
308   case AMDGPU::V_ASHRREV_I64:
309   case AMDGPU::V_ASHRREV_I64_gfx10:
310   case AMDGPU::V_ASHR_I64:
311     return 1;
312   }
313 
314   return 2;
315 }
316 
317 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
318   const Function &F) const {
319   if (NWaves == 1)
320     return getLocalMemorySize();
321   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
322   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
323   if (!WorkGroupsPerCu)
324     return 0;
325   unsigned MaxWaves = getMaxWavesPerEU();
326   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
327 }
328 
329 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
330   const Function &F) const {
331   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
332   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
333   if (!WorkGroupsPerCu)
334     return 0;
335   unsigned MaxWaves = getMaxWavesPerEU();
336   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
337   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
338   NumWaves = std::min(NumWaves, MaxWaves);
339   NumWaves = std::max(NumWaves, 1u);
340   return NumWaves;
341 }
342 
343 unsigned
344 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
345   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
346   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
347 }
348 
349 std::pair<unsigned, unsigned>
350 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
351   switch (CC) {
352   case CallingConv::AMDGPU_VS:
353   case CallingConv::AMDGPU_LS:
354   case CallingConv::AMDGPU_HS:
355   case CallingConv::AMDGPU_ES:
356   case CallingConv::AMDGPU_GS:
357   case CallingConv::AMDGPU_PS:
358     return std::make_pair(1, getWavefrontSize());
359   default:
360     return std::make_pair(1u, getMaxFlatWorkGroupSize());
361   }
362 }
363 
364 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
365   const Function &F) const {
366   // Default minimum/maximum flat work group sizes.
367   std::pair<unsigned, unsigned> Default =
368     getDefaultFlatWorkGroupSize(F.getCallingConv());
369 
370   // Requested minimum/maximum flat work group sizes.
371   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
372     F, "amdgpu-flat-work-group-size", Default);
373 
374   // Make sure requested minimum is less than requested maximum.
375   if (Requested.first > Requested.second)
376     return Default;
377 
378   // Make sure requested values do not violate subtarget's specifications.
379   if (Requested.first < getMinFlatWorkGroupSize())
380     return Default;
381   if (Requested.second > getMaxFlatWorkGroupSize())
382     return Default;
383 
384   return Requested;
385 }
386 
387 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
388   const Function &F) const {
389   // Default minimum/maximum number of waves per execution unit.
390   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
391 
392   // Default/requested minimum/maximum flat work group sizes.
393   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
394 
395   // If minimum/maximum flat work group sizes were explicitly requested using
396   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
397   // number of waves per execution unit to values implied by requested
398   // minimum/maximum flat work group sizes.
399   unsigned MinImpliedByFlatWorkGroupSize =
400     getMaxWavesPerEU(FlatWorkGroupSizes.second);
401   bool RequestedFlatWorkGroupSize = false;
402 
403   if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
404     Default.first = MinImpliedByFlatWorkGroupSize;
405     RequestedFlatWorkGroupSize = true;
406   }
407 
408   // Requested minimum/maximum number of waves per execution unit.
409   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
410     F, "amdgpu-waves-per-eu", Default, true);
411 
412   // Make sure requested minimum is less than requested maximum.
413   if (Requested.second && Requested.first > Requested.second)
414     return Default;
415 
416   // Make sure requested values do not violate subtarget's specifications.
417   if (Requested.first < getMinWavesPerEU() ||
418       Requested.first > getMaxWavesPerEU())
419     return Default;
420   if (Requested.second > getMaxWavesPerEU())
421     return Default;
422 
423   // Make sure requested values are compatible with values implied by requested
424   // minimum/maximum flat work group sizes.
425   if (RequestedFlatWorkGroupSize &&
426       Requested.first < MinImpliedByFlatWorkGroupSize)
427     return Default;
428 
429   return Requested;
430 }
431 
432 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
433   Function *Kernel = I->getParent()->getParent();
434   unsigned MinSize = 0;
435   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
436   bool IdQuery = false;
437 
438   // If reqd_work_group_size is present it narrows value down.
439   if (auto *CI = dyn_cast<CallInst>(I)) {
440     const Function *F = CI->getCalledFunction();
441     if (F) {
442       unsigned Dim = UINT_MAX;
443       switch (F->getIntrinsicID()) {
444       case Intrinsic::amdgcn_workitem_id_x:
445       case Intrinsic::r600_read_tidig_x:
446         IdQuery = true;
447         LLVM_FALLTHROUGH;
448       case Intrinsic::r600_read_local_size_x:
449         Dim = 0;
450         break;
451       case Intrinsic::amdgcn_workitem_id_y:
452       case Intrinsic::r600_read_tidig_y:
453         IdQuery = true;
454         LLVM_FALLTHROUGH;
455       case Intrinsic::r600_read_local_size_y:
456         Dim = 1;
457         break;
458       case Intrinsic::amdgcn_workitem_id_z:
459       case Intrinsic::r600_read_tidig_z:
460         IdQuery = true;
461         LLVM_FALLTHROUGH;
462       case Intrinsic::r600_read_local_size_z:
463         Dim = 2;
464         break;
465       default:
466         break;
467       }
468       if (Dim <= 3) {
469         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
470           if (Node->getNumOperands() == 3)
471             MinSize = MaxSize = mdconst::extract<ConstantInt>(
472                                   Node->getOperand(Dim))->getZExtValue();
473       }
474     }
475   }
476 
477   if (!MaxSize)
478     return false;
479 
480   // Range metadata is [Lo, Hi). For ID query we need to pass max size
481   // as Hi. For size query we need to pass Hi + 1.
482   if (IdQuery)
483     MinSize = 0;
484   else
485     ++MaxSize;
486 
487   MDBuilder MDB(I->getContext());
488   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
489                                                   APInt(32, MaxSize));
490   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
491   return true;
492 }
493 
494 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
495                                                  Align &MaxAlign) const {
496   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
497          F.getCallingConv() == CallingConv::SPIR_KERNEL);
498 
499   const DataLayout &DL = F.getParent()->getDataLayout();
500   uint64_t ExplicitArgBytes = 0;
501   MaxAlign = Align(1);
502 
503   for (const Argument &Arg : F.args()) {
504     Type *ArgTy = Arg.getType();
505 
506     const Align Alignment(DL.getABITypeAlignment(ArgTy));
507     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
508     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
509     MaxAlign = std::max(MaxAlign, Alignment);
510   }
511 
512   return ExplicitArgBytes;
513 }
514 
515 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
516                                                 Align &MaxAlign) const {
517   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
518 
519   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
520 
521   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
522   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
523   if (ImplicitBytes != 0) {
524     const Align Alignment = getAlignmentForImplicitArgPtr();
525     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
526   }
527 
528   // Being able to dereference past the end is useful for emitting scalar loads.
529   return alignTo(TotalSize, 4);
530 }
531 
532 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
533                              const TargetMachine &TM) :
534   R600GenSubtargetInfo(TT, GPU, FS),
535   AMDGPUSubtarget(TT),
536   InstrInfo(*this),
537   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
538   FMA(false),
539   CaymanISA(false),
540   CFALUBug(false),
541   HasVertexCache(false),
542   R600ALUInst(false),
543   FP64(false),
544   TexVTXClauseSize(0),
545   Gen(R600),
546   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
547   InstrItins(getInstrItineraryForCPU(GPU)) { }
548 
549 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
550                                       unsigned NumRegionInstrs) const {
551   // Track register pressure so the scheduler can try to decrease
552   // pressure once register usage is above the threshold defined by
553   // SIRegisterInfo::getRegPressureSetLimit()
554   Policy.ShouldTrackPressure = true;
555 
556   // Enabling both top down and bottom up scheduling seems to give us less
557   // register spills than just using one of these approaches on its own.
558   Policy.OnlyTopDown = false;
559   Policy.OnlyBottomUp = false;
560 
561   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
562   if (!enableSIScheduler())
563     Policy.ShouldTrackLaneMasks = true;
564 }
565 
566 bool GCNSubtarget::hasMadF16() const {
567   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
568 }
569 
570 bool GCNSubtarget::useVGPRIndexMode() const {
571   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
572 }
573 
574 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
575   if (getGeneration() >= AMDGPUSubtarget::GFX10)
576     return getMaxWavesPerEU();
577 
578   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
579     if (SGPRs <= 80)
580       return 10;
581     if (SGPRs <= 88)
582       return 9;
583     if (SGPRs <= 100)
584       return 8;
585     return 7;
586   }
587   if (SGPRs <= 48)
588     return 10;
589   if (SGPRs <= 56)
590     return 9;
591   if (SGPRs <= 64)
592     return 8;
593   if (SGPRs <= 72)
594     return 7;
595   if (SGPRs <= 80)
596     return 6;
597   return 5;
598 }
599 
600 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
601   unsigned MaxWaves = getMaxWavesPerEU();
602   unsigned Granule = getVGPRAllocGranule();
603   if (VGPRs < Granule)
604     return MaxWaves;
605   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
606   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
607 }
608 
609 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
610   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
611   if (getGeneration() >= AMDGPUSubtarget::GFX10)
612     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
613 
614   if (MFI.hasFlatScratchInit()) {
615     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
616       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
617     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
618       return 4; // FLAT_SCRATCH, VCC (in that order).
619   }
620 
621   if (isXNACKEnabled())
622     return 4; // XNACK, VCC (in that order).
623   return 2; // VCC.
624 }
625 
626 unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF,
627                                         unsigned LDSSize,
628                                         unsigned NumSGPRs,
629                                         unsigned NumVGPRs) const {
630   unsigned Occupancy =
631     std::min(getMaxWavesPerEU(),
632              getOccupancyWithLocalMemSize(LDSSize, MF.getFunction()));
633   if (NumSGPRs)
634     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
635   if (NumVGPRs)
636     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
637   return Occupancy;
638 }
639 
640 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
641   const Function &F = MF.getFunction();
642   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
643 
644   // Compute maximum number of SGPRs function can use using default/requested
645   // minimum number of waves per execution unit.
646   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
647   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
648   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
649 
650   // Check if maximum number of SGPRs was explicitly requested using
651   // "amdgpu-num-sgpr" attribute.
652   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
653     unsigned Requested = AMDGPU::getIntegerAttribute(
654       F, "amdgpu-num-sgpr", MaxNumSGPRs);
655 
656     // Make sure requested value does not violate subtarget's specifications.
657     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
658       Requested = 0;
659 
660     // If more SGPRs are required to support the input user/system SGPRs,
661     // increase to accommodate them.
662     //
663     // FIXME: This really ends up using the requested number of SGPRs + number
664     // of reserved special registers in total. Theoretically you could re-use
665     // the last input registers for these special registers, but this would
666     // require a lot of complexity to deal with the weird aliasing.
667     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
668     if (Requested && Requested < InputNumSGPRs)
669       Requested = InputNumSGPRs;
670 
671     // Make sure requested value is compatible with values implied by
672     // default/requested minimum/maximum number of waves per execution unit.
673     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
674       Requested = 0;
675     if (WavesPerEU.second &&
676         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
677       Requested = 0;
678 
679     if (Requested)
680       MaxNumSGPRs = Requested;
681   }
682 
683   if (hasSGPRInitBug())
684     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
685 
686   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
687                   MaxAddressableNumSGPRs);
688 }
689 
690 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
691   const Function &F = MF.getFunction();
692   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
693 
694   // Compute maximum number of VGPRs function can use using default/requested
695   // minimum number of waves per execution unit.
696   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
697   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
698 
699   // Check if maximum number of VGPRs was explicitly requested using
700   // "amdgpu-num-vgpr" attribute.
701   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
702     unsigned Requested = AMDGPU::getIntegerAttribute(
703       F, "amdgpu-num-vgpr", MaxNumVGPRs);
704 
705     // Make sure requested value is compatible with values implied by
706     // default/requested minimum/maximum number of waves per execution unit.
707     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
708       Requested = 0;
709     if (WavesPerEU.second &&
710         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
711       Requested = 0;
712 
713     if (Requested)
714       MaxNumVGPRs = Requested;
715   }
716 
717   return MaxNumVGPRs;
718 }
719 
720 void GCNSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
721                                          SDep &Dep) const {
722   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
723       !Src->isInstr() || !Dst->isInstr())
724     return;
725 
726   MachineInstr *SrcI = Src->getInstr();
727   MachineInstr *DstI = Dst->getInstr();
728 
729   if (SrcI->isBundle()) {
730     const SIRegisterInfo *TRI = getRegisterInfo();
731     auto Reg = Dep.getReg();
732     MachineBasicBlock::const_instr_iterator I(SrcI->getIterator());
733     MachineBasicBlock::const_instr_iterator E(SrcI->getParent()->instr_end());
734     unsigned Lat = 0;
735     for (++I; I != E && I->isBundledWithPred(); ++I) {
736       if (I->modifiesRegister(Reg, TRI))
737         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
738       else if (Lat)
739         --Lat;
740     }
741     Dep.setLatency(Lat);
742   } else if (DstI->isBundle()) {
743     const SIRegisterInfo *TRI = getRegisterInfo();
744     auto Reg = Dep.getReg();
745     MachineBasicBlock::const_instr_iterator I(DstI->getIterator());
746     MachineBasicBlock::const_instr_iterator E(DstI->getParent()->instr_end());
747     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *SrcI);
748     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
749       if (I->readsRegister(Reg, TRI))
750         break;
751       --Lat;
752     }
753     Dep.setLatency(Lat);
754   }
755 }
756 
757 namespace {
758 struct FillMFMAShadowMutation : ScheduleDAGMutation {
759   const SIInstrInfo *TII;
760 
761   ScheduleDAGMI *DAG;
762 
763   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
764 
765   bool isSALU(const SUnit *SU) const {
766     const MachineInstr *MI = SU->getInstr();
767     return MI && TII->isSALU(*MI) && !MI->isTerminator();
768   }
769 
770   bool isVALU(const SUnit *SU) const {
771     const MachineInstr *MI = SU->getInstr();
772     return MI && TII->isVALU(*MI);
773   }
774 
775   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
776     if (Pred->NodeNum < Succ->NodeNum)
777       return true;
778 
779     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
780 
781     for (unsigned I = 0; I < Succs.size(); ++I) {
782       for (const SDep &SI : Succs[I]->Succs) {
783         const SUnit *SU = SI.getSUnit();
784         if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
785           Succs.push_back(SU);
786       }
787     }
788 
789     SmallPtrSet<const SUnit*, 32> Visited;
790     while (!Preds.empty()) {
791       const SUnit *SU = Preds.pop_back_val();
792       if (llvm::find(Succs, SU) != Succs.end())
793         return false;
794       Visited.insert(SU);
795       for (const SDep &SI : SU->Preds)
796         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
797           Preds.push_back(SI.getSUnit());
798     }
799 
800     return true;
801   }
802 
803   // Link as much SALU intructions in chain as possible. Return the size
804   // of the chain. Links up to MaxChain instructions.
805   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
806                          SmallPtrSetImpl<SUnit *> &Visited) const {
807     SmallVector<SUnit *, 8> Worklist({To});
808     unsigned Linked = 0;
809 
810     while (!Worklist.empty() && MaxChain-- > 0) {
811       SUnit *SU = Worklist.pop_back_val();
812       if (!Visited.insert(SU).second)
813         continue;
814 
815       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
816                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
817 
818       if (SU->addPred(SDep(From, SDep::Artificial), false))
819         ++Linked;
820 
821       for (SDep &SI : From->Succs) {
822         SUnit *SUv = SI.getSUnit();
823         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
824           SUv->addPred(SDep(SU, SDep::Artificial), false);
825       }
826 
827       for (SDep &SI : SU->Succs) {
828         SUnit *Succ = SI.getSUnit();
829         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
830           Worklist.push_back(Succ);
831       }
832     }
833 
834     return Linked;
835   }
836 
837   void apply(ScheduleDAGInstrs *DAGInstrs) override {
838     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
839     if (!ST.hasMAIInsts() || DisablePowerSched)
840       return;
841     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
842     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
843     if (!TSchedModel || DAG->SUnits.empty())
844       return;
845 
846     // Scan for MFMA long latency instructions and try to add a dependency
847     // of available SALU instructions to give them a chance to fill MFMA
848     // shadow. That is desirable to fill MFMA shadow with SALU instructions
849     // rather than VALU to prevent power consumption bursts and throttle.
850     auto LastSALU = DAG->SUnits.begin();
851     auto E = DAG->SUnits.end();
852     SmallPtrSet<SUnit*, 32> Visited;
853     for (SUnit &SU : DAG->SUnits) {
854       MachineInstr &MAI = *SU.getInstr();
855       if (!TII->isMAI(MAI) ||
856            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
857            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
858         continue;
859 
860       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
861 
862       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
863                  dbgs() << "Need " << Lat
864                         << " instructions to cover latency.\n");
865 
866       // Find up to Lat independent scalar instructions as early as
867       // possible such that they can be scheduled after this MFMA.
868       for ( ; Lat && LastSALU != E; ++LastSALU) {
869         if (Visited.count(&*LastSALU))
870           continue;
871 
872         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
873           continue;
874 
875         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
876       }
877     }
878   }
879 };
880 } // namespace
881 
882 void GCNSubtarget::getPostRAMutations(
883     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
884   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
885 }
886 
887 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
888   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
889     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
890   else
891     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
892 }
893 
894 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
895   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
896     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
897   else
898     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
899 }
900