1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 static cl::opt<bool> DisablePowerSched(
44   "amdgpu-disable-power-sched",
45   cl::desc("Disable scheduling to minimize mAI power bursts"),
46   cl::init(false));
47 
48 static cl::opt<bool> EnableVGPRIndexMode(
49   "amdgpu-vgpr-index-mode",
50   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51   cl::init(false));
52 
53 GCNSubtarget::~GCNSubtarget() = default;
54 
55 R600Subtarget &
56 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
57                                                StringRef GPU, StringRef FS) {
58   SmallString<256> FullFS("+promote-alloca,");
59   FullFS += FS;
60   ParseSubtargetFeatures(GPU, FullFS);
61 
62   HasMulU24 = getGeneration() >= EVERGREEN;
63   HasMulI24 = hasCaymanISA();
64 
65   return *this;
66 }
67 
68 GCNSubtarget &
69 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
70                                               StringRef GPU, StringRef FS) {
71   // Determine default and user-specified characteristics
72   //
73   // We want to be able to turn these off, but making this a subtarget feature
74   // for SI has the unhelpful behavior that it unsets everything else if you
75   // disable it.
76   //
77   // Similarly we want enable-prt-strict-null to be on by default and not to
78   // unset everything else if it is disabled
79 
80   // Assuming ECC is enabled is the conservative default.
81   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,");
82 
83   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
84     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
85 
86   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
87 
88   // Disable mutually exclusive bits.
89   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
90     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
91       FullFS += "-wavefrontsize16,";
92     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
93       FullFS += "-wavefrontsize32,";
94     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
95       FullFS += "-wavefrontsize64,";
96   }
97 
98   FullFS += FS;
99 
100   ParseSubtargetFeatures(GPU, FullFS);
101 
102   // We don't support FP64 for EG/NI atm.
103   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
104 
105   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
106   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
107   // variants of MUBUF instructions.
108   if (!hasAddr64() && !FS.contains("flat-for-global")) {
109     FlatForGlobal = true;
110   }
111 
112   // Set defaults if needed.
113   if (MaxPrivateElementSize == 0)
114     MaxPrivateElementSize = 4;
115 
116   if (LDSBankCount == 0)
117     LDSBankCount = 32;
118 
119   if (TT.getArch() == Triple::amdgcn) {
120     if (LocalMemorySize == 0)
121       LocalMemorySize = 32768;
122 
123     // Do something sensible for unspecified target.
124     if (!HasMovrel && !HasVGPRIndexMode)
125       HasMovrel = true;
126   }
127 
128   // Don't crash on invalid devices.
129   if (WavefrontSize == 0)
130     WavefrontSize = 64;
131 
132   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
133 
134   // Disable XNACK on targets where it is not enabled by default unless it is
135   // explicitly requested.
136   if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) {
137     ToggleFeature(AMDGPU::FeatureXNACK);
138     EnableXNACK = false;
139   }
140 
141   // ECC is on by default, but turn it off if the hardware doesn't support it
142   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
143   // ECC.
144   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
145     ToggleFeature(AMDGPU::FeatureSRAMECC);
146     EnableSRAMECC = false;
147   }
148 
149   return *this;
150 }
151 
152 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
153   TargetTriple(TT),
154   Has16BitInsts(false),
155   HasMadMixInsts(false),
156   FPExceptions(false),
157   HasSDWA(false),
158   HasVOP3PInsts(false),
159   HasMulI24(true),
160   HasMulU24(true),
161   HasInv2PiInlineImm(false),
162   HasFminFmaxLegacy(true),
163   EnablePromoteAlloca(false),
164   HasTrigReducedRange(false),
165   MaxWavesPerEU(10),
166   LocalMemorySize(0),
167   WavefrontSize(0)
168   { }
169 
170 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
171                            const GCNTargetMachine &TM) :
172     AMDGPUGenSubtargetInfo(TT, GPU, FS),
173     AMDGPUSubtarget(TT),
174     TargetTriple(TT),
175     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
176     InstrItins(getInstrItineraryForCPU(GPU)),
177     LDSBankCount(0),
178     MaxPrivateElementSize(0),
179 
180     FastFMAF32(false),
181     FastDenormalF32(false),
182     HalfRate64Ops(false),
183 
184     FlatForGlobal(false),
185     AutoWaitcntBeforeBarrier(false),
186     CodeObjectV3(false),
187     UnalignedScratchAccess(false),
188     UnalignedBufferAccess(false),
189 
190     HasApertureRegs(false),
191     EnableXNACK(false),
192     DoesNotSupportXNACK(false),
193     EnableCuMode(false),
194     TrapHandler(false),
195 
196     EnableLoadStoreOpt(false),
197     EnableUnsafeDSOffsetFolding(false),
198     EnableSIScheduler(false),
199     EnableDS128(false),
200     EnablePRTStrictNull(false),
201     DumpCode(false),
202 
203     FP64(false),
204     GCN3Encoding(false),
205     CIInsts(false),
206     GFX8Insts(false),
207     GFX9Insts(false),
208     GFX10Insts(false),
209     GFX7GFX8GFX9Insts(false),
210     SGPRInitBug(false),
211     HasSMemRealTime(false),
212     HasIntClamp(false),
213     HasFmaMixInsts(false),
214     HasMovrel(false),
215     HasVGPRIndexMode(false),
216     HasScalarStores(false),
217     HasScalarAtomics(false),
218     HasSDWAOmod(false),
219     HasSDWAScalar(false),
220     HasSDWASdst(false),
221     HasSDWAMac(false),
222     HasSDWAOutModsVOPC(false),
223     HasDPP(false),
224     HasDPP8(false),
225     HasR128A16(false),
226     HasGFX10A16(false),
227     HasNSAEncoding(false),
228     HasDLInsts(false),
229     HasDot1Insts(false),
230     HasDot2Insts(false),
231     HasDot3Insts(false),
232     HasDot4Insts(false),
233     HasDot5Insts(false),
234     HasDot6Insts(false),
235     HasMAIInsts(false),
236     HasPkFmacF16Inst(false),
237     HasAtomicFaddInsts(false),
238     EnableSRAMECC(false),
239     DoesNotSupportSRAMECC(false),
240     HasNoSdstCMPX(false),
241     HasVscnt(false),
242     HasRegisterBanking(false),
243     HasVOP3Literal(false),
244     HasNoDataDepHazard(false),
245     FlatAddressSpace(false),
246     FlatInstOffsets(false),
247     FlatGlobalInsts(false),
248     FlatScratchInsts(false),
249     ScalarFlatScratchInsts(false),
250     AddNoCarryInsts(false),
251     HasUnpackedD16VMem(false),
252     LDSMisalignedBug(false),
253     HasMFMAInlineLiteralBug(false),
254 
255     ScalarizeGlobal(false),
256 
257     HasVcmpxPermlaneHazard(false),
258     HasVMEMtoScalarWriteHazard(false),
259     HasSMEMtoVectorWriteHazard(false),
260     HasInstFwdPrefetchBug(false),
261     HasVcmpxExecWARHazard(false),
262     HasLdsBranchVmemWARHazard(false),
263     HasNSAtoVMEMBug(false),
264     HasOffset3fBug(false),
265     HasFlatSegmentOffsetBug(false),
266 
267     FeatureDisable(false),
268     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
269     TLInfo(TM, *this),
270     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
271   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
272   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
273   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
274   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
275   InstSelector.reset(new AMDGPUInstructionSelector(
276   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
277 }
278 
279 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
280   if (getGeneration() < GFX10)
281     return 1;
282 
283   switch (Opcode) {
284   case AMDGPU::V_LSHLREV_B64:
285   case AMDGPU::V_LSHLREV_B64_gfx10:
286   case AMDGPU::V_LSHL_B64:
287   case AMDGPU::V_LSHRREV_B64:
288   case AMDGPU::V_LSHRREV_B64_gfx10:
289   case AMDGPU::V_LSHR_B64:
290   case AMDGPU::V_ASHRREV_I64:
291   case AMDGPU::V_ASHRREV_I64_gfx10:
292   case AMDGPU::V_ASHR_I64:
293     return 1;
294   }
295 
296   return 2;
297 }
298 
299 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
300   const Function &F) const {
301   if (NWaves == 1)
302     return getLocalMemorySize();
303   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
304   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
305   if (!WorkGroupsPerCu)
306     return 0;
307   unsigned MaxWaves = getMaxWavesPerEU();
308   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
309 }
310 
311 // FIXME: Should return min,max range.
312 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
313   const Function &F) const {
314   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
315   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
316   if (!MaxWorkGroupsPerCu)
317     return 0;
318 
319   const unsigned WaveSize = getWavefrontSize();
320 
321   // FIXME: Do we need to account for alignment requirement of LDS rounding the
322   // size up?
323   // Compute restriction based on LDS usage
324   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
325 
326   // This can be queried with more LDS than is possible, so just assume the
327   // worst.
328   if (NumGroups == 0)
329     return 1;
330 
331   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
332 
333   // Round to the number of waves.
334   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
335   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
336 
337   // Clamp to the maximum possible number of waves.
338   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
339 
340   // FIXME: Needs to be a multiple of the group size?
341   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
342 
343   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
344          "computed invalid occupancy");
345   return MaxWaves;
346 }
347 
348 unsigned
349 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
350   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
351   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
352 }
353 
354 std::pair<unsigned, unsigned>
355 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
356   switch (CC) {
357   case CallingConv::AMDGPU_VS:
358   case CallingConv::AMDGPU_LS:
359   case CallingConv::AMDGPU_HS:
360   case CallingConv::AMDGPU_ES:
361   case CallingConv::AMDGPU_GS:
362   case CallingConv::AMDGPU_PS:
363     return std::make_pair(1, getWavefrontSize());
364   default:
365     return std::make_pair(1u, getMaxFlatWorkGroupSize());
366   }
367 }
368 
369 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
370   const Function &F) const {
371   // Default minimum/maximum flat work group sizes.
372   std::pair<unsigned, unsigned> Default =
373     getDefaultFlatWorkGroupSize(F.getCallingConv());
374 
375   // Requested minimum/maximum flat work group sizes.
376   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
377     F, "amdgpu-flat-work-group-size", Default);
378 
379   // Make sure requested minimum is less than requested maximum.
380   if (Requested.first > Requested.second)
381     return Default;
382 
383   // Make sure requested values do not violate subtarget's specifications.
384   if (Requested.first < getMinFlatWorkGroupSize())
385     return Default;
386   if (Requested.second > getMaxFlatWorkGroupSize())
387     return Default;
388 
389   return Requested;
390 }
391 
392 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
393   const Function &F) const {
394   // Default minimum/maximum number of waves per execution unit.
395   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
396 
397   // Default/requested minimum/maximum flat work group sizes.
398   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
399 
400   // If minimum/maximum flat work group sizes were explicitly requested using
401   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
402   // number of waves per execution unit to values implied by requested
403   // minimum/maximum flat work group sizes.
404   unsigned MinImpliedByFlatWorkGroupSize =
405     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
406   bool RequestedFlatWorkGroupSize = false;
407 
408   if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
409     Default.first = MinImpliedByFlatWorkGroupSize;
410     RequestedFlatWorkGroupSize = true;
411   }
412 
413   // Requested minimum/maximum number of waves per execution unit.
414   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
415     F, "amdgpu-waves-per-eu", Default, true);
416 
417   // Make sure requested minimum is less than requested maximum.
418   if (Requested.second && Requested.first > Requested.second)
419     return Default;
420 
421   // Make sure requested values do not violate subtarget's specifications.
422   if (Requested.first < getMinWavesPerEU() ||
423       Requested.first > getMaxWavesPerEU())
424     return Default;
425   if (Requested.second > getMaxWavesPerEU())
426     return Default;
427 
428   // Make sure requested values are compatible with values implied by requested
429   // minimum/maximum flat work group sizes.
430   if (RequestedFlatWorkGroupSize &&
431       Requested.first < MinImpliedByFlatWorkGroupSize)
432     return Default;
433 
434   return Requested;
435 }
436 
437 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
438   Function *Kernel = I->getParent()->getParent();
439   unsigned MinSize = 0;
440   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
441   bool IdQuery = false;
442 
443   // If reqd_work_group_size is present it narrows value down.
444   if (auto *CI = dyn_cast<CallInst>(I)) {
445     const Function *F = CI->getCalledFunction();
446     if (F) {
447       unsigned Dim = UINT_MAX;
448       switch (F->getIntrinsicID()) {
449       case Intrinsic::amdgcn_workitem_id_x:
450       case Intrinsic::r600_read_tidig_x:
451         IdQuery = true;
452         LLVM_FALLTHROUGH;
453       case Intrinsic::r600_read_local_size_x:
454         Dim = 0;
455         break;
456       case Intrinsic::amdgcn_workitem_id_y:
457       case Intrinsic::r600_read_tidig_y:
458         IdQuery = true;
459         LLVM_FALLTHROUGH;
460       case Intrinsic::r600_read_local_size_y:
461         Dim = 1;
462         break;
463       case Intrinsic::amdgcn_workitem_id_z:
464       case Intrinsic::r600_read_tidig_z:
465         IdQuery = true;
466         LLVM_FALLTHROUGH;
467       case Intrinsic::r600_read_local_size_z:
468         Dim = 2;
469         break;
470       default:
471         break;
472       }
473       if (Dim <= 3) {
474         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
475           if (Node->getNumOperands() == 3)
476             MinSize = MaxSize = mdconst::extract<ConstantInt>(
477                                   Node->getOperand(Dim))->getZExtValue();
478       }
479     }
480   }
481 
482   if (!MaxSize)
483     return false;
484 
485   // Range metadata is [Lo, Hi). For ID query we need to pass max size
486   // as Hi. For size query we need to pass Hi + 1.
487   if (IdQuery)
488     MinSize = 0;
489   else
490     ++MaxSize;
491 
492   MDBuilder MDB(I->getContext());
493   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
494                                                   APInt(32, MaxSize));
495   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
496   return true;
497 }
498 
499 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
500                                                  Align &MaxAlign) const {
501   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
502          F.getCallingConv() == CallingConv::SPIR_KERNEL);
503 
504   const DataLayout &DL = F.getParent()->getDataLayout();
505   uint64_t ExplicitArgBytes = 0;
506   MaxAlign = Align(1);
507 
508   for (const Argument &Arg : F.args()) {
509     Type *ArgTy = Arg.getType();
510 
511     const Align Alignment(DL.getABITypeAlignment(ArgTy));
512     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
513     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
514     MaxAlign = std::max(MaxAlign, Alignment);
515   }
516 
517   return ExplicitArgBytes;
518 }
519 
520 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
521                                                 Align &MaxAlign) const {
522   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
523 
524   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
525 
526   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
527   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
528   if (ImplicitBytes != 0) {
529     const Align Alignment = getAlignmentForImplicitArgPtr();
530     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
531   }
532 
533   // Being able to dereference past the end is useful for emitting scalar loads.
534   return alignTo(TotalSize, 4);
535 }
536 
537 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
538                              const TargetMachine &TM) :
539   R600GenSubtargetInfo(TT, GPU, FS),
540   AMDGPUSubtarget(TT),
541   InstrInfo(*this),
542   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
543   FMA(false),
544   CaymanISA(false),
545   CFALUBug(false),
546   HasVertexCache(false),
547   R600ALUInst(false),
548   FP64(false),
549   TexVTXClauseSize(0),
550   Gen(R600),
551   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
552   InstrItins(getInstrItineraryForCPU(GPU)) { }
553 
554 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
555                                       unsigned NumRegionInstrs) const {
556   // Track register pressure so the scheduler can try to decrease
557   // pressure once register usage is above the threshold defined by
558   // SIRegisterInfo::getRegPressureSetLimit()
559   Policy.ShouldTrackPressure = true;
560 
561   // Enabling both top down and bottom up scheduling seems to give us less
562   // register spills than just using one of these approaches on its own.
563   Policy.OnlyTopDown = false;
564   Policy.OnlyBottomUp = false;
565 
566   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
567   if (!enableSIScheduler())
568     Policy.ShouldTrackLaneMasks = true;
569 }
570 
571 bool GCNSubtarget::hasMadF16() const {
572   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
573 }
574 
575 bool GCNSubtarget::useVGPRIndexMode() const {
576   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
577 }
578 
579 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
580   if (getGeneration() >= AMDGPUSubtarget::GFX10)
581     return getMaxWavesPerEU();
582 
583   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
584     if (SGPRs <= 80)
585       return 10;
586     if (SGPRs <= 88)
587       return 9;
588     if (SGPRs <= 100)
589       return 8;
590     return 7;
591   }
592   if (SGPRs <= 48)
593     return 10;
594   if (SGPRs <= 56)
595     return 9;
596   if (SGPRs <= 64)
597     return 8;
598   if (SGPRs <= 72)
599     return 7;
600   if (SGPRs <= 80)
601     return 6;
602   return 5;
603 }
604 
605 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
606   unsigned MaxWaves = getMaxWavesPerEU();
607   unsigned Granule = getVGPRAllocGranule();
608   if (VGPRs < Granule)
609     return MaxWaves;
610   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
611   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
612 }
613 
614 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
615   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
616   if (getGeneration() >= AMDGPUSubtarget::GFX10)
617     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
618 
619   if (MFI.hasFlatScratchInit()) {
620     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
621       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
622     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
623       return 4; // FLAT_SCRATCH, VCC (in that order).
624   }
625 
626   if (isXNACKEnabled())
627     return 4; // XNACK, VCC (in that order).
628   return 2; // VCC.
629 }
630 
631 unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF,
632                                         unsigned LDSSize,
633                                         unsigned NumSGPRs,
634                                         unsigned NumVGPRs) const {
635   unsigned Occupancy =
636     std::min(getMaxWavesPerEU(),
637              getOccupancyWithLocalMemSize(LDSSize, MF.getFunction()));
638   if (NumSGPRs)
639     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
640   if (NumVGPRs)
641     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
642   return Occupancy;
643 }
644 
645 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
646   const Function &F = MF.getFunction();
647   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
648 
649   // Compute maximum number of SGPRs function can use using default/requested
650   // minimum number of waves per execution unit.
651   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
652   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
653   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
654 
655   // Check if maximum number of SGPRs was explicitly requested using
656   // "amdgpu-num-sgpr" attribute.
657   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
658     unsigned Requested = AMDGPU::getIntegerAttribute(
659       F, "amdgpu-num-sgpr", MaxNumSGPRs);
660 
661     // Make sure requested value does not violate subtarget's specifications.
662     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
663       Requested = 0;
664 
665     // If more SGPRs are required to support the input user/system SGPRs,
666     // increase to accommodate them.
667     //
668     // FIXME: This really ends up using the requested number of SGPRs + number
669     // of reserved special registers in total. Theoretically you could re-use
670     // the last input registers for these special registers, but this would
671     // require a lot of complexity to deal with the weird aliasing.
672     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
673     if (Requested && Requested < InputNumSGPRs)
674       Requested = InputNumSGPRs;
675 
676     // Make sure requested value is compatible with values implied by
677     // default/requested minimum/maximum number of waves per execution unit.
678     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
679       Requested = 0;
680     if (WavesPerEU.second &&
681         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
682       Requested = 0;
683 
684     if (Requested)
685       MaxNumSGPRs = Requested;
686   }
687 
688   if (hasSGPRInitBug())
689     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
690 
691   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
692                   MaxAddressableNumSGPRs);
693 }
694 
695 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
696   const Function &F = MF.getFunction();
697   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
698 
699   // Compute maximum number of VGPRs function can use using default/requested
700   // minimum number of waves per execution unit.
701   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
702   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
703 
704   // Check if maximum number of VGPRs was explicitly requested using
705   // "amdgpu-num-vgpr" attribute.
706   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
707     unsigned Requested = AMDGPU::getIntegerAttribute(
708       F, "amdgpu-num-vgpr", MaxNumVGPRs);
709 
710     // Make sure requested value is compatible with values implied by
711     // default/requested minimum/maximum number of waves per execution unit.
712     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
713       Requested = 0;
714     if (WavesPerEU.second &&
715         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
716       Requested = 0;
717 
718     if (Requested)
719       MaxNumVGPRs = Requested;
720   }
721 
722   return MaxNumVGPRs;
723 }
724 
725 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
726                                          int UseOpIdx, SDep &Dep) const {
727   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
728       !Def->isInstr() || !Use->isInstr())
729     return;
730 
731   MachineInstr *DefI = Def->getInstr();
732   MachineInstr *UseI = Use->getInstr();
733 
734   if (DefI->isBundle()) {
735     const SIRegisterInfo *TRI = getRegisterInfo();
736     auto Reg = Dep.getReg();
737     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
738     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
739     unsigned Lat = 0;
740     for (++I; I != E && I->isBundledWithPred(); ++I) {
741       if (I->modifiesRegister(Reg, TRI))
742         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
743       else if (Lat)
744         --Lat;
745     }
746     Dep.setLatency(Lat);
747   } else if (UseI->isBundle()) {
748     const SIRegisterInfo *TRI = getRegisterInfo();
749     auto Reg = Dep.getReg();
750     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
751     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
752     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
753     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
754       if (I->readsRegister(Reg, TRI))
755         break;
756       --Lat;
757     }
758     Dep.setLatency(Lat);
759   }
760 }
761 
762 namespace {
763 struct FillMFMAShadowMutation : ScheduleDAGMutation {
764   const SIInstrInfo *TII;
765 
766   ScheduleDAGMI *DAG;
767 
768   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
769 
770   bool isSALU(const SUnit *SU) const {
771     const MachineInstr *MI = SU->getInstr();
772     return MI && TII->isSALU(*MI) && !MI->isTerminator();
773   }
774 
775   bool isVALU(const SUnit *SU) const {
776     const MachineInstr *MI = SU->getInstr();
777     return MI && TII->isVALU(*MI);
778   }
779 
780   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
781     if (Pred->NodeNum < Succ->NodeNum)
782       return true;
783 
784     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
785 
786     for (unsigned I = 0; I < Succs.size(); ++I) {
787       for (const SDep &SI : Succs[I]->Succs) {
788         const SUnit *SU = SI.getSUnit();
789         if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
790           Succs.push_back(SU);
791       }
792     }
793 
794     SmallPtrSet<const SUnit*, 32> Visited;
795     while (!Preds.empty()) {
796       const SUnit *SU = Preds.pop_back_val();
797       if (llvm::find(Succs, SU) != Succs.end())
798         return false;
799       Visited.insert(SU);
800       for (const SDep &SI : SU->Preds)
801         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
802           Preds.push_back(SI.getSUnit());
803     }
804 
805     return true;
806   }
807 
808   // Link as much SALU intructions in chain as possible. Return the size
809   // of the chain. Links up to MaxChain instructions.
810   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
811                          SmallPtrSetImpl<SUnit *> &Visited) const {
812     SmallVector<SUnit *, 8> Worklist({To});
813     unsigned Linked = 0;
814 
815     while (!Worklist.empty() && MaxChain-- > 0) {
816       SUnit *SU = Worklist.pop_back_val();
817       if (!Visited.insert(SU).second)
818         continue;
819 
820       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
821                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
822 
823       if (SU->addPred(SDep(From, SDep::Artificial), false))
824         ++Linked;
825 
826       for (SDep &SI : From->Succs) {
827         SUnit *SUv = SI.getSUnit();
828         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
829           SUv->addPred(SDep(SU, SDep::Artificial), false);
830       }
831 
832       for (SDep &SI : SU->Succs) {
833         SUnit *Succ = SI.getSUnit();
834         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
835           Worklist.push_back(Succ);
836       }
837     }
838 
839     return Linked;
840   }
841 
842   void apply(ScheduleDAGInstrs *DAGInstrs) override {
843     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
844     if (!ST.hasMAIInsts() || DisablePowerSched)
845       return;
846     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
847     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
848     if (!TSchedModel || DAG->SUnits.empty())
849       return;
850 
851     // Scan for MFMA long latency instructions and try to add a dependency
852     // of available SALU instructions to give them a chance to fill MFMA
853     // shadow. That is desirable to fill MFMA shadow with SALU instructions
854     // rather than VALU to prevent power consumption bursts and throttle.
855     auto LastSALU = DAG->SUnits.begin();
856     auto E = DAG->SUnits.end();
857     SmallPtrSet<SUnit*, 32> Visited;
858     for (SUnit &SU : DAG->SUnits) {
859       MachineInstr &MAI = *SU.getInstr();
860       if (!TII->isMAI(MAI) ||
861            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
862            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
863         continue;
864 
865       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
866 
867       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
868                  dbgs() << "Need " << Lat
869                         << " instructions to cover latency.\n");
870 
871       // Find up to Lat independent scalar instructions as early as
872       // possible such that they can be scheduled after this MFMA.
873       for ( ; Lat && LastSALU != E; ++LastSALU) {
874         if (Visited.count(&*LastSALU))
875           continue;
876 
877         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
878           continue;
879 
880         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
881       }
882     }
883   }
884 };
885 } // namespace
886 
887 void GCNSubtarget::getPostRAMutations(
888     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
889   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
890 }
891 
892 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
893   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
894     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
895   else
896     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
897 }
898 
899 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
900   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
901     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
902   else
903     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
904 }
905