1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 static cl::opt<bool> DisablePowerSched(
44   "amdgpu-disable-power-sched",
45   cl::desc("Disable scheduling to minimize mAI power bursts"),
46   cl::init(false));
47 
48 static cl::opt<bool> EnableVGPRIndexMode(
49   "amdgpu-vgpr-index-mode",
50   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51   cl::init(false));
52 
53 GCNSubtarget::~GCNSubtarget() = default;
54 
55 R600Subtarget &
56 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
57                                                StringRef GPU, StringRef FS) {
58   SmallString<256> FullFS("+promote-alloca,");
59   FullFS += FS;
60   ParseSubtargetFeatures(GPU, FullFS);
61 
62   HasMulU24 = getGeneration() >= EVERGREEN;
63   HasMulI24 = hasCaymanISA();
64 
65   return *this;
66 }
67 
68 GCNSubtarget &
69 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
70                                               StringRef GPU, StringRef FS) {
71   // Determine default and user-specified characteristics
72   //
73   // We want to be able to turn these off, but making this a subtarget feature
74   // for SI has the unhelpful behavior that it unsets everything else if you
75   // disable it.
76   //
77   // Similarly we want enable-prt-strict-null to be on by default and not to
78   // unset everything else if it is disabled
79 
80   // Assuming ECC is enabled is the conservative default.
81   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,");
82 
83   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
84     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
85 
86   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
87 
88   // Disable mutually exclusive bits.
89   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
90     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
91       FullFS += "-wavefrontsize16,";
92     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
93       FullFS += "-wavefrontsize32,";
94     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
95       FullFS += "-wavefrontsize64,";
96   }
97 
98   FullFS += FS;
99 
100   ParseSubtargetFeatures(GPU, FullFS);
101 
102   // We don't support FP64 for EG/NI atm.
103   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
104 
105   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
106   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
107   // variants of MUBUF instructions.
108   if (!hasAddr64() && !FS.contains("flat-for-global")) {
109     FlatForGlobal = true;
110   }
111 
112   // Set defaults if needed.
113   if (MaxPrivateElementSize == 0)
114     MaxPrivateElementSize = 4;
115 
116   if (LDSBankCount == 0)
117     LDSBankCount = 32;
118 
119   if (TT.getArch() == Triple::amdgcn) {
120     if (LocalMemorySize == 0)
121       LocalMemorySize = 32768;
122 
123     // Do something sensible for unspecified target.
124     if (!HasMovrel && !HasVGPRIndexMode)
125       HasMovrel = true;
126   }
127 
128   // Don't crash on invalid devices.
129   if (WavefrontSizeLog2 == 0)
130     WavefrontSizeLog2 = 5;
131 
132   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
133 
134   // Disable XNACK on targets where it is not enabled by default unless it is
135   // explicitly requested.
136   if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) {
137     ToggleFeature(AMDGPU::FeatureXNACK);
138     EnableXNACK = false;
139   }
140 
141   // ECC is on by default, but turn it off if the hardware doesn't support it
142   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
143   // ECC.
144   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
145     ToggleFeature(AMDGPU::FeatureSRAMECC);
146     EnableSRAMECC = false;
147   }
148 
149   return *this;
150 }
151 
152 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
153   TargetTriple(TT),
154   Has16BitInsts(false),
155   HasMadMixInsts(false),
156   HasSDWA(false),
157   HasVOP3PInsts(false),
158   HasMulI24(true),
159   HasMulU24(true),
160   HasInv2PiInlineImm(false),
161   HasFminFmaxLegacy(true),
162   EnablePromoteAlloca(false),
163   HasTrigReducedRange(false),
164   MaxWavesPerEU(10),
165   LocalMemorySize(0),
166   WavefrontSizeLog2(0)
167   { }
168 
169 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
170                            const GCNTargetMachine &TM) :
171     AMDGPUGenSubtargetInfo(TT, GPU, FS),
172     AMDGPUSubtarget(TT),
173     TargetTriple(TT),
174     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
175     InstrItins(getInstrItineraryForCPU(GPU)),
176     LDSBankCount(0),
177     MaxPrivateElementSize(0),
178 
179     FastFMAF32(false),
180     FastDenormalF32(false),
181     HalfRate64Ops(false),
182 
183     FlatForGlobal(false),
184     AutoWaitcntBeforeBarrier(false),
185     CodeObjectV3(false),
186     UnalignedScratchAccess(false),
187     UnalignedBufferAccess(false),
188 
189     HasApertureRegs(false),
190     EnableXNACK(false),
191     DoesNotSupportXNACK(false),
192     EnableCuMode(false),
193     TrapHandler(false),
194 
195     EnableLoadStoreOpt(false),
196     EnableUnsafeDSOffsetFolding(false),
197     EnableSIScheduler(false),
198     EnableDS128(false),
199     EnablePRTStrictNull(false),
200     DumpCode(false),
201 
202     FP64(false),
203     GCN3Encoding(false),
204     CIInsts(false),
205     GFX8Insts(false),
206     GFX9Insts(false),
207     GFX10Insts(false),
208     GFX7GFX8GFX9Insts(false),
209     SGPRInitBug(false),
210     HasSMemRealTime(false),
211     HasIntClamp(false),
212     HasFmaMixInsts(false),
213     HasMovrel(false),
214     HasVGPRIndexMode(false),
215     HasScalarStores(false),
216     HasScalarAtomics(false),
217     HasSDWAOmod(false),
218     HasSDWAScalar(false),
219     HasSDWASdst(false),
220     HasSDWAMac(false),
221     HasSDWAOutModsVOPC(false),
222     HasDPP(false),
223     HasDPP8(false),
224     HasR128A16(false),
225     HasGFX10A16(false),
226     HasG16(false),
227     HasNSAEncoding(false),
228     HasDLInsts(false),
229     HasDot1Insts(false),
230     HasDot2Insts(false),
231     HasDot3Insts(false),
232     HasDot4Insts(false),
233     HasDot5Insts(false),
234     HasDot6Insts(false),
235     HasMAIInsts(false),
236     HasPkFmacF16Inst(false),
237     HasAtomicFaddInsts(false),
238     EnableSRAMECC(false),
239     DoesNotSupportSRAMECC(false),
240     HasNoSdstCMPX(false),
241     HasVscnt(false),
242     HasRegisterBanking(false),
243     HasVOP3Literal(false),
244     HasNoDataDepHazard(false),
245     FlatAddressSpace(false),
246     FlatInstOffsets(false),
247     FlatGlobalInsts(false),
248     FlatScratchInsts(false),
249     ScalarFlatScratchInsts(false),
250     AddNoCarryInsts(false),
251     HasUnpackedD16VMem(false),
252     LDSMisalignedBug(false),
253     HasMFMAInlineLiteralBug(false),
254 
255     ScalarizeGlobal(false),
256 
257     HasVcmpxPermlaneHazard(false),
258     HasVMEMtoScalarWriteHazard(false),
259     HasSMEMtoVectorWriteHazard(false),
260     HasInstFwdPrefetchBug(false),
261     HasVcmpxExecWARHazard(false),
262     HasLdsBranchVmemWARHazard(false),
263     HasNSAtoVMEMBug(false),
264     HasOffset3fBug(false),
265     HasFlatSegmentOffsetBug(false),
266 
267     FeatureDisable(false),
268     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
269     TLInfo(TM, *this),
270     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
271   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
272   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
273   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
274   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
275   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
276   InstSelector.reset(new AMDGPUInstructionSelector(
277   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
278 }
279 
280 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
281   if (getGeneration() < GFX10)
282     return 1;
283 
284   switch (Opcode) {
285   case AMDGPU::V_LSHLREV_B64:
286   case AMDGPU::V_LSHLREV_B64_gfx10:
287   case AMDGPU::V_LSHL_B64:
288   case AMDGPU::V_LSHRREV_B64:
289   case AMDGPU::V_LSHRREV_B64_gfx10:
290   case AMDGPU::V_LSHR_B64:
291   case AMDGPU::V_ASHRREV_I64:
292   case AMDGPU::V_ASHRREV_I64_gfx10:
293   case AMDGPU::V_ASHR_I64:
294     return 1;
295   }
296 
297   return 2;
298 }
299 
300 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
301   const Function &F) const {
302   if (NWaves == 1)
303     return getLocalMemorySize();
304   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
305   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
306   if (!WorkGroupsPerCu)
307     return 0;
308   unsigned MaxWaves = getMaxWavesPerEU();
309   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
310 }
311 
312 // FIXME: Should return min,max range.
313 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
314   const Function &F) const {
315   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
316   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
317   if (!MaxWorkGroupsPerCu)
318     return 0;
319 
320   const unsigned WaveSize = getWavefrontSize();
321 
322   // FIXME: Do we need to account for alignment requirement of LDS rounding the
323   // size up?
324   // Compute restriction based on LDS usage
325   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
326 
327   // This can be queried with more LDS than is possible, so just assume the
328   // worst.
329   if (NumGroups == 0)
330     return 1;
331 
332   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
333 
334   // Round to the number of waves.
335   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
336   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
337 
338   // Clamp to the maximum possible number of waves.
339   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
340 
341   // FIXME: Needs to be a multiple of the group size?
342   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
343 
344   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
345          "computed invalid occupancy");
346   return MaxWaves;
347 }
348 
349 unsigned
350 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
351   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
352   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
353 }
354 
355 std::pair<unsigned, unsigned>
356 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
357   switch (CC) {
358   case CallingConv::AMDGPU_VS:
359   case CallingConv::AMDGPU_LS:
360   case CallingConv::AMDGPU_HS:
361   case CallingConv::AMDGPU_ES:
362   case CallingConv::AMDGPU_GS:
363   case CallingConv::AMDGPU_PS:
364     return std::make_pair(1, getWavefrontSize());
365   default:
366     return std::make_pair(1u, getMaxFlatWorkGroupSize());
367   }
368 }
369 
370 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
371   const Function &F) const {
372   // Default minimum/maximum flat work group sizes.
373   std::pair<unsigned, unsigned> Default =
374     getDefaultFlatWorkGroupSize(F.getCallingConv());
375 
376   // Requested minimum/maximum flat work group sizes.
377   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
378     F, "amdgpu-flat-work-group-size", Default);
379 
380   // Make sure requested minimum is less than requested maximum.
381   if (Requested.first > Requested.second)
382     return Default;
383 
384   // Make sure requested values do not violate subtarget's specifications.
385   if (Requested.first < getMinFlatWorkGroupSize())
386     return Default;
387   if (Requested.second > getMaxFlatWorkGroupSize())
388     return Default;
389 
390   return Requested;
391 }
392 
393 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
394   const Function &F) const {
395   // Default minimum/maximum number of waves per execution unit.
396   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
397 
398   // Default/requested minimum/maximum flat work group sizes.
399   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
400 
401   // If minimum/maximum flat work group sizes were explicitly requested using
402   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
403   // number of waves per execution unit to values implied by requested
404   // minimum/maximum flat work group sizes.
405   unsigned MinImpliedByFlatWorkGroupSize =
406     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
407   bool RequestedFlatWorkGroupSize = false;
408 
409   if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
410     Default.first = MinImpliedByFlatWorkGroupSize;
411     RequestedFlatWorkGroupSize = true;
412   }
413 
414   // Requested minimum/maximum number of waves per execution unit.
415   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
416     F, "amdgpu-waves-per-eu", Default, true);
417 
418   // Make sure requested minimum is less than requested maximum.
419   if (Requested.second && Requested.first > Requested.second)
420     return Default;
421 
422   // Make sure requested values do not violate subtarget's specifications.
423   if (Requested.first < getMinWavesPerEU() ||
424       Requested.first > getMaxWavesPerEU())
425     return Default;
426   if (Requested.second > getMaxWavesPerEU())
427     return Default;
428 
429   // Make sure requested values are compatible with values implied by requested
430   // minimum/maximum flat work group sizes.
431   if (RequestedFlatWorkGroupSize &&
432       Requested.first < MinImpliedByFlatWorkGroupSize)
433     return Default;
434 
435   return Requested;
436 }
437 
438 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
439   Function *Kernel = I->getParent()->getParent();
440   unsigned MinSize = 0;
441   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
442   bool IdQuery = false;
443 
444   // If reqd_work_group_size is present it narrows value down.
445   if (auto *CI = dyn_cast<CallInst>(I)) {
446     const Function *F = CI->getCalledFunction();
447     if (F) {
448       unsigned Dim = UINT_MAX;
449       switch (F->getIntrinsicID()) {
450       case Intrinsic::amdgcn_workitem_id_x:
451       case Intrinsic::r600_read_tidig_x:
452         IdQuery = true;
453         LLVM_FALLTHROUGH;
454       case Intrinsic::r600_read_local_size_x:
455         Dim = 0;
456         break;
457       case Intrinsic::amdgcn_workitem_id_y:
458       case Intrinsic::r600_read_tidig_y:
459         IdQuery = true;
460         LLVM_FALLTHROUGH;
461       case Intrinsic::r600_read_local_size_y:
462         Dim = 1;
463         break;
464       case Intrinsic::amdgcn_workitem_id_z:
465       case Intrinsic::r600_read_tidig_z:
466         IdQuery = true;
467         LLVM_FALLTHROUGH;
468       case Intrinsic::r600_read_local_size_z:
469         Dim = 2;
470         break;
471       default:
472         break;
473       }
474       if (Dim <= 3) {
475         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
476           if (Node->getNumOperands() == 3)
477             MinSize = MaxSize = mdconst::extract<ConstantInt>(
478                                   Node->getOperand(Dim))->getZExtValue();
479       }
480     }
481   }
482 
483   if (!MaxSize)
484     return false;
485 
486   // Range metadata is [Lo, Hi). For ID query we need to pass max size
487   // as Hi. For size query we need to pass Hi + 1.
488   if (IdQuery)
489     MinSize = 0;
490   else
491     ++MaxSize;
492 
493   MDBuilder MDB(I->getContext());
494   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
495                                                   APInt(32, MaxSize));
496   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
497   return true;
498 }
499 
500 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
501                                                  Align &MaxAlign) const {
502   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
503          F.getCallingConv() == CallingConv::SPIR_KERNEL);
504 
505   const DataLayout &DL = F.getParent()->getDataLayout();
506   uint64_t ExplicitArgBytes = 0;
507   MaxAlign = Align(1);
508 
509   for (const Argument &Arg : F.args()) {
510     Type *ArgTy = Arg.getType();
511 
512     const Align Alignment(DL.getABITypeAlignment(ArgTy));
513     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
514     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
515     MaxAlign = std::max(MaxAlign, Alignment);
516   }
517 
518   return ExplicitArgBytes;
519 }
520 
521 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
522                                                 Align &MaxAlign) const {
523   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
524 
525   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
526 
527   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
528   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
529   if (ImplicitBytes != 0) {
530     const Align Alignment = getAlignmentForImplicitArgPtr();
531     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
532   }
533 
534   // Being able to dereference past the end is useful for emitting scalar loads.
535   return alignTo(TotalSize, 4);
536 }
537 
538 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
539                              const TargetMachine &TM) :
540   R600GenSubtargetInfo(TT, GPU, FS),
541   AMDGPUSubtarget(TT),
542   InstrInfo(*this),
543   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
544   FMA(false),
545   CaymanISA(false),
546   CFALUBug(false),
547   HasVertexCache(false),
548   R600ALUInst(false),
549   FP64(false),
550   TexVTXClauseSize(0),
551   Gen(R600),
552   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
553   InstrItins(getInstrItineraryForCPU(GPU)) { }
554 
555 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
556                                       unsigned NumRegionInstrs) const {
557   // Track register pressure so the scheduler can try to decrease
558   // pressure once register usage is above the threshold defined by
559   // SIRegisterInfo::getRegPressureSetLimit()
560   Policy.ShouldTrackPressure = true;
561 
562   // Enabling both top down and bottom up scheduling seems to give us less
563   // register spills than just using one of these approaches on its own.
564   Policy.OnlyTopDown = false;
565   Policy.OnlyBottomUp = false;
566 
567   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
568   if (!enableSIScheduler())
569     Policy.ShouldTrackLaneMasks = true;
570 }
571 
572 bool GCNSubtarget::hasMadF16() const {
573   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
574 }
575 
576 bool GCNSubtarget::useVGPRIndexMode() const {
577   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
578 }
579 
580 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
581   if (getGeneration() >= AMDGPUSubtarget::GFX10)
582     return getMaxWavesPerEU();
583 
584   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
585     if (SGPRs <= 80)
586       return 10;
587     if (SGPRs <= 88)
588       return 9;
589     if (SGPRs <= 100)
590       return 8;
591     return 7;
592   }
593   if (SGPRs <= 48)
594     return 10;
595   if (SGPRs <= 56)
596     return 9;
597   if (SGPRs <= 64)
598     return 8;
599   if (SGPRs <= 72)
600     return 7;
601   if (SGPRs <= 80)
602     return 6;
603   return 5;
604 }
605 
606 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
607   unsigned MaxWaves = getMaxWavesPerEU();
608   unsigned Granule = getVGPRAllocGranule();
609   if (VGPRs < Granule)
610     return MaxWaves;
611   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
612   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
613 }
614 
615 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
616   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
617   if (getGeneration() >= AMDGPUSubtarget::GFX10)
618     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
619 
620   if (MFI.hasFlatScratchInit()) {
621     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
622       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
623     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
624       return 4; // FLAT_SCRATCH, VCC (in that order).
625   }
626 
627   if (isXNACKEnabled())
628     return 4; // XNACK, VCC (in that order).
629   return 2; // VCC.
630 }
631 
632 unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF,
633                                         unsigned LDSSize,
634                                         unsigned NumSGPRs,
635                                         unsigned NumVGPRs) const {
636   unsigned Occupancy =
637     std::min(getMaxWavesPerEU(),
638              getOccupancyWithLocalMemSize(LDSSize, MF.getFunction()));
639   if (NumSGPRs)
640     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
641   if (NumVGPRs)
642     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
643   return Occupancy;
644 }
645 
646 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
647   const Function &F = MF.getFunction();
648   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
649 
650   // Compute maximum number of SGPRs function can use using default/requested
651   // minimum number of waves per execution unit.
652   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
653   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
654   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
655 
656   // Check if maximum number of SGPRs was explicitly requested using
657   // "amdgpu-num-sgpr" attribute.
658   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
659     unsigned Requested = AMDGPU::getIntegerAttribute(
660       F, "amdgpu-num-sgpr", MaxNumSGPRs);
661 
662     // Make sure requested value does not violate subtarget's specifications.
663     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
664       Requested = 0;
665 
666     // If more SGPRs are required to support the input user/system SGPRs,
667     // increase to accommodate them.
668     //
669     // FIXME: This really ends up using the requested number of SGPRs + number
670     // of reserved special registers in total. Theoretically you could re-use
671     // the last input registers for these special registers, but this would
672     // require a lot of complexity to deal with the weird aliasing.
673     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
674     if (Requested && Requested < InputNumSGPRs)
675       Requested = InputNumSGPRs;
676 
677     // Make sure requested value is compatible with values implied by
678     // default/requested minimum/maximum number of waves per execution unit.
679     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
680       Requested = 0;
681     if (WavesPerEU.second &&
682         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
683       Requested = 0;
684 
685     if (Requested)
686       MaxNumSGPRs = Requested;
687   }
688 
689   if (hasSGPRInitBug())
690     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
691 
692   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
693                   MaxAddressableNumSGPRs);
694 }
695 
696 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
697   const Function &F = MF.getFunction();
698   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
699 
700   // Compute maximum number of VGPRs function can use using default/requested
701   // minimum number of waves per execution unit.
702   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
703   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
704 
705   // Check if maximum number of VGPRs was explicitly requested using
706   // "amdgpu-num-vgpr" attribute.
707   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
708     unsigned Requested = AMDGPU::getIntegerAttribute(
709       F, "amdgpu-num-vgpr", MaxNumVGPRs);
710 
711     // Make sure requested value is compatible with values implied by
712     // default/requested minimum/maximum number of waves per execution unit.
713     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
714       Requested = 0;
715     if (WavesPerEU.second &&
716         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
717       Requested = 0;
718 
719     if (Requested)
720       MaxNumVGPRs = Requested;
721   }
722 
723   return MaxNumVGPRs;
724 }
725 
726 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
727                                          int UseOpIdx, SDep &Dep) const {
728   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
729       !Def->isInstr() || !Use->isInstr())
730     return;
731 
732   MachineInstr *DefI = Def->getInstr();
733   MachineInstr *UseI = Use->getInstr();
734 
735   if (DefI->isBundle()) {
736     const SIRegisterInfo *TRI = getRegisterInfo();
737     auto Reg = Dep.getReg();
738     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
739     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
740     unsigned Lat = 0;
741     for (++I; I != E && I->isBundledWithPred(); ++I) {
742       if (I->modifiesRegister(Reg, TRI))
743         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
744       else if (Lat)
745         --Lat;
746     }
747     Dep.setLatency(Lat);
748   } else if (UseI->isBundle()) {
749     const SIRegisterInfo *TRI = getRegisterInfo();
750     auto Reg = Dep.getReg();
751     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
752     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
753     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
754     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
755       if (I->readsRegister(Reg, TRI))
756         break;
757       --Lat;
758     }
759     Dep.setLatency(Lat);
760   }
761 }
762 
763 namespace {
764 struct FillMFMAShadowMutation : ScheduleDAGMutation {
765   const SIInstrInfo *TII;
766 
767   ScheduleDAGMI *DAG;
768 
769   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
770 
771   bool isSALU(const SUnit *SU) const {
772     const MachineInstr *MI = SU->getInstr();
773     return MI && TII->isSALU(*MI) && !MI->isTerminator();
774   }
775 
776   bool isVALU(const SUnit *SU) const {
777     const MachineInstr *MI = SU->getInstr();
778     return MI && TII->isVALU(*MI);
779   }
780 
781   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
782     if (Pred->NodeNum < Succ->NodeNum)
783       return true;
784 
785     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
786 
787     for (unsigned I = 0; I < Succs.size(); ++I) {
788       for (const SDep &SI : Succs[I]->Succs) {
789         const SUnit *SU = SI.getSUnit();
790         if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
791           Succs.push_back(SU);
792       }
793     }
794 
795     SmallPtrSet<const SUnit*, 32> Visited;
796     while (!Preds.empty()) {
797       const SUnit *SU = Preds.pop_back_val();
798       if (llvm::find(Succs, SU) != Succs.end())
799         return false;
800       Visited.insert(SU);
801       for (const SDep &SI : SU->Preds)
802         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
803           Preds.push_back(SI.getSUnit());
804     }
805 
806     return true;
807   }
808 
809   // Link as much SALU intructions in chain as possible. Return the size
810   // of the chain. Links up to MaxChain instructions.
811   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
812                          SmallPtrSetImpl<SUnit *> &Visited) const {
813     SmallVector<SUnit *, 8> Worklist({To});
814     unsigned Linked = 0;
815 
816     while (!Worklist.empty() && MaxChain-- > 0) {
817       SUnit *SU = Worklist.pop_back_val();
818       if (!Visited.insert(SU).second)
819         continue;
820 
821       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
822                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
823 
824       if (SU->addPred(SDep(From, SDep::Artificial), false))
825         ++Linked;
826 
827       for (SDep &SI : From->Succs) {
828         SUnit *SUv = SI.getSUnit();
829         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
830           SUv->addPred(SDep(SU, SDep::Artificial), false);
831       }
832 
833       for (SDep &SI : SU->Succs) {
834         SUnit *Succ = SI.getSUnit();
835         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
836           Worklist.push_back(Succ);
837       }
838     }
839 
840     return Linked;
841   }
842 
843   void apply(ScheduleDAGInstrs *DAGInstrs) override {
844     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
845     if (!ST.hasMAIInsts() || DisablePowerSched)
846       return;
847     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
848     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
849     if (!TSchedModel || DAG->SUnits.empty())
850       return;
851 
852     // Scan for MFMA long latency instructions and try to add a dependency
853     // of available SALU instructions to give them a chance to fill MFMA
854     // shadow. That is desirable to fill MFMA shadow with SALU instructions
855     // rather than VALU to prevent power consumption bursts and throttle.
856     auto LastSALU = DAG->SUnits.begin();
857     auto E = DAG->SUnits.end();
858     SmallPtrSet<SUnit*, 32> Visited;
859     for (SUnit &SU : DAG->SUnits) {
860       MachineInstr &MAI = *SU.getInstr();
861       if (!TII->isMAI(MAI) ||
862            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
863            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
864         continue;
865 
866       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
867 
868       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
869                  dbgs() << "Need " << Lat
870                         << " instructions to cover latency.\n");
871 
872       // Find up to Lat independent scalar instructions as early as
873       // possible such that they can be scheduled after this MFMA.
874       for ( ; Lat && LastSALU != E; ++LastSALU) {
875         if (Visited.count(&*LastSALU))
876           continue;
877 
878         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
879           continue;
880 
881         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
882       }
883     }
884   }
885 };
886 } // namespace
887 
888 void GCNSubtarget::getPostRAMutations(
889     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
890   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
891 }
892 
893 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
894   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
895     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
896   else
897     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
898 }
899 
900 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
901   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
902     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
903   else
904     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
905 }
906