1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 static cl::opt<bool> DisablePowerSched(
44   "amdgpu-disable-power-sched",
45   cl::desc("Disable scheduling to minimize mAI power bursts"),
46   cl::init(false));
47 
48 static cl::opt<bool> EnableVGPRIndexMode(
49   "amdgpu-vgpr-index-mode",
50   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51   cl::init(false));
52 
53 GCNSubtarget::~GCNSubtarget() = default;
54 
55 R600Subtarget &
56 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
57                                                StringRef GPU, StringRef FS) {
58   SmallString<256> FullFS("+promote-alloca,");
59   FullFS += FS;
60   ParseSubtargetFeatures(GPU, FullFS);
61 
62   HasMulU24 = getGeneration() >= EVERGREEN;
63   HasMulI24 = hasCaymanISA();
64 
65   return *this;
66 }
67 
68 GCNSubtarget &
69 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
70                                               StringRef GPU, StringRef FS) {
71   // Determine default and user-specified characteristics
72   //
73   // We want to be able to turn these off, but making this a subtarget feature
74   // for SI has the unhelpful behavior that it unsets everything else if you
75   // disable it.
76   //
77   // Similarly we want enable-prt-strict-null to be on by default and not to
78   // unset everything else if it is disabled
79 
80   // Assuming ECC is enabled is the conservative default.
81   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,");
82 
83   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
84     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
85 
86   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
87 
88   // Disable mutually exclusive bits.
89   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
90     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
91       FullFS += "-wavefrontsize16,";
92     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
93       FullFS += "-wavefrontsize32,";
94     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
95       FullFS += "-wavefrontsize64,";
96   }
97 
98   FullFS += FS;
99 
100   ParseSubtargetFeatures(GPU, FullFS);
101 
102   // We don't support FP64 for EG/NI atm.
103   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
104 
105   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
106   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
107   // variants of MUBUF instructions.
108   if (!hasAddr64() && !FS.contains("flat-for-global")) {
109     FlatForGlobal = true;
110   }
111 
112   // Set defaults if needed.
113   if (MaxPrivateElementSize == 0)
114     MaxPrivateElementSize = 4;
115 
116   if (LDSBankCount == 0)
117     LDSBankCount = 32;
118 
119   if (TT.getArch() == Triple::amdgcn) {
120     if (LocalMemorySize == 0)
121       LocalMemorySize = 32768;
122 
123     // Do something sensible for unspecified target.
124     if (!HasMovrel && !HasVGPRIndexMode)
125       HasMovrel = true;
126   }
127 
128   // Don't crash on invalid devices.
129   if (WavefrontSizeLog2 == 0)
130     WavefrontSizeLog2 = 5;
131 
132   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
133 
134   // Disable XNACK on targets where it is not enabled by default unless it is
135   // explicitly requested.
136   if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) {
137     ToggleFeature(AMDGPU::FeatureXNACK);
138     EnableXNACK = false;
139   }
140 
141   // ECC is on by default, but turn it off if the hardware doesn't support it
142   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
143   // ECC.
144   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
145     ToggleFeature(AMDGPU::FeatureSRAMECC);
146     EnableSRAMECC = false;
147   }
148 
149   return *this;
150 }
151 
152 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
153   TargetTriple(TT),
154   Has16BitInsts(false),
155   HasMadMixInsts(false),
156   HasMadMacF32Insts(false),
157   HasDsSrc2Insts(false),
158   HasSDWA(false),
159   HasVOP3PInsts(false),
160   HasMulI24(true),
161   HasMulU24(true),
162   HasInv2PiInlineImm(false),
163   HasFminFmaxLegacy(true),
164   EnablePromoteAlloca(false),
165   HasTrigReducedRange(false),
166   MaxWavesPerEU(10),
167   LocalMemorySize(0),
168   WavefrontSizeLog2(0)
169   { }
170 
171 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
172                            const GCNTargetMachine &TM) :
173     AMDGPUGenSubtargetInfo(TT, GPU, FS),
174     AMDGPUSubtarget(TT),
175     TargetTriple(TT),
176     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
177     InstrItins(getInstrItineraryForCPU(GPU)),
178     LDSBankCount(0),
179     MaxPrivateElementSize(0),
180 
181     FastFMAF32(false),
182     FastDenormalF32(false),
183     HalfRate64Ops(false),
184 
185     FlatForGlobal(false),
186     AutoWaitcntBeforeBarrier(false),
187     CodeObjectV3(false),
188     UnalignedScratchAccess(false),
189     UnalignedBufferAccess(false),
190 
191     HasApertureRegs(false),
192     EnableXNACK(false),
193     DoesNotSupportXNACK(false),
194     EnableCuMode(false),
195     TrapHandler(false),
196 
197     EnableLoadStoreOpt(false),
198     EnableUnsafeDSOffsetFolding(false),
199     EnableSIScheduler(false),
200     EnableDS128(false),
201     EnablePRTStrictNull(false),
202     DumpCode(false),
203 
204     FP64(false),
205     GCN3Encoding(false),
206     CIInsts(false),
207     GFX8Insts(false),
208     GFX9Insts(false),
209     GFX10Insts(false),
210     GFX10_3Insts(false),
211     GFX7GFX8GFX9Insts(false),
212     SGPRInitBug(false),
213     HasSMemRealTime(false),
214     HasIntClamp(false),
215     HasFmaMixInsts(false),
216     HasMovrel(false),
217     HasVGPRIndexMode(false),
218     HasScalarStores(false),
219     HasScalarAtomics(false),
220     HasSDWAOmod(false),
221     HasSDWAScalar(false),
222     HasSDWASdst(false),
223     HasSDWAMac(false),
224     HasSDWAOutModsVOPC(false),
225     HasDPP(false),
226     HasDPP8(false),
227     HasR128A16(false),
228     HasGFX10A16(false),
229     HasG16(false),
230     HasNSAEncoding(false),
231     GFX10_BEncoding(false),
232     HasDLInsts(false),
233     HasDot1Insts(false),
234     HasDot2Insts(false),
235     HasDot3Insts(false),
236     HasDot4Insts(false),
237     HasDot5Insts(false),
238     HasDot6Insts(false),
239     HasMAIInsts(false),
240     HasPkFmacF16Inst(false),
241     HasAtomicFaddInsts(false),
242     EnableSRAMECC(false),
243     DoesNotSupportSRAMECC(false),
244     HasNoSdstCMPX(false),
245     HasVscnt(false),
246     HasGetWaveIdInst(false),
247     HasSMemTimeInst(false),
248     HasRegisterBanking(false),
249     HasVOP3Literal(false),
250     HasNoDataDepHazard(false),
251     FlatAddressSpace(false),
252     FlatInstOffsets(false),
253     FlatGlobalInsts(false),
254     FlatScratchInsts(false),
255     ScalarFlatScratchInsts(false),
256     AddNoCarryInsts(false),
257     HasUnpackedD16VMem(false),
258     LDSMisalignedBug(false),
259     HasMFMAInlineLiteralBug(false),
260 
261     ScalarizeGlobal(false),
262 
263     HasVcmpxPermlaneHazard(false),
264     HasVMEMtoScalarWriteHazard(false),
265     HasSMEMtoVectorWriteHazard(false),
266     HasInstFwdPrefetchBug(false),
267     HasVcmpxExecWARHazard(false),
268     HasLdsBranchVmemWARHazard(false),
269     HasNSAtoVMEMBug(false),
270     HasOffset3fBug(false),
271     HasFlatSegmentOffsetBug(false),
272 
273     FeatureDisable(false),
274     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
275     TLInfo(TM, *this),
276     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
277   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
278   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
279   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
280   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
281   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
282   InstSelector.reset(new AMDGPUInstructionSelector(
283   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
284 }
285 
286 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
287   if (getGeneration() < GFX10)
288     return 1;
289 
290   switch (Opcode) {
291   case AMDGPU::V_LSHLREV_B64:
292   case AMDGPU::V_LSHLREV_B64_gfx10:
293   case AMDGPU::V_LSHL_B64:
294   case AMDGPU::V_LSHRREV_B64:
295   case AMDGPU::V_LSHRREV_B64_gfx10:
296   case AMDGPU::V_LSHR_B64:
297   case AMDGPU::V_ASHRREV_I64:
298   case AMDGPU::V_ASHRREV_I64_gfx10:
299   case AMDGPU::V_ASHR_I64:
300     return 1;
301   }
302 
303   return 2;
304 }
305 
306 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
307   const Function &F) const {
308   if (NWaves == 1)
309     return getLocalMemorySize();
310   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
311   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
312   if (!WorkGroupsPerCu)
313     return 0;
314   unsigned MaxWaves = getMaxWavesPerEU();
315   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
316 }
317 
318 // FIXME: Should return min,max range.
319 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
320   const Function &F) const {
321   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
322   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
323   if (!MaxWorkGroupsPerCu)
324     return 0;
325 
326   const unsigned WaveSize = getWavefrontSize();
327 
328   // FIXME: Do we need to account for alignment requirement of LDS rounding the
329   // size up?
330   // Compute restriction based on LDS usage
331   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
332 
333   // This can be queried with more LDS than is possible, so just assume the
334   // worst.
335   if (NumGroups == 0)
336     return 1;
337 
338   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
339 
340   // Round to the number of waves.
341   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
342   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
343 
344   // Clamp to the maximum possible number of waves.
345   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
346 
347   // FIXME: Needs to be a multiple of the group size?
348   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
349 
350   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
351          "computed invalid occupancy");
352   return MaxWaves;
353 }
354 
355 unsigned
356 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
357   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
358   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
359 }
360 
361 std::pair<unsigned, unsigned>
362 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
363   switch (CC) {
364   case CallingConv::AMDGPU_VS:
365   case CallingConv::AMDGPU_LS:
366   case CallingConv::AMDGPU_HS:
367   case CallingConv::AMDGPU_ES:
368   case CallingConv::AMDGPU_GS:
369   case CallingConv::AMDGPU_PS:
370     return std::make_pair(1, getWavefrontSize());
371   default:
372     return std::make_pair(1u, getMaxFlatWorkGroupSize());
373   }
374 }
375 
376 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
377   const Function &F) const {
378   // Default minimum/maximum flat work group sizes.
379   std::pair<unsigned, unsigned> Default =
380     getDefaultFlatWorkGroupSize(F.getCallingConv());
381 
382   // Requested minimum/maximum flat work group sizes.
383   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
384     F, "amdgpu-flat-work-group-size", Default);
385 
386   // Make sure requested minimum is less than requested maximum.
387   if (Requested.first > Requested.second)
388     return Default;
389 
390   // Make sure requested values do not violate subtarget's specifications.
391   if (Requested.first < getMinFlatWorkGroupSize())
392     return Default;
393   if (Requested.second > getMaxFlatWorkGroupSize())
394     return Default;
395 
396   return Requested;
397 }
398 
399 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
400   const Function &F) const {
401   // Default minimum/maximum number of waves per execution unit.
402   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
403 
404   // Default/requested minimum/maximum flat work group sizes.
405   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
406 
407   // If minimum/maximum flat work group sizes were explicitly requested using
408   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
409   // number of waves per execution unit to values implied by requested
410   // minimum/maximum flat work group sizes.
411   unsigned MinImpliedByFlatWorkGroupSize =
412     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
413   bool RequestedFlatWorkGroupSize = false;
414 
415   if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
416     Default.first = MinImpliedByFlatWorkGroupSize;
417     RequestedFlatWorkGroupSize = true;
418   }
419 
420   // Requested minimum/maximum number of waves per execution unit.
421   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
422     F, "amdgpu-waves-per-eu", Default, true);
423 
424   // Make sure requested minimum is less than requested maximum.
425   if (Requested.second && Requested.first > Requested.second)
426     return Default;
427 
428   // Make sure requested values do not violate subtarget's specifications.
429   if (Requested.first < getMinWavesPerEU() ||
430       Requested.first > getMaxWavesPerEU())
431     return Default;
432   if (Requested.second > getMaxWavesPerEU())
433     return Default;
434 
435   // Make sure requested values are compatible with values implied by requested
436   // minimum/maximum flat work group sizes.
437   if (RequestedFlatWorkGroupSize &&
438       Requested.first < MinImpliedByFlatWorkGroupSize)
439     return Default;
440 
441   return Requested;
442 }
443 
444 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
445   Function *Kernel = I->getParent()->getParent();
446   unsigned MinSize = 0;
447   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
448   bool IdQuery = false;
449 
450   // If reqd_work_group_size is present it narrows value down.
451   if (auto *CI = dyn_cast<CallInst>(I)) {
452     const Function *F = CI->getCalledFunction();
453     if (F) {
454       unsigned Dim = UINT_MAX;
455       switch (F->getIntrinsicID()) {
456       case Intrinsic::amdgcn_workitem_id_x:
457       case Intrinsic::r600_read_tidig_x:
458         IdQuery = true;
459         LLVM_FALLTHROUGH;
460       case Intrinsic::r600_read_local_size_x:
461         Dim = 0;
462         break;
463       case Intrinsic::amdgcn_workitem_id_y:
464       case Intrinsic::r600_read_tidig_y:
465         IdQuery = true;
466         LLVM_FALLTHROUGH;
467       case Intrinsic::r600_read_local_size_y:
468         Dim = 1;
469         break;
470       case Intrinsic::amdgcn_workitem_id_z:
471       case Intrinsic::r600_read_tidig_z:
472         IdQuery = true;
473         LLVM_FALLTHROUGH;
474       case Intrinsic::r600_read_local_size_z:
475         Dim = 2;
476         break;
477       default:
478         break;
479       }
480       if (Dim <= 3) {
481         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
482           if (Node->getNumOperands() == 3)
483             MinSize = MaxSize = mdconst::extract<ConstantInt>(
484                                   Node->getOperand(Dim))->getZExtValue();
485       }
486     }
487   }
488 
489   if (!MaxSize)
490     return false;
491 
492   // Range metadata is [Lo, Hi). For ID query we need to pass max size
493   // as Hi. For size query we need to pass Hi + 1.
494   if (IdQuery)
495     MinSize = 0;
496   else
497     ++MaxSize;
498 
499   MDBuilder MDB(I->getContext());
500   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
501                                                   APInt(32, MaxSize));
502   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
503   return true;
504 }
505 
506 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
507                                                  Align &MaxAlign) const {
508   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
509          F.getCallingConv() == CallingConv::SPIR_KERNEL);
510 
511   const DataLayout &DL = F.getParent()->getDataLayout();
512   uint64_t ExplicitArgBytes = 0;
513   MaxAlign = Align(1);
514 
515   for (const Argument &Arg : F.args()) {
516     Type *ArgTy = Arg.getType();
517 
518     const Align Alignment(DL.getABITypeAlignment(ArgTy));
519     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
520     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
521     MaxAlign = std::max(MaxAlign, Alignment);
522   }
523 
524   return ExplicitArgBytes;
525 }
526 
527 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
528                                                 Align &MaxAlign) const {
529   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
530 
531   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
532 
533   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
534   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
535   if (ImplicitBytes != 0) {
536     const Align Alignment = getAlignmentForImplicitArgPtr();
537     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
538   }
539 
540   // Being able to dereference past the end is useful for emitting scalar loads.
541   return alignTo(TotalSize, 4);
542 }
543 
544 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
545                              const TargetMachine &TM) :
546   R600GenSubtargetInfo(TT, GPU, FS),
547   AMDGPUSubtarget(TT),
548   InstrInfo(*this),
549   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
550   FMA(false),
551   CaymanISA(false),
552   CFALUBug(false),
553   HasVertexCache(false),
554   R600ALUInst(false),
555   FP64(false),
556   TexVTXClauseSize(0),
557   Gen(R600),
558   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
559   InstrItins(getInstrItineraryForCPU(GPU)) { }
560 
561 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
562                                       unsigned NumRegionInstrs) const {
563   // Track register pressure so the scheduler can try to decrease
564   // pressure once register usage is above the threshold defined by
565   // SIRegisterInfo::getRegPressureSetLimit()
566   Policy.ShouldTrackPressure = true;
567 
568   // Enabling both top down and bottom up scheduling seems to give us less
569   // register spills than just using one of these approaches on its own.
570   Policy.OnlyTopDown = false;
571   Policy.OnlyBottomUp = false;
572 
573   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
574   if (!enableSIScheduler())
575     Policy.ShouldTrackLaneMasks = true;
576 }
577 
578 bool GCNSubtarget::hasMadF16() const {
579   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
580 }
581 
582 bool GCNSubtarget::useVGPRIndexMode() const {
583   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
584 }
585 
586 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
587   if (getGeneration() >= AMDGPUSubtarget::GFX10)
588     return getMaxWavesPerEU();
589 
590   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
591     if (SGPRs <= 80)
592       return 10;
593     if (SGPRs <= 88)
594       return 9;
595     if (SGPRs <= 100)
596       return 8;
597     return 7;
598   }
599   if (SGPRs <= 48)
600     return 10;
601   if (SGPRs <= 56)
602     return 9;
603   if (SGPRs <= 64)
604     return 8;
605   if (SGPRs <= 72)
606     return 7;
607   if (SGPRs <= 80)
608     return 6;
609   return 5;
610 }
611 
612 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
613   unsigned MaxWaves = getMaxWavesPerEU();
614   unsigned Granule = getVGPRAllocGranule();
615   if (VGPRs < Granule)
616     return MaxWaves;
617   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
618   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
619 }
620 
621 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
622   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
623   if (getGeneration() >= AMDGPUSubtarget::GFX10)
624     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
625 
626   if (MFI.hasFlatScratchInit()) {
627     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
628       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
629     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
630       return 4; // FLAT_SCRATCH, VCC (in that order).
631   }
632 
633   if (isXNACKEnabled())
634     return 4; // XNACK, VCC (in that order).
635   return 2; // VCC.
636 }
637 
638 unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF,
639                                         unsigned LDSSize,
640                                         unsigned NumSGPRs,
641                                         unsigned NumVGPRs) const {
642   unsigned Occupancy =
643     std::min(getMaxWavesPerEU(),
644              getOccupancyWithLocalMemSize(LDSSize, MF.getFunction()));
645   if (NumSGPRs)
646     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
647   if (NumVGPRs)
648     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
649   return Occupancy;
650 }
651 
652 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
653   const Function &F = MF.getFunction();
654   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
655 
656   // Compute maximum number of SGPRs function can use using default/requested
657   // minimum number of waves per execution unit.
658   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
659   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
660   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
661 
662   // Check if maximum number of SGPRs was explicitly requested using
663   // "amdgpu-num-sgpr" attribute.
664   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
665     unsigned Requested = AMDGPU::getIntegerAttribute(
666       F, "amdgpu-num-sgpr", MaxNumSGPRs);
667 
668     // Make sure requested value does not violate subtarget's specifications.
669     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
670       Requested = 0;
671 
672     // If more SGPRs are required to support the input user/system SGPRs,
673     // increase to accommodate them.
674     //
675     // FIXME: This really ends up using the requested number of SGPRs + number
676     // of reserved special registers in total. Theoretically you could re-use
677     // the last input registers for these special registers, but this would
678     // require a lot of complexity to deal with the weird aliasing.
679     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
680     if (Requested && Requested < InputNumSGPRs)
681       Requested = InputNumSGPRs;
682 
683     // Make sure requested value is compatible with values implied by
684     // default/requested minimum/maximum number of waves per execution unit.
685     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
686       Requested = 0;
687     if (WavesPerEU.second &&
688         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
689       Requested = 0;
690 
691     if (Requested)
692       MaxNumSGPRs = Requested;
693   }
694 
695   if (hasSGPRInitBug())
696     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
697 
698   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
699                   MaxAddressableNumSGPRs);
700 }
701 
702 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
703   const Function &F = MF.getFunction();
704   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
705 
706   // Compute maximum number of VGPRs function can use using default/requested
707   // minimum number of waves per execution unit.
708   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
709   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
710 
711   // Check if maximum number of VGPRs was explicitly requested using
712   // "amdgpu-num-vgpr" attribute.
713   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
714     unsigned Requested = AMDGPU::getIntegerAttribute(
715       F, "amdgpu-num-vgpr", MaxNumVGPRs);
716 
717     // Make sure requested value is compatible with values implied by
718     // default/requested minimum/maximum number of waves per execution unit.
719     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
720       Requested = 0;
721     if (WavesPerEU.second &&
722         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
723       Requested = 0;
724 
725     if (Requested)
726       MaxNumVGPRs = Requested;
727   }
728 
729   return MaxNumVGPRs;
730 }
731 
732 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
733                                          int UseOpIdx, SDep &Dep) const {
734   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
735       !Def->isInstr() || !Use->isInstr())
736     return;
737 
738   MachineInstr *DefI = Def->getInstr();
739   MachineInstr *UseI = Use->getInstr();
740 
741   if (DefI->isBundle()) {
742     const SIRegisterInfo *TRI = getRegisterInfo();
743     auto Reg = Dep.getReg();
744     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
745     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
746     unsigned Lat = 0;
747     for (++I; I != E && I->isBundledWithPred(); ++I) {
748       if (I->modifiesRegister(Reg, TRI))
749         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
750       else if (Lat)
751         --Lat;
752     }
753     Dep.setLatency(Lat);
754   } else if (UseI->isBundle()) {
755     const SIRegisterInfo *TRI = getRegisterInfo();
756     auto Reg = Dep.getReg();
757     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
758     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
759     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
760     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
761       if (I->readsRegister(Reg, TRI))
762         break;
763       --Lat;
764     }
765     Dep.setLatency(Lat);
766   }
767 }
768 
769 namespace {
770 struct FillMFMAShadowMutation : ScheduleDAGMutation {
771   const SIInstrInfo *TII;
772 
773   ScheduleDAGMI *DAG;
774 
775   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
776 
777   bool isSALU(const SUnit *SU) const {
778     const MachineInstr *MI = SU->getInstr();
779     return MI && TII->isSALU(*MI) && !MI->isTerminator();
780   }
781 
782   bool isVALU(const SUnit *SU) const {
783     const MachineInstr *MI = SU->getInstr();
784     return MI && TII->isVALU(*MI);
785   }
786 
787   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
788     if (Pred->NodeNum < Succ->NodeNum)
789       return true;
790 
791     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
792 
793     for (unsigned I = 0; I < Succs.size(); ++I) {
794       for (const SDep &SI : Succs[I]->Succs) {
795         const SUnit *SU = SI.getSUnit();
796         if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
797           Succs.push_back(SU);
798       }
799     }
800 
801     SmallPtrSet<const SUnit*, 32> Visited;
802     while (!Preds.empty()) {
803       const SUnit *SU = Preds.pop_back_val();
804       if (llvm::find(Succs, SU) != Succs.end())
805         return false;
806       Visited.insert(SU);
807       for (const SDep &SI : SU->Preds)
808         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
809           Preds.push_back(SI.getSUnit());
810     }
811 
812     return true;
813   }
814 
815   // Link as much SALU intructions in chain as possible. Return the size
816   // of the chain. Links up to MaxChain instructions.
817   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
818                          SmallPtrSetImpl<SUnit *> &Visited) const {
819     SmallVector<SUnit *, 8> Worklist({To});
820     unsigned Linked = 0;
821 
822     while (!Worklist.empty() && MaxChain-- > 0) {
823       SUnit *SU = Worklist.pop_back_val();
824       if (!Visited.insert(SU).second)
825         continue;
826 
827       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
828                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
829 
830       if (SU->addPred(SDep(From, SDep::Artificial), false))
831         ++Linked;
832 
833       for (SDep &SI : From->Succs) {
834         SUnit *SUv = SI.getSUnit();
835         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
836           SUv->addPred(SDep(SU, SDep::Artificial), false);
837       }
838 
839       for (SDep &SI : SU->Succs) {
840         SUnit *Succ = SI.getSUnit();
841         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
842           Worklist.push_back(Succ);
843       }
844     }
845 
846     return Linked;
847   }
848 
849   void apply(ScheduleDAGInstrs *DAGInstrs) override {
850     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
851     if (!ST.hasMAIInsts() || DisablePowerSched)
852       return;
853     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
854     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
855     if (!TSchedModel || DAG->SUnits.empty())
856       return;
857 
858     // Scan for MFMA long latency instructions and try to add a dependency
859     // of available SALU instructions to give them a chance to fill MFMA
860     // shadow. That is desirable to fill MFMA shadow with SALU instructions
861     // rather than VALU to prevent power consumption bursts and throttle.
862     auto LastSALU = DAG->SUnits.begin();
863     auto E = DAG->SUnits.end();
864     SmallPtrSet<SUnit*, 32> Visited;
865     for (SUnit &SU : DAG->SUnits) {
866       MachineInstr &MAI = *SU.getInstr();
867       if (!TII->isMAI(MAI) ||
868            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
869            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
870         continue;
871 
872       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
873 
874       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
875                  dbgs() << "Need " << Lat
876                         << " instructions to cover latency.\n");
877 
878       // Find up to Lat independent scalar instructions as early as
879       // possible such that they can be scheduled after this MFMA.
880       for ( ; Lat && LastSALU != E; ++LastSALU) {
881         if (Visited.count(&*LastSALU))
882           continue;
883 
884         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
885           continue;
886 
887         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
888       }
889     }
890   }
891 };
892 } // namespace
893 
894 void GCNSubtarget::getPostRAMutations(
895     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
896   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
897 }
898 
899 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
900   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
901     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
902   else
903     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
904 }
905 
906 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
907   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
908     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
909   else
910     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
911 }
912