1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 static cl::opt<bool> DisablePowerSched(
44   "amdgpu-disable-power-sched",
45   cl::desc("Disable scheduling to minimize mAI power bursts"),
46   cl::init(false));
47 
48 static cl::opt<bool> EnableVGPRIndexMode(
49   "amdgpu-vgpr-index-mode",
50   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51   cl::init(false));
52 
53 GCNSubtarget::~GCNSubtarget() = default;
54 
55 R600Subtarget &
56 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
57                                                StringRef GPU, StringRef FS) {
58   SmallString<256> FullFS("+promote-alloca,");
59   FullFS += FS;
60   ParseSubtargetFeatures(GPU, FullFS);
61 
62   HasMulU24 = getGeneration() >= EVERGREEN;
63   HasMulI24 = hasCaymanISA();
64 
65   return *this;
66 }
67 
68 GCNSubtarget &
69 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
70                                               StringRef GPU, StringRef FS) {
71   // Determine default and user-specified characteristics
72   //
73   // We want to be able to turn these off, but making this a subtarget feature
74   // for SI has the unhelpful behavior that it unsets everything else if you
75   // disable it.
76   //
77   // Similarly we want enable-prt-strict-null to be on by default and not to
78   // unset everything else if it is disabled
79 
80   // Assuming ECC is enabled is the conservative default.
81   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,");
82 
83   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
84     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
85 
86   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
87 
88   // Disable mutually exclusive bits.
89   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
90     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
91       FullFS += "-wavefrontsize16,";
92     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
93       FullFS += "-wavefrontsize32,";
94     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
95       FullFS += "-wavefrontsize64,";
96   }
97 
98   FullFS += FS;
99 
100   ParseSubtargetFeatures(GPU, FullFS);
101 
102   // We don't support FP64 for EG/NI atm.
103   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
104 
105   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
106   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
107   // variants of MUBUF instructions.
108   if (!hasAddr64() && !FS.contains("flat-for-global")) {
109     FlatForGlobal = true;
110   }
111 
112   // Set defaults if needed.
113   if (MaxPrivateElementSize == 0)
114     MaxPrivateElementSize = 4;
115 
116   if (LDSBankCount == 0)
117     LDSBankCount = 32;
118 
119   if (TT.getArch() == Triple::amdgcn) {
120     if (LocalMemorySize == 0)
121       LocalMemorySize = 32768;
122 
123     // Do something sensible for unspecified target.
124     if (!HasMovrel && !HasVGPRIndexMode)
125       HasMovrel = true;
126   }
127 
128   // Don't crash on invalid devices.
129   if (WavefrontSizeLog2 == 0)
130     WavefrontSizeLog2 = 5;
131 
132   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
133 
134   // Disable XNACK on targets where it is not enabled by default unless it is
135   // explicitly requested.
136   if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) {
137     ToggleFeature(AMDGPU::FeatureXNACK);
138     EnableXNACK = false;
139   }
140 
141   // ECC is on by default, but turn it off if the hardware doesn't support it
142   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
143   // ECC.
144   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
145     ToggleFeature(AMDGPU::FeatureSRAMECC);
146     EnableSRAMECC = false;
147   }
148 
149   return *this;
150 }
151 
152 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
153   TargetTriple(TT),
154   Has16BitInsts(false),
155   HasMadMixInsts(false),
156   HasMadMacF32Insts(false),
157   HasDsSrc2Insts(false),
158   HasSDWA(false),
159   HasVOP3PInsts(false),
160   HasMulI24(true),
161   HasMulU24(true),
162   HasInv2PiInlineImm(false),
163   HasFminFmaxLegacy(true),
164   EnablePromoteAlloca(false),
165   HasTrigReducedRange(false),
166   MaxWavesPerEU(10),
167   LocalMemorySize(0),
168   WavefrontSizeLog2(0)
169   { }
170 
171 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
172                            const GCNTargetMachine &TM) :
173     AMDGPUGenSubtargetInfo(TT, GPU, FS),
174     AMDGPUSubtarget(TT),
175     TargetTriple(TT),
176     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
177     InstrItins(getInstrItineraryForCPU(GPU)),
178     LDSBankCount(0),
179     MaxPrivateElementSize(0),
180 
181     FastFMAF32(false),
182     FastDenormalF32(false),
183     HalfRate64Ops(false),
184 
185     FlatForGlobal(false),
186     AutoWaitcntBeforeBarrier(false),
187     CodeObjectV3(false),
188     UnalignedScratchAccess(false),
189     UnalignedBufferAccess(false),
190 
191     HasApertureRegs(false),
192     EnableXNACK(false),
193     DoesNotSupportXNACK(false),
194     EnableCuMode(false),
195     TrapHandler(false),
196 
197     EnableLoadStoreOpt(false),
198     EnableUnsafeDSOffsetFolding(false),
199     EnableSIScheduler(false),
200     EnableDS128(false),
201     EnablePRTStrictNull(false),
202     DumpCode(false),
203 
204     FP64(false),
205     GCN3Encoding(false),
206     CIInsts(false),
207     GFX8Insts(false),
208     GFX9Insts(false),
209     GFX10Insts(false),
210     GFX10_3Insts(false),
211     GFX7GFX8GFX9Insts(false),
212     SGPRInitBug(false),
213     HasSMemRealTime(false),
214     HasIntClamp(false),
215     HasFmaMixInsts(false),
216     HasMovrel(false),
217     HasVGPRIndexMode(false),
218     HasScalarStores(false),
219     HasScalarAtomics(false),
220     HasSDWAOmod(false),
221     HasSDWAScalar(false),
222     HasSDWASdst(false),
223     HasSDWAMac(false),
224     HasSDWAOutModsVOPC(false),
225     HasDPP(false),
226     HasDPP8(false),
227     HasR128A16(false),
228     HasGFX10A16(false),
229     HasG16(false),
230     HasNSAEncoding(false),
231     GFX10_BEncoding(false),
232     HasDLInsts(false),
233     HasDot1Insts(false),
234     HasDot2Insts(false),
235     HasDot3Insts(false),
236     HasDot4Insts(false),
237     HasDot5Insts(false),
238     HasDot6Insts(false),
239     HasMAIInsts(false),
240     HasPkFmacF16Inst(false),
241     HasAtomicFaddInsts(false),
242     EnableSRAMECC(false),
243     DoesNotSupportSRAMECC(false),
244     HasNoSdstCMPX(false),
245     HasVscnt(false),
246     HasGetWaveIdInst(false),
247     HasSMemTimeInst(false),
248     HasRegisterBanking(false),
249     HasVOP3Literal(false),
250     HasNoDataDepHazard(false),
251     FlatAddressSpace(false),
252     FlatInstOffsets(false),
253     FlatGlobalInsts(false),
254     FlatScratchInsts(false),
255     ScalarFlatScratchInsts(false),
256     AddNoCarryInsts(false),
257     HasUnpackedD16VMem(false),
258     LDSMisalignedBug(false),
259     HasMFMAInlineLiteralBug(false),
260 
261     ScalarizeGlobal(false),
262 
263     HasVcmpxPermlaneHazard(false),
264     HasVMEMtoScalarWriteHazard(false),
265     HasSMEMtoVectorWriteHazard(false),
266     HasInstFwdPrefetchBug(false),
267     HasVcmpxExecWARHazard(false),
268     HasLdsBranchVmemWARHazard(false),
269     HasNSAtoVMEMBug(false),
270     HasOffset3fBug(false),
271     HasFlatSegmentOffsetBug(false),
272 
273     FeatureDisable(false),
274     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
275     TLInfo(TM, *this),
276     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
277   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
278   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
279   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
280   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
281   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
282   InstSelector.reset(new AMDGPUInstructionSelector(
283   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
284 }
285 
286 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
287   if (getGeneration() < GFX10)
288     return 1;
289 
290   switch (Opcode) {
291   case AMDGPU::V_LSHLREV_B64:
292   case AMDGPU::V_LSHLREV_B64_gfx10:
293   case AMDGPU::V_LSHL_B64:
294   case AMDGPU::V_LSHRREV_B64:
295   case AMDGPU::V_LSHRREV_B64_gfx10:
296   case AMDGPU::V_LSHR_B64:
297   case AMDGPU::V_ASHRREV_I64:
298   case AMDGPU::V_ASHRREV_I64_gfx10:
299   case AMDGPU::V_ASHR_I64:
300     return 1;
301   }
302 
303   return 2;
304 }
305 
306 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
307   const Function &F) const {
308   if (NWaves == 1)
309     return getLocalMemorySize();
310   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
311   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
312   if (!WorkGroupsPerCu)
313     return 0;
314   unsigned MaxWaves = getMaxWavesPerEU();
315   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
316 }
317 
318 // FIXME: Should return min,max range.
319 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
320   const Function &F) const {
321   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
322   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
323   if (!MaxWorkGroupsPerCu)
324     return 0;
325 
326   const unsigned WaveSize = getWavefrontSize();
327 
328   // FIXME: Do we need to account for alignment requirement of LDS rounding the
329   // size up?
330   // Compute restriction based on LDS usage
331   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
332 
333   // This can be queried with more LDS than is possible, so just assume the
334   // worst.
335   if (NumGroups == 0)
336     return 1;
337 
338   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
339 
340   // Round to the number of waves.
341   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
342   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
343 
344   // Clamp to the maximum possible number of waves.
345   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
346 
347   // FIXME: Needs to be a multiple of the group size?
348   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
349 
350   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
351          "computed invalid occupancy");
352   return MaxWaves;
353 }
354 
355 unsigned
356 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
357   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
358   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
359 }
360 
361 std::pair<unsigned, unsigned>
362 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
363   switch (CC) {
364   case CallingConv::AMDGPU_VS:
365   case CallingConv::AMDGPU_LS:
366   case CallingConv::AMDGPU_HS:
367   case CallingConv::AMDGPU_ES:
368   case CallingConv::AMDGPU_GS:
369   case CallingConv::AMDGPU_PS:
370     return std::make_pair(1, getWavefrontSize());
371   default:
372     return std::make_pair(1u, getMaxFlatWorkGroupSize());
373   }
374 }
375 
376 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
377   const Function &F) const {
378   // Default minimum/maximum flat work group sizes.
379   std::pair<unsigned, unsigned> Default =
380     getDefaultFlatWorkGroupSize(F.getCallingConv());
381 
382   // Requested minimum/maximum flat work group sizes.
383   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
384     F, "amdgpu-flat-work-group-size", Default);
385 
386   // Make sure requested minimum is less than requested maximum.
387   if (Requested.first > Requested.second)
388     return Default;
389 
390   // Make sure requested values do not violate subtarget's specifications.
391   if (Requested.first < getMinFlatWorkGroupSize())
392     return Default;
393   if (Requested.second > getMaxFlatWorkGroupSize())
394     return Default;
395 
396   return Requested;
397 }
398 
399 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
400   const Function &F) const {
401   // Default minimum/maximum number of waves per execution unit.
402   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
403 
404   // Default/requested minimum/maximum flat work group sizes.
405   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
406 
407   // If minimum/maximum flat work group sizes were explicitly requested using
408   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
409   // number of waves per execution unit to values implied by requested
410   // minimum/maximum flat work group sizes.
411   unsigned MinImpliedByFlatWorkGroupSize =
412     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
413   bool RequestedFlatWorkGroupSize = false;
414 
415   if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
416     Default.first = MinImpliedByFlatWorkGroupSize;
417     RequestedFlatWorkGroupSize = true;
418   }
419 
420   // Requested minimum/maximum number of waves per execution unit.
421   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
422     F, "amdgpu-waves-per-eu", Default, true);
423 
424   // Make sure requested minimum is less than requested maximum.
425   if (Requested.second && Requested.first > Requested.second)
426     return Default;
427 
428   // Make sure requested values do not violate subtarget's specifications.
429   if (Requested.first < getMinWavesPerEU() ||
430       Requested.first > getMaxWavesPerEU())
431     return Default;
432   if (Requested.second > getMaxWavesPerEU())
433     return Default;
434 
435   // Make sure requested values are compatible with values implied by requested
436   // minimum/maximum flat work group sizes.
437   if (RequestedFlatWorkGroupSize &&
438       Requested.first < MinImpliedByFlatWorkGroupSize)
439     return Default;
440 
441   return Requested;
442 }
443 
444 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
445   Function *Kernel = I->getParent()->getParent();
446   unsigned MinSize = 0;
447   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
448   bool IdQuery = false;
449 
450   // If reqd_work_group_size is present it narrows value down.
451   if (auto *CI = dyn_cast<CallInst>(I)) {
452     const Function *F = CI->getCalledFunction();
453     if (F) {
454       unsigned Dim = UINT_MAX;
455       switch (F->getIntrinsicID()) {
456       case Intrinsic::amdgcn_workitem_id_x:
457       case Intrinsic::r600_read_tidig_x:
458         IdQuery = true;
459         LLVM_FALLTHROUGH;
460       case Intrinsic::r600_read_local_size_x:
461         Dim = 0;
462         break;
463       case Intrinsic::amdgcn_workitem_id_y:
464       case Intrinsic::r600_read_tidig_y:
465         IdQuery = true;
466         LLVM_FALLTHROUGH;
467       case Intrinsic::r600_read_local_size_y:
468         Dim = 1;
469         break;
470       case Intrinsic::amdgcn_workitem_id_z:
471       case Intrinsic::r600_read_tidig_z:
472         IdQuery = true;
473         LLVM_FALLTHROUGH;
474       case Intrinsic::r600_read_local_size_z:
475         Dim = 2;
476         break;
477       default:
478         break;
479       }
480       if (Dim <= 3) {
481         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
482           if (Node->getNumOperands() == 3)
483             MinSize = MaxSize = mdconst::extract<ConstantInt>(
484                                   Node->getOperand(Dim))->getZExtValue();
485       }
486     }
487   }
488 
489   if (!MaxSize)
490     return false;
491 
492   // Range metadata is [Lo, Hi). For ID query we need to pass max size
493   // as Hi. For size query we need to pass Hi + 1.
494   if (IdQuery)
495     MinSize = 0;
496   else
497     ++MaxSize;
498 
499   MDBuilder MDB(I->getContext());
500   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
501                                                   APInt(32, MaxSize));
502   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
503   return true;
504 }
505 
506 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
507                                                  Align &MaxAlign) const {
508   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
509          F.getCallingConv() == CallingConv::SPIR_KERNEL);
510 
511   const DataLayout &DL = F.getParent()->getDataLayout();
512   uint64_t ExplicitArgBytes = 0;
513   MaxAlign = Align(1);
514 
515   for (const Argument &Arg : F.args()) {
516     Type *ArgTy = Arg.getType();
517 
518     const Align Alignment(DL.getABITypeAlignment(ArgTy));
519     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
520     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
521     MaxAlign = std::max(MaxAlign, Alignment);
522   }
523 
524   return ExplicitArgBytes;
525 }
526 
527 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
528                                                 Align &MaxAlign) const {
529   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
530 
531   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
532 
533   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
534   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
535   if (ImplicitBytes != 0) {
536     const Align Alignment = getAlignmentForImplicitArgPtr();
537     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
538   }
539 
540   // Being able to dereference past the end is useful for emitting scalar loads.
541   return alignTo(TotalSize, 4);
542 }
543 
544 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
545                              const TargetMachine &TM) :
546   R600GenSubtargetInfo(TT, GPU, FS),
547   AMDGPUSubtarget(TT),
548   InstrInfo(*this),
549   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
550   FMA(false),
551   CaymanISA(false),
552   CFALUBug(false),
553   HasVertexCache(false),
554   R600ALUInst(false),
555   FP64(false),
556   TexVTXClauseSize(0),
557   Gen(R600),
558   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
559   InstrItins(getInstrItineraryForCPU(GPU)) { }
560 
561 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
562                                       unsigned NumRegionInstrs) const {
563   // Track register pressure so the scheduler can try to decrease
564   // pressure once register usage is above the threshold defined by
565   // SIRegisterInfo::getRegPressureSetLimit()
566   Policy.ShouldTrackPressure = true;
567 
568   // Enabling both top down and bottom up scheduling seems to give us less
569   // register spills than just using one of these approaches on its own.
570   Policy.OnlyTopDown = false;
571   Policy.OnlyBottomUp = false;
572 
573   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
574   if (!enableSIScheduler())
575     Policy.ShouldTrackLaneMasks = true;
576 }
577 
578 bool GCNSubtarget::hasMadF16() const {
579   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
580 }
581 
582 bool GCNSubtarget::useVGPRIndexMode() const {
583   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
584 }
585 
586 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
587   if (getGeneration() >= AMDGPUSubtarget::GFX10)
588     return getMaxWavesPerEU();
589 
590   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
591     if (SGPRs <= 80)
592       return 10;
593     if (SGPRs <= 88)
594       return 9;
595     if (SGPRs <= 100)
596       return 8;
597     return 7;
598   }
599   if (SGPRs <= 48)
600     return 10;
601   if (SGPRs <= 56)
602     return 9;
603   if (SGPRs <= 64)
604     return 8;
605   if (SGPRs <= 72)
606     return 7;
607   if (SGPRs <= 80)
608     return 6;
609   return 5;
610 }
611 
612 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
613   unsigned MaxWaves = getMaxWavesPerEU();
614   unsigned Granule = getVGPRAllocGranule();
615   if (VGPRs < Granule)
616     return MaxWaves;
617   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
618   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
619 }
620 
621 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
622   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
623   if (getGeneration() >= AMDGPUSubtarget::GFX10)
624     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
625 
626   if (MFI.hasFlatScratchInit()) {
627     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
628       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
629     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
630       return 4; // FLAT_SCRATCH, VCC (in that order).
631   }
632 
633   if (isXNACKEnabled())
634     return 4; // XNACK, VCC (in that order).
635   return 2; // VCC.
636 }
637 
638 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
639                                         unsigned NumSGPRs,
640                                         unsigned NumVGPRs) const {
641   unsigned Occupancy =
642     std::min(getMaxWavesPerEU(),
643              getOccupancyWithLocalMemSize(LDSSize, F));
644   if (NumSGPRs)
645     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
646   if (NumVGPRs)
647     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
648   return Occupancy;
649 }
650 
651 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
652   const Function &F = MF.getFunction();
653   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
654 
655   // Compute maximum number of SGPRs function can use using default/requested
656   // minimum number of waves per execution unit.
657   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
658   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
659   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
660 
661   // Check if maximum number of SGPRs was explicitly requested using
662   // "amdgpu-num-sgpr" attribute.
663   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
664     unsigned Requested = AMDGPU::getIntegerAttribute(
665       F, "amdgpu-num-sgpr", MaxNumSGPRs);
666 
667     // Make sure requested value does not violate subtarget's specifications.
668     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
669       Requested = 0;
670 
671     // If more SGPRs are required to support the input user/system SGPRs,
672     // increase to accommodate them.
673     //
674     // FIXME: This really ends up using the requested number of SGPRs + number
675     // of reserved special registers in total. Theoretically you could re-use
676     // the last input registers for these special registers, but this would
677     // require a lot of complexity to deal with the weird aliasing.
678     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
679     if (Requested && Requested < InputNumSGPRs)
680       Requested = InputNumSGPRs;
681 
682     // Make sure requested value is compatible with values implied by
683     // default/requested minimum/maximum number of waves per execution unit.
684     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
685       Requested = 0;
686     if (WavesPerEU.second &&
687         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
688       Requested = 0;
689 
690     if (Requested)
691       MaxNumSGPRs = Requested;
692   }
693 
694   if (hasSGPRInitBug())
695     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
696 
697   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
698                   MaxAddressableNumSGPRs);
699 }
700 
701 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
702   const Function &F = MF.getFunction();
703   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
704 
705   // Compute maximum number of VGPRs function can use using default/requested
706   // minimum number of waves per execution unit.
707   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
708   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
709 
710   // Check if maximum number of VGPRs was explicitly requested using
711   // "amdgpu-num-vgpr" attribute.
712   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
713     unsigned Requested = AMDGPU::getIntegerAttribute(
714       F, "amdgpu-num-vgpr", MaxNumVGPRs);
715 
716     // Make sure requested value is compatible with values implied by
717     // default/requested minimum/maximum number of waves per execution unit.
718     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
719       Requested = 0;
720     if (WavesPerEU.second &&
721         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
722       Requested = 0;
723 
724     if (Requested)
725       MaxNumVGPRs = Requested;
726   }
727 
728   return MaxNumVGPRs;
729 }
730 
731 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
732                                          int UseOpIdx, SDep &Dep) const {
733   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
734       !Def->isInstr() || !Use->isInstr())
735     return;
736 
737   MachineInstr *DefI = Def->getInstr();
738   MachineInstr *UseI = Use->getInstr();
739 
740   if (DefI->isBundle()) {
741     const SIRegisterInfo *TRI = getRegisterInfo();
742     auto Reg = Dep.getReg();
743     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
744     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
745     unsigned Lat = 0;
746     for (++I; I != E && I->isBundledWithPred(); ++I) {
747       if (I->modifiesRegister(Reg, TRI))
748         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
749       else if (Lat)
750         --Lat;
751     }
752     Dep.setLatency(Lat);
753   } else if (UseI->isBundle()) {
754     const SIRegisterInfo *TRI = getRegisterInfo();
755     auto Reg = Dep.getReg();
756     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
757     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
758     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
759     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
760       if (I->readsRegister(Reg, TRI))
761         break;
762       --Lat;
763     }
764     Dep.setLatency(Lat);
765   }
766 }
767 
768 namespace {
769 struct FillMFMAShadowMutation : ScheduleDAGMutation {
770   const SIInstrInfo *TII;
771 
772   ScheduleDAGMI *DAG;
773 
774   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
775 
776   bool isSALU(const SUnit *SU) const {
777     const MachineInstr *MI = SU->getInstr();
778     return MI && TII->isSALU(*MI) && !MI->isTerminator();
779   }
780 
781   bool isVALU(const SUnit *SU) const {
782     const MachineInstr *MI = SU->getInstr();
783     return MI && TII->isVALU(*MI);
784   }
785 
786   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
787     if (Pred->NodeNum < Succ->NodeNum)
788       return true;
789 
790     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
791 
792     for (unsigned I = 0; I < Succs.size(); ++I) {
793       for (const SDep &SI : Succs[I]->Succs) {
794         const SUnit *SU = SI.getSUnit();
795         if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
796           Succs.push_back(SU);
797       }
798     }
799 
800     SmallPtrSet<const SUnit*, 32> Visited;
801     while (!Preds.empty()) {
802       const SUnit *SU = Preds.pop_back_val();
803       if (llvm::find(Succs, SU) != Succs.end())
804         return false;
805       Visited.insert(SU);
806       for (const SDep &SI : SU->Preds)
807         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
808           Preds.push_back(SI.getSUnit());
809     }
810 
811     return true;
812   }
813 
814   // Link as much SALU intructions in chain as possible. Return the size
815   // of the chain. Links up to MaxChain instructions.
816   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
817                          SmallPtrSetImpl<SUnit *> &Visited) const {
818     SmallVector<SUnit *, 8> Worklist({To});
819     unsigned Linked = 0;
820 
821     while (!Worklist.empty() && MaxChain-- > 0) {
822       SUnit *SU = Worklist.pop_back_val();
823       if (!Visited.insert(SU).second)
824         continue;
825 
826       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
827                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
828 
829       if (SU->addPred(SDep(From, SDep::Artificial), false))
830         ++Linked;
831 
832       for (SDep &SI : From->Succs) {
833         SUnit *SUv = SI.getSUnit();
834         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
835           SUv->addPred(SDep(SU, SDep::Artificial), false);
836       }
837 
838       for (SDep &SI : SU->Succs) {
839         SUnit *Succ = SI.getSUnit();
840         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
841           Worklist.push_back(Succ);
842       }
843     }
844 
845     return Linked;
846   }
847 
848   void apply(ScheduleDAGInstrs *DAGInstrs) override {
849     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
850     if (!ST.hasMAIInsts() || DisablePowerSched)
851       return;
852     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
853     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
854     if (!TSchedModel || DAG->SUnits.empty())
855       return;
856 
857     // Scan for MFMA long latency instructions and try to add a dependency
858     // of available SALU instructions to give them a chance to fill MFMA
859     // shadow. That is desirable to fill MFMA shadow with SALU instructions
860     // rather than VALU to prevent power consumption bursts and throttle.
861     auto LastSALU = DAG->SUnits.begin();
862     auto E = DAG->SUnits.end();
863     SmallPtrSet<SUnit*, 32> Visited;
864     for (SUnit &SU : DAG->SUnits) {
865       MachineInstr &MAI = *SU.getInstr();
866       if (!TII->isMAI(MAI) ||
867            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
868            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
869         continue;
870 
871       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
872 
873       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
874                  dbgs() << "Need " << Lat
875                         << " instructions to cover latency.\n");
876 
877       // Find up to Lat independent scalar instructions as early as
878       // possible such that they can be scheduled after this MFMA.
879       for ( ; Lat && LastSALU != E; ++LastSALU) {
880         if (Visited.count(&*LastSALU))
881           continue;
882 
883         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
884           continue;
885 
886         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
887       }
888     }
889   }
890 };
891 } // namespace
892 
893 void GCNSubtarget::getPostRAMutations(
894     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
895   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
896 }
897 
898 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
899   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
900     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
901   else
902     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
903 }
904 
905 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
906   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
907     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
908   else
909     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
910 }
911