1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 static cl::opt<bool> DisablePowerSched(
44   "amdgpu-disable-power-sched",
45   cl::desc("Disable scheduling to minimize mAI power bursts"),
46   cl::init(false));
47 
48 static cl::opt<bool> EnableVGPRIndexMode(
49   "amdgpu-vgpr-index-mode",
50   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51   cl::init(false));
52 
53 GCNSubtarget::~GCNSubtarget() = default;
54 
55 R600Subtarget &
56 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
57                                                StringRef GPU, StringRef FS) {
58   SmallString<256> FullFS("+promote-alloca,");
59   FullFS += FS;
60   ParseSubtargetFeatures(GPU, FullFS);
61 
62   HasMulU24 = getGeneration() >= EVERGREEN;
63   HasMulI24 = hasCaymanISA();
64 
65   return *this;
66 }
67 
68 GCNSubtarget &
69 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
70                                               StringRef GPU, StringRef FS) {
71   // Determine default and user-specified characteristics
72   //
73   // We want to be able to turn these off, but making this a subtarget feature
74   // for SI has the unhelpful behavior that it unsets everything else if you
75   // disable it.
76   //
77   // Similarly we want enable-prt-strict-null to be on by default and not to
78   // unset everything else if it is disabled
79 
80   // Assuming ECC is enabled is the conservative default.
81   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,");
82 
83   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
84     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
85 
86   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
87 
88   // Disable mutually exclusive bits.
89   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
90     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
91       FullFS += "-wavefrontsize16,";
92     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
93       FullFS += "-wavefrontsize32,";
94     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
95       FullFS += "-wavefrontsize64,";
96   }
97 
98   FullFS += FS;
99 
100   ParseSubtargetFeatures(GPU, FullFS);
101 
102   // We don't support FP64 for EG/NI atm.
103   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
104 
105   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
106   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
107   // variants of MUBUF instructions.
108   if (!hasAddr64() && !FS.contains("flat-for-global")) {
109     FlatForGlobal = true;
110   }
111 
112   // Set defaults if needed.
113   if (MaxPrivateElementSize == 0)
114     MaxPrivateElementSize = 4;
115 
116   if (LDSBankCount == 0)
117     LDSBankCount = 32;
118 
119   if (TT.getArch() == Triple::amdgcn) {
120     if (LocalMemorySize == 0)
121       LocalMemorySize = 32768;
122 
123     // Do something sensible for unspecified target.
124     if (!HasMovrel && !HasVGPRIndexMode)
125       HasMovrel = true;
126   }
127 
128   // Don't crash on invalid devices.
129   if (WavefrontSizeLog2 == 0)
130     WavefrontSizeLog2 = 5;
131 
132   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
133 
134   // Disable XNACK on targets where it is not enabled by default unless it is
135   // explicitly requested.
136   if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) {
137     ToggleFeature(AMDGPU::FeatureXNACK);
138     EnableXNACK = false;
139   }
140 
141   // ECC is on by default, but turn it off if the hardware doesn't support it
142   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
143   // ECC.
144   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
145     ToggleFeature(AMDGPU::FeatureSRAMECC);
146     EnableSRAMECC = false;
147   }
148 
149   return *this;
150 }
151 
152 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
153   TargetTriple(TT),
154   Has16BitInsts(false),
155   HasMadMixInsts(false),
156   HasMadMacF32Insts(false),
157   HasDsSrc2Insts(false),
158   HasSDWA(false),
159   HasVOP3PInsts(false),
160   HasMulI24(true),
161   HasMulU24(true),
162   HasInv2PiInlineImm(false),
163   HasFminFmaxLegacy(true),
164   EnablePromoteAlloca(false),
165   HasTrigReducedRange(false),
166   MaxWavesPerEU(10),
167   LocalMemorySize(0),
168   WavefrontSizeLog2(0)
169   { }
170 
171 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
172                            const GCNTargetMachine &TM) :
173     AMDGPUGenSubtargetInfo(TT, GPU, FS),
174     AMDGPUSubtarget(TT),
175     TargetTriple(TT),
176     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
177     InstrItins(getInstrItineraryForCPU(GPU)),
178     LDSBankCount(0),
179     MaxPrivateElementSize(0),
180 
181     FastFMAF32(false),
182     FastDenormalF32(false),
183     HalfRate64Ops(false),
184 
185     FlatForGlobal(false),
186     AutoWaitcntBeforeBarrier(false),
187     CodeObjectV3(false),
188     UnalignedScratchAccess(false),
189     UnalignedBufferAccess(false),
190 
191     HasApertureRegs(false),
192     EnableXNACK(false),
193     DoesNotSupportXNACK(false),
194     EnableCuMode(false),
195     TrapHandler(false),
196 
197     EnableLoadStoreOpt(false),
198     EnableUnsafeDSOffsetFolding(false),
199     EnableSIScheduler(false),
200     EnableDS128(false),
201     EnablePRTStrictNull(false),
202     DumpCode(false),
203 
204     FP64(false),
205     GCN3Encoding(false),
206     CIInsts(false),
207     GFX8Insts(false),
208     GFX9Insts(false),
209     GFX10Insts(false),
210     GFX10_3Insts(false),
211     GFX7GFX8GFX9Insts(false),
212     SGPRInitBug(false),
213     HasSMemRealTime(false),
214     HasIntClamp(false),
215     HasFmaMixInsts(false),
216     HasMovrel(false),
217     HasVGPRIndexMode(false),
218     HasScalarStores(false),
219     HasScalarAtomics(false),
220     HasSDWAOmod(false),
221     HasSDWAScalar(false),
222     HasSDWASdst(false),
223     HasSDWAMac(false),
224     HasSDWAOutModsVOPC(false),
225     HasDPP(false),
226     HasDPP8(false),
227     HasR128A16(false),
228     HasGFX10A16(false),
229     HasG16(false),
230     HasNSAEncoding(false),
231     GFX10_BEncoding(false),
232     HasDLInsts(false),
233     HasDot1Insts(false),
234     HasDot2Insts(false),
235     HasDot3Insts(false),
236     HasDot4Insts(false),
237     HasDot5Insts(false),
238     HasDot6Insts(false),
239     HasMAIInsts(false),
240     HasPkFmacF16Inst(false),
241     HasAtomicFaddInsts(false),
242     EnableSRAMECC(false),
243     DoesNotSupportSRAMECC(false),
244     HasNoSdstCMPX(false),
245     HasVscnt(false),
246     HasGetWaveIdInst(false),
247     HasSMemTimeInst(false),
248     HasRegisterBanking(false),
249     HasVOP3Literal(false),
250     HasNoDataDepHazard(false),
251     FlatAddressSpace(false),
252     FlatInstOffsets(false),
253     FlatGlobalInsts(false),
254     FlatScratchInsts(false),
255     ScalarFlatScratchInsts(false),
256     AddNoCarryInsts(false),
257     HasUnpackedD16VMem(false),
258     LDSMisalignedBug(false),
259     HasMFMAInlineLiteralBug(false),
260 
261     ScalarizeGlobal(false),
262 
263     HasVcmpxPermlaneHazard(false),
264     HasVMEMtoScalarWriteHazard(false),
265     HasSMEMtoVectorWriteHazard(false),
266     HasInstFwdPrefetchBug(false),
267     HasVcmpxExecWARHazard(false),
268     HasLdsBranchVmemWARHazard(false),
269     HasNSAtoVMEMBug(false),
270     HasOffset3fBug(false),
271     HasFlatSegmentOffsetBug(false),
272 
273     FeatureDisable(false),
274     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
275     TLInfo(TM, *this),
276     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
277   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
278   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
279   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
280   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
281   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
282   InstSelector.reset(new AMDGPUInstructionSelector(
283   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
284 }
285 
286 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
287   if (getGeneration() < GFX10)
288     return 1;
289 
290   switch (Opcode) {
291   case AMDGPU::V_LSHLREV_B64:
292   case AMDGPU::V_LSHLREV_B64_gfx10:
293   case AMDGPU::V_LSHL_B64:
294   case AMDGPU::V_LSHRREV_B64:
295   case AMDGPU::V_LSHRREV_B64_gfx10:
296   case AMDGPU::V_LSHR_B64:
297   case AMDGPU::V_ASHRREV_I64:
298   case AMDGPU::V_ASHRREV_I64_gfx10:
299   case AMDGPU::V_ASHR_I64:
300     return 1;
301   }
302 
303   return 2;
304 }
305 
306 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
307   const Function &F) const {
308   if (NWaves == 1)
309     return getLocalMemorySize();
310   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
311   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
312   if (!WorkGroupsPerCu)
313     return 0;
314   unsigned MaxWaves = getMaxWavesPerEU();
315   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
316 }
317 
318 // FIXME: Should return min,max range.
319 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
320   const Function &F) const {
321   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
322   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
323   if (!MaxWorkGroupsPerCu)
324     return 0;
325 
326   const unsigned WaveSize = getWavefrontSize();
327 
328   // FIXME: Do we need to account for alignment requirement of LDS rounding the
329   // size up?
330   // Compute restriction based on LDS usage
331   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
332 
333   // This can be queried with more LDS than is possible, so just assume the
334   // worst.
335   if (NumGroups == 0)
336     return 1;
337 
338   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
339 
340   // Round to the number of waves.
341   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
342   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
343 
344   // Clamp to the maximum possible number of waves.
345   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
346 
347   // FIXME: Needs to be a multiple of the group size?
348   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
349 
350   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
351          "computed invalid occupancy");
352   return MaxWaves;
353 }
354 
355 unsigned
356 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
357   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
358   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
359 }
360 
361 std::pair<unsigned, unsigned>
362 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
363   switch (CC) {
364   case CallingConv::AMDGPU_VS:
365   case CallingConv::AMDGPU_LS:
366   case CallingConv::AMDGPU_HS:
367   case CallingConv::AMDGPU_ES:
368   case CallingConv::AMDGPU_GS:
369   case CallingConv::AMDGPU_PS:
370     return std::make_pair(1, getWavefrontSize());
371   default:
372     return std::make_pair(1u, getMaxFlatWorkGroupSize());
373   }
374 }
375 
376 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
377   const Function &F) const {
378   // Default minimum/maximum flat work group sizes.
379   std::pair<unsigned, unsigned> Default =
380     getDefaultFlatWorkGroupSize(F.getCallingConv());
381 
382   // Requested minimum/maximum flat work group sizes.
383   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
384     F, "amdgpu-flat-work-group-size", Default);
385 
386   // Make sure requested minimum is less than requested maximum.
387   if (Requested.first > Requested.second)
388     return Default;
389 
390   // Make sure requested values do not violate subtarget's specifications.
391   if (Requested.first < getMinFlatWorkGroupSize())
392     return Default;
393   if (Requested.second > getMaxFlatWorkGroupSize())
394     return Default;
395 
396   return Requested;
397 }
398 
399 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
400   const Function &F) const {
401   // Default minimum/maximum number of waves per execution unit.
402   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
403 
404   // Default/requested minimum/maximum flat work group sizes.
405   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
406 
407   // If minimum/maximum flat work group sizes were explicitly requested using
408   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
409   // number of waves per execution unit to values implied by requested
410   // minimum/maximum flat work group sizes.
411   unsigned MinImpliedByFlatWorkGroupSize =
412     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
413   Default.first = MinImpliedByFlatWorkGroupSize;
414   bool RequestedFlatWorkGroupSize =
415       F.hasFnAttribute("amdgpu-flat-work-group-size");
416 
417   // Requested minimum/maximum number of waves per execution unit.
418   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
419     F, "amdgpu-waves-per-eu", Default, true);
420 
421   // Make sure requested minimum is less than requested maximum.
422   if (Requested.second && Requested.first > Requested.second)
423     return Default;
424 
425   // Make sure requested values do not violate subtarget's specifications.
426   if (Requested.first < getMinWavesPerEU() ||
427       Requested.second > getMaxWavesPerEU())
428     return Default;
429 
430   // Make sure requested values are compatible with values implied by requested
431   // minimum/maximum flat work group sizes.
432   if (RequestedFlatWorkGroupSize &&
433       Requested.first < MinImpliedByFlatWorkGroupSize)
434     return Default;
435 
436   return Requested;
437 }
438 
439 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
440   Function *Kernel = I->getParent()->getParent();
441   unsigned MinSize = 0;
442   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
443   bool IdQuery = false;
444 
445   // If reqd_work_group_size is present it narrows value down.
446   if (auto *CI = dyn_cast<CallInst>(I)) {
447     const Function *F = CI->getCalledFunction();
448     if (F) {
449       unsigned Dim = UINT_MAX;
450       switch (F->getIntrinsicID()) {
451       case Intrinsic::amdgcn_workitem_id_x:
452       case Intrinsic::r600_read_tidig_x:
453         IdQuery = true;
454         LLVM_FALLTHROUGH;
455       case Intrinsic::r600_read_local_size_x:
456         Dim = 0;
457         break;
458       case Intrinsic::amdgcn_workitem_id_y:
459       case Intrinsic::r600_read_tidig_y:
460         IdQuery = true;
461         LLVM_FALLTHROUGH;
462       case Intrinsic::r600_read_local_size_y:
463         Dim = 1;
464         break;
465       case Intrinsic::amdgcn_workitem_id_z:
466       case Intrinsic::r600_read_tidig_z:
467         IdQuery = true;
468         LLVM_FALLTHROUGH;
469       case Intrinsic::r600_read_local_size_z:
470         Dim = 2;
471         break;
472       default:
473         break;
474       }
475       if (Dim <= 3) {
476         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
477           if (Node->getNumOperands() == 3)
478             MinSize = MaxSize = mdconst::extract<ConstantInt>(
479                                   Node->getOperand(Dim))->getZExtValue();
480       }
481     }
482   }
483 
484   if (!MaxSize)
485     return false;
486 
487   // Range metadata is [Lo, Hi). For ID query we need to pass max size
488   // as Hi. For size query we need to pass Hi + 1.
489   if (IdQuery)
490     MinSize = 0;
491   else
492     ++MaxSize;
493 
494   MDBuilder MDB(I->getContext());
495   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
496                                                   APInt(32, MaxSize));
497   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
498   return true;
499 }
500 
501 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
502                                                  Align &MaxAlign) const {
503   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
504          F.getCallingConv() == CallingConv::SPIR_KERNEL);
505 
506   const DataLayout &DL = F.getParent()->getDataLayout();
507   uint64_t ExplicitArgBytes = 0;
508   MaxAlign = Align(1);
509 
510   for (const Argument &Arg : F.args()) {
511     const bool IsByRef = Arg.hasByRefAttr();
512     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
513     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
514     if (!Alignment)
515       Alignment = DL.getABITypeAlign(ArgTy);
516 
517     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
518     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
519     MaxAlign = max(MaxAlign, Alignment);
520   }
521 
522   return ExplicitArgBytes;
523 }
524 
525 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
526                                                 Align &MaxAlign) const {
527   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
528 
529   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
530 
531   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
532   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
533   if (ImplicitBytes != 0) {
534     const Align Alignment = getAlignmentForImplicitArgPtr();
535     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
536   }
537 
538   // Being able to dereference past the end is useful for emitting scalar loads.
539   return alignTo(TotalSize, 4);
540 }
541 
542 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
543                              const TargetMachine &TM) :
544   R600GenSubtargetInfo(TT, GPU, FS),
545   AMDGPUSubtarget(TT),
546   InstrInfo(*this),
547   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
548   FMA(false),
549   CaymanISA(false),
550   CFALUBug(false),
551   HasVertexCache(false),
552   R600ALUInst(false),
553   FP64(false),
554   TexVTXClauseSize(0),
555   Gen(R600),
556   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
557   InstrItins(getInstrItineraryForCPU(GPU)) { }
558 
559 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
560                                       unsigned NumRegionInstrs) const {
561   // Track register pressure so the scheduler can try to decrease
562   // pressure once register usage is above the threshold defined by
563   // SIRegisterInfo::getRegPressureSetLimit()
564   Policy.ShouldTrackPressure = true;
565 
566   // Enabling both top down and bottom up scheduling seems to give us less
567   // register spills than just using one of these approaches on its own.
568   Policy.OnlyTopDown = false;
569   Policy.OnlyBottomUp = false;
570 
571   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
572   if (!enableSIScheduler())
573     Policy.ShouldTrackLaneMasks = true;
574 }
575 
576 bool GCNSubtarget::hasMadF16() const {
577   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
578 }
579 
580 bool GCNSubtarget::useVGPRIndexMode() const {
581   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
582 }
583 
584 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
585   if (getGeneration() >= AMDGPUSubtarget::GFX10)
586     return getMaxWavesPerEU();
587 
588   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
589     if (SGPRs <= 80)
590       return 10;
591     if (SGPRs <= 88)
592       return 9;
593     if (SGPRs <= 100)
594       return 8;
595     return 7;
596   }
597   if (SGPRs <= 48)
598     return 10;
599   if (SGPRs <= 56)
600     return 9;
601   if (SGPRs <= 64)
602     return 8;
603   if (SGPRs <= 72)
604     return 7;
605   if (SGPRs <= 80)
606     return 6;
607   return 5;
608 }
609 
610 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
611   unsigned MaxWaves = getMaxWavesPerEU();
612   unsigned Granule = getVGPRAllocGranule();
613   if (VGPRs < Granule)
614     return MaxWaves;
615   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
616   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
617 }
618 
619 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
620   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
621   if (getGeneration() >= AMDGPUSubtarget::GFX10)
622     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
623 
624   if (MFI.hasFlatScratchInit()) {
625     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
626       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
627     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
628       return 4; // FLAT_SCRATCH, VCC (in that order).
629   }
630 
631   if (isXNACKEnabled())
632     return 4; // XNACK, VCC (in that order).
633   return 2; // VCC.
634 }
635 
636 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
637                                         unsigned NumSGPRs,
638                                         unsigned NumVGPRs) const {
639   unsigned Occupancy =
640     std::min(getMaxWavesPerEU(),
641              getOccupancyWithLocalMemSize(LDSSize, F));
642   if (NumSGPRs)
643     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
644   if (NumVGPRs)
645     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
646   return Occupancy;
647 }
648 
649 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
650   const Function &F = MF.getFunction();
651   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
652 
653   // Compute maximum number of SGPRs function can use using default/requested
654   // minimum number of waves per execution unit.
655   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
656   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
657   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
658 
659   // Check if maximum number of SGPRs was explicitly requested using
660   // "amdgpu-num-sgpr" attribute.
661   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
662     unsigned Requested = AMDGPU::getIntegerAttribute(
663       F, "amdgpu-num-sgpr", MaxNumSGPRs);
664 
665     // Make sure requested value does not violate subtarget's specifications.
666     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
667       Requested = 0;
668 
669     // If more SGPRs are required to support the input user/system SGPRs,
670     // increase to accommodate them.
671     //
672     // FIXME: This really ends up using the requested number of SGPRs + number
673     // of reserved special registers in total. Theoretically you could re-use
674     // the last input registers for these special registers, but this would
675     // require a lot of complexity to deal with the weird aliasing.
676     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
677     if (Requested && Requested < InputNumSGPRs)
678       Requested = InputNumSGPRs;
679 
680     // Make sure requested value is compatible with values implied by
681     // default/requested minimum/maximum number of waves per execution unit.
682     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
683       Requested = 0;
684     if (WavesPerEU.second &&
685         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
686       Requested = 0;
687 
688     if (Requested)
689       MaxNumSGPRs = Requested;
690   }
691 
692   if (hasSGPRInitBug())
693     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
694 
695   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
696                   MaxAddressableNumSGPRs);
697 }
698 
699 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
700   const Function &F = MF.getFunction();
701   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
702 
703   // Compute maximum number of VGPRs function can use using default/requested
704   // minimum number of waves per execution unit.
705   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
706   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
707 
708   // Check if maximum number of VGPRs was explicitly requested using
709   // "amdgpu-num-vgpr" attribute.
710   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
711     unsigned Requested = AMDGPU::getIntegerAttribute(
712       F, "amdgpu-num-vgpr", MaxNumVGPRs);
713 
714     // Make sure requested value is compatible with values implied by
715     // default/requested minimum/maximum number of waves per execution unit.
716     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
717       Requested = 0;
718     if (WavesPerEU.second &&
719         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
720       Requested = 0;
721 
722     if (Requested)
723       MaxNumVGPRs = Requested;
724   }
725 
726   return MaxNumVGPRs;
727 }
728 
729 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
730                                          int UseOpIdx, SDep &Dep) const {
731   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
732       !Def->isInstr() || !Use->isInstr())
733     return;
734 
735   MachineInstr *DefI = Def->getInstr();
736   MachineInstr *UseI = Use->getInstr();
737 
738   if (DefI->isBundle()) {
739     const SIRegisterInfo *TRI = getRegisterInfo();
740     auto Reg = Dep.getReg();
741     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
742     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
743     unsigned Lat = 0;
744     for (++I; I != E && I->isBundledWithPred(); ++I) {
745       if (I->modifiesRegister(Reg, TRI))
746         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
747       else if (Lat)
748         --Lat;
749     }
750     Dep.setLatency(Lat);
751   } else if (UseI->isBundle()) {
752     const SIRegisterInfo *TRI = getRegisterInfo();
753     auto Reg = Dep.getReg();
754     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
755     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
756     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
757     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
758       if (I->readsRegister(Reg, TRI))
759         break;
760       --Lat;
761     }
762     Dep.setLatency(Lat);
763   }
764 }
765 
766 namespace {
767 struct FillMFMAShadowMutation : ScheduleDAGMutation {
768   const SIInstrInfo *TII;
769 
770   ScheduleDAGMI *DAG;
771 
772   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
773 
774   bool isSALU(const SUnit *SU) const {
775     const MachineInstr *MI = SU->getInstr();
776     return MI && TII->isSALU(*MI) && !MI->isTerminator();
777   }
778 
779   bool isVALU(const SUnit *SU) const {
780     const MachineInstr *MI = SU->getInstr();
781     return MI && TII->isVALU(*MI);
782   }
783 
784   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
785     if (Pred->NodeNum < Succ->NodeNum)
786       return true;
787 
788     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
789 
790     for (unsigned I = 0; I < Succs.size(); ++I) {
791       for (const SDep &SI : Succs[I]->Succs) {
792         const SUnit *SU = SI.getSUnit();
793         if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
794           Succs.push_back(SU);
795       }
796     }
797 
798     SmallPtrSet<const SUnit*, 32> Visited;
799     while (!Preds.empty()) {
800       const SUnit *SU = Preds.pop_back_val();
801       if (llvm::find(Succs, SU) != Succs.end())
802         return false;
803       Visited.insert(SU);
804       for (const SDep &SI : SU->Preds)
805         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
806           Preds.push_back(SI.getSUnit());
807     }
808 
809     return true;
810   }
811 
812   // Link as much SALU intructions in chain as possible. Return the size
813   // of the chain. Links up to MaxChain instructions.
814   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
815                          SmallPtrSetImpl<SUnit *> &Visited) const {
816     SmallVector<SUnit *, 8> Worklist({To});
817     unsigned Linked = 0;
818 
819     while (!Worklist.empty() && MaxChain-- > 0) {
820       SUnit *SU = Worklist.pop_back_val();
821       if (!Visited.insert(SU).second)
822         continue;
823 
824       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
825                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
826 
827       if (SU->addPred(SDep(From, SDep::Artificial), false))
828         ++Linked;
829 
830       for (SDep &SI : From->Succs) {
831         SUnit *SUv = SI.getSUnit();
832         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
833           SUv->addPred(SDep(SU, SDep::Artificial), false);
834       }
835 
836       for (SDep &SI : SU->Succs) {
837         SUnit *Succ = SI.getSUnit();
838         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
839           Worklist.push_back(Succ);
840       }
841     }
842 
843     return Linked;
844   }
845 
846   void apply(ScheduleDAGInstrs *DAGInstrs) override {
847     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
848     if (!ST.hasMAIInsts() || DisablePowerSched)
849       return;
850     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
851     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
852     if (!TSchedModel || DAG->SUnits.empty())
853       return;
854 
855     // Scan for MFMA long latency instructions and try to add a dependency
856     // of available SALU instructions to give them a chance to fill MFMA
857     // shadow. That is desirable to fill MFMA shadow with SALU instructions
858     // rather than VALU to prevent power consumption bursts and throttle.
859     auto LastSALU = DAG->SUnits.begin();
860     auto E = DAG->SUnits.end();
861     SmallPtrSet<SUnit*, 32> Visited;
862     for (SUnit &SU : DAG->SUnits) {
863       MachineInstr &MAI = *SU.getInstr();
864       if (!TII->isMAI(MAI) ||
865            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
866            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
867         continue;
868 
869       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
870 
871       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
872                  dbgs() << "Need " << Lat
873                         << " instructions to cover latency.\n");
874 
875       // Find up to Lat independent scalar instructions as early as
876       // possible such that they can be scheduled after this MFMA.
877       for ( ; Lat && LastSALU != E; ++LastSALU) {
878         if (Visited.count(&*LastSALU))
879           continue;
880 
881         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
882           continue;
883 
884         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
885       }
886     }
887   }
888 };
889 } // namespace
890 
891 void GCNSubtarget::getPostRAMutations(
892     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
893   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
894 }
895 
896 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
897   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
898     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
899   else
900     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
901 }
902 
903 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
904   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
905     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
906   else
907     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
908 }
909