1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 static cl::opt<bool> DisablePowerSched(
44   "amdgpu-disable-power-sched",
45   cl::desc("Disable scheduling to minimize mAI power bursts"),
46   cl::init(false));
47 
48 static cl::opt<bool> EnableVGPRIndexMode(
49   "amdgpu-vgpr-index-mode",
50   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51   cl::init(false));
52 
53 GCNSubtarget::~GCNSubtarget() = default;
54 
55 R600Subtarget &
56 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
57                                                StringRef GPU, StringRef FS) {
58   SmallString<256> FullFS("+promote-alloca,");
59   FullFS += FS;
60   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
61 
62   HasMulU24 = getGeneration() >= EVERGREEN;
63   HasMulI24 = hasCaymanISA();
64 
65   return *this;
66 }
67 
68 GCNSubtarget &
69 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
70                                               StringRef GPU, StringRef FS) {
71   // Determine default and user-specified characteristics
72   //
73   // We want to be able to turn these off, but making this a subtarget feature
74   // for SI has the unhelpful behavior that it unsets everything else if you
75   // disable it.
76   //
77   // Similarly we want enable-prt-strict-null to be on by default and not to
78   // unset everything else if it is disabled
79 
80   // Assuming ECC is enabled is the conservative default.
81   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,");
82 
83   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
84     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
85 
86   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
87 
88   // Disable mutually exclusive bits.
89   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
90     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
91       FullFS += "-wavefrontsize16,";
92     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
93       FullFS += "-wavefrontsize32,";
94     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
95       FullFS += "-wavefrontsize64,";
96   }
97 
98   FullFS += FS;
99 
100   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
101 
102   // We don't support FP64 for EG/NI atm.
103   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
104 
105   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
106   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
107   // variants of MUBUF instructions.
108   if (!hasAddr64() && !FS.contains("flat-for-global")) {
109     FlatForGlobal = true;
110   }
111 
112   // Set defaults if needed.
113   if (MaxPrivateElementSize == 0)
114     MaxPrivateElementSize = 4;
115 
116   if (LDSBankCount == 0)
117     LDSBankCount = 32;
118 
119   if (TT.getArch() == Triple::amdgcn) {
120     if (LocalMemorySize == 0)
121       LocalMemorySize = 32768;
122 
123     // Do something sensible for unspecified target.
124     if (!HasMovrel && !HasVGPRIndexMode)
125       HasMovrel = true;
126   }
127 
128   // Don't crash on invalid devices.
129   if (WavefrontSizeLog2 == 0)
130     WavefrontSizeLog2 = 5;
131 
132   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
133 
134   // Disable XNACK on targets where it is not enabled by default unless it is
135   // explicitly requested.
136   if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) {
137     ToggleFeature(AMDGPU::FeatureXNACK);
138     EnableXNACK = false;
139   }
140 
141   // ECC is on by default, but turn it off if the hardware doesn't support it
142   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
143   // ECC.
144   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
145     ToggleFeature(AMDGPU::FeatureSRAMECC);
146     EnableSRAMECC = false;
147   }
148 
149   return *this;
150 }
151 
152 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
153   TargetTriple(TT),
154   Has16BitInsts(false),
155   HasMadMixInsts(false),
156   HasMadMacF32Insts(false),
157   HasDsSrc2Insts(false),
158   HasSDWA(false),
159   HasVOP3PInsts(false),
160   HasMulI24(true),
161   HasMulU24(true),
162   HasInv2PiInlineImm(false),
163   HasFminFmaxLegacy(true),
164   EnablePromoteAlloca(false),
165   HasTrigReducedRange(false),
166   MaxWavesPerEU(10),
167   LocalMemorySize(0),
168   WavefrontSizeLog2(0)
169   { }
170 
171 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
172                            const GCNTargetMachine &TM) :
173     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
174     AMDGPUSubtarget(TT),
175     TargetTriple(TT),
176     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
177     InstrItins(getInstrItineraryForCPU(GPU)),
178     LDSBankCount(0),
179     MaxPrivateElementSize(0),
180 
181     FastFMAF32(false),
182     FastDenormalF32(false),
183     HalfRate64Ops(false),
184 
185     FlatForGlobal(false),
186     AutoWaitcntBeforeBarrier(false),
187     CodeObjectV3(false),
188     UnalignedScratchAccess(false),
189     UnalignedAccessMode(false),
190 
191     HasApertureRegs(false),
192     EnableXNACK(false),
193     DoesNotSupportXNACK(false),
194     EnableCuMode(false),
195     TrapHandler(false),
196 
197     EnableLoadStoreOpt(false),
198     EnableUnsafeDSOffsetFolding(false),
199     EnableSIScheduler(false),
200     EnableDS128(false),
201     EnablePRTStrictNull(false),
202     DumpCode(false),
203 
204     FP64(false),
205     GCN3Encoding(false),
206     CIInsts(false),
207     GFX8Insts(false),
208     GFX9Insts(false),
209     GFX10Insts(false),
210     GFX10_3Insts(false),
211     GFX7GFX8GFX9Insts(false),
212     SGPRInitBug(false),
213     HasSMemRealTime(false),
214     HasIntClamp(false),
215     HasFmaMixInsts(false),
216     HasMovrel(false),
217     HasVGPRIndexMode(false),
218     HasScalarStores(false),
219     HasScalarAtomics(false),
220     HasSDWAOmod(false),
221     HasSDWAScalar(false),
222     HasSDWASdst(false),
223     HasSDWAMac(false),
224     HasSDWAOutModsVOPC(false),
225     HasDPP(false),
226     HasDPP8(false),
227     HasR128A16(false),
228     HasGFX10A16(false),
229     HasG16(false),
230     HasNSAEncoding(false),
231     GFX10_BEncoding(false),
232     HasDLInsts(false),
233     HasDot1Insts(false),
234     HasDot2Insts(false),
235     HasDot3Insts(false),
236     HasDot4Insts(false),
237     HasDot5Insts(false),
238     HasDot6Insts(false),
239     HasMAIInsts(false),
240     HasPkFmacF16Inst(false),
241     HasAtomicFaddInsts(false),
242     EnableSRAMECC(false),
243     DoesNotSupportSRAMECC(false),
244     HasNoSdstCMPX(false),
245     HasVscnt(false),
246     HasGetWaveIdInst(false),
247     HasSMemTimeInst(false),
248     HasRegisterBanking(false),
249     HasVOP3Literal(false),
250     HasNoDataDepHazard(false),
251     FlatAddressSpace(false),
252     FlatInstOffsets(false),
253     FlatGlobalInsts(false),
254     FlatScratchInsts(false),
255     ScalarFlatScratchInsts(false),
256     AddNoCarryInsts(false),
257     HasUnpackedD16VMem(false),
258     LDSMisalignedBug(false),
259     HasMFMAInlineLiteralBug(false),
260     UnalignedBufferAccess(false),
261     UnalignedDSAccess(false),
262 
263     ScalarizeGlobal(false),
264 
265     HasVcmpxPermlaneHazard(false),
266     HasVMEMtoScalarWriteHazard(false),
267     HasSMEMtoVectorWriteHazard(false),
268     HasInstFwdPrefetchBug(false),
269     HasVcmpxExecWARHazard(false),
270     HasLdsBranchVmemWARHazard(false),
271     HasNSAtoVMEMBug(false),
272     HasOffset3fBug(false),
273     HasFlatSegmentOffsetBug(false),
274 
275     FeatureDisable(false),
276     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
277     TLInfo(TM, *this),
278     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
279   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
280   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
281   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
282   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
283   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
284   InstSelector.reset(new AMDGPUInstructionSelector(
285   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
286 }
287 
288 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
289   if (getGeneration() < GFX10)
290     return 1;
291 
292   switch (Opcode) {
293   case AMDGPU::V_LSHLREV_B64:
294   case AMDGPU::V_LSHLREV_B64_gfx10:
295   case AMDGPU::V_LSHL_B64:
296   case AMDGPU::V_LSHRREV_B64:
297   case AMDGPU::V_LSHRREV_B64_gfx10:
298   case AMDGPU::V_LSHR_B64:
299   case AMDGPU::V_ASHRREV_I64:
300   case AMDGPU::V_ASHRREV_I64_gfx10:
301   case AMDGPU::V_ASHR_I64:
302     return 1;
303   }
304 
305   return 2;
306 }
307 
308 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
309   const Function &F) const {
310   if (NWaves == 1)
311     return getLocalMemorySize();
312   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
313   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
314   if (!WorkGroupsPerCu)
315     return 0;
316   unsigned MaxWaves = getMaxWavesPerEU();
317   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
318 }
319 
320 // FIXME: Should return min,max range.
321 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
322   const Function &F) const {
323   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
324   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
325   if (!MaxWorkGroupsPerCu)
326     return 0;
327 
328   const unsigned WaveSize = getWavefrontSize();
329 
330   // FIXME: Do we need to account for alignment requirement of LDS rounding the
331   // size up?
332   // Compute restriction based on LDS usage
333   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
334 
335   // This can be queried with more LDS than is possible, so just assume the
336   // worst.
337   if (NumGroups == 0)
338     return 1;
339 
340   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
341 
342   // Round to the number of waves.
343   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
344   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
345 
346   // Clamp to the maximum possible number of waves.
347   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
348 
349   // FIXME: Needs to be a multiple of the group size?
350   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
351 
352   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
353          "computed invalid occupancy");
354   return MaxWaves;
355 }
356 
357 unsigned
358 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
359   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
360   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
361 }
362 
363 std::pair<unsigned, unsigned>
364 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
365   switch (CC) {
366   case CallingConv::AMDGPU_VS:
367   case CallingConv::AMDGPU_LS:
368   case CallingConv::AMDGPU_HS:
369   case CallingConv::AMDGPU_ES:
370   case CallingConv::AMDGPU_GS:
371   case CallingConv::AMDGPU_PS:
372     return std::make_pair(1, getWavefrontSize());
373   default:
374     return std::make_pair(1u, getMaxFlatWorkGroupSize());
375   }
376 }
377 
378 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
379   const Function &F) const {
380   // Default minimum/maximum flat work group sizes.
381   std::pair<unsigned, unsigned> Default =
382     getDefaultFlatWorkGroupSize(F.getCallingConv());
383 
384   // Requested minimum/maximum flat work group sizes.
385   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
386     F, "amdgpu-flat-work-group-size", Default);
387 
388   // Make sure requested minimum is less than requested maximum.
389   if (Requested.first > Requested.second)
390     return Default;
391 
392   // Make sure requested values do not violate subtarget's specifications.
393   if (Requested.first < getMinFlatWorkGroupSize())
394     return Default;
395   if (Requested.second > getMaxFlatWorkGroupSize())
396     return Default;
397 
398   return Requested;
399 }
400 
401 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
402   const Function &F) const {
403   // Default minimum/maximum number of waves per execution unit.
404   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
405 
406   // Default/requested minimum/maximum flat work group sizes.
407   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
408 
409   // If minimum/maximum flat work group sizes were explicitly requested using
410   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
411   // number of waves per execution unit to values implied by requested
412   // minimum/maximum flat work group sizes.
413   unsigned MinImpliedByFlatWorkGroupSize =
414     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
415   Default.first = MinImpliedByFlatWorkGroupSize;
416   bool RequestedFlatWorkGroupSize =
417       F.hasFnAttribute("amdgpu-flat-work-group-size");
418 
419   // Requested minimum/maximum number of waves per execution unit.
420   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
421     F, "amdgpu-waves-per-eu", Default, true);
422 
423   // Make sure requested minimum is less than requested maximum.
424   if (Requested.second && Requested.first > Requested.second)
425     return Default;
426 
427   // Make sure requested values do not violate subtarget's specifications.
428   if (Requested.first < getMinWavesPerEU() ||
429       Requested.second > getMaxWavesPerEU())
430     return Default;
431 
432   // Make sure requested values are compatible with values implied by requested
433   // minimum/maximum flat work group sizes.
434   if (RequestedFlatWorkGroupSize &&
435       Requested.first < MinImpliedByFlatWorkGroupSize)
436     return Default;
437 
438   return Requested;
439 }
440 
441 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
442   Function *Kernel = I->getParent()->getParent();
443   unsigned MinSize = 0;
444   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
445   bool IdQuery = false;
446 
447   // If reqd_work_group_size is present it narrows value down.
448   if (auto *CI = dyn_cast<CallInst>(I)) {
449     const Function *F = CI->getCalledFunction();
450     if (F) {
451       unsigned Dim = UINT_MAX;
452       switch (F->getIntrinsicID()) {
453       case Intrinsic::amdgcn_workitem_id_x:
454       case Intrinsic::r600_read_tidig_x:
455         IdQuery = true;
456         LLVM_FALLTHROUGH;
457       case Intrinsic::r600_read_local_size_x:
458         Dim = 0;
459         break;
460       case Intrinsic::amdgcn_workitem_id_y:
461       case Intrinsic::r600_read_tidig_y:
462         IdQuery = true;
463         LLVM_FALLTHROUGH;
464       case Intrinsic::r600_read_local_size_y:
465         Dim = 1;
466         break;
467       case Intrinsic::amdgcn_workitem_id_z:
468       case Intrinsic::r600_read_tidig_z:
469         IdQuery = true;
470         LLVM_FALLTHROUGH;
471       case Intrinsic::r600_read_local_size_z:
472         Dim = 2;
473         break;
474       default:
475         break;
476       }
477       if (Dim <= 3) {
478         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
479           if (Node->getNumOperands() == 3)
480             MinSize = MaxSize = mdconst::extract<ConstantInt>(
481                                   Node->getOperand(Dim))->getZExtValue();
482       }
483     }
484   }
485 
486   if (!MaxSize)
487     return false;
488 
489   // Range metadata is [Lo, Hi). For ID query we need to pass max size
490   // as Hi. For size query we need to pass Hi + 1.
491   if (IdQuery)
492     MinSize = 0;
493   else
494     ++MaxSize;
495 
496   MDBuilder MDB(I->getContext());
497   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
498                                                   APInt(32, MaxSize));
499   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
500   return true;
501 }
502 
503 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
504                                                  Align &MaxAlign) const {
505   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
506          F.getCallingConv() == CallingConv::SPIR_KERNEL);
507 
508   const DataLayout &DL = F.getParent()->getDataLayout();
509   uint64_t ExplicitArgBytes = 0;
510   MaxAlign = Align(1);
511 
512   for (const Argument &Arg : F.args()) {
513     const bool IsByRef = Arg.hasByRefAttr();
514     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
515     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
516     if (!Alignment)
517       Alignment = DL.getABITypeAlign(ArgTy);
518 
519     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
520     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
521     MaxAlign = max(MaxAlign, Alignment);
522   }
523 
524   return ExplicitArgBytes;
525 }
526 
527 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
528                                                 Align &MaxAlign) const {
529   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
530 
531   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
532 
533   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
534   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
535   if (ImplicitBytes != 0) {
536     const Align Alignment = getAlignmentForImplicitArgPtr();
537     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
538   }
539 
540   // Being able to dereference past the end is useful for emitting scalar loads.
541   return alignTo(TotalSize, 4);
542 }
543 
544 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
545                              const TargetMachine &TM) :
546   R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS),
547   AMDGPUSubtarget(TT),
548   InstrInfo(*this),
549   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
550   FMA(false),
551   CaymanISA(false),
552   CFALUBug(false),
553   HasVertexCache(false),
554   R600ALUInst(false),
555   FP64(false),
556   TexVTXClauseSize(0),
557   Gen(R600),
558   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
559   InstrItins(getInstrItineraryForCPU(GPU)) { }
560 
561 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
562                                       unsigned NumRegionInstrs) const {
563   // Track register pressure so the scheduler can try to decrease
564   // pressure once register usage is above the threshold defined by
565   // SIRegisterInfo::getRegPressureSetLimit()
566   Policy.ShouldTrackPressure = true;
567 
568   // Enabling both top down and bottom up scheduling seems to give us less
569   // register spills than just using one of these approaches on its own.
570   Policy.OnlyTopDown = false;
571   Policy.OnlyBottomUp = false;
572 
573   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
574   if (!enableSIScheduler())
575     Policy.ShouldTrackLaneMasks = true;
576 }
577 
578 bool GCNSubtarget::hasMadF16() const {
579   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
580 }
581 
582 bool GCNSubtarget::useVGPRIndexMode() const {
583   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
584 }
585 
586 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
587   if (getGeneration() >= AMDGPUSubtarget::GFX10)
588     return getMaxWavesPerEU();
589 
590   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
591     if (SGPRs <= 80)
592       return 10;
593     if (SGPRs <= 88)
594       return 9;
595     if (SGPRs <= 100)
596       return 8;
597     return 7;
598   }
599   if (SGPRs <= 48)
600     return 10;
601   if (SGPRs <= 56)
602     return 9;
603   if (SGPRs <= 64)
604     return 8;
605   if (SGPRs <= 72)
606     return 7;
607   if (SGPRs <= 80)
608     return 6;
609   return 5;
610 }
611 
612 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
613   unsigned MaxWaves = getMaxWavesPerEU();
614   unsigned Granule = getVGPRAllocGranule();
615   if (VGPRs < Granule)
616     return MaxWaves;
617   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
618   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
619 }
620 
621 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
622   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
623   if (getGeneration() >= AMDGPUSubtarget::GFX10)
624     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
625 
626   if (MFI.hasFlatScratchInit()) {
627     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
628       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
629     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
630       return 4; // FLAT_SCRATCH, VCC (in that order).
631   }
632 
633   if (isXNACKEnabled())
634     return 4; // XNACK, VCC (in that order).
635   return 2; // VCC.
636 }
637 
638 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
639                                         unsigned NumSGPRs,
640                                         unsigned NumVGPRs) const {
641   unsigned Occupancy =
642     std::min(getMaxWavesPerEU(),
643              getOccupancyWithLocalMemSize(LDSSize, F));
644   if (NumSGPRs)
645     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
646   if (NumVGPRs)
647     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
648   return Occupancy;
649 }
650 
651 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
652   const Function &F = MF.getFunction();
653   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
654 
655   // Compute maximum number of SGPRs function can use using default/requested
656   // minimum number of waves per execution unit.
657   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
658   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
659   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
660 
661   // Check if maximum number of SGPRs was explicitly requested using
662   // "amdgpu-num-sgpr" attribute.
663   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
664     unsigned Requested = AMDGPU::getIntegerAttribute(
665       F, "amdgpu-num-sgpr", MaxNumSGPRs);
666 
667     // Make sure requested value does not violate subtarget's specifications.
668     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
669       Requested = 0;
670 
671     // If more SGPRs are required to support the input user/system SGPRs,
672     // increase to accommodate them.
673     //
674     // FIXME: This really ends up using the requested number of SGPRs + number
675     // of reserved special registers in total. Theoretically you could re-use
676     // the last input registers for these special registers, but this would
677     // require a lot of complexity to deal with the weird aliasing.
678     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
679     if (Requested && Requested < InputNumSGPRs)
680       Requested = InputNumSGPRs;
681 
682     // Make sure requested value is compatible with values implied by
683     // default/requested minimum/maximum number of waves per execution unit.
684     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
685       Requested = 0;
686     if (WavesPerEU.second &&
687         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
688       Requested = 0;
689 
690     if (Requested)
691       MaxNumSGPRs = Requested;
692   }
693 
694   if (hasSGPRInitBug())
695     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
696 
697   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
698                   MaxAddressableNumSGPRs);
699 }
700 
701 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
702   const Function &F = MF.getFunction();
703   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
704 
705   // Compute maximum number of VGPRs function can use using default/requested
706   // minimum number of waves per execution unit.
707   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
708   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
709 
710   // Check if maximum number of VGPRs was explicitly requested using
711   // "amdgpu-num-vgpr" attribute.
712   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
713     unsigned Requested = AMDGPU::getIntegerAttribute(
714       F, "amdgpu-num-vgpr", MaxNumVGPRs);
715 
716     // Make sure requested value is compatible with values implied by
717     // default/requested minimum/maximum number of waves per execution unit.
718     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
719       Requested = 0;
720     if (WavesPerEU.second &&
721         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
722       Requested = 0;
723 
724     if (Requested)
725       MaxNumVGPRs = Requested;
726   }
727 
728   return MaxNumVGPRs;
729 }
730 
731 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
732                                          int UseOpIdx, SDep &Dep) const {
733   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
734       !Def->isInstr() || !Use->isInstr())
735     return;
736 
737   MachineInstr *DefI = Def->getInstr();
738   MachineInstr *UseI = Use->getInstr();
739 
740   if (DefI->isBundle()) {
741     const SIRegisterInfo *TRI = getRegisterInfo();
742     auto Reg = Dep.getReg();
743     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
744     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
745     unsigned Lat = 0;
746     for (++I; I != E && I->isBundledWithPred(); ++I) {
747       if (I->modifiesRegister(Reg, TRI))
748         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
749       else if (Lat)
750         --Lat;
751     }
752     Dep.setLatency(Lat);
753   } else if (UseI->isBundle()) {
754     const SIRegisterInfo *TRI = getRegisterInfo();
755     auto Reg = Dep.getReg();
756     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
757     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
758     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
759     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
760       if (I->readsRegister(Reg, TRI))
761         break;
762       --Lat;
763     }
764     Dep.setLatency(Lat);
765   }
766 }
767 
768 namespace {
769 struct FillMFMAShadowMutation : ScheduleDAGMutation {
770   const SIInstrInfo *TII;
771 
772   ScheduleDAGMI *DAG;
773 
774   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
775 
776   bool isSALU(const SUnit *SU) const {
777     const MachineInstr *MI = SU->getInstr();
778     return MI && TII->isSALU(*MI) && !MI->isTerminator();
779   }
780 
781   bool isVALU(const SUnit *SU) const {
782     const MachineInstr *MI = SU->getInstr();
783     return MI && TII->isVALU(*MI);
784   }
785 
786   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
787     if (Pred->NodeNum < Succ->NodeNum)
788       return true;
789 
790     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
791 
792     for (unsigned I = 0; I < Succs.size(); ++I) {
793       for (const SDep &SI : Succs[I]->Succs) {
794         const SUnit *SU = SI.getSUnit();
795         if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
796           Succs.push_back(SU);
797       }
798     }
799 
800     SmallPtrSet<const SUnit*, 32> Visited;
801     while (!Preds.empty()) {
802       const SUnit *SU = Preds.pop_back_val();
803       if (llvm::find(Succs, SU) != Succs.end())
804         return false;
805       Visited.insert(SU);
806       for (const SDep &SI : SU->Preds)
807         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
808           Preds.push_back(SI.getSUnit());
809     }
810 
811     return true;
812   }
813 
814   // Link as much SALU intructions in chain as possible. Return the size
815   // of the chain. Links up to MaxChain instructions.
816   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
817                          SmallPtrSetImpl<SUnit *> &Visited) const {
818     SmallVector<SUnit *, 8> Worklist({To});
819     unsigned Linked = 0;
820 
821     while (!Worklist.empty() && MaxChain-- > 0) {
822       SUnit *SU = Worklist.pop_back_val();
823       if (!Visited.insert(SU).second)
824         continue;
825 
826       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
827                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
828 
829       if (SU->addPred(SDep(From, SDep::Artificial), false))
830         ++Linked;
831 
832       for (SDep &SI : From->Succs) {
833         SUnit *SUv = SI.getSUnit();
834         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
835           SUv->addPred(SDep(SU, SDep::Artificial), false);
836       }
837 
838       for (SDep &SI : SU->Succs) {
839         SUnit *Succ = SI.getSUnit();
840         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
841           Worklist.push_back(Succ);
842       }
843     }
844 
845     return Linked;
846   }
847 
848   void apply(ScheduleDAGInstrs *DAGInstrs) override {
849     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
850     if (!ST.hasMAIInsts() || DisablePowerSched)
851       return;
852     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
853     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
854     if (!TSchedModel || DAG->SUnits.empty())
855       return;
856 
857     // Scan for MFMA long latency instructions and try to add a dependency
858     // of available SALU instructions to give them a chance to fill MFMA
859     // shadow. That is desirable to fill MFMA shadow with SALU instructions
860     // rather than VALU to prevent power consumption bursts and throttle.
861     auto LastSALU = DAG->SUnits.begin();
862     auto E = DAG->SUnits.end();
863     SmallPtrSet<SUnit*, 32> Visited;
864     for (SUnit &SU : DAG->SUnits) {
865       MachineInstr &MAI = *SU.getInstr();
866       if (!TII->isMAI(MAI) ||
867            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
868            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
869         continue;
870 
871       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
872 
873       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
874                  dbgs() << "Need " << Lat
875                         << " instructions to cover latency.\n");
876 
877       // Find up to Lat independent scalar instructions as early as
878       // possible such that they can be scheduled after this MFMA.
879       for ( ; Lat && LastSALU != E; ++LastSALU) {
880         if (Visited.count(&*LastSALU))
881           continue;
882 
883         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
884           continue;
885 
886         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
887       }
888     }
889   }
890 };
891 } // namespace
892 
893 void GCNSubtarget::getPostRAMutations(
894     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
895   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
896 }
897 
898 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
899   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
900     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
901   else
902     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
903 }
904 
905 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
906   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
907     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
908   else
909     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
910 }
911