1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 static cl::opt<bool> DisablePowerSched(
44   "amdgpu-disable-power-sched",
45   cl::desc("Disable scheduling to minimize mAI power bursts"),
46   cl::init(false));
47 
48 static cl::opt<bool> EnableVGPRIndexMode(
49   "amdgpu-vgpr-index-mode",
50   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51   cl::init(false));
52 
53 GCNSubtarget::~GCNSubtarget() = default;
54 
55 R600Subtarget &
56 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
57                                                StringRef GPU, StringRef FS) {
58   SmallString<256> FullFS("+promote-alloca,");
59   FullFS += FS;
60   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
61 
62   HasMulU24 = getGeneration() >= EVERGREEN;
63   HasMulI24 = hasCaymanISA();
64 
65   return *this;
66 }
67 
68 GCNSubtarget &
69 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
70                                               StringRef GPU, StringRef FS) {
71   // Determine default and user-specified characteristics
72   //
73   // We want to be able to turn these off, but making this a subtarget feature
74   // for SI has the unhelpful behavior that it unsets everything else if you
75   // disable it.
76   //
77   // Similarly we want enable-prt-strict-null to be on by default and not to
78   // unset everything else if it is disabled
79 
80   // Assuming ECC is enabled is the conservative default.
81   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,");
82 
83   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
84     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
85 
86   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
87 
88   // Disable mutually exclusive bits.
89   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
90     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
91       FullFS += "-wavefrontsize16,";
92     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
93       FullFS += "-wavefrontsize32,";
94     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
95       FullFS += "-wavefrontsize64,";
96   }
97 
98   FullFS += FS;
99 
100   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
101 
102   // We don't support FP64 for EG/NI atm.
103   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
104 
105   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
106   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
107   // variants of MUBUF instructions.
108   if (!hasAddr64() && !FS.contains("flat-for-global")) {
109     FlatForGlobal = true;
110   }
111 
112   // Set defaults if needed.
113   if (MaxPrivateElementSize == 0)
114     MaxPrivateElementSize = 4;
115 
116   if (LDSBankCount == 0)
117     LDSBankCount = 32;
118 
119   if (TT.getArch() == Triple::amdgcn) {
120     if (LocalMemorySize == 0)
121       LocalMemorySize = 32768;
122 
123     // Do something sensible for unspecified target.
124     if (!HasMovrel && !HasVGPRIndexMode)
125       HasMovrel = true;
126   }
127 
128   // Don't crash on invalid devices.
129   if (WavefrontSizeLog2 == 0)
130     WavefrontSizeLog2 = 5;
131 
132   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
133 
134   // Disable XNACK on targets where it is not enabled by default unless it is
135   // explicitly requested.
136   if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) {
137     ToggleFeature(AMDGPU::FeatureXNACK);
138     EnableXNACK = false;
139   }
140 
141   // ECC is on by default, but turn it off if the hardware doesn't support it
142   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
143   // ECC.
144   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
145     ToggleFeature(AMDGPU::FeatureSRAMECC);
146     EnableSRAMECC = false;
147   }
148 
149   return *this;
150 }
151 
152 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
153   TargetTriple(TT),
154   Has16BitInsts(false),
155   HasMadMixInsts(false),
156   HasMadMacF32Insts(false),
157   HasDsSrc2Insts(false),
158   HasSDWA(false),
159   HasVOP3PInsts(false),
160   HasMulI24(true),
161   HasMulU24(true),
162   HasInv2PiInlineImm(false),
163   HasFminFmaxLegacy(true),
164   EnablePromoteAlloca(false),
165   HasTrigReducedRange(false),
166   MaxWavesPerEU(10),
167   LocalMemorySize(0),
168   WavefrontSizeLog2(0)
169   { }
170 
171 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
172                            const GCNTargetMachine &TM) :
173     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
174     AMDGPUSubtarget(TT),
175     TargetTriple(TT),
176     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
177     InstrItins(getInstrItineraryForCPU(GPU)),
178     LDSBankCount(0),
179     MaxPrivateElementSize(0),
180 
181     FastFMAF32(false),
182     FastDenormalF32(false),
183     HalfRate64Ops(false),
184 
185     FlatForGlobal(false),
186     AutoWaitcntBeforeBarrier(false),
187     CodeObjectV3(false),
188     UnalignedScratchAccess(false),
189     UnalignedBufferAccess(false),
190     UnalignedAccessMode(false),
191 
192     HasApertureRegs(false),
193     EnableXNACK(false),
194     DoesNotSupportXNACK(false),
195     EnableCuMode(false),
196     TrapHandler(false),
197 
198     EnableLoadStoreOpt(false),
199     EnableUnsafeDSOffsetFolding(false),
200     EnableSIScheduler(false),
201     EnableDS128(false),
202     EnablePRTStrictNull(false),
203     DumpCode(false),
204 
205     FP64(false),
206     GCN3Encoding(false),
207     CIInsts(false),
208     GFX8Insts(false),
209     GFX9Insts(false),
210     GFX10Insts(false),
211     GFX10_3Insts(false),
212     GFX7GFX8GFX9Insts(false),
213     SGPRInitBug(false),
214     HasSMemRealTime(false),
215     HasIntClamp(false),
216     HasFmaMixInsts(false),
217     HasMovrel(false),
218     HasVGPRIndexMode(false),
219     HasScalarStores(false),
220     HasScalarAtomics(false),
221     HasSDWAOmod(false),
222     HasSDWAScalar(false),
223     HasSDWASdst(false),
224     HasSDWAMac(false),
225     HasSDWAOutModsVOPC(false),
226     HasDPP(false),
227     HasDPP8(false),
228     HasR128A16(false),
229     HasGFX10A16(false),
230     HasG16(false),
231     HasNSAEncoding(false),
232     GFX10_BEncoding(false),
233     HasDLInsts(false),
234     HasDot1Insts(false),
235     HasDot2Insts(false),
236     HasDot3Insts(false),
237     HasDot4Insts(false),
238     HasDot5Insts(false),
239     HasDot6Insts(false),
240     HasMAIInsts(false),
241     HasPkFmacF16Inst(false),
242     HasAtomicFaddInsts(false),
243     EnableSRAMECC(false),
244     DoesNotSupportSRAMECC(false),
245     HasNoSdstCMPX(false),
246     HasVscnt(false),
247     HasGetWaveIdInst(false),
248     HasSMemTimeInst(false),
249     HasRegisterBanking(false),
250     HasVOP3Literal(false),
251     HasNoDataDepHazard(false),
252     FlatAddressSpace(false),
253     FlatInstOffsets(false),
254     FlatGlobalInsts(false),
255     FlatScratchInsts(false),
256     ScalarFlatScratchInsts(false),
257     AddNoCarryInsts(false),
258     HasUnpackedD16VMem(false),
259     LDSMisalignedBug(false),
260     HasMFMAInlineLiteralBug(false),
261     UnalignedDSAccess(false),
262 
263     ScalarizeGlobal(false),
264 
265     HasVcmpxPermlaneHazard(false),
266     HasVMEMtoScalarWriteHazard(false),
267     HasSMEMtoVectorWriteHazard(false),
268     HasInstFwdPrefetchBug(false),
269     HasVcmpxExecWARHazard(false),
270     HasLdsBranchVmemWARHazard(false),
271     HasNSAtoVMEMBug(false),
272     HasOffset3fBug(false),
273     HasFlatSegmentOffsetBug(false),
274     HasImageStoreD16Bug(false),
275     HasImageGather4D16Bug(false),
276 
277     FeatureDisable(false),
278     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
279     TLInfo(TM, *this),
280     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
281   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
282   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
283   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
284   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
285   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
286   InstSelector.reset(new AMDGPUInstructionSelector(
287   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
288 }
289 
290 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
291   if (getGeneration() < GFX10)
292     return 1;
293 
294   switch (Opcode) {
295   case AMDGPU::V_LSHLREV_B64:
296   case AMDGPU::V_LSHLREV_B64_gfx10:
297   case AMDGPU::V_LSHL_B64:
298   case AMDGPU::V_LSHRREV_B64:
299   case AMDGPU::V_LSHRREV_B64_gfx10:
300   case AMDGPU::V_LSHR_B64:
301   case AMDGPU::V_ASHRREV_I64:
302   case AMDGPU::V_ASHRREV_I64_gfx10:
303   case AMDGPU::V_ASHR_I64:
304     return 1;
305   }
306 
307   return 2;
308 }
309 
310 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
311   const Function &F) const {
312   if (NWaves == 1)
313     return getLocalMemorySize();
314   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
315   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
316   if (!WorkGroupsPerCu)
317     return 0;
318   unsigned MaxWaves = getMaxWavesPerEU();
319   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
320 }
321 
322 // FIXME: Should return min,max range.
323 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
324   const Function &F) const {
325   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
326   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
327   if (!MaxWorkGroupsPerCu)
328     return 0;
329 
330   const unsigned WaveSize = getWavefrontSize();
331 
332   // FIXME: Do we need to account for alignment requirement of LDS rounding the
333   // size up?
334   // Compute restriction based on LDS usage
335   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
336 
337   // This can be queried with more LDS than is possible, so just assume the
338   // worst.
339   if (NumGroups == 0)
340     return 1;
341 
342   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
343 
344   // Round to the number of waves.
345   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
346   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
347 
348   // Clamp to the maximum possible number of waves.
349   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
350 
351   // FIXME: Needs to be a multiple of the group size?
352   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
353 
354   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
355          "computed invalid occupancy");
356   return MaxWaves;
357 }
358 
359 unsigned
360 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
361   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
362   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
363 }
364 
365 std::pair<unsigned, unsigned>
366 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
367   switch (CC) {
368   case CallingConv::AMDGPU_VS:
369   case CallingConv::AMDGPU_LS:
370   case CallingConv::AMDGPU_HS:
371   case CallingConv::AMDGPU_ES:
372   case CallingConv::AMDGPU_GS:
373   case CallingConv::AMDGPU_PS:
374     return std::make_pair(1, getWavefrontSize());
375   default:
376     return std::make_pair(1u, getMaxFlatWorkGroupSize());
377   }
378 }
379 
380 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
381   const Function &F) const {
382   // Default minimum/maximum flat work group sizes.
383   std::pair<unsigned, unsigned> Default =
384     getDefaultFlatWorkGroupSize(F.getCallingConv());
385 
386   // Requested minimum/maximum flat work group sizes.
387   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
388     F, "amdgpu-flat-work-group-size", Default);
389 
390   // Make sure requested minimum is less than requested maximum.
391   if (Requested.first > Requested.second)
392     return Default;
393 
394   // Make sure requested values do not violate subtarget's specifications.
395   if (Requested.first < getMinFlatWorkGroupSize())
396     return Default;
397   if (Requested.second > getMaxFlatWorkGroupSize())
398     return Default;
399 
400   return Requested;
401 }
402 
403 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
404   const Function &F) const {
405   // Default minimum/maximum number of waves per execution unit.
406   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
407 
408   // Default/requested minimum/maximum flat work group sizes.
409   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
410 
411   // If minimum/maximum flat work group sizes were explicitly requested using
412   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
413   // number of waves per execution unit to values implied by requested
414   // minimum/maximum flat work group sizes.
415   unsigned MinImpliedByFlatWorkGroupSize =
416     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
417   Default.first = MinImpliedByFlatWorkGroupSize;
418   bool RequestedFlatWorkGroupSize =
419       F.hasFnAttribute("amdgpu-flat-work-group-size");
420 
421   // Requested minimum/maximum number of waves per execution unit.
422   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
423     F, "amdgpu-waves-per-eu", Default, true);
424 
425   // Make sure requested minimum is less than requested maximum.
426   if (Requested.second && Requested.first > Requested.second)
427     return Default;
428 
429   // Make sure requested values do not violate subtarget's specifications.
430   if (Requested.first < getMinWavesPerEU() ||
431       Requested.second > getMaxWavesPerEU())
432     return Default;
433 
434   // Make sure requested values are compatible with values implied by requested
435   // minimum/maximum flat work group sizes.
436   if (RequestedFlatWorkGroupSize &&
437       Requested.first < MinImpliedByFlatWorkGroupSize)
438     return Default;
439 
440   return Requested;
441 }
442 
443 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
444   auto Node = Kernel.getMetadata("reqd_work_group_size");
445   if (Node && Node->getNumOperands() == 3)
446     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
447   return std::numeric_limits<unsigned>::max();
448 }
449 
450 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
451                                            unsigned Dimension) const {
452   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
453   if (ReqdSize != std::numeric_limits<unsigned>::max())
454     return ReqdSize - 1;
455   return getFlatWorkGroupSizes(Kernel).second - 1;
456 }
457 
458 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
459   Function *Kernel = I->getParent()->getParent();
460   unsigned MinSize = 0;
461   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
462   bool IdQuery = false;
463 
464   // If reqd_work_group_size is present it narrows value down.
465   if (auto *CI = dyn_cast<CallInst>(I)) {
466     const Function *F = CI->getCalledFunction();
467     if (F) {
468       unsigned Dim = UINT_MAX;
469       switch (F->getIntrinsicID()) {
470       case Intrinsic::amdgcn_workitem_id_x:
471       case Intrinsic::r600_read_tidig_x:
472         IdQuery = true;
473         LLVM_FALLTHROUGH;
474       case Intrinsic::r600_read_local_size_x:
475         Dim = 0;
476         break;
477       case Intrinsic::amdgcn_workitem_id_y:
478       case Intrinsic::r600_read_tidig_y:
479         IdQuery = true;
480         LLVM_FALLTHROUGH;
481       case Intrinsic::r600_read_local_size_y:
482         Dim = 1;
483         break;
484       case Intrinsic::amdgcn_workitem_id_z:
485       case Intrinsic::r600_read_tidig_z:
486         IdQuery = true;
487         LLVM_FALLTHROUGH;
488       case Intrinsic::r600_read_local_size_z:
489         Dim = 2;
490         break;
491       default:
492         break;
493       }
494 
495       if (Dim <= 3) {
496         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
497         if (ReqdSize != std::numeric_limits<unsigned>::max())
498           MinSize = MaxSize = ReqdSize;
499       }
500     }
501   }
502 
503   if (!MaxSize)
504     return false;
505 
506   // Range metadata is [Lo, Hi). For ID query we need to pass max size
507   // as Hi. For size query we need to pass Hi + 1.
508   if (IdQuery)
509     MinSize = 0;
510   else
511     ++MaxSize;
512 
513   MDBuilder MDB(I->getContext());
514   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
515                                                   APInt(32, MaxSize));
516   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
517   return true;
518 }
519 
520 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
521                                                  Align &MaxAlign) const {
522   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
523          F.getCallingConv() == CallingConv::SPIR_KERNEL);
524 
525   const DataLayout &DL = F.getParent()->getDataLayout();
526   uint64_t ExplicitArgBytes = 0;
527   MaxAlign = Align(1);
528 
529   for (const Argument &Arg : F.args()) {
530     const bool IsByRef = Arg.hasByRefAttr();
531     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
532     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
533     if (!Alignment)
534       Alignment = DL.getABITypeAlign(ArgTy);
535 
536     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
537     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
538     MaxAlign = max(MaxAlign, Alignment);
539   }
540 
541   return ExplicitArgBytes;
542 }
543 
544 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
545                                                 Align &MaxAlign) const {
546   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
547 
548   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
549 
550   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
551   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
552   if (ImplicitBytes != 0) {
553     const Align Alignment = getAlignmentForImplicitArgPtr();
554     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
555   }
556 
557   // Being able to dereference past the end is useful for emitting scalar loads.
558   return alignTo(TotalSize, 4);
559 }
560 
561 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
562                              const TargetMachine &TM) :
563   R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS),
564   AMDGPUSubtarget(TT),
565   InstrInfo(*this),
566   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
567   FMA(false),
568   CaymanISA(false),
569   CFALUBug(false),
570   HasVertexCache(false),
571   R600ALUInst(false),
572   FP64(false),
573   TexVTXClauseSize(0),
574   Gen(R600),
575   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
576   InstrItins(getInstrItineraryForCPU(GPU)) { }
577 
578 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
579                                       unsigned NumRegionInstrs) const {
580   // Track register pressure so the scheduler can try to decrease
581   // pressure once register usage is above the threshold defined by
582   // SIRegisterInfo::getRegPressureSetLimit()
583   Policy.ShouldTrackPressure = true;
584 
585   // Enabling both top down and bottom up scheduling seems to give us less
586   // register spills than just using one of these approaches on its own.
587   Policy.OnlyTopDown = false;
588   Policy.OnlyBottomUp = false;
589 
590   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
591   if (!enableSIScheduler())
592     Policy.ShouldTrackLaneMasks = true;
593 }
594 
595 bool GCNSubtarget::hasMadF16() const {
596   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
597 }
598 
599 bool GCNSubtarget::useVGPRIndexMode() const {
600   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
601 }
602 
603 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
604   if (getGeneration() >= AMDGPUSubtarget::GFX10)
605     return getMaxWavesPerEU();
606 
607   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
608     if (SGPRs <= 80)
609       return 10;
610     if (SGPRs <= 88)
611       return 9;
612     if (SGPRs <= 100)
613       return 8;
614     return 7;
615   }
616   if (SGPRs <= 48)
617     return 10;
618   if (SGPRs <= 56)
619     return 9;
620   if (SGPRs <= 64)
621     return 8;
622   if (SGPRs <= 72)
623     return 7;
624   if (SGPRs <= 80)
625     return 6;
626   return 5;
627 }
628 
629 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
630   unsigned MaxWaves = getMaxWavesPerEU();
631   unsigned Granule = getVGPRAllocGranule();
632   if (VGPRs < Granule)
633     return MaxWaves;
634   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
635   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
636 }
637 
638 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
639   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
640   if (getGeneration() >= AMDGPUSubtarget::GFX10)
641     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
642 
643   if (MFI.hasFlatScratchInit()) {
644     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
645       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
646     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
647       return 4; // FLAT_SCRATCH, VCC (in that order).
648   }
649 
650   if (isXNACKEnabled())
651     return 4; // XNACK, VCC (in that order).
652   return 2; // VCC.
653 }
654 
655 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
656                                         unsigned NumSGPRs,
657                                         unsigned NumVGPRs) const {
658   unsigned Occupancy =
659     std::min(getMaxWavesPerEU(),
660              getOccupancyWithLocalMemSize(LDSSize, F));
661   if (NumSGPRs)
662     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
663   if (NumVGPRs)
664     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
665   return Occupancy;
666 }
667 
668 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
669   const Function &F = MF.getFunction();
670   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
671 
672   // Compute maximum number of SGPRs function can use using default/requested
673   // minimum number of waves per execution unit.
674   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
675   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
676   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
677 
678   // Check if maximum number of SGPRs was explicitly requested using
679   // "amdgpu-num-sgpr" attribute.
680   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
681     unsigned Requested = AMDGPU::getIntegerAttribute(
682       F, "amdgpu-num-sgpr", MaxNumSGPRs);
683 
684     // Make sure requested value does not violate subtarget's specifications.
685     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
686       Requested = 0;
687 
688     // If more SGPRs are required to support the input user/system SGPRs,
689     // increase to accommodate them.
690     //
691     // FIXME: This really ends up using the requested number of SGPRs + number
692     // of reserved special registers in total. Theoretically you could re-use
693     // the last input registers for these special registers, but this would
694     // require a lot of complexity to deal with the weird aliasing.
695     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
696     if (Requested && Requested < InputNumSGPRs)
697       Requested = InputNumSGPRs;
698 
699     // Make sure requested value is compatible with values implied by
700     // default/requested minimum/maximum number of waves per execution unit.
701     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
702       Requested = 0;
703     if (WavesPerEU.second &&
704         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
705       Requested = 0;
706 
707     if (Requested)
708       MaxNumSGPRs = Requested;
709   }
710 
711   if (hasSGPRInitBug())
712     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
713 
714   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
715                   MaxAddressableNumSGPRs);
716 }
717 
718 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
719   const Function &F = MF.getFunction();
720   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
721 
722   // Compute maximum number of VGPRs function can use using default/requested
723   // minimum number of waves per execution unit.
724   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
725   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
726 
727   // Check if maximum number of VGPRs was explicitly requested using
728   // "amdgpu-num-vgpr" attribute.
729   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
730     unsigned Requested = AMDGPU::getIntegerAttribute(
731       F, "amdgpu-num-vgpr", MaxNumVGPRs);
732 
733     // Make sure requested value is compatible with values implied by
734     // default/requested minimum/maximum number of waves per execution unit.
735     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
736       Requested = 0;
737     if (WavesPerEU.second &&
738         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
739       Requested = 0;
740 
741     if (Requested)
742       MaxNumVGPRs = Requested;
743   }
744 
745   return MaxNumVGPRs;
746 }
747 
748 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
749                                          int UseOpIdx, SDep &Dep) const {
750   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
751       !Def->isInstr() || !Use->isInstr())
752     return;
753 
754   MachineInstr *DefI = Def->getInstr();
755   MachineInstr *UseI = Use->getInstr();
756 
757   if (DefI->isBundle()) {
758     const SIRegisterInfo *TRI = getRegisterInfo();
759     auto Reg = Dep.getReg();
760     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
761     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
762     unsigned Lat = 0;
763     for (++I; I != E && I->isBundledWithPred(); ++I) {
764       if (I->modifiesRegister(Reg, TRI))
765         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
766       else if (Lat)
767         --Lat;
768     }
769     Dep.setLatency(Lat);
770   } else if (UseI->isBundle()) {
771     const SIRegisterInfo *TRI = getRegisterInfo();
772     auto Reg = Dep.getReg();
773     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
774     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
775     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
776     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
777       if (I->readsRegister(Reg, TRI))
778         break;
779       --Lat;
780     }
781     Dep.setLatency(Lat);
782   }
783 }
784 
785 namespace {
786 struct FillMFMAShadowMutation : ScheduleDAGMutation {
787   const SIInstrInfo *TII;
788 
789   ScheduleDAGMI *DAG;
790 
791   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
792 
793   bool isSALU(const SUnit *SU) const {
794     const MachineInstr *MI = SU->getInstr();
795     return MI && TII->isSALU(*MI) && !MI->isTerminator();
796   }
797 
798   bool isVALU(const SUnit *SU) const {
799     const MachineInstr *MI = SU->getInstr();
800     return MI && TII->isVALU(*MI);
801   }
802 
803   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
804     if (Pred->NodeNum < Succ->NodeNum)
805       return true;
806 
807     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
808 
809     for (unsigned I = 0; I < Succs.size(); ++I) {
810       for (const SDep &SI : Succs[I]->Succs) {
811         const SUnit *SU = SI.getSUnit();
812         if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
813           Succs.push_back(SU);
814       }
815     }
816 
817     SmallPtrSet<const SUnit*, 32> Visited;
818     while (!Preds.empty()) {
819       const SUnit *SU = Preds.pop_back_val();
820       if (llvm::find(Succs, SU) != Succs.end())
821         return false;
822       Visited.insert(SU);
823       for (const SDep &SI : SU->Preds)
824         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
825           Preds.push_back(SI.getSUnit());
826     }
827 
828     return true;
829   }
830 
831   // Link as much SALU intructions in chain as possible. Return the size
832   // of the chain. Links up to MaxChain instructions.
833   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
834                          SmallPtrSetImpl<SUnit *> &Visited) const {
835     SmallVector<SUnit *, 8> Worklist({To});
836     unsigned Linked = 0;
837 
838     while (!Worklist.empty() && MaxChain-- > 0) {
839       SUnit *SU = Worklist.pop_back_val();
840       if (!Visited.insert(SU).second)
841         continue;
842 
843       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
844                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
845 
846       if (SU->addPred(SDep(From, SDep::Artificial), false))
847         ++Linked;
848 
849       for (SDep &SI : From->Succs) {
850         SUnit *SUv = SI.getSUnit();
851         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
852           SUv->addPred(SDep(SU, SDep::Artificial), false);
853       }
854 
855       for (SDep &SI : SU->Succs) {
856         SUnit *Succ = SI.getSUnit();
857         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
858           Worklist.push_back(Succ);
859       }
860     }
861 
862     return Linked;
863   }
864 
865   void apply(ScheduleDAGInstrs *DAGInstrs) override {
866     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
867     if (!ST.hasMAIInsts() || DisablePowerSched)
868       return;
869     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
870     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
871     if (!TSchedModel || DAG->SUnits.empty())
872       return;
873 
874     // Scan for MFMA long latency instructions and try to add a dependency
875     // of available SALU instructions to give them a chance to fill MFMA
876     // shadow. That is desirable to fill MFMA shadow with SALU instructions
877     // rather than VALU to prevent power consumption bursts and throttle.
878     auto LastSALU = DAG->SUnits.begin();
879     auto E = DAG->SUnits.end();
880     SmallPtrSet<SUnit*, 32> Visited;
881     for (SUnit &SU : DAG->SUnits) {
882       MachineInstr &MAI = *SU.getInstr();
883       if (!TII->isMAI(MAI) ||
884            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
885            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
886         continue;
887 
888       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
889 
890       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
891                  dbgs() << "Need " << Lat
892                         << " instructions to cover latency.\n");
893 
894       // Find up to Lat independent scalar instructions as early as
895       // possible such that they can be scheduled after this MFMA.
896       for ( ; Lat && LastSALU != E; ++LastSALU) {
897         if (Visited.count(&*LastSALU))
898           continue;
899 
900         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
901           continue;
902 
903         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
904       }
905     }
906   }
907 };
908 } // namespace
909 
910 void GCNSubtarget::getPostRAMutations(
911     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
912   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
913 }
914 
915 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
916   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
917     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
918   else
919     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
920 }
921 
922 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
923   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
924     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
925   else
926     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
927 }
928