1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 static cl::opt<bool> DisablePowerSched(
44   "amdgpu-disable-power-sched",
45   cl::desc("Disable scheduling to minimize mAI power bursts"),
46   cl::init(false));
47 
48 static cl::opt<bool> EnableVGPRIndexMode(
49   "amdgpu-vgpr-index-mode",
50   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51   cl::init(false));
52 
53 GCNSubtarget::~GCNSubtarget() = default;
54 
55 R600Subtarget &
56 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
57                                                StringRef GPU, StringRef FS) {
58   SmallString<256> FullFS("+promote-alloca,");
59   FullFS += FS;
60   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
61 
62   HasMulU24 = getGeneration() >= EVERGREEN;
63   HasMulI24 = hasCaymanISA();
64 
65   return *this;
66 }
67 
68 GCNSubtarget &
69 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
70                                               StringRef GPU, StringRef FS) {
71   // Determine default and user-specified characteristics
72   //
73   // We want to be able to turn these off, but making this a subtarget feature
74   // for SI has the unhelpful behavior that it unsets everything else if you
75   // disable it.
76   //
77   // Similarly we want enable-prt-strict-null to be on by default and not to
78   // unset everything else if it is disabled
79 
80   // Assuming ECC is enabled is the conservative default.
81   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,");
82 
83   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
84     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
85 
86   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
87 
88   // Disable mutually exclusive bits.
89   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
90     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
91       FullFS += "-wavefrontsize16,";
92     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
93       FullFS += "-wavefrontsize32,";
94     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
95       FullFS += "-wavefrontsize64,";
96   }
97 
98   FullFS += FS;
99 
100   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
101 
102   // We don't support FP64 for EG/NI atm.
103   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
104 
105   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
106   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
107   // variants of MUBUF instructions.
108   if (!hasAddr64() && !FS.contains("flat-for-global")) {
109     FlatForGlobal = true;
110   }
111 
112   // Set defaults if needed.
113   if (MaxPrivateElementSize == 0)
114     MaxPrivateElementSize = 4;
115 
116   if (LDSBankCount == 0)
117     LDSBankCount = 32;
118 
119   if (TT.getArch() == Triple::amdgcn) {
120     if (LocalMemorySize == 0)
121       LocalMemorySize = 32768;
122 
123     // Do something sensible for unspecified target.
124     if (!HasMovrel && !HasVGPRIndexMode)
125       HasMovrel = true;
126   }
127 
128   // Don't crash on invalid devices.
129   if (WavefrontSizeLog2 == 0)
130     WavefrontSizeLog2 = 5;
131 
132   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
133 
134   // Disable XNACK on targets where it is not enabled by default unless it is
135   // explicitly requested.
136   if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) {
137     ToggleFeature(AMDGPU::FeatureXNACK);
138     EnableXNACK = false;
139   }
140 
141   // ECC is on by default, but turn it off if the hardware doesn't support it
142   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
143   // ECC.
144   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
145     ToggleFeature(AMDGPU::FeatureSRAMECC);
146     EnableSRAMECC = false;
147   }
148 
149   return *this;
150 }
151 
152 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
153   TargetTriple(TT),
154   Has16BitInsts(false),
155   HasMadMixInsts(false),
156   HasMadMacF32Insts(false),
157   HasDsSrc2Insts(false),
158   HasSDWA(false),
159   HasVOP3PInsts(false),
160   HasMulI24(true),
161   HasMulU24(true),
162   HasInv2PiInlineImm(false),
163   HasFminFmaxLegacy(true),
164   EnablePromoteAlloca(false),
165   HasTrigReducedRange(false),
166   MaxWavesPerEU(10),
167   LocalMemorySize(0),
168   WavefrontSizeLog2(0)
169   { }
170 
171 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
172                            const GCNTargetMachine &TM) :
173     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
174     AMDGPUSubtarget(TT),
175     TargetTriple(TT),
176     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
177     InstrItins(getInstrItineraryForCPU(GPU)),
178     LDSBankCount(0),
179     MaxPrivateElementSize(0),
180 
181     FastFMAF32(false),
182     FastDenormalF32(false),
183     HalfRate64Ops(false),
184 
185     FlatForGlobal(false),
186     AutoWaitcntBeforeBarrier(false),
187     CodeObjectV3(false),
188     UnalignedScratchAccess(false),
189     UnalignedAccessMode(false),
190 
191     HasApertureRegs(false),
192     EnableXNACK(false),
193     DoesNotSupportXNACK(false),
194     EnableCuMode(false),
195     TrapHandler(false),
196 
197     EnableLoadStoreOpt(false),
198     EnableUnsafeDSOffsetFolding(false),
199     EnableSIScheduler(false),
200     EnableDS128(false),
201     EnablePRTStrictNull(false),
202     DumpCode(false),
203 
204     FP64(false),
205     GCN3Encoding(false),
206     CIInsts(false),
207     GFX8Insts(false),
208     GFX9Insts(false),
209     GFX10Insts(false),
210     GFX10_3Insts(false),
211     GFX7GFX8GFX9Insts(false),
212     SGPRInitBug(false),
213     HasSMemRealTime(false),
214     HasIntClamp(false),
215     HasFmaMixInsts(false),
216     HasMovrel(false),
217     HasVGPRIndexMode(false),
218     HasScalarStores(false),
219     HasScalarAtomics(false),
220     HasSDWAOmod(false),
221     HasSDWAScalar(false),
222     HasSDWASdst(false),
223     HasSDWAMac(false),
224     HasSDWAOutModsVOPC(false),
225     HasDPP(false),
226     HasDPP8(false),
227     HasR128A16(false),
228     HasGFX10A16(false),
229     HasG16(false),
230     HasNSAEncoding(false),
231     GFX10_BEncoding(false),
232     HasDLInsts(false),
233     HasDot1Insts(false),
234     HasDot2Insts(false),
235     HasDot3Insts(false),
236     HasDot4Insts(false),
237     HasDot5Insts(false),
238     HasDot6Insts(false),
239     HasMAIInsts(false),
240     HasPkFmacF16Inst(false),
241     HasAtomicFaddInsts(false),
242     EnableSRAMECC(false),
243     DoesNotSupportSRAMECC(false),
244     HasNoSdstCMPX(false),
245     HasVscnt(false),
246     HasGetWaveIdInst(false),
247     HasSMemTimeInst(false),
248     HasRegisterBanking(false),
249     HasVOP3Literal(false),
250     HasNoDataDepHazard(false),
251     FlatAddressSpace(false),
252     FlatInstOffsets(false),
253     FlatGlobalInsts(false),
254     FlatScratchInsts(false),
255     ScalarFlatScratchInsts(false),
256     AddNoCarryInsts(false),
257     HasUnpackedD16VMem(false),
258     LDSMisalignedBug(false),
259     HasMFMAInlineLiteralBug(false),
260     UnalignedBufferAccess(false),
261     UnalignedDSAccess(false),
262 
263     ScalarizeGlobal(false),
264 
265     HasVcmpxPermlaneHazard(false),
266     HasVMEMtoScalarWriteHazard(false),
267     HasSMEMtoVectorWriteHazard(false),
268     HasInstFwdPrefetchBug(false),
269     HasVcmpxExecWARHazard(false),
270     HasLdsBranchVmemWARHazard(false),
271     HasNSAtoVMEMBug(false),
272     HasOffset3fBug(false),
273     HasFlatSegmentOffsetBug(false),
274 
275     FeatureDisable(false),
276     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
277     TLInfo(TM, *this),
278     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
279   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
280   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
281   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
282   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
283   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
284   InstSelector.reset(new AMDGPUInstructionSelector(
285   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
286 }
287 
288 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
289   if (getGeneration() < GFX10)
290     return 1;
291 
292   switch (Opcode) {
293   case AMDGPU::V_LSHLREV_B64:
294   case AMDGPU::V_LSHLREV_B64_gfx10:
295   case AMDGPU::V_LSHL_B64:
296   case AMDGPU::V_LSHRREV_B64:
297   case AMDGPU::V_LSHRREV_B64_gfx10:
298   case AMDGPU::V_LSHR_B64:
299   case AMDGPU::V_ASHRREV_I64:
300   case AMDGPU::V_ASHRREV_I64_gfx10:
301   case AMDGPU::V_ASHR_I64:
302     return 1;
303   }
304 
305   return 2;
306 }
307 
308 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
309   const Function &F) const {
310   if (NWaves == 1)
311     return getLocalMemorySize();
312   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
313   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
314   if (!WorkGroupsPerCu)
315     return 0;
316   unsigned MaxWaves = getMaxWavesPerEU();
317   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
318 }
319 
320 // FIXME: Should return min,max range.
321 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
322   const Function &F) const {
323   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
324   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
325   if (!MaxWorkGroupsPerCu)
326     return 0;
327 
328   const unsigned WaveSize = getWavefrontSize();
329 
330   // FIXME: Do we need to account for alignment requirement of LDS rounding the
331   // size up?
332   // Compute restriction based on LDS usage
333   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
334 
335   // This can be queried with more LDS than is possible, so just assume the
336   // worst.
337   if (NumGroups == 0)
338     return 1;
339 
340   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
341 
342   // Round to the number of waves.
343   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
344   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
345 
346   // Clamp to the maximum possible number of waves.
347   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
348 
349   // FIXME: Needs to be a multiple of the group size?
350   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
351 
352   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
353          "computed invalid occupancy");
354   return MaxWaves;
355 }
356 
357 unsigned
358 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
359   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
360   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
361 }
362 
363 std::pair<unsigned, unsigned>
364 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
365   switch (CC) {
366   case CallingConv::AMDGPU_VS:
367   case CallingConv::AMDGPU_LS:
368   case CallingConv::AMDGPU_HS:
369   case CallingConv::AMDGPU_ES:
370   case CallingConv::AMDGPU_GS:
371   case CallingConv::AMDGPU_PS:
372     return std::make_pair(1, getWavefrontSize());
373   default:
374     return std::make_pair(1u, getMaxFlatWorkGroupSize());
375   }
376 }
377 
378 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
379   const Function &F) const {
380   // Default minimum/maximum flat work group sizes.
381   std::pair<unsigned, unsigned> Default =
382     getDefaultFlatWorkGroupSize(F.getCallingConv());
383 
384   // Requested minimum/maximum flat work group sizes.
385   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
386     F, "amdgpu-flat-work-group-size", Default);
387 
388   // Make sure requested minimum is less than requested maximum.
389   if (Requested.first > Requested.second)
390     return Default;
391 
392   // Make sure requested values do not violate subtarget's specifications.
393   if (Requested.first < getMinFlatWorkGroupSize())
394     return Default;
395   if (Requested.second > getMaxFlatWorkGroupSize())
396     return Default;
397 
398   return Requested;
399 }
400 
401 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
402   const Function &F) const {
403   // Default minimum/maximum number of waves per execution unit.
404   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
405 
406   // Default/requested minimum/maximum flat work group sizes.
407   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
408 
409   // If minimum/maximum flat work group sizes were explicitly requested using
410   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
411   // number of waves per execution unit to values implied by requested
412   // minimum/maximum flat work group sizes.
413   unsigned MinImpliedByFlatWorkGroupSize =
414     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
415   Default.first = MinImpliedByFlatWorkGroupSize;
416   bool RequestedFlatWorkGroupSize =
417       F.hasFnAttribute("amdgpu-flat-work-group-size");
418 
419   // Requested minimum/maximum number of waves per execution unit.
420   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
421     F, "amdgpu-waves-per-eu", Default, true);
422 
423   // Make sure requested minimum is less than requested maximum.
424   if (Requested.second && Requested.first > Requested.second)
425     return Default;
426 
427   // Make sure requested values do not violate subtarget's specifications.
428   if (Requested.first < getMinWavesPerEU() ||
429       Requested.second > getMaxWavesPerEU())
430     return Default;
431 
432   // Make sure requested values are compatible with values implied by requested
433   // minimum/maximum flat work group sizes.
434   if (RequestedFlatWorkGroupSize &&
435       Requested.first < MinImpliedByFlatWorkGroupSize)
436     return Default;
437 
438   return Requested;
439 }
440 
441 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
442   auto Node = Kernel.getMetadata("reqd_work_group_size");
443   if (Node && Node->getNumOperands() == 3)
444     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
445   return std::numeric_limits<unsigned>::max();
446 }
447 
448 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
449                                            unsigned Dimension) const {
450   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
451   if (ReqdSize != std::numeric_limits<unsigned>::max())
452     return ReqdSize - 1;
453   return getFlatWorkGroupSizes(Kernel).second - 1;
454 }
455 
456 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
457   Function *Kernel = I->getParent()->getParent();
458   unsigned MinSize = 0;
459   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
460   bool IdQuery = false;
461 
462   // If reqd_work_group_size is present it narrows value down.
463   if (auto *CI = dyn_cast<CallInst>(I)) {
464     const Function *F = CI->getCalledFunction();
465     if (F) {
466       unsigned Dim = UINT_MAX;
467       switch (F->getIntrinsicID()) {
468       case Intrinsic::amdgcn_workitem_id_x:
469       case Intrinsic::r600_read_tidig_x:
470         IdQuery = true;
471         LLVM_FALLTHROUGH;
472       case Intrinsic::r600_read_local_size_x:
473         Dim = 0;
474         break;
475       case Intrinsic::amdgcn_workitem_id_y:
476       case Intrinsic::r600_read_tidig_y:
477         IdQuery = true;
478         LLVM_FALLTHROUGH;
479       case Intrinsic::r600_read_local_size_y:
480         Dim = 1;
481         break;
482       case Intrinsic::amdgcn_workitem_id_z:
483       case Intrinsic::r600_read_tidig_z:
484         IdQuery = true;
485         LLVM_FALLTHROUGH;
486       case Intrinsic::r600_read_local_size_z:
487         Dim = 2;
488         break;
489       default:
490         break;
491       }
492 
493       if (Dim <= 3) {
494         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
495         if (ReqdSize != std::numeric_limits<unsigned>::max())
496           MinSize = MaxSize = ReqdSize;
497       }
498     }
499   }
500 
501   if (!MaxSize)
502     return false;
503 
504   // Range metadata is [Lo, Hi). For ID query we need to pass max size
505   // as Hi. For size query we need to pass Hi + 1.
506   if (IdQuery)
507     MinSize = 0;
508   else
509     ++MaxSize;
510 
511   MDBuilder MDB(I->getContext());
512   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
513                                                   APInt(32, MaxSize));
514   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
515   return true;
516 }
517 
518 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
519                                                  Align &MaxAlign) const {
520   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
521          F.getCallingConv() == CallingConv::SPIR_KERNEL);
522 
523   const DataLayout &DL = F.getParent()->getDataLayout();
524   uint64_t ExplicitArgBytes = 0;
525   MaxAlign = Align(1);
526 
527   for (const Argument &Arg : F.args()) {
528     const bool IsByRef = Arg.hasByRefAttr();
529     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
530     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
531     if (!Alignment)
532       Alignment = DL.getABITypeAlign(ArgTy);
533 
534     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
535     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
536     MaxAlign = max(MaxAlign, Alignment);
537   }
538 
539   return ExplicitArgBytes;
540 }
541 
542 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
543                                                 Align &MaxAlign) const {
544   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
545 
546   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
547 
548   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
549   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
550   if (ImplicitBytes != 0) {
551     const Align Alignment = getAlignmentForImplicitArgPtr();
552     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
553   }
554 
555   // Being able to dereference past the end is useful for emitting scalar loads.
556   return alignTo(TotalSize, 4);
557 }
558 
559 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
560                              const TargetMachine &TM) :
561   R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS),
562   AMDGPUSubtarget(TT),
563   InstrInfo(*this),
564   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
565   FMA(false),
566   CaymanISA(false),
567   CFALUBug(false),
568   HasVertexCache(false),
569   R600ALUInst(false),
570   FP64(false),
571   TexVTXClauseSize(0),
572   Gen(R600),
573   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
574   InstrItins(getInstrItineraryForCPU(GPU)) { }
575 
576 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
577                                       unsigned NumRegionInstrs) const {
578   // Track register pressure so the scheduler can try to decrease
579   // pressure once register usage is above the threshold defined by
580   // SIRegisterInfo::getRegPressureSetLimit()
581   Policy.ShouldTrackPressure = true;
582 
583   // Enabling both top down and bottom up scheduling seems to give us less
584   // register spills than just using one of these approaches on its own.
585   Policy.OnlyTopDown = false;
586   Policy.OnlyBottomUp = false;
587 
588   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
589   if (!enableSIScheduler())
590     Policy.ShouldTrackLaneMasks = true;
591 }
592 
593 bool GCNSubtarget::hasMadF16() const {
594   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
595 }
596 
597 bool GCNSubtarget::useVGPRIndexMode() const {
598   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
599 }
600 
601 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
602   if (getGeneration() >= AMDGPUSubtarget::GFX10)
603     return getMaxWavesPerEU();
604 
605   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
606     if (SGPRs <= 80)
607       return 10;
608     if (SGPRs <= 88)
609       return 9;
610     if (SGPRs <= 100)
611       return 8;
612     return 7;
613   }
614   if (SGPRs <= 48)
615     return 10;
616   if (SGPRs <= 56)
617     return 9;
618   if (SGPRs <= 64)
619     return 8;
620   if (SGPRs <= 72)
621     return 7;
622   if (SGPRs <= 80)
623     return 6;
624   return 5;
625 }
626 
627 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
628   unsigned MaxWaves = getMaxWavesPerEU();
629   unsigned Granule = getVGPRAllocGranule();
630   if (VGPRs < Granule)
631     return MaxWaves;
632   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
633   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
634 }
635 
636 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
637   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
638   if (getGeneration() >= AMDGPUSubtarget::GFX10)
639     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
640 
641   if (MFI.hasFlatScratchInit()) {
642     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
643       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
644     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
645       return 4; // FLAT_SCRATCH, VCC (in that order).
646   }
647 
648   if (isXNACKEnabled())
649     return 4; // XNACK, VCC (in that order).
650   return 2; // VCC.
651 }
652 
653 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
654                                         unsigned NumSGPRs,
655                                         unsigned NumVGPRs) const {
656   unsigned Occupancy =
657     std::min(getMaxWavesPerEU(),
658              getOccupancyWithLocalMemSize(LDSSize, F));
659   if (NumSGPRs)
660     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
661   if (NumVGPRs)
662     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
663   return Occupancy;
664 }
665 
666 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
667   const Function &F = MF.getFunction();
668   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
669 
670   // Compute maximum number of SGPRs function can use using default/requested
671   // minimum number of waves per execution unit.
672   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
673   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
674   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
675 
676   // Check if maximum number of SGPRs was explicitly requested using
677   // "amdgpu-num-sgpr" attribute.
678   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
679     unsigned Requested = AMDGPU::getIntegerAttribute(
680       F, "amdgpu-num-sgpr", MaxNumSGPRs);
681 
682     // Make sure requested value does not violate subtarget's specifications.
683     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
684       Requested = 0;
685 
686     // If more SGPRs are required to support the input user/system SGPRs,
687     // increase to accommodate them.
688     //
689     // FIXME: This really ends up using the requested number of SGPRs + number
690     // of reserved special registers in total. Theoretically you could re-use
691     // the last input registers for these special registers, but this would
692     // require a lot of complexity to deal with the weird aliasing.
693     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
694     if (Requested && Requested < InputNumSGPRs)
695       Requested = InputNumSGPRs;
696 
697     // Make sure requested value is compatible with values implied by
698     // default/requested minimum/maximum number of waves per execution unit.
699     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
700       Requested = 0;
701     if (WavesPerEU.second &&
702         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
703       Requested = 0;
704 
705     if (Requested)
706       MaxNumSGPRs = Requested;
707   }
708 
709   if (hasSGPRInitBug())
710     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
711 
712   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
713                   MaxAddressableNumSGPRs);
714 }
715 
716 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
717   const Function &F = MF.getFunction();
718   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
719 
720   // Compute maximum number of VGPRs function can use using default/requested
721   // minimum number of waves per execution unit.
722   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
723   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
724 
725   // Check if maximum number of VGPRs was explicitly requested using
726   // "amdgpu-num-vgpr" attribute.
727   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
728     unsigned Requested = AMDGPU::getIntegerAttribute(
729       F, "amdgpu-num-vgpr", MaxNumVGPRs);
730 
731     // Make sure requested value is compatible with values implied by
732     // default/requested minimum/maximum number of waves per execution unit.
733     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
734       Requested = 0;
735     if (WavesPerEU.second &&
736         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
737       Requested = 0;
738 
739     if (Requested)
740       MaxNumVGPRs = Requested;
741   }
742 
743   return MaxNumVGPRs;
744 }
745 
746 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
747                                          int UseOpIdx, SDep &Dep) const {
748   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
749       !Def->isInstr() || !Use->isInstr())
750     return;
751 
752   MachineInstr *DefI = Def->getInstr();
753   MachineInstr *UseI = Use->getInstr();
754 
755   if (DefI->isBundle()) {
756     const SIRegisterInfo *TRI = getRegisterInfo();
757     auto Reg = Dep.getReg();
758     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
759     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
760     unsigned Lat = 0;
761     for (++I; I != E && I->isBundledWithPred(); ++I) {
762       if (I->modifiesRegister(Reg, TRI))
763         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
764       else if (Lat)
765         --Lat;
766     }
767     Dep.setLatency(Lat);
768   } else if (UseI->isBundle()) {
769     const SIRegisterInfo *TRI = getRegisterInfo();
770     auto Reg = Dep.getReg();
771     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
772     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
773     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
774     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
775       if (I->readsRegister(Reg, TRI))
776         break;
777       --Lat;
778     }
779     Dep.setLatency(Lat);
780   }
781 }
782 
783 namespace {
784 struct FillMFMAShadowMutation : ScheduleDAGMutation {
785   const SIInstrInfo *TII;
786 
787   ScheduleDAGMI *DAG;
788 
789   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
790 
791   bool isSALU(const SUnit *SU) const {
792     const MachineInstr *MI = SU->getInstr();
793     return MI && TII->isSALU(*MI) && !MI->isTerminator();
794   }
795 
796   bool isVALU(const SUnit *SU) const {
797     const MachineInstr *MI = SU->getInstr();
798     return MI && TII->isVALU(*MI);
799   }
800 
801   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
802     if (Pred->NodeNum < Succ->NodeNum)
803       return true;
804 
805     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
806 
807     for (unsigned I = 0; I < Succs.size(); ++I) {
808       for (const SDep &SI : Succs[I]->Succs) {
809         const SUnit *SU = SI.getSUnit();
810         if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
811           Succs.push_back(SU);
812       }
813     }
814 
815     SmallPtrSet<const SUnit*, 32> Visited;
816     while (!Preds.empty()) {
817       const SUnit *SU = Preds.pop_back_val();
818       if (llvm::find(Succs, SU) != Succs.end())
819         return false;
820       Visited.insert(SU);
821       for (const SDep &SI : SU->Preds)
822         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
823           Preds.push_back(SI.getSUnit());
824     }
825 
826     return true;
827   }
828 
829   // Link as much SALU intructions in chain as possible. Return the size
830   // of the chain. Links up to MaxChain instructions.
831   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
832                          SmallPtrSetImpl<SUnit *> &Visited) const {
833     SmallVector<SUnit *, 8> Worklist({To});
834     unsigned Linked = 0;
835 
836     while (!Worklist.empty() && MaxChain-- > 0) {
837       SUnit *SU = Worklist.pop_back_val();
838       if (!Visited.insert(SU).second)
839         continue;
840 
841       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
842                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
843 
844       if (SU->addPred(SDep(From, SDep::Artificial), false))
845         ++Linked;
846 
847       for (SDep &SI : From->Succs) {
848         SUnit *SUv = SI.getSUnit();
849         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
850           SUv->addPred(SDep(SU, SDep::Artificial), false);
851       }
852 
853       for (SDep &SI : SU->Succs) {
854         SUnit *Succ = SI.getSUnit();
855         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
856           Worklist.push_back(Succ);
857       }
858     }
859 
860     return Linked;
861   }
862 
863   void apply(ScheduleDAGInstrs *DAGInstrs) override {
864     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
865     if (!ST.hasMAIInsts() || DisablePowerSched)
866       return;
867     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
868     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
869     if (!TSchedModel || DAG->SUnits.empty())
870       return;
871 
872     // Scan for MFMA long latency instructions and try to add a dependency
873     // of available SALU instructions to give them a chance to fill MFMA
874     // shadow. That is desirable to fill MFMA shadow with SALU instructions
875     // rather than VALU to prevent power consumption bursts and throttle.
876     auto LastSALU = DAG->SUnits.begin();
877     auto E = DAG->SUnits.end();
878     SmallPtrSet<SUnit*, 32> Visited;
879     for (SUnit &SU : DAG->SUnits) {
880       MachineInstr &MAI = *SU.getInstr();
881       if (!TII->isMAI(MAI) ||
882            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
883            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
884         continue;
885 
886       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
887 
888       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
889                  dbgs() << "Need " << Lat
890                         << " instructions to cover latency.\n");
891 
892       // Find up to Lat independent scalar instructions as early as
893       // possible such that they can be scheduled after this MFMA.
894       for ( ; Lat && LastSALU != E; ++LastSALU) {
895         if (Visited.count(&*LastSALU))
896           continue;
897 
898         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
899           continue;
900 
901         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
902       }
903     }
904   }
905 };
906 } // namespace
907 
908 void GCNSubtarget::getPostRAMutations(
909     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
910   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
911 }
912 
913 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
914   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
915     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
916   else
917     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
918 }
919 
920 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
921   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
922     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
923   else
924     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
925 }
926