1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 static cl::opt<bool> DisablePowerSched(
44   "amdgpu-disable-power-sched",
45   cl::desc("Disable scheduling to minimize mAI power bursts"),
46   cl::init(false));
47 
48 static cl::opt<bool> EnableVGPRIndexMode(
49   "amdgpu-vgpr-index-mode",
50   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51   cl::init(false));
52 
53 GCNSubtarget::~GCNSubtarget() = default;
54 
55 R600Subtarget &
56 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
57                                                StringRef GPU, StringRef FS) {
58   SmallString<256> FullFS("+promote-alloca,");
59   FullFS += FS;
60   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
61 
62   HasMulU24 = getGeneration() >= EVERGREEN;
63   HasMulI24 = hasCaymanISA();
64 
65   return *this;
66 }
67 
68 GCNSubtarget &
69 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
70                                               StringRef GPU, StringRef FS) {
71   // Determine default and user-specified characteristics
72   //
73   // We want to be able to turn these off, but making this a subtarget feature
74   // for SI has the unhelpful behavior that it unsets everything else if you
75   // disable it.
76   //
77   // Similarly we want enable-prt-strict-null to be on by default and not to
78   // unset everything else if it is disabled
79 
80   // Assuming ECC is enabled is the conservative default.
81   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,");
82 
83   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
84     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
85 
86   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
87 
88   // Disable mutually exclusive bits.
89   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
90     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
91       FullFS += "-wavefrontsize16,";
92     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
93       FullFS += "-wavefrontsize32,";
94     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
95       FullFS += "-wavefrontsize64,";
96   }
97 
98   FullFS += FS;
99 
100   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
101 
102   // We don't support FP64 for EG/NI atm.
103   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
104 
105   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
106   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
107   // variants of MUBUF instructions.
108   if (!hasAddr64() && !FS.contains("flat-for-global")) {
109     FlatForGlobal = true;
110   }
111 
112   // Set defaults if needed.
113   if (MaxPrivateElementSize == 0)
114     MaxPrivateElementSize = 4;
115 
116   if (LDSBankCount == 0)
117     LDSBankCount = 32;
118 
119   if (TT.getArch() == Triple::amdgcn) {
120     if (LocalMemorySize == 0)
121       LocalMemorySize = 32768;
122 
123     // Do something sensible for unspecified target.
124     if (!HasMovrel && !HasVGPRIndexMode)
125       HasMovrel = true;
126   }
127 
128   // Don't crash on invalid devices.
129   if (WavefrontSizeLog2 == 0)
130     WavefrontSizeLog2 = 5;
131 
132   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
133 
134   // Disable XNACK on targets where it is not enabled by default unless it is
135   // explicitly requested.
136   if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) {
137     ToggleFeature(AMDGPU::FeatureXNACK);
138     EnableXNACK = false;
139   }
140 
141   // ECC is on by default, but turn it off if the hardware doesn't support it
142   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
143   // ECC.
144   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
145     ToggleFeature(AMDGPU::FeatureSRAMECC);
146     EnableSRAMECC = false;
147   }
148 
149   return *this;
150 }
151 
152 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
153   TargetTriple(TT),
154   Has16BitInsts(false),
155   HasMadMixInsts(false),
156   HasMadMacF32Insts(false),
157   HasDsSrc2Insts(false),
158   HasSDWA(false),
159   HasVOP3PInsts(false),
160   HasMulI24(true),
161   HasMulU24(true),
162   HasInv2PiInlineImm(false),
163   HasFminFmaxLegacy(true),
164   EnablePromoteAlloca(false),
165   HasTrigReducedRange(false),
166   MaxWavesPerEU(10),
167   LocalMemorySize(0),
168   WavefrontSizeLog2(0)
169   { }
170 
171 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
172                            const GCNTargetMachine &TM) :
173     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
174     AMDGPUSubtarget(TT),
175     TargetTriple(TT),
176     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
177     InstrItins(getInstrItineraryForCPU(GPU)),
178     LDSBankCount(0),
179     MaxPrivateElementSize(0),
180 
181     FastFMAF32(false),
182     FastDenormalF32(false),
183     HalfRate64Ops(false),
184 
185     FlatForGlobal(false),
186     AutoWaitcntBeforeBarrier(false),
187     UnalignedScratchAccess(false),
188     UnalignedBufferAccess(false),
189     UnalignedAccessMode(false),
190 
191     HasApertureRegs(false),
192     EnableXNACK(false),
193     DoesNotSupportXNACK(false),
194     EnableCuMode(false),
195     TrapHandler(false),
196 
197     EnableLoadStoreOpt(false),
198     EnableUnsafeDSOffsetFolding(false),
199     EnableSIScheduler(false),
200     EnableDS128(false),
201     EnablePRTStrictNull(false),
202     DumpCode(false),
203 
204     FP64(false),
205     GCN3Encoding(false),
206     CIInsts(false),
207     GFX8Insts(false),
208     GFX9Insts(false),
209     GFX10Insts(false),
210     GFX10_3Insts(false),
211     GFX7GFX8GFX9Insts(false),
212     SGPRInitBug(false),
213     HasSMemRealTime(false),
214     HasIntClamp(false),
215     HasFmaMixInsts(false),
216     HasMovrel(false),
217     HasVGPRIndexMode(false),
218     HasScalarStores(false),
219     HasScalarAtomics(false),
220     HasSDWAOmod(false),
221     HasSDWAScalar(false),
222     HasSDWASdst(false),
223     HasSDWAMac(false),
224     HasSDWAOutModsVOPC(false),
225     HasDPP(false),
226     HasDPP8(false),
227     HasR128A16(false),
228     HasGFX10A16(false),
229     HasG16(false),
230     HasNSAEncoding(false),
231     GFX10_BEncoding(false),
232     HasDLInsts(false),
233     HasDot1Insts(false),
234     HasDot2Insts(false),
235     HasDot3Insts(false),
236     HasDot4Insts(false),
237     HasDot5Insts(false),
238     HasDot6Insts(false),
239     HasMAIInsts(false),
240     HasPkFmacF16Inst(false),
241     HasAtomicFaddInsts(false),
242     EnableSRAMECC(false),
243     DoesNotSupportSRAMECC(false),
244     HasNoSdstCMPX(false),
245     HasVscnt(false),
246     HasGetWaveIdInst(false),
247     HasSMemTimeInst(false),
248     HasRegisterBanking(false),
249     HasVOP3Literal(false),
250     HasNoDataDepHazard(false),
251     FlatAddressSpace(false),
252     FlatInstOffsets(false),
253     FlatGlobalInsts(false),
254     FlatScratchInsts(false),
255     ScalarFlatScratchInsts(false),
256     AddNoCarryInsts(false),
257     HasUnpackedD16VMem(false),
258     LDSMisalignedBug(false),
259     HasMFMAInlineLiteralBug(false),
260     UnalignedDSAccess(false),
261 
262     ScalarizeGlobal(false),
263 
264     HasVcmpxPermlaneHazard(false),
265     HasVMEMtoScalarWriteHazard(false),
266     HasSMEMtoVectorWriteHazard(false),
267     HasInstFwdPrefetchBug(false),
268     HasVcmpxExecWARHazard(false),
269     HasLdsBranchVmemWARHazard(false),
270     HasNSAtoVMEMBug(false),
271     HasOffset3fBug(false),
272     HasFlatSegmentOffsetBug(false),
273     HasImageStoreD16Bug(false),
274     HasImageGather4D16Bug(false),
275 
276     FeatureDisable(false),
277     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
278     TLInfo(TM, *this),
279     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
280   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
281   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
282   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
283   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
284   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
285   InstSelector.reset(new AMDGPUInstructionSelector(
286   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
287 }
288 
289 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
290   if (getGeneration() < GFX10)
291     return 1;
292 
293   switch (Opcode) {
294   case AMDGPU::V_LSHLREV_B64:
295   case AMDGPU::V_LSHLREV_B64_gfx10:
296   case AMDGPU::V_LSHL_B64:
297   case AMDGPU::V_LSHRREV_B64:
298   case AMDGPU::V_LSHRREV_B64_gfx10:
299   case AMDGPU::V_LSHR_B64:
300   case AMDGPU::V_ASHRREV_I64:
301   case AMDGPU::V_ASHRREV_I64_gfx10:
302   case AMDGPU::V_ASHR_I64:
303     return 1;
304   }
305 
306   return 2;
307 }
308 
309 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
310   const Function &F) const {
311   if (NWaves == 1)
312     return getLocalMemorySize();
313   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
314   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
315   if (!WorkGroupsPerCu)
316     return 0;
317   unsigned MaxWaves = getMaxWavesPerEU();
318   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
319 }
320 
321 // FIXME: Should return min,max range.
322 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
323   const Function &F) const {
324   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
325   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
326   if (!MaxWorkGroupsPerCu)
327     return 0;
328 
329   const unsigned WaveSize = getWavefrontSize();
330 
331   // FIXME: Do we need to account for alignment requirement of LDS rounding the
332   // size up?
333   // Compute restriction based on LDS usage
334   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
335 
336   // This can be queried with more LDS than is possible, so just assume the
337   // worst.
338   if (NumGroups == 0)
339     return 1;
340 
341   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
342 
343   // Round to the number of waves.
344   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
345   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
346 
347   // Clamp to the maximum possible number of waves.
348   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
349 
350   // FIXME: Needs to be a multiple of the group size?
351   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
352 
353   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
354          "computed invalid occupancy");
355   return MaxWaves;
356 }
357 
358 unsigned
359 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
360   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
361   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
362 }
363 
364 std::pair<unsigned, unsigned>
365 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
366   switch (CC) {
367   case CallingConv::AMDGPU_VS:
368   case CallingConv::AMDGPU_LS:
369   case CallingConv::AMDGPU_HS:
370   case CallingConv::AMDGPU_ES:
371   case CallingConv::AMDGPU_GS:
372   case CallingConv::AMDGPU_PS:
373     return std::make_pair(1, getWavefrontSize());
374   default:
375     return std::make_pair(1u, getMaxFlatWorkGroupSize());
376   }
377 }
378 
379 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
380   const Function &F) const {
381   // Default minimum/maximum flat work group sizes.
382   std::pair<unsigned, unsigned> Default =
383     getDefaultFlatWorkGroupSize(F.getCallingConv());
384 
385   // Requested minimum/maximum flat work group sizes.
386   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
387     F, "amdgpu-flat-work-group-size", Default);
388 
389   // Make sure requested minimum is less than requested maximum.
390   if (Requested.first > Requested.second)
391     return Default;
392 
393   // Make sure requested values do not violate subtarget's specifications.
394   if (Requested.first < getMinFlatWorkGroupSize())
395     return Default;
396   if (Requested.second > getMaxFlatWorkGroupSize())
397     return Default;
398 
399   return Requested;
400 }
401 
402 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
403   const Function &F) const {
404   // Default minimum/maximum number of waves per execution unit.
405   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
406 
407   // Default/requested minimum/maximum flat work group sizes.
408   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
409 
410   // If minimum/maximum flat work group sizes were explicitly requested using
411   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
412   // number of waves per execution unit to values implied by requested
413   // minimum/maximum flat work group sizes.
414   unsigned MinImpliedByFlatWorkGroupSize =
415     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
416   Default.first = MinImpliedByFlatWorkGroupSize;
417   bool RequestedFlatWorkGroupSize =
418       F.hasFnAttribute("amdgpu-flat-work-group-size");
419 
420   // Requested minimum/maximum number of waves per execution unit.
421   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
422     F, "amdgpu-waves-per-eu", Default, true);
423 
424   // Make sure requested minimum is less than requested maximum.
425   if (Requested.second && Requested.first > Requested.second)
426     return Default;
427 
428   // Make sure requested values do not violate subtarget's specifications.
429   if (Requested.first < getMinWavesPerEU() ||
430       Requested.second > getMaxWavesPerEU())
431     return Default;
432 
433   // Make sure requested values are compatible with values implied by requested
434   // minimum/maximum flat work group sizes.
435   if (RequestedFlatWorkGroupSize &&
436       Requested.first < MinImpliedByFlatWorkGroupSize)
437     return Default;
438 
439   return Requested;
440 }
441 
442 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
443   auto Node = Kernel.getMetadata("reqd_work_group_size");
444   if (Node && Node->getNumOperands() == 3)
445     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
446   return std::numeric_limits<unsigned>::max();
447 }
448 
449 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
450                                            unsigned Dimension) const {
451   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
452   if (ReqdSize != std::numeric_limits<unsigned>::max())
453     return ReqdSize - 1;
454   return getFlatWorkGroupSizes(Kernel).second - 1;
455 }
456 
457 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
458   Function *Kernel = I->getParent()->getParent();
459   unsigned MinSize = 0;
460   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
461   bool IdQuery = false;
462 
463   // If reqd_work_group_size is present it narrows value down.
464   if (auto *CI = dyn_cast<CallInst>(I)) {
465     const Function *F = CI->getCalledFunction();
466     if (F) {
467       unsigned Dim = UINT_MAX;
468       switch (F->getIntrinsicID()) {
469       case Intrinsic::amdgcn_workitem_id_x:
470       case Intrinsic::r600_read_tidig_x:
471         IdQuery = true;
472         LLVM_FALLTHROUGH;
473       case Intrinsic::r600_read_local_size_x:
474         Dim = 0;
475         break;
476       case Intrinsic::amdgcn_workitem_id_y:
477       case Intrinsic::r600_read_tidig_y:
478         IdQuery = true;
479         LLVM_FALLTHROUGH;
480       case Intrinsic::r600_read_local_size_y:
481         Dim = 1;
482         break;
483       case Intrinsic::amdgcn_workitem_id_z:
484       case Intrinsic::r600_read_tidig_z:
485         IdQuery = true;
486         LLVM_FALLTHROUGH;
487       case Intrinsic::r600_read_local_size_z:
488         Dim = 2;
489         break;
490       default:
491         break;
492       }
493 
494       if (Dim <= 3) {
495         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
496         if (ReqdSize != std::numeric_limits<unsigned>::max())
497           MinSize = MaxSize = ReqdSize;
498       }
499     }
500   }
501 
502   if (!MaxSize)
503     return false;
504 
505   // Range metadata is [Lo, Hi). For ID query we need to pass max size
506   // as Hi. For size query we need to pass Hi + 1.
507   if (IdQuery)
508     MinSize = 0;
509   else
510     ++MaxSize;
511 
512   MDBuilder MDB(I->getContext());
513   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
514                                                   APInt(32, MaxSize));
515   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
516   return true;
517 }
518 
519 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
520                                                  Align &MaxAlign) const {
521   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
522          F.getCallingConv() == CallingConv::SPIR_KERNEL);
523 
524   const DataLayout &DL = F.getParent()->getDataLayout();
525   uint64_t ExplicitArgBytes = 0;
526   MaxAlign = Align(1);
527 
528   for (const Argument &Arg : F.args()) {
529     const bool IsByRef = Arg.hasByRefAttr();
530     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
531     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
532     if (!Alignment)
533       Alignment = DL.getABITypeAlign(ArgTy);
534 
535     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
536     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
537     MaxAlign = max(MaxAlign, Alignment);
538   }
539 
540   return ExplicitArgBytes;
541 }
542 
543 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
544                                                 Align &MaxAlign) const {
545   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
546 
547   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
548 
549   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
550   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
551   if (ImplicitBytes != 0) {
552     const Align Alignment = getAlignmentForImplicitArgPtr();
553     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
554   }
555 
556   // Being able to dereference past the end is useful for emitting scalar loads.
557   return alignTo(TotalSize, 4);
558 }
559 
560 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
561                              const TargetMachine &TM) :
562   R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS),
563   AMDGPUSubtarget(TT),
564   InstrInfo(*this),
565   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
566   FMA(false),
567   CaymanISA(false),
568   CFALUBug(false),
569   HasVertexCache(false),
570   R600ALUInst(false),
571   FP64(false),
572   TexVTXClauseSize(0),
573   Gen(R600),
574   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
575   InstrItins(getInstrItineraryForCPU(GPU)) { }
576 
577 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
578                                       unsigned NumRegionInstrs) const {
579   // Track register pressure so the scheduler can try to decrease
580   // pressure once register usage is above the threshold defined by
581   // SIRegisterInfo::getRegPressureSetLimit()
582   Policy.ShouldTrackPressure = true;
583 
584   // Enabling both top down and bottom up scheduling seems to give us less
585   // register spills than just using one of these approaches on its own.
586   Policy.OnlyTopDown = false;
587   Policy.OnlyBottomUp = false;
588 
589   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
590   if (!enableSIScheduler())
591     Policy.ShouldTrackLaneMasks = true;
592 }
593 
594 bool GCNSubtarget::hasMadF16() const {
595   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
596 }
597 
598 bool GCNSubtarget::useVGPRIndexMode() const {
599   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
600 }
601 
602 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
603   if (getGeneration() >= AMDGPUSubtarget::GFX10)
604     return getMaxWavesPerEU();
605 
606   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
607     if (SGPRs <= 80)
608       return 10;
609     if (SGPRs <= 88)
610       return 9;
611     if (SGPRs <= 100)
612       return 8;
613     return 7;
614   }
615   if (SGPRs <= 48)
616     return 10;
617   if (SGPRs <= 56)
618     return 9;
619   if (SGPRs <= 64)
620     return 8;
621   if (SGPRs <= 72)
622     return 7;
623   if (SGPRs <= 80)
624     return 6;
625   return 5;
626 }
627 
628 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
629   unsigned MaxWaves = getMaxWavesPerEU();
630   unsigned Granule = getVGPRAllocGranule();
631   if (VGPRs < Granule)
632     return MaxWaves;
633   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
634   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
635 }
636 
637 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
638   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
639   if (getGeneration() >= AMDGPUSubtarget::GFX10)
640     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
641 
642   if (MFI.hasFlatScratchInit()) {
643     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
644       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
645     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
646       return 4; // FLAT_SCRATCH, VCC (in that order).
647   }
648 
649   if (isXNACKEnabled())
650     return 4; // XNACK, VCC (in that order).
651   return 2; // VCC.
652 }
653 
654 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
655                                         unsigned NumSGPRs,
656                                         unsigned NumVGPRs) const {
657   unsigned Occupancy =
658     std::min(getMaxWavesPerEU(),
659              getOccupancyWithLocalMemSize(LDSSize, F));
660   if (NumSGPRs)
661     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
662   if (NumVGPRs)
663     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
664   return Occupancy;
665 }
666 
667 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
668   const Function &F = MF.getFunction();
669   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
670 
671   // Compute maximum number of SGPRs function can use using default/requested
672   // minimum number of waves per execution unit.
673   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
674   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
675   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
676 
677   // Check if maximum number of SGPRs was explicitly requested using
678   // "amdgpu-num-sgpr" attribute.
679   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
680     unsigned Requested = AMDGPU::getIntegerAttribute(
681       F, "amdgpu-num-sgpr", MaxNumSGPRs);
682 
683     // Make sure requested value does not violate subtarget's specifications.
684     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
685       Requested = 0;
686 
687     // If more SGPRs are required to support the input user/system SGPRs,
688     // increase to accommodate them.
689     //
690     // FIXME: This really ends up using the requested number of SGPRs + number
691     // of reserved special registers in total. Theoretically you could re-use
692     // the last input registers for these special registers, but this would
693     // require a lot of complexity to deal with the weird aliasing.
694     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
695     if (Requested && Requested < InputNumSGPRs)
696       Requested = InputNumSGPRs;
697 
698     // Make sure requested value is compatible with values implied by
699     // default/requested minimum/maximum number of waves per execution unit.
700     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
701       Requested = 0;
702     if (WavesPerEU.second &&
703         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
704       Requested = 0;
705 
706     if (Requested)
707       MaxNumSGPRs = Requested;
708   }
709 
710   if (hasSGPRInitBug())
711     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
712 
713   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
714                   MaxAddressableNumSGPRs);
715 }
716 
717 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
718   const Function &F = MF.getFunction();
719   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
720 
721   // Compute maximum number of VGPRs function can use using default/requested
722   // minimum number of waves per execution unit.
723   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
724   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
725 
726   // Check if maximum number of VGPRs was explicitly requested using
727   // "amdgpu-num-vgpr" attribute.
728   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
729     unsigned Requested = AMDGPU::getIntegerAttribute(
730       F, "amdgpu-num-vgpr", MaxNumVGPRs);
731 
732     // Make sure requested value is compatible with values implied by
733     // default/requested minimum/maximum number of waves per execution unit.
734     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
735       Requested = 0;
736     if (WavesPerEU.second &&
737         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
738       Requested = 0;
739 
740     if (Requested)
741       MaxNumVGPRs = Requested;
742   }
743 
744   return MaxNumVGPRs;
745 }
746 
747 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
748                                          int UseOpIdx, SDep &Dep) const {
749   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
750       !Def->isInstr() || !Use->isInstr())
751     return;
752 
753   MachineInstr *DefI = Def->getInstr();
754   MachineInstr *UseI = Use->getInstr();
755 
756   if (DefI->isBundle()) {
757     const SIRegisterInfo *TRI = getRegisterInfo();
758     auto Reg = Dep.getReg();
759     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
760     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
761     unsigned Lat = 0;
762     for (++I; I != E && I->isBundledWithPred(); ++I) {
763       if (I->modifiesRegister(Reg, TRI))
764         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
765       else if (Lat)
766         --Lat;
767     }
768     Dep.setLatency(Lat);
769   } else if (UseI->isBundle()) {
770     const SIRegisterInfo *TRI = getRegisterInfo();
771     auto Reg = Dep.getReg();
772     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
773     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
774     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
775     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
776       if (I->readsRegister(Reg, TRI))
777         break;
778       --Lat;
779     }
780     Dep.setLatency(Lat);
781   }
782 }
783 
784 namespace {
785 struct FillMFMAShadowMutation : ScheduleDAGMutation {
786   const SIInstrInfo *TII;
787 
788   ScheduleDAGMI *DAG;
789 
790   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
791 
792   bool isSALU(const SUnit *SU) const {
793     const MachineInstr *MI = SU->getInstr();
794     return MI && TII->isSALU(*MI) && !MI->isTerminator();
795   }
796 
797   bool isVALU(const SUnit *SU) const {
798     const MachineInstr *MI = SU->getInstr();
799     return MI && TII->isVALU(*MI);
800   }
801 
802   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
803     if (Pred->NodeNum < Succ->NodeNum)
804       return true;
805 
806     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
807 
808     for (unsigned I = 0; I < Succs.size(); ++I) {
809       for (const SDep &SI : Succs[I]->Succs) {
810         const SUnit *SU = SI.getSUnit();
811         if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
812           Succs.push_back(SU);
813       }
814     }
815 
816     SmallPtrSet<const SUnit*, 32> Visited;
817     while (!Preds.empty()) {
818       const SUnit *SU = Preds.pop_back_val();
819       if (llvm::find(Succs, SU) != Succs.end())
820         return false;
821       Visited.insert(SU);
822       for (const SDep &SI : SU->Preds)
823         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
824           Preds.push_back(SI.getSUnit());
825     }
826 
827     return true;
828   }
829 
830   // Link as much SALU intructions in chain as possible. Return the size
831   // of the chain. Links up to MaxChain instructions.
832   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
833                          SmallPtrSetImpl<SUnit *> &Visited) const {
834     SmallVector<SUnit *, 8> Worklist({To});
835     unsigned Linked = 0;
836 
837     while (!Worklist.empty() && MaxChain-- > 0) {
838       SUnit *SU = Worklist.pop_back_val();
839       if (!Visited.insert(SU).second)
840         continue;
841 
842       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
843                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
844 
845       if (SU->addPred(SDep(From, SDep::Artificial), false))
846         ++Linked;
847 
848       for (SDep &SI : From->Succs) {
849         SUnit *SUv = SI.getSUnit();
850         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
851           SUv->addPred(SDep(SU, SDep::Artificial), false);
852       }
853 
854       for (SDep &SI : SU->Succs) {
855         SUnit *Succ = SI.getSUnit();
856         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
857           Worklist.push_back(Succ);
858       }
859     }
860 
861     return Linked;
862   }
863 
864   void apply(ScheduleDAGInstrs *DAGInstrs) override {
865     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
866     if (!ST.hasMAIInsts() || DisablePowerSched)
867       return;
868     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
869     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
870     if (!TSchedModel || DAG->SUnits.empty())
871       return;
872 
873     // Scan for MFMA long latency instructions and try to add a dependency
874     // of available SALU instructions to give them a chance to fill MFMA
875     // shadow. That is desirable to fill MFMA shadow with SALU instructions
876     // rather than VALU to prevent power consumption bursts and throttle.
877     auto LastSALU = DAG->SUnits.begin();
878     auto E = DAG->SUnits.end();
879     SmallPtrSet<SUnit*, 32> Visited;
880     for (SUnit &SU : DAG->SUnits) {
881       MachineInstr &MAI = *SU.getInstr();
882       if (!TII->isMAI(MAI) ||
883            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
884            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
885         continue;
886 
887       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
888 
889       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
890                  dbgs() << "Need " << Lat
891                         << " instructions to cover latency.\n");
892 
893       // Find up to Lat independent scalar instructions as early as
894       // possible such that they can be scheduled after this MFMA.
895       for ( ; Lat && LastSALU != E; ++LastSALU) {
896         if (Visited.count(&*LastSALU))
897           continue;
898 
899         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
900           continue;
901 
902         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
903       }
904     }
905   }
906 };
907 } // namespace
908 
909 void GCNSubtarget::getPostRAMutations(
910     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
911   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
912 }
913 
914 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
915   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
916     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
917   else
918     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
919 }
920 
921 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
922   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
923     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
924   else
925     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
926 }
927