1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 static cl::opt<bool> DisablePowerSched(
44   "amdgpu-disable-power-sched",
45   cl::desc("Disable scheduling to minimize mAI power bursts"),
46   cl::init(false));
47 
48 GCNSubtarget::~GCNSubtarget() = default;
49 
50 R600Subtarget &
51 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
52                                                StringRef GPU, StringRef FS) {
53   SmallString<256> FullFS("+promote-alloca,");
54   FullFS += FS;
55   ParseSubtargetFeatures(GPU, FullFS);
56 
57   // FIXME: I don't think think Evergreen has any useful support for
58   // denormals, but should be checked. Should we issue a warning somewhere
59   // if someone tries to enable these?
60   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
61     FP32Denormals = false;
62   }
63 
64   HasMulU24 = getGeneration() >= EVERGREEN;
65   HasMulI24 = hasCaymanISA();
66 
67   return *this;
68 }
69 
70 GCNSubtarget &
71 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
72                                               StringRef GPU, StringRef FS) {
73   // Determine default and user-specified characteristics
74   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
75   // enabled, but some instructions do not respect them and they run at the
76   // double precision rate, so don't enable by default.
77   //
78   // We want to be able to turn these off, but making this a subtarget feature
79   // for SI has the unhelpful behavior that it unsets everything else if you
80   // disable it.
81   //
82   // Similarly we want enable-prt-strict-null to be on by default and not to
83   // unset everything else if it is disabled
84 
85   // Assuming ECC is enabled is the conservative default.
86   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
87 
88   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
89     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
90 
91   // FIXME: I don't think think Evergreen has any useful support for
92   // denormals, but should be checked. Should we issue a warning somewhere
93   // if someone tries to enable these?
94   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
95     FullFS += "+fp64-fp16-denormals,";
96   } else {
97     FullFS += "-fp32-denormals,";
98   }
99 
100   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
101 
102   // Disable mutually exclusive bits.
103   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
104     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
105       FullFS += "-wavefrontsize16,";
106     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
107       FullFS += "-wavefrontsize32,";
108     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
109       FullFS += "-wavefrontsize64,";
110   }
111 
112   FullFS += FS;
113 
114   ParseSubtargetFeatures(GPU, FullFS);
115 
116   // We don't support FP64 for EG/NI atm.
117   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
118 
119   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
120   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
121   // variants of MUBUF instructions.
122   if (!hasAddr64() && !FS.contains("flat-for-global")) {
123     FlatForGlobal = true;
124   }
125 
126   // Set defaults if needed.
127   if (MaxPrivateElementSize == 0)
128     MaxPrivateElementSize = 4;
129 
130   if (LDSBankCount == 0)
131     LDSBankCount = 32;
132 
133   if (TT.getArch() == Triple::amdgcn) {
134     if (LocalMemorySize == 0)
135       LocalMemorySize = 32768;
136 
137     // Do something sensible for unspecified target.
138     if (!HasMovrel && !HasVGPRIndexMode)
139       HasMovrel = true;
140   }
141 
142   // Don't crash on invalid devices.
143   if (WavefrontSize == 0)
144     WavefrontSize = 64;
145 
146   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
147 
148   if (DoesNotSupportXNACK && EnableXNACK) {
149     ToggleFeature(AMDGPU::FeatureXNACK);
150     EnableXNACK = false;
151   }
152 
153   // ECC is on by default, but turn it off if the hardware doesn't support it
154   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
155   // ECC.
156   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
157     ToggleFeature(AMDGPU::FeatureSRAMECC);
158     EnableSRAMECC = false;
159   }
160 
161   return *this;
162 }
163 
164 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
165   TargetTriple(TT),
166   Has16BitInsts(false),
167   HasMadMixInsts(false),
168   FP32Denormals(false),
169   FPExceptions(false),
170   HasSDWA(false),
171   HasVOP3PInsts(false),
172   HasMulI24(true),
173   HasMulU24(true),
174   HasInv2PiInlineImm(false),
175   HasFminFmaxLegacy(true),
176   EnablePromoteAlloca(false),
177   HasTrigReducedRange(false),
178   MaxWavesPerEU(10),
179   LocalMemorySize(0),
180   WavefrontSize(0)
181   { }
182 
183 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
184                            const GCNTargetMachine &TM) :
185     AMDGPUGenSubtargetInfo(TT, GPU, FS),
186     AMDGPUSubtarget(TT),
187     TargetTriple(TT),
188     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
189     InstrItins(getInstrItineraryForCPU(GPU)),
190     LDSBankCount(0),
191     MaxPrivateElementSize(0),
192 
193     FastFMAF32(false),
194     HalfRate64Ops(false),
195 
196     FP64FP16Denormals(false),
197     FlatForGlobal(false),
198     AutoWaitcntBeforeBarrier(false),
199     CodeObjectV3(false),
200     UnalignedScratchAccess(false),
201     UnalignedBufferAccess(false),
202 
203     HasApertureRegs(false),
204     EnableXNACK(false),
205     DoesNotSupportXNACK(false),
206     EnableCuMode(false),
207     TrapHandler(false),
208 
209     EnableLoadStoreOpt(false),
210     EnableUnsafeDSOffsetFolding(false),
211     EnableSIScheduler(false),
212     EnableDS128(false),
213     EnablePRTStrictNull(false),
214     DumpCode(false),
215 
216     FP64(false),
217     GCN3Encoding(false),
218     CIInsts(false),
219     GFX8Insts(false),
220     GFX9Insts(false),
221     GFX10Insts(false),
222     GFX7GFX8GFX9Insts(false),
223     SGPRInitBug(false),
224     HasSMemRealTime(false),
225     HasIntClamp(false),
226     HasFmaMixInsts(false),
227     HasMovrel(false),
228     HasVGPRIndexMode(false),
229     HasScalarStores(false),
230     HasScalarAtomics(false),
231     HasSDWAOmod(false),
232     HasSDWAScalar(false),
233     HasSDWASdst(false),
234     HasSDWAMac(false),
235     HasSDWAOutModsVOPC(false),
236     HasDPP(false),
237     HasDPP8(false),
238     HasR128A16(false),
239     HasNSAEncoding(false),
240     HasDLInsts(false),
241     HasDot1Insts(false),
242     HasDot2Insts(false),
243     HasDot3Insts(false),
244     HasDot4Insts(false),
245     HasDot5Insts(false),
246     HasDot6Insts(false),
247     HasMAIInsts(false),
248     HasPkFmacF16Inst(false),
249     HasAtomicFaddInsts(false),
250     EnableSRAMECC(false),
251     DoesNotSupportSRAMECC(false),
252     HasNoSdstCMPX(false),
253     HasVscnt(false),
254     HasRegisterBanking(false),
255     HasVOP3Literal(false),
256     HasNoDataDepHazard(false),
257     FlatAddressSpace(false),
258     FlatInstOffsets(false),
259     FlatGlobalInsts(false),
260     FlatScratchInsts(false),
261     ScalarFlatScratchInsts(false),
262     AddNoCarryInsts(false),
263     HasUnpackedD16VMem(false),
264     LDSMisalignedBug(false),
265 
266     ScalarizeGlobal(false),
267 
268     HasVcmpxPermlaneHazard(false),
269     HasVMEMtoScalarWriteHazard(false),
270     HasSMEMtoVectorWriteHazard(false),
271     HasInstFwdPrefetchBug(false),
272     HasVcmpxExecWARHazard(false),
273     HasLdsBranchVmemWARHazard(false),
274     HasNSAtoVMEMBug(false),
275     HasOffset3fBug(false),
276     HasFlatSegmentOffsetBug(false),
277 
278     FeatureDisable(false),
279     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
280     TLInfo(TM, *this),
281     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
282   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
283   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
284   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
285   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
286   InstSelector.reset(new AMDGPUInstructionSelector(
287   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
288 }
289 
290 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
291   if (getGeneration() < GFX10)
292     return 1;
293 
294   switch (Opcode) {
295   case AMDGPU::V_LSHLREV_B64:
296   case AMDGPU::V_LSHLREV_B64_gfx10:
297   case AMDGPU::V_LSHL_B64:
298   case AMDGPU::V_LSHRREV_B64:
299   case AMDGPU::V_LSHRREV_B64_gfx10:
300   case AMDGPU::V_LSHR_B64:
301   case AMDGPU::V_ASHRREV_I64:
302   case AMDGPU::V_ASHRREV_I64_gfx10:
303   case AMDGPU::V_ASHR_I64:
304     return 1;
305   }
306 
307   return 2;
308 }
309 
310 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
311   const Function &F) const {
312   if (NWaves == 1)
313     return getLocalMemorySize();
314   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
315   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
316   if (!WorkGroupsPerCu)
317     return 0;
318   unsigned MaxWaves = getMaxWavesPerEU();
319   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
320 }
321 
322 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
323   const Function &F) const {
324   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
325   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
326   if (!WorkGroupsPerCu)
327     return 0;
328   unsigned MaxWaves = getMaxWavesPerEU();
329   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
330   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
331   NumWaves = std::min(NumWaves, MaxWaves);
332   NumWaves = std::max(NumWaves, 1u);
333   return NumWaves;
334 }
335 
336 unsigned
337 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
338   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
339   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
340 }
341 
342 std::pair<unsigned, unsigned>
343 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
344   switch (CC) {
345   case CallingConv::AMDGPU_CS:
346   case CallingConv::AMDGPU_KERNEL:
347   case CallingConv::SPIR_KERNEL:
348     return std::make_pair(getWavefrontSize() * 2,
349                           std::max(getWavefrontSize() * 4, 256u));
350   case CallingConv::AMDGPU_VS:
351   case CallingConv::AMDGPU_LS:
352   case CallingConv::AMDGPU_HS:
353   case CallingConv::AMDGPU_ES:
354   case CallingConv::AMDGPU_GS:
355   case CallingConv::AMDGPU_PS:
356     return std::make_pair(1, getWavefrontSize());
357   default:
358     return std::make_pair(1, 16 * getWavefrontSize());
359   }
360 }
361 
362 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
363   const Function &F) const {
364   // FIXME: 1024 if function.
365   // Default minimum/maximum flat work group sizes.
366   std::pair<unsigned, unsigned> Default =
367     getDefaultFlatWorkGroupSize(F.getCallingConv());
368 
369   // Requested minimum/maximum flat work group sizes.
370   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
371     F, "amdgpu-flat-work-group-size", Default);
372 
373   // Make sure requested minimum is less than requested maximum.
374   if (Requested.first > Requested.second)
375     return Default;
376 
377   // Make sure requested values do not violate subtarget's specifications.
378   if (Requested.first < getMinFlatWorkGroupSize())
379     return Default;
380   if (Requested.second > getMaxFlatWorkGroupSize())
381     return Default;
382 
383   return Requested;
384 }
385 
386 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
387   const Function &F) const {
388   // Default minimum/maximum number of waves per execution unit.
389   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
390 
391   // Default/requested minimum/maximum flat work group sizes.
392   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
393 
394   // If minimum/maximum flat work group sizes were explicitly requested using
395   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
396   // number of waves per execution unit to values implied by requested
397   // minimum/maximum flat work group sizes.
398   unsigned MinImpliedByFlatWorkGroupSize =
399     getMaxWavesPerEU(FlatWorkGroupSizes.second);
400   bool RequestedFlatWorkGroupSize = false;
401 
402   if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
403     Default.first = MinImpliedByFlatWorkGroupSize;
404     RequestedFlatWorkGroupSize = true;
405   }
406 
407   // Requested minimum/maximum number of waves per execution unit.
408   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
409     F, "amdgpu-waves-per-eu", Default, true);
410 
411   // Make sure requested minimum is less than requested maximum.
412   if (Requested.second && Requested.first > Requested.second)
413     return Default;
414 
415   // Make sure requested values do not violate subtarget's specifications.
416   if (Requested.first < getMinWavesPerEU() ||
417       Requested.first > getMaxWavesPerEU())
418     return Default;
419   if (Requested.second > getMaxWavesPerEU())
420     return Default;
421 
422   // Make sure requested values are compatible with values implied by requested
423   // minimum/maximum flat work group sizes.
424   if (RequestedFlatWorkGroupSize &&
425       Requested.first < MinImpliedByFlatWorkGroupSize)
426     return Default;
427 
428   return Requested;
429 }
430 
431 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
432   Function *Kernel = I->getParent()->getParent();
433   unsigned MinSize = 0;
434   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
435   bool IdQuery = false;
436 
437   // If reqd_work_group_size is present it narrows value down.
438   if (auto *CI = dyn_cast<CallInst>(I)) {
439     const Function *F = CI->getCalledFunction();
440     if (F) {
441       unsigned Dim = UINT_MAX;
442       switch (F->getIntrinsicID()) {
443       case Intrinsic::amdgcn_workitem_id_x:
444       case Intrinsic::r600_read_tidig_x:
445         IdQuery = true;
446         LLVM_FALLTHROUGH;
447       case Intrinsic::r600_read_local_size_x:
448         Dim = 0;
449         break;
450       case Intrinsic::amdgcn_workitem_id_y:
451       case Intrinsic::r600_read_tidig_y:
452         IdQuery = true;
453         LLVM_FALLTHROUGH;
454       case Intrinsic::r600_read_local_size_y:
455         Dim = 1;
456         break;
457       case Intrinsic::amdgcn_workitem_id_z:
458       case Intrinsic::r600_read_tidig_z:
459         IdQuery = true;
460         LLVM_FALLTHROUGH;
461       case Intrinsic::r600_read_local_size_z:
462         Dim = 2;
463         break;
464       default:
465         break;
466       }
467       if (Dim <= 3) {
468         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
469           if (Node->getNumOperands() == 3)
470             MinSize = MaxSize = mdconst::extract<ConstantInt>(
471                                   Node->getOperand(Dim))->getZExtValue();
472       }
473     }
474   }
475 
476   if (!MaxSize)
477     return false;
478 
479   // Range metadata is [Lo, Hi). For ID query we need to pass max size
480   // as Hi. For size query we need to pass Hi + 1.
481   if (IdQuery)
482     MinSize = 0;
483   else
484     ++MaxSize;
485 
486   MDBuilder MDB(I->getContext());
487   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
488                                                   APInt(32, MaxSize));
489   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
490   return true;
491 }
492 
493 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
494                                                  unsigned &MaxAlign) const {
495   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
496          F.getCallingConv() == CallingConv::SPIR_KERNEL);
497 
498   const DataLayout &DL = F.getParent()->getDataLayout();
499   uint64_t ExplicitArgBytes = 0;
500   MaxAlign = 1;
501 
502   for (const Argument &Arg : F.args()) {
503     Type *ArgTy = Arg.getType();
504 
505     unsigned Align = DL.getABITypeAlignment(ArgTy);
506     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
507     ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
508     MaxAlign = std::max(MaxAlign, Align);
509   }
510 
511   return ExplicitArgBytes;
512 }
513 
514 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
515                                                 unsigned &MaxAlign) const {
516   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
517 
518   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
519 
520   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
521   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
522   if (ImplicitBytes != 0) {
523     unsigned Alignment = getAlignmentForImplicitArgPtr();
524     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
525   }
526 
527   // Being able to dereference past the end is useful for emitting scalar loads.
528   return alignTo(TotalSize, 4);
529 }
530 
531 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
532                              const TargetMachine &TM) :
533   R600GenSubtargetInfo(TT, GPU, FS),
534   AMDGPUSubtarget(TT),
535   InstrInfo(*this),
536   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
537   FMA(false),
538   CaymanISA(false),
539   CFALUBug(false),
540   HasVertexCache(false),
541   R600ALUInst(false),
542   FP64(false),
543   TexVTXClauseSize(0),
544   Gen(R600),
545   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
546   InstrItins(getInstrItineraryForCPU(GPU)) { }
547 
548 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
549                                       unsigned NumRegionInstrs) const {
550   // Track register pressure so the scheduler can try to decrease
551   // pressure once register usage is above the threshold defined by
552   // SIRegisterInfo::getRegPressureSetLimit()
553   Policy.ShouldTrackPressure = true;
554 
555   // Enabling both top down and bottom up scheduling seems to give us less
556   // register spills than just using one of these approaches on its own.
557   Policy.OnlyTopDown = false;
558   Policy.OnlyBottomUp = false;
559 
560   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
561   if (!enableSIScheduler())
562     Policy.ShouldTrackLaneMasks = true;
563 }
564 
565 bool GCNSubtarget::hasMadF16() const {
566   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
567 }
568 
569 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
570   if (getGeneration() >= AMDGPUSubtarget::GFX10)
571     return getMaxWavesPerEU();
572 
573   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
574     if (SGPRs <= 80)
575       return 10;
576     if (SGPRs <= 88)
577       return 9;
578     if (SGPRs <= 100)
579       return 8;
580     return 7;
581   }
582   if (SGPRs <= 48)
583     return 10;
584   if (SGPRs <= 56)
585     return 9;
586   if (SGPRs <= 64)
587     return 8;
588   if (SGPRs <= 72)
589     return 7;
590   if (SGPRs <= 80)
591     return 6;
592   return 5;
593 }
594 
595 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
596   unsigned MaxWaves = getMaxWavesPerEU();
597   unsigned Granule = getVGPRAllocGranule();
598   if (VGPRs < Granule)
599     return MaxWaves;
600   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
601   return std::min(getTotalNumVGPRs() / RoundedRegs, MaxWaves);
602 }
603 
604 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
605   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
606   if (getGeneration() >= AMDGPUSubtarget::GFX10)
607     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
608 
609   if (MFI.hasFlatScratchInit()) {
610     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
611       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
612     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
613       return 4; // FLAT_SCRATCH, VCC (in that order).
614   }
615 
616   if (isXNACKEnabled())
617     return 4; // XNACK, VCC (in that order).
618   return 2; // VCC.
619 }
620 
621 unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF,
622                                         unsigned LDSSize,
623                                         unsigned NumSGPRs,
624                                         unsigned NumVGPRs) const {
625   unsigned Occupancy =
626     std::min(getMaxWavesPerEU(),
627              getOccupancyWithLocalMemSize(LDSSize, MF.getFunction()));
628   if (NumSGPRs)
629     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
630   if (NumVGPRs)
631     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
632   return Occupancy;
633 }
634 
635 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
636   const Function &F = MF.getFunction();
637   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
638 
639   // Compute maximum number of SGPRs function can use using default/requested
640   // minimum number of waves per execution unit.
641   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
642   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
643   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
644 
645   // Check if maximum number of SGPRs was explicitly requested using
646   // "amdgpu-num-sgpr" attribute.
647   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
648     unsigned Requested = AMDGPU::getIntegerAttribute(
649       F, "amdgpu-num-sgpr", MaxNumSGPRs);
650 
651     // Make sure requested value does not violate subtarget's specifications.
652     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
653       Requested = 0;
654 
655     // If more SGPRs are required to support the input user/system SGPRs,
656     // increase to accommodate them.
657     //
658     // FIXME: This really ends up using the requested number of SGPRs + number
659     // of reserved special registers in total. Theoretically you could re-use
660     // the last input registers for these special registers, but this would
661     // require a lot of complexity to deal with the weird aliasing.
662     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
663     if (Requested && Requested < InputNumSGPRs)
664       Requested = InputNumSGPRs;
665 
666     // Make sure requested value is compatible with values implied by
667     // default/requested minimum/maximum number of waves per execution unit.
668     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
669       Requested = 0;
670     if (WavesPerEU.second &&
671         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
672       Requested = 0;
673 
674     if (Requested)
675       MaxNumSGPRs = Requested;
676   }
677 
678   if (hasSGPRInitBug())
679     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
680 
681   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
682                   MaxAddressableNumSGPRs);
683 }
684 
685 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
686   const Function &F = MF.getFunction();
687   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
688 
689   // Compute maximum number of VGPRs function can use using default/requested
690   // minimum number of waves per execution unit.
691   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
692   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
693 
694   // Check if maximum number of VGPRs was explicitly requested using
695   // "amdgpu-num-vgpr" attribute.
696   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
697     unsigned Requested = AMDGPU::getIntegerAttribute(
698       F, "amdgpu-num-vgpr", MaxNumVGPRs);
699 
700     // Make sure requested value is compatible with values implied by
701     // default/requested minimum/maximum number of waves per execution unit.
702     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
703       Requested = 0;
704     if (WavesPerEU.second &&
705         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
706       Requested = 0;
707 
708     if (Requested)
709       MaxNumVGPRs = Requested;
710   }
711 
712   return MaxNumVGPRs;
713 }
714 
715 namespace {
716 struct MemOpClusterMutation : ScheduleDAGMutation {
717   const SIInstrInfo *TII;
718 
719   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
720 
721   void apply(ScheduleDAGInstrs *DAG) override {
722     SUnit *SUa = nullptr;
723     // Search for two consequent memory operations and link them
724     // to prevent scheduler from moving them apart.
725     // In DAG pre-process SUnits are in the original order of
726     // the instructions before scheduling.
727     for (SUnit &SU : DAG->SUnits) {
728       MachineInstr &MI2 = *SU.getInstr();
729       if (!MI2.mayLoad() && !MI2.mayStore()) {
730         SUa = nullptr;
731         continue;
732       }
733       if (!SUa) {
734         SUa = &SU;
735         continue;
736       }
737 
738       MachineInstr &MI1 = *SUa->getInstr();
739       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
740           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
741           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
742           (TII->isDS(MI1)   && TII->isDS(MI2))) {
743         SU.addPredBarrier(SUa);
744 
745         for (const SDep &SI : SU.Preds) {
746           if (SI.getSUnit() != SUa)
747             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
748         }
749 
750         if (&SU != &DAG->ExitSU) {
751           for (const SDep &SI : SUa->Succs) {
752             if (SI.getSUnit() != &SU)
753               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
754           }
755         }
756       }
757 
758       SUa = &SU;
759     }
760   }
761 };
762 
763 struct FillMFMAShadowMutation : ScheduleDAGMutation {
764   const SIInstrInfo *TII;
765 
766   ScheduleDAGMI *DAG;
767 
768   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
769 
770   bool isSALU(const SUnit *SU) const {
771     const MachineInstr *MI = SU->getInstr();
772     return MI && TII->isSALU(*MI) && !MI->isTerminator();
773   }
774 
775   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
776     if (Pred->NodeNum < Succ->NodeNum)
777       return true;
778 
779     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
780 
781     for (unsigned I = 0; I < Succs.size(); ++I) {
782       for (const SDep &SI : Succs[I]->Succs) {
783         const SUnit *SU = SI.getSUnit();
784         if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
785           Succs.push_back(SU);
786       }
787     }
788 
789     SmallPtrSet<const SUnit*, 32> Visited;
790     while (!Preds.empty()) {
791       const SUnit *SU = Preds.pop_back_val();
792       if (llvm::find(Succs, SU) != Succs.end())
793         return false;
794       Visited.insert(SU);
795       for (const SDep &SI : SU->Preds)
796         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
797           Preds.push_back(SI.getSUnit());
798     }
799 
800     return true;
801   }
802 
803   // Link as much SALU intructions in chain as possible. Return the size
804   // of the chain. Links up to MaxChain instructions.
805   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
806                          SmallPtrSetImpl<SUnit *> &Visited) const {
807     SmallVector<SUnit *, 8> Worklist({To});
808     unsigned Linked = 0;
809 
810     while (!Worklist.empty() && MaxChain-- > 0) {
811       SUnit *SU = Worklist.pop_back_val();
812       if (!Visited.insert(SU).second)
813         continue;
814 
815       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
816                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
817 
818       if (SU->addPred(SDep(From, SDep::Artificial), false))
819         ++Linked;
820 
821       for (SDep &SI : From->Succs) {
822         SUnit *SUv = SI.getSUnit();
823         if (SUv != From && TII->isVALU(*SUv->getInstr()) && canAddEdge(SUv, SU))
824           SUv->addPred(SDep(SU, SDep::Artificial), false);
825       }
826 
827       for (SDep &SI : SU->Succs) {
828         SUnit *Succ = SI.getSUnit();
829         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
830           Worklist.push_back(Succ);
831       }
832     }
833 
834     return Linked;
835   }
836 
837   void apply(ScheduleDAGInstrs *DAGInstrs) override {
838     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
839     if (!ST.hasMAIInsts() || DisablePowerSched)
840       return;
841     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
842     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
843     if (!TSchedModel || DAG->SUnits.empty())
844       return;
845 
846     // Scan for MFMA long latency instructions and try to add a dependency
847     // of available SALU instructions to give them a chance to fill MFMA
848     // shadow. That is desirable to fill MFMA shadow with SALU instructions
849     // rather than VALU to prevent power consumption bursts and throttle.
850     auto LastSALU = DAG->SUnits.begin();
851     auto E = DAG->SUnits.end();
852     SmallPtrSet<SUnit*, 32> Visited;
853     for (SUnit &SU : DAG->SUnits) {
854       MachineInstr &MAI = *SU.getInstr();
855       if (!TII->isMAI(MAI) ||
856            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
857            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
858         continue;
859 
860       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
861 
862       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
863                  dbgs() << "Need " << Lat
864                         << " instructions to cover latency.\n");
865 
866       // Find up to Lat independent scalar instructions as early as
867       // possible such that they can be scheduled after this MFMA.
868       for ( ; Lat && LastSALU != E; ++LastSALU) {
869         if (Visited.count(&*LastSALU))
870           continue;
871 
872         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
873           continue;
874 
875         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
876       }
877     }
878   }
879 };
880 } // namespace
881 
882 void GCNSubtarget::getPostRAMutations(
883     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
884   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
885   Mutations.push_back(llvm::make_unique<FillMFMAShadowMutation>(&InstrInfo));
886 }
887 
888 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
889   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
890     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
891   else
892     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
893 }
894 
895 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
896   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
897     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
898   else
899     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
900 }
901