1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUTargetMachine.h"
18 #ifdef LLVM_BUILD_GLOBAL_ISEL
19 #include "AMDGPUCallLowering.h"
20 #include "AMDGPUInstructionSelector.h"
21 #include "AMDGPULegalizerInfo.h"
22 #include "AMDGPURegisterBankInfo.h"
23 #endif
24 #include "SIMachineFunctionInfo.h"
25 #include "llvm/ADT/SmallString.h"
26 #include "llvm/CodeGen/MachineScheduler.h"
27 #include "llvm/IR/MDBuilder.h"
28 #include "llvm/Target/TargetFrameLowering.h"
29 #include <algorithm>
30 
31 using namespace llvm;
32 
33 #define DEBUG_TYPE "amdgpu-subtarget"
34 
35 #define GET_SUBTARGETINFO_TARGET_DESC
36 #define GET_SUBTARGETINFO_CTOR
37 #include "AMDGPUGenSubtargetInfo.inc"
38 
39 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
40 
41 AMDGPUSubtarget &
42 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
43                                                  StringRef GPU, StringRef FS) {
44   // Determine default and user-specified characteristics
45   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
46   // enabled, but some instructions do not respect them and they run at the
47   // double precision rate, so don't enable by default.
48   //
49   // We want to be able to turn these off, but making this a subtarget feature
50   // for SI has the unhelpful behavior that it unsets everything else if you
51   // disable it.
52 
53   SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,");
54   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
55     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
56 
57   FullFS += FS;
58 
59   ParseSubtargetFeatures(GPU, FullFS);
60 
61   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
62   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
63   // variants of MUBUF instructions.
64   if (!hasAddr64() && !FS.contains("flat-for-global")) {
65     FlatForGlobal = true;
66   }
67 
68   // FIXME: I don't think think Evergreen has any useful support for
69   // denormals, but should be checked. Should we issue a warning somewhere
70   // if someone tries to enable these?
71   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
72     FP64FP16Denormals = false;
73     FP32Denormals = false;
74   }
75 
76   // Set defaults if needed.
77   if (MaxPrivateElementSize == 0)
78     MaxPrivateElementSize = 4;
79 
80   return *this;
81 }
82 
83 #ifdef LLVM_BUILD_GLOBAL_ISEL
84 namespace {
85 
86 struct SIGISelActualAccessor : public GISelAccessor {
87   std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
88   std::unique_ptr<InstructionSelector> InstSelector;
89   std::unique_ptr<LegalizerInfo> Legalizer;
90   std::unique_ptr<RegisterBankInfo> RegBankInfo;
91   const AMDGPUCallLowering *getCallLowering() const override {
92     return CallLoweringInfo.get();
93   }
94   const InstructionSelector *getInstructionSelector() const override {
95     return InstSelector.get();
96   }
97   const LegalizerInfo *getLegalizerInfo() const override {
98     return Legalizer.get();
99   }
100   const RegisterBankInfo *getRegBankInfo() const override {
101     return RegBankInfo.get();
102   }
103 };
104 
105 } // end anonymous namespace
106 #endif
107 
108 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
109                                  const TargetMachine &TM)
110   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
111     TargetTriple(TT),
112     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
113     IsaVersion(ISAVersion0_0_0),
114     WavefrontSize(64),
115     LocalMemorySize(0),
116     LDSBankCount(0),
117     MaxPrivateElementSize(0),
118 
119     FastFMAF32(false),
120     HalfRate64Ops(false),
121 
122     FP32Denormals(false),
123     FP64FP16Denormals(false),
124     FPExceptions(false),
125     DX10Clamp(false),
126     FlatForGlobal(false),
127     AutoWaitcntBeforeBarrier(false),
128     UnalignedScratchAccess(false),
129     UnalignedBufferAccess(false),
130 
131     HasApertureRegs(false),
132     EnableXNACK(false),
133     TrapHandler(false),
134     DebuggerInsertNops(false),
135     DebuggerReserveRegs(false),
136     DebuggerEmitPrologue(false),
137 
138     EnableVGPRSpilling(false),
139     EnablePromoteAlloca(false),
140     EnableLoadStoreOpt(false),
141     EnableUnsafeDSOffsetFolding(false),
142     EnableSIScheduler(false),
143     DumpCode(false),
144 
145     FP64(false),
146     IsGCN(false),
147     GCN1Encoding(false),
148     GCN3Encoding(false),
149     CIInsts(false),
150     GFX9Insts(false),
151     SGPRInitBug(false),
152     HasSMemRealTime(false),
153     Has16BitInsts(false),
154     HasVOP3PInsts(false),
155     HasMovrel(false),
156     HasVGPRIndexMode(false),
157     HasScalarStores(false),
158     HasInv2PiInlineImm(false),
159     HasSDWA(false),
160     HasSDWAOmod(false),
161     HasSDWAScalar(false),
162     HasSDWASdst(false),
163     HasSDWAMac(false),
164     HasSDWAOutModsVOPC(false),
165     HasDPP(false),
166     FlatAddressSpace(false),
167     FlatInstOffsets(false),
168     FlatGlobalInsts(false),
169     FlatScratchInsts(false),
170     AddNoCarryInsts(false),
171 
172     R600ALUInst(false),
173     CaymanISA(false),
174     CFALUBug(false),
175     HasVertexCache(false),
176     TexVTXClauseSize(0),
177     ScalarizeGlobal(false),
178 
179     FeatureDisable(false),
180     InstrItins(getInstrItineraryForCPU(GPU)) {
181   AS = AMDGPU::getAMDGPUAS(TT);
182   initializeSubtargetDependencies(TT, GPU, FS);
183 }
184 
185 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
186   const Function &F) const {
187   if (NWaves == 1)
188     return getLocalMemorySize();
189   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
190   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
191   unsigned MaxWaves = getMaxWavesPerEU();
192   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
193 }
194 
195 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
196   const Function &F) const {
197   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
198   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
199   unsigned MaxWaves = getMaxWavesPerEU();
200   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
201   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
202   NumWaves = std::min(NumWaves, MaxWaves);
203   NumWaves = std::max(NumWaves, 1u);
204   return NumWaves;
205 }
206 
207 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
208   const Function &F) const {
209   // Default minimum/maximum flat work group sizes.
210   std::pair<unsigned, unsigned> Default =
211     AMDGPU::isCompute(F.getCallingConv()) ?
212       std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
213                                     getWavefrontSize() * 4) :
214       std::pair<unsigned, unsigned>(1, getWavefrontSize());
215 
216   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
217   // starts using "amdgpu-flat-work-group-size" attribute.
218   Default.second = AMDGPU::getIntegerAttribute(
219     F, "amdgpu-max-work-group-size", Default.second);
220   Default.first = std::min(Default.first, Default.second);
221 
222   // Requested minimum/maximum flat work group sizes.
223   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
224     F, "amdgpu-flat-work-group-size", Default);
225 
226   // Make sure requested minimum is less than requested maximum.
227   if (Requested.first > Requested.second)
228     return Default;
229 
230   // Make sure requested values do not violate subtarget's specifications.
231   if (Requested.first < getMinFlatWorkGroupSize())
232     return Default;
233   if (Requested.second > getMaxFlatWorkGroupSize())
234     return Default;
235 
236   return Requested;
237 }
238 
239 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
240   const Function &F) const {
241   // Default minimum/maximum number of waves per execution unit.
242   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
243 
244   // Default/requested minimum/maximum flat work group sizes.
245   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
246 
247   // If minimum/maximum flat work group sizes were explicitly requested using
248   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
249   // number of waves per execution unit to values implied by requested
250   // minimum/maximum flat work group sizes.
251   unsigned MinImpliedByFlatWorkGroupSize =
252     getMaxWavesPerEU(FlatWorkGroupSizes.second);
253   bool RequestedFlatWorkGroupSize = false;
254 
255   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
256   // starts using "amdgpu-flat-work-group-size" attribute.
257   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
258       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
259     Default.first = MinImpliedByFlatWorkGroupSize;
260     RequestedFlatWorkGroupSize = true;
261   }
262 
263   // Requested minimum/maximum number of waves per execution unit.
264   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
265     F, "amdgpu-waves-per-eu", Default, true);
266 
267   // Make sure requested minimum is less than requested maximum.
268   if (Requested.second && Requested.first > Requested.second)
269     return Default;
270 
271   // Make sure requested values do not violate subtarget's specifications.
272   if (Requested.first < getMinWavesPerEU() ||
273       Requested.first > getMaxWavesPerEU())
274     return Default;
275   if (Requested.second > getMaxWavesPerEU())
276     return Default;
277 
278   // Make sure requested values are compatible with values implied by requested
279   // minimum/maximum flat work group sizes.
280   if (RequestedFlatWorkGroupSize &&
281       Requested.first < MinImpliedByFlatWorkGroupSize)
282     return Default;
283 
284   return Requested;
285 }
286 
287 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
288   Function *Kernel = I->getParent()->getParent();
289   unsigned MinSize = 0;
290   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
291   bool IdQuery = false;
292 
293   // If reqd_work_group_size is present it narrows value down.
294   if (auto *CI = dyn_cast<CallInst>(I)) {
295     const Function *F = CI->getCalledFunction();
296     if (F) {
297       unsigned Dim = UINT_MAX;
298       switch (F->getIntrinsicID()) {
299       case Intrinsic::amdgcn_workitem_id_x:
300       case Intrinsic::r600_read_tidig_x:
301         IdQuery = true;
302         LLVM_FALLTHROUGH;
303       case Intrinsic::r600_read_local_size_x:
304         Dim = 0;
305         break;
306       case Intrinsic::amdgcn_workitem_id_y:
307       case Intrinsic::r600_read_tidig_y:
308         IdQuery = true;
309         LLVM_FALLTHROUGH;
310       case Intrinsic::r600_read_local_size_y:
311         Dim = 1;
312         break;
313       case Intrinsic::amdgcn_workitem_id_z:
314       case Intrinsic::r600_read_tidig_z:
315         IdQuery = true;
316         LLVM_FALLTHROUGH;
317       case Intrinsic::r600_read_local_size_z:
318         Dim = 2;
319         break;
320       default:
321         break;
322       }
323       if (Dim <= 3) {
324         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
325           if (Node->getNumOperands() == 3)
326             MinSize = MaxSize = mdconst::extract<ConstantInt>(
327                                   Node->getOperand(Dim))->getZExtValue();
328       }
329     }
330   }
331 
332   if (!MaxSize)
333     return false;
334 
335   // Range metadata is [Lo, Hi). For ID query we need to pass max size
336   // as Hi. For size query we need to pass Hi + 1.
337   if (IdQuery)
338     MinSize = 0;
339   else
340     ++MaxSize;
341 
342   MDBuilder MDB(I->getContext());
343   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
344                                                   APInt(32, MaxSize));
345   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
346   return true;
347 }
348 
349 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
350                              const TargetMachine &TM) :
351   AMDGPUSubtarget(TT, GPU, FS, TM),
352   InstrInfo(*this),
353   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
354   TLInfo(TM, *this) {}
355 
356 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
357                          const TargetMachine &TM)
358     : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this),
359       FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
360       TLInfo(TM, *this) {
361 #ifndef LLVM_BUILD_GLOBAL_ISEL
362   GISelAccessor *GISel = new GISelAccessor();
363 #else
364   SIGISelActualAccessor *GISel = new SIGISelActualAccessor();
365   GISel->CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
366   GISel->Legalizer.reset(new AMDGPULegalizerInfo());
367 
368   GISel->RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
369   GISel->InstSelector.reset(new AMDGPUInstructionSelector(
370       *this, *static_cast<AMDGPURegisterBankInfo *>(GISel->RegBankInfo.get())));
371 #endif
372   setGISelAccessor(*GISel);
373 }
374 
375 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
376                                       unsigned NumRegionInstrs) const {
377   // Track register pressure so the scheduler can try to decrease
378   // pressure once register usage is above the threshold defined by
379   // SIRegisterInfo::getRegPressureSetLimit()
380   Policy.ShouldTrackPressure = true;
381 
382   // Enabling both top down and bottom up scheduling seems to give us less
383   // register spills than just using one of these approaches on its own.
384   Policy.OnlyTopDown = false;
385   Policy.OnlyBottomUp = false;
386 
387   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
388   if (!enableSIScheduler())
389     Policy.ShouldTrackLaneMasks = true;
390 }
391 
392 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
393   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
394 }
395 
396 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
397                                             unsigned ExplicitArgBytes) const {
398   unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
399   if (ImplicitBytes == 0)
400     return ExplicitArgBytes;
401 
402   unsigned Alignment = getAlignmentForImplicitArgPtr();
403   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
404 }
405 
406 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
407   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
408     if (SGPRs <= 80)
409       return 10;
410     if (SGPRs <= 88)
411       return 9;
412     if (SGPRs <= 100)
413       return 8;
414     return 7;
415   }
416   if (SGPRs <= 48)
417     return 10;
418   if (SGPRs <= 56)
419     return 9;
420   if (SGPRs <= 64)
421     return 8;
422   if (SGPRs <= 72)
423     return 7;
424   if (SGPRs <= 80)
425     return 6;
426   return 5;
427 }
428 
429 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
430   if (VGPRs <= 24)
431     return 10;
432   if (VGPRs <= 28)
433     return 9;
434   if (VGPRs <= 32)
435     return 8;
436   if (VGPRs <= 36)
437     return 7;
438   if (VGPRs <= 40)
439     return 6;
440   if (VGPRs <= 48)
441     return 5;
442   if (VGPRs <= 64)
443     return 4;
444   if (VGPRs <= 84)
445     return 3;
446   if (VGPRs <= 128)
447     return 2;
448   return 1;
449 }
450 
451 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
452   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
453   if (MFI.hasFlatScratchInit()) {
454     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
455       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
456     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
457       return 4; // FLAT_SCRATCH, VCC (in that order).
458   }
459 
460   if (isXNACKEnabled())
461     return 4; // XNACK, VCC (in that order).
462   return 2; // VCC.
463 }
464 
465 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
466   const Function &F = *MF.getFunction();
467   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
468 
469   // Compute maximum number of SGPRs function can use using default/requested
470   // minimum number of waves per execution unit.
471   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
472   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
473   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
474 
475   // Check if maximum number of SGPRs was explicitly requested using
476   // "amdgpu-num-sgpr" attribute.
477   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
478     unsigned Requested = AMDGPU::getIntegerAttribute(
479       F, "amdgpu-num-sgpr", MaxNumSGPRs);
480 
481     // Make sure requested value does not violate subtarget's specifications.
482     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
483       Requested = 0;
484 
485     // If more SGPRs are required to support the input user/system SGPRs,
486     // increase to accommodate them.
487     //
488     // FIXME: This really ends up using the requested number of SGPRs + number
489     // of reserved special registers in total. Theoretically you could re-use
490     // the last input registers for these special registers, but this would
491     // require a lot of complexity to deal with the weird aliasing.
492     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
493     if (Requested && Requested < InputNumSGPRs)
494       Requested = InputNumSGPRs;
495 
496     // Make sure requested value is compatible with values implied by
497     // default/requested minimum/maximum number of waves per execution unit.
498     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
499       Requested = 0;
500     if (WavesPerEU.second &&
501         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
502       Requested = 0;
503 
504     if (Requested)
505       MaxNumSGPRs = Requested;
506   }
507 
508   if (hasSGPRInitBug())
509     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
510 
511   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
512                   MaxAddressableNumSGPRs);
513 }
514 
515 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
516   const Function &F = *MF.getFunction();
517   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
518 
519   // Compute maximum number of VGPRs function can use using default/requested
520   // minimum number of waves per execution unit.
521   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
522   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
523 
524   // Check if maximum number of VGPRs was explicitly requested using
525   // "amdgpu-num-vgpr" attribute.
526   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
527     unsigned Requested = AMDGPU::getIntegerAttribute(
528       F, "amdgpu-num-vgpr", MaxNumVGPRs);
529 
530     // Make sure requested value does not violate subtarget's specifications.
531     if (Requested && Requested <= getReservedNumVGPRs(MF))
532       Requested = 0;
533 
534     // Make sure requested value is compatible with values implied by
535     // default/requested minimum/maximum number of waves per execution unit.
536     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
537       Requested = 0;
538     if (WavesPerEU.second &&
539         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
540       Requested = 0;
541 
542     if (Requested)
543       MaxNumVGPRs = Requested;
544   }
545 
546   return MaxNumVGPRs - getReservedNumVGPRs(MF);
547 }
548