1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "SIMachineFunctionInfo.h"
17 #include "llvm/ADT/SmallString.h"
18 #include "llvm/CodeGen/MachineScheduler.h"
19 #include "llvm/IR/MDBuilder.h"
20 #include "llvm/Target/TargetFrameLowering.h"
21 #include <algorithm>
22 
23 using namespace llvm;
24 
25 #define DEBUG_TYPE "amdgpu-subtarget"
26 
27 #define GET_SUBTARGETINFO_TARGET_DESC
28 #define GET_SUBTARGETINFO_CTOR
29 #include "AMDGPUGenSubtargetInfo.inc"
30 
31 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
32 
33 AMDGPUSubtarget &
34 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
35                                                  StringRef GPU, StringRef FS) {
36   // Determine default and user-specified characteristics
37   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
38   // enabled, but some instructions do not respect them and they run at the
39   // double precision rate, so don't enable by default.
40   //
41   // We want to be able to turn these off, but making this a subtarget feature
42   // for SI has the unhelpful behavior that it unsets everything else if you
43   // disable it.
44 
45   SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,");
46   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
47     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
48 
49   FullFS += FS;
50 
51   ParseSubtargetFeatures(GPU, FullFS);
52 
53   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
54   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
55   // variants of MUBUF instructions.
56   if (!hasAddr64() && !FS.contains("flat-for-global")) {
57     FlatForGlobal = true;
58   }
59 
60   // FIXME: I don't think think Evergreen has any useful support for
61   // denormals, but should be checked. Should we issue a warning somewhere
62   // if someone tries to enable these?
63   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
64     FP64FP16Denormals = false;
65     FP32Denormals = false;
66   }
67 
68   // Set defaults if needed.
69   if (MaxPrivateElementSize == 0)
70     MaxPrivateElementSize = 4;
71 
72   return *this;
73 }
74 
75 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
76                                  const TargetMachine &TM)
77   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
78     TargetTriple(TT),
79     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
80     IsaVersion(ISAVersion0_0_0),
81     WavefrontSize(64),
82     LocalMemorySize(0),
83     LDSBankCount(0),
84     MaxPrivateElementSize(0),
85 
86     FastFMAF32(false),
87     HalfRate64Ops(false),
88 
89     FP32Denormals(false),
90     FP64FP16Denormals(false),
91     FPExceptions(false),
92     DX10Clamp(false),
93     FlatForGlobal(false),
94     AutoWaitcntBeforeBarrier(false),
95     UnalignedScratchAccess(false),
96     UnalignedBufferAccess(false),
97 
98     HasApertureRegs(false),
99     EnableXNACK(false),
100     TrapHandler(false),
101     DebuggerInsertNops(false),
102     DebuggerReserveRegs(false),
103     DebuggerEmitPrologue(false),
104 
105     EnableVGPRSpilling(false),
106     EnablePromoteAlloca(false),
107     EnableLoadStoreOpt(false),
108     EnableUnsafeDSOffsetFolding(false),
109     EnableSIScheduler(false),
110     DumpCode(false),
111 
112     FP64(false),
113     IsGCN(false),
114     GCN1Encoding(false),
115     GCN3Encoding(false),
116     CIInsts(false),
117     GFX9Insts(false),
118     SGPRInitBug(false),
119     HasSMemRealTime(false),
120     Has16BitInsts(false),
121     HasVOP3PInsts(false),
122     HasMovrel(false),
123     HasVGPRIndexMode(false),
124     HasScalarStores(false),
125     HasInv2PiInlineImm(false),
126     HasSDWA(false),
127     HasSDWAOmod(false),
128     HasSDWAScalar(false),
129     HasSDWASdst(false),
130     HasSDWAMac(false),
131     HasSDWAClampVOPC(false),
132     HasDPP(false),
133     FlatAddressSpace(false),
134     FlatInstOffsets(false),
135     FlatGlobalInsts(false),
136     FlatScratchInsts(false),
137 
138     R600ALUInst(false),
139     CaymanISA(false),
140     CFALUBug(false),
141     HasVertexCache(false),
142     TexVTXClauseSize(0),
143     ScalarizeGlobal(false),
144 
145     FeatureDisable(false),
146     InstrItins(getInstrItineraryForCPU(GPU)) {
147   AS = AMDGPU::getAMDGPUAS(TT);
148   initializeSubtargetDependencies(TT, GPU, FS);
149 }
150 
151 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
152   const Function &F) const {
153   if (NWaves == 1)
154     return getLocalMemorySize();
155   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
156   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
157   unsigned MaxWaves = getMaxWavesPerEU();
158   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
159 }
160 
161 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
162   const Function &F) const {
163   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
164   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
165   unsigned MaxWaves = getMaxWavesPerEU();
166   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
167   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
168   NumWaves = std::min(NumWaves, MaxWaves);
169   NumWaves = std::max(NumWaves, 1u);
170   return NumWaves;
171 }
172 
173 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
174   const Function &F) const {
175   // Default minimum/maximum flat work group sizes.
176   std::pair<unsigned, unsigned> Default =
177     AMDGPU::isCompute(F.getCallingConv()) ?
178       std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
179                                     getWavefrontSize() * 4) :
180       std::pair<unsigned, unsigned>(1, getWavefrontSize());
181 
182   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
183   // starts using "amdgpu-flat-work-group-size" attribute.
184   Default.second = AMDGPU::getIntegerAttribute(
185     F, "amdgpu-max-work-group-size", Default.second);
186   Default.first = std::min(Default.first, Default.second);
187 
188   // Requested minimum/maximum flat work group sizes.
189   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
190     F, "amdgpu-flat-work-group-size", Default);
191 
192   // Make sure requested minimum is less than requested maximum.
193   if (Requested.first > Requested.second)
194     return Default;
195 
196   // Make sure requested values do not violate subtarget's specifications.
197   if (Requested.first < getMinFlatWorkGroupSize())
198     return Default;
199   if (Requested.second > getMaxFlatWorkGroupSize())
200     return Default;
201 
202   return Requested;
203 }
204 
205 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
206   const Function &F) const {
207   // Default minimum/maximum number of waves per execution unit.
208   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
209 
210   // Default/requested minimum/maximum flat work group sizes.
211   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
212 
213   // If minimum/maximum flat work group sizes were explicitly requested using
214   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
215   // number of waves per execution unit to values implied by requested
216   // minimum/maximum flat work group sizes.
217   unsigned MinImpliedByFlatWorkGroupSize =
218     getMaxWavesPerEU(FlatWorkGroupSizes.second);
219   bool RequestedFlatWorkGroupSize = false;
220 
221   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
222   // starts using "amdgpu-flat-work-group-size" attribute.
223   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
224       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
225     Default.first = MinImpliedByFlatWorkGroupSize;
226     RequestedFlatWorkGroupSize = true;
227   }
228 
229   // Requested minimum/maximum number of waves per execution unit.
230   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
231     F, "amdgpu-waves-per-eu", Default, true);
232 
233   // Make sure requested minimum is less than requested maximum.
234   if (Requested.second && Requested.first > Requested.second)
235     return Default;
236 
237   // Make sure requested values do not violate subtarget's specifications.
238   if (Requested.first < getMinWavesPerEU() ||
239       Requested.first > getMaxWavesPerEU())
240     return Default;
241   if (Requested.second > getMaxWavesPerEU())
242     return Default;
243 
244   // Make sure requested values are compatible with values implied by requested
245   // minimum/maximum flat work group sizes.
246   if (RequestedFlatWorkGroupSize &&
247       Requested.first > MinImpliedByFlatWorkGroupSize)
248     return Default;
249 
250   return Requested;
251 }
252 
253 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
254   Function *Kernel = I->getParent()->getParent();
255   unsigned MinSize = 0;
256   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
257   bool IdQuery = false;
258 
259   // If reqd_work_group_size is present it narrows value down.
260   if (auto *CI = dyn_cast<CallInst>(I)) {
261     const Function *F = CI->getCalledFunction();
262     if (F) {
263       unsigned Dim = UINT_MAX;
264       switch (F->getIntrinsicID()) {
265       case Intrinsic::amdgcn_workitem_id_x:
266       case Intrinsic::r600_read_tidig_x:
267         IdQuery = true;
268       case Intrinsic::r600_read_local_size_x:
269         Dim = 0;
270         break;
271       case Intrinsic::amdgcn_workitem_id_y:
272       case Intrinsic::r600_read_tidig_y:
273         IdQuery = true;
274       case Intrinsic::r600_read_local_size_y:
275         Dim = 1;
276         break;
277       case Intrinsic::amdgcn_workitem_id_z:
278       case Intrinsic::r600_read_tidig_z:
279         IdQuery = true;
280       case Intrinsic::r600_read_local_size_z:
281         Dim = 2;
282         break;
283       default:
284         break;
285       }
286       if (Dim <= 3) {
287         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
288           if (Node->getNumOperands() == 3)
289             MinSize = MaxSize = mdconst::extract<ConstantInt>(
290                                   Node->getOperand(Dim))->getZExtValue();
291       }
292     }
293   }
294 
295   if (!MaxSize)
296     return false;
297 
298   // Range metadata is [Lo, Hi). For ID query we need to pass max size
299   // as Hi. For size query we need to pass Hi + 1.
300   if (IdQuery)
301     MinSize = 0;
302   else
303     ++MaxSize;
304 
305   MDBuilder MDB(I->getContext());
306   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
307                                                   APInt(32, MaxSize));
308   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
309   return true;
310 }
311 
312 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
313                              const TargetMachine &TM) :
314   AMDGPUSubtarget(TT, GPU, FS, TM),
315   InstrInfo(*this),
316   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
317   TLInfo(TM, *this) {}
318 
319 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
320                          const TargetMachine &TM) :
321   AMDGPUSubtarget(TT, GPU, FS, TM),
322   InstrInfo(*this),
323   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
324   TLInfo(TM, *this) {}
325 
326 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
327                                       unsigned NumRegionInstrs) const {
328   // Track register pressure so the scheduler can try to decrease
329   // pressure once register usage is above the threshold defined by
330   // SIRegisterInfo::getRegPressureSetLimit()
331   Policy.ShouldTrackPressure = true;
332 
333   // Enabling both top down and bottom up scheduling seems to give us less
334   // register spills than just using one of these approaches on its own.
335   Policy.OnlyTopDown = false;
336   Policy.OnlyBottomUp = false;
337 
338   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
339   if (!enableSIScheduler())
340     Policy.ShouldTrackLaneMasks = true;
341 }
342 
343 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
344   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
345 }
346 
347 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
348                                             unsigned ExplicitArgBytes) const {
349   unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
350   if (ImplicitBytes == 0)
351     return ExplicitArgBytes;
352 
353   unsigned Alignment = getAlignmentForImplicitArgPtr();
354   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
355 }
356 
357 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
358   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
359     if (SGPRs <= 80)
360       return 10;
361     if (SGPRs <= 88)
362       return 9;
363     if (SGPRs <= 100)
364       return 8;
365     return 7;
366   }
367   if (SGPRs <= 48)
368     return 10;
369   if (SGPRs <= 56)
370     return 9;
371   if (SGPRs <= 64)
372     return 8;
373   if (SGPRs <= 72)
374     return 7;
375   if (SGPRs <= 80)
376     return 6;
377   return 5;
378 }
379 
380 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
381   if (VGPRs <= 24)
382     return 10;
383   if (VGPRs <= 28)
384     return 9;
385   if (VGPRs <= 32)
386     return 8;
387   if (VGPRs <= 36)
388     return 7;
389   if (VGPRs <= 40)
390     return 6;
391   if (VGPRs <= 48)
392     return 5;
393   if (VGPRs <= 64)
394     return 4;
395   if (VGPRs <= 84)
396     return 3;
397   if (VGPRs <= 128)
398     return 2;
399   return 1;
400 }
401 
402 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
403   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
404   if (MFI.hasFlatScratchInit()) {
405     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
406       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
407     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
408       return 4; // FLAT_SCRATCH, VCC (in that order).
409   }
410 
411   if (isXNACKEnabled())
412     return 4; // XNACK, VCC (in that order).
413   return 2; // VCC.
414 }
415 
416 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
417   const Function &F = *MF.getFunction();
418   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
419 
420   // Compute maximum number of SGPRs function can use using default/requested
421   // minimum number of waves per execution unit.
422   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
423   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
424   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
425 
426   // Check if maximum number of SGPRs was explicitly requested using
427   // "amdgpu-num-sgpr" attribute.
428   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
429     unsigned Requested = AMDGPU::getIntegerAttribute(
430       F, "amdgpu-num-sgpr", MaxNumSGPRs);
431 
432     // Make sure requested value does not violate subtarget's specifications.
433     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
434       Requested = 0;
435 
436     // If more SGPRs are required to support the input user/system SGPRs,
437     // increase to accommodate them.
438     //
439     // FIXME: This really ends up using the requested number of SGPRs + number
440     // of reserved special registers in total. Theoretically you could re-use
441     // the last input registers for these special registers, but this would
442     // require a lot of complexity to deal with the weird aliasing.
443     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
444     if (Requested && Requested < InputNumSGPRs)
445       Requested = InputNumSGPRs;
446 
447     // Make sure requested value is compatible with values implied by
448     // default/requested minimum/maximum number of waves per execution unit.
449     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
450       Requested = 0;
451     if (WavesPerEU.second &&
452         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
453       Requested = 0;
454 
455     if (Requested)
456       MaxNumSGPRs = Requested;
457   }
458 
459   if (hasSGPRInitBug())
460     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
461 
462   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
463                   MaxAddressableNumSGPRs);
464 }
465 
466 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
467   const Function &F = *MF.getFunction();
468   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
469 
470   // Compute maximum number of VGPRs function can use using default/requested
471   // minimum number of waves per execution unit.
472   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
473   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
474 
475   // Check if maximum number of VGPRs was explicitly requested using
476   // "amdgpu-num-vgpr" attribute.
477   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
478     unsigned Requested = AMDGPU::getIntegerAttribute(
479       F, "amdgpu-num-vgpr", MaxNumVGPRs);
480 
481     // Make sure requested value does not violate subtarget's specifications.
482     if (Requested && Requested <= getReservedNumVGPRs(MF))
483       Requested = 0;
484 
485     // Make sure requested value is compatible with values implied by
486     // default/requested minimum/maximum number of waves per execution unit.
487     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
488       Requested = 0;
489     if (WavesPerEU.second &&
490         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
491       Requested = 0;
492 
493     if (Requested)
494       MaxNumVGPRs = Requested;
495   }
496 
497   return MaxNumVGPRs - getReservedNumVGPRs(MF);
498 }
499