1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "SIMachineFunctionInfo.h"
17 #include "llvm/ADT/SmallString.h"
18 #include "llvm/CodeGen/MachineScheduler.h"
19 #include "llvm/IR/MDBuilder.h"
20 #include "llvm/Target/TargetFrameLowering.h"
21 #include <algorithm>
22 
23 using namespace llvm;
24 
25 #define DEBUG_TYPE "amdgpu-subtarget"
26 
27 #define GET_SUBTARGETINFO_TARGET_DESC
28 #define GET_SUBTARGETINFO_CTOR
29 #include "AMDGPUGenSubtargetInfo.inc"
30 
31 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
32 
33 AMDGPUSubtarget &
34 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
35                                                  StringRef GPU, StringRef FS) {
36   // Determine default and user-specified characteristics
37   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
38   // enabled, but some instructions do not respect them and they run at the
39   // double precision rate, so don't enable by default.
40   //
41   // We want to be able to turn these off, but making this a subtarget feature
42   // for SI has the unhelpful behavior that it unsets everything else if you
43   // disable it.
44 
45   SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,");
46   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
47     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
48 
49   FullFS += FS;
50 
51   ParseSubtargetFeatures(GPU, FullFS);
52 
53   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
54   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
55   // variants of MUBUF instructions.
56   if (!hasAddr64() && !FS.contains("flat-for-global")) {
57     FlatForGlobal = true;
58   }
59 
60   // FIXME: I don't think think Evergreen has any useful support for
61   // denormals, but should be checked. Should we issue a warning somewhere
62   // if someone tries to enable these?
63   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
64     FP64FP16Denormals = false;
65     FP32Denormals = false;
66   }
67 
68   // Set defaults if needed.
69   if (MaxPrivateElementSize == 0)
70     MaxPrivateElementSize = 4;
71 
72   return *this;
73 }
74 
75 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
76                                  const TargetMachine &TM)
77   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
78     TargetTriple(TT),
79     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
80     IsaVersion(ISAVersion0_0_0),
81     WavefrontSize(64),
82     LocalMemorySize(0),
83     LDSBankCount(0),
84     MaxPrivateElementSize(0),
85 
86     FastFMAF32(false),
87     HalfRate64Ops(false),
88 
89     FP32Denormals(false),
90     FP64FP16Denormals(false),
91     FPExceptions(false),
92     DX10Clamp(false),
93     FlatForGlobal(false),
94     UnalignedScratchAccess(false),
95     UnalignedBufferAccess(false),
96 
97     HasApertureRegs(false),
98     EnableXNACK(false),
99     TrapHandler(false),
100     DebuggerInsertNops(false),
101     DebuggerReserveRegs(false),
102     DebuggerEmitPrologue(false),
103 
104     EnableVGPRSpilling(false),
105     EnablePromoteAlloca(false),
106     EnableLoadStoreOpt(false),
107     EnableUnsafeDSOffsetFolding(false),
108     EnableSIScheduler(false),
109     DumpCode(false),
110 
111     FP64(false),
112     IsGCN(false),
113     GCN1Encoding(false),
114     GCN3Encoding(false),
115     CIInsts(false),
116     GFX9Insts(false),
117     SGPRInitBug(false),
118     HasSMemRealTime(false),
119     Has16BitInsts(false),
120     HasVOP3PInsts(false),
121     HasMovrel(false),
122     HasVGPRIndexMode(false),
123     HasScalarStores(false),
124     HasInv2PiInlineImm(false),
125     HasSDWA(false),
126     HasDPP(false),
127     FlatAddressSpace(false),
128     FlatInstOffsets(false),
129     FlatGlobalInsts(false),
130     FlatScratchInsts(false),
131 
132     R600ALUInst(false),
133     CaymanISA(false),
134     CFALUBug(false),
135     HasVertexCache(false),
136     TexVTXClauseSize(0),
137     ScalarizeGlobal(false),
138 
139     FeatureDisable(false),
140     InstrItins(getInstrItineraryForCPU(GPU)) {
141   AS = AMDGPU::getAMDGPUAS(TT);
142   initializeSubtargetDependencies(TT, GPU, FS);
143 }
144 
145 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
146   const Function &F) const {
147   if (NWaves == 1)
148     return getLocalMemorySize();
149   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
150   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
151   unsigned MaxWaves = getMaxWavesPerEU();
152   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
153 }
154 
155 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
156   const Function &F) const {
157   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
158   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
159   unsigned MaxWaves = getMaxWavesPerEU();
160   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
161   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
162   NumWaves = std::min(NumWaves, MaxWaves);
163   NumWaves = std::max(NumWaves, 1u);
164   return NumWaves;
165 }
166 
167 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
168   const Function &F) const {
169   // Default minimum/maximum flat work group sizes.
170   std::pair<unsigned, unsigned> Default =
171     AMDGPU::isCompute(F.getCallingConv()) ?
172       std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
173                                     getWavefrontSize() * 4) :
174       std::pair<unsigned, unsigned>(1, getWavefrontSize());
175 
176   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
177   // starts using "amdgpu-flat-work-group-size" attribute.
178   Default.second = AMDGPU::getIntegerAttribute(
179     F, "amdgpu-max-work-group-size", Default.second);
180   Default.first = std::min(Default.first, Default.second);
181 
182   // Requested minimum/maximum flat work group sizes.
183   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
184     F, "amdgpu-flat-work-group-size", Default);
185 
186   // Make sure requested minimum is less than requested maximum.
187   if (Requested.first > Requested.second)
188     return Default;
189 
190   // Make sure requested values do not violate subtarget's specifications.
191   if (Requested.first < getMinFlatWorkGroupSize())
192     return Default;
193   if (Requested.second > getMaxFlatWorkGroupSize())
194     return Default;
195 
196   return Requested;
197 }
198 
199 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
200   const Function &F) const {
201   // Default minimum/maximum number of waves per execution unit.
202   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
203 
204   // Default/requested minimum/maximum flat work group sizes.
205   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
206 
207   // If minimum/maximum flat work group sizes were explicitly requested using
208   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
209   // number of waves per execution unit to values implied by requested
210   // minimum/maximum flat work group sizes.
211   unsigned MinImpliedByFlatWorkGroupSize =
212     getMaxWavesPerEU(FlatWorkGroupSizes.second);
213   bool RequestedFlatWorkGroupSize = false;
214 
215   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
216   // starts using "amdgpu-flat-work-group-size" attribute.
217   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
218       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
219     Default.first = MinImpliedByFlatWorkGroupSize;
220     RequestedFlatWorkGroupSize = true;
221   }
222 
223   // Requested minimum/maximum number of waves per execution unit.
224   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
225     F, "amdgpu-waves-per-eu", Default, true);
226 
227   // Make sure requested minimum is less than requested maximum.
228   if (Requested.second && Requested.first > Requested.second)
229     return Default;
230 
231   // Make sure requested values do not violate subtarget's specifications.
232   if (Requested.first < getMinWavesPerEU() ||
233       Requested.first > getMaxWavesPerEU())
234     return Default;
235   if (Requested.second > getMaxWavesPerEU())
236     return Default;
237 
238   // Make sure requested values are compatible with values implied by requested
239   // minimum/maximum flat work group sizes.
240   if (RequestedFlatWorkGroupSize &&
241       Requested.first > MinImpliedByFlatWorkGroupSize)
242     return Default;
243 
244   return Requested;
245 }
246 
247 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
248   Function *Kernel = I->getParent()->getParent();
249   unsigned MinSize = 0;
250   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
251   bool IdQuery = false;
252 
253   // If reqd_work_group_size is present it narrows value down.
254   if (auto *CI = dyn_cast<CallInst>(I)) {
255     const Function *F = CI->getCalledFunction();
256     if (F) {
257       unsigned Dim = UINT_MAX;
258       switch (F->getIntrinsicID()) {
259       case Intrinsic::amdgcn_workitem_id_x:
260       case Intrinsic::r600_read_tidig_x:
261         IdQuery = true;
262       case Intrinsic::r600_read_local_size_x:
263         Dim = 0;
264         break;
265       case Intrinsic::amdgcn_workitem_id_y:
266       case Intrinsic::r600_read_tidig_y:
267         IdQuery = true;
268       case Intrinsic::r600_read_local_size_y:
269         Dim = 1;
270         break;
271       case Intrinsic::amdgcn_workitem_id_z:
272       case Intrinsic::r600_read_tidig_z:
273         IdQuery = true;
274       case Intrinsic::r600_read_local_size_z:
275         Dim = 2;
276         break;
277       default:
278         break;
279       }
280       if (Dim <= 3) {
281         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
282           if (Node->getNumOperands() == 3)
283             MinSize = MaxSize = mdconst::extract<ConstantInt>(
284                                   Node->getOperand(Dim))->getZExtValue();
285       }
286     }
287   }
288 
289   if (!MaxSize)
290     return false;
291 
292   // Range metadata is [Lo, Hi). For ID query we need to pass max size
293   // as Hi. For size query we need to pass Hi + 1.
294   if (IdQuery)
295     MinSize = 0;
296   else
297     ++MaxSize;
298 
299   MDBuilder MDB(I->getContext());
300   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
301                                                   APInt(32, MaxSize));
302   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
303   return true;
304 }
305 
306 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
307                              const TargetMachine &TM) :
308   AMDGPUSubtarget(TT, GPU, FS, TM),
309   InstrInfo(*this),
310   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
311   TLInfo(TM, *this) {}
312 
313 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
314                          const TargetMachine &TM) :
315   AMDGPUSubtarget(TT, GPU, FS, TM),
316   InstrInfo(*this),
317   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
318   TLInfo(TM, *this) {}
319 
320 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
321                                       unsigned NumRegionInstrs) const {
322   // Track register pressure so the scheduler can try to decrease
323   // pressure once register usage is above the threshold defined by
324   // SIRegisterInfo::getRegPressureSetLimit()
325   Policy.ShouldTrackPressure = true;
326 
327   // Enabling both top down and bottom up scheduling seems to give us less
328   // register spills than just using one of these approaches on its own.
329   Policy.OnlyTopDown = false;
330   Policy.OnlyBottomUp = false;
331 
332   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
333   if (!enableSIScheduler())
334     Policy.ShouldTrackLaneMasks = true;
335 }
336 
337 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
338   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
339 }
340 
341 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
342                                             unsigned ExplicitArgBytes) const {
343   unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
344   if (ImplicitBytes == 0)
345     return ExplicitArgBytes;
346 
347   unsigned Alignment = getAlignmentForImplicitArgPtr();
348   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
349 }
350 
351 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
352   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
353     if (SGPRs <= 80)
354       return 10;
355     if (SGPRs <= 88)
356       return 9;
357     if (SGPRs <= 100)
358       return 8;
359     return 7;
360   }
361   if (SGPRs <= 48)
362     return 10;
363   if (SGPRs <= 56)
364     return 9;
365   if (SGPRs <= 64)
366     return 8;
367   if (SGPRs <= 72)
368     return 7;
369   if (SGPRs <= 80)
370     return 6;
371   return 5;
372 }
373 
374 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
375   if (VGPRs <= 24)
376     return 10;
377   if (VGPRs <= 28)
378     return 9;
379   if (VGPRs <= 32)
380     return 8;
381   if (VGPRs <= 36)
382     return 7;
383   if (VGPRs <= 40)
384     return 6;
385   if (VGPRs <= 48)
386     return 5;
387   if (VGPRs <= 64)
388     return 4;
389   if (VGPRs <= 84)
390     return 3;
391   if (VGPRs <= 128)
392     return 2;
393   return 1;
394 }
395 
396 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
397   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
398   if (MFI.hasFlatScratchInit()) {
399     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
400       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
401     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
402       return 4; // FLAT_SCRATCH, VCC (in that order).
403   }
404 
405   if (isXNACKEnabled())
406     return 4; // XNACK, VCC (in that order).
407   return 2; // VCC.
408 }
409 
410 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
411   const Function &F = *MF.getFunction();
412   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
413 
414   // Compute maximum number of SGPRs function can use using default/requested
415   // minimum number of waves per execution unit.
416   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
417   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
418   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
419 
420   // Check if maximum number of SGPRs was explicitly requested using
421   // "amdgpu-num-sgpr" attribute.
422   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
423     unsigned Requested = AMDGPU::getIntegerAttribute(
424       F, "amdgpu-num-sgpr", MaxNumSGPRs);
425 
426     // Make sure requested value does not violate subtarget's specifications.
427     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
428       Requested = 0;
429 
430     // If more SGPRs are required to support the input user/system SGPRs,
431     // increase to accommodate them.
432     //
433     // FIXME: This really ends up using the requested number of SGPRs + number
434     // of reserved special registers in total. Theoretically you could re-use
435     // the last input registers for these special registers, but this would
436     // require a lot of complexity to deal with the weird aliasing.
437     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
438     if (Requested && Requested < InputNumSGPRs)
439       Requested = InputNumSGPRs;
440 
441     // Make sure requested value is compatible with values implied by
442     // default/requested minimum/maximum number of waves per execution unit.
443     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
444       Requested = 0;
445     if (WavesPerEU.second &&
446         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
447       Requested = 0;
448 
449     if (Requested)
450       MaxNumSGPRs = Requested;
451   }
452 
453   if (hasSGPRInitBug())
454     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
455 
456   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
457                   MaxAddressableNumSGPRs);
458 }
459 
460 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
461   const Function &F = *MF.getFunction();
462   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
463 
464   // Compute maximum number of VGPRs function can use using default/requested
465   // minimum number of waves per execution unit.
466   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
467   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
468 
469   // Check if maximum number of VGPRs was explicitly requested using
470   // "amdgpu-num-vgpr" attribute.
471   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
472     unsigned Requested = AMDGPU::getIntegerAttribute(
473       F, "amdgpu-num-vgpr", MaxNumVGPRs);
474 
475     // Make sure requested value does not violate subtarget's specifications.
476     if (Requested && Requested <= getReservedNumVGPRs(MF))
477       Requested = 0;
478 
479     // Make sure requested value is compatible with values implied by
480     // default/requested minimum/maximum number of waves per execution unit.
481     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
482       Requested = 0;
483     if (WavesPerEU.second &&
484         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
485       Requested = 0;
486 
487     if (Requested)
488       MaxNumVGPRs = Requested;
489   }
490 
491   return MaxNumVGPRs - getReservedNumVGPRs(MF);
492 }
493