1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "AMDGPUCallLowering.h"
19 #include "AMDGPUInstructionSelector.h"
20 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPURegisterBankInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/IR/MDBuilder.h"
26 #include "llvm/Target/TargetFrameLowering.h"
27 #include <algorithm>
28 
29 using namespace llvm;
30 
31 #define DEBUG_TYPE "amdgpu-subtarget"
32 
33 #define GET_SUBTARGETINFO_TARGET_DESC
34 #define GET_SUBTARGETINFO_CTOR
35 #include "AMDGPUGenSubtargetInfo.inc"
36 
37 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
38 
39 AMDGPUSubtarget &
40 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
41                                                  StringRef GPU, StringRef FS) {
42   // Determine default and user-specified characteristics
43   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
44   // enabled, but some instructions do not respect them and they run at the
45   // double precision rate, so don't enable by default.
46   //
47   // We want to be able to turn these off, but making this a subtarget feature
48   // for SI has the unhelpful behavior that it unsets everything else if you
49   // disable it.
50 
51   SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,");
52   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
53     FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
54 
55   FullFS += FS;
56 
57   ParseSubtargetFeatures(GPU, FullFS);
58 
59   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
60   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
61   // variants of MUBUF instructions.
62   if (!hasAddr64() && !FS.contains("flat-for-global")) {
63     FlatForGlobal = true;
64   }
65 
66   // FIXME: I don't think think Evergreen has any useful support for
67   // denormals, but should be checked. Should we issue a warning somewhere
68   // if someone tries to enable these?
69   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
70     FP64FP16Denormals = false;
71     FP32Denormals = false;
72   }
73 
74   // Set defaults if needed.
75   if (MaxPrivateElementSize == 0)
76     MaxPrivateElementSize = 4;
77 
78   if (LDSBankCount == 0)
79     LDSBankCount = 32;
80 
81   if (TT.getArch() == Triple::amdgcn) {
82     if (LocalMemorySize == 0)
83       LocalMemorySize = 32768;
84 
85     // Do something sensible for unspecified target.
86     if (!HasMovrel && !HasVGPRIndexMode)
87       HasMovrel = true;
88   }
89 
90   return *this;
91 }
92 
93 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
94                                  const TargetMachine &TM)
95   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
96     TargetTriple(TT),
97     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
98     IsaVersion(ISAVersion0_0_0),
99     WavefrontSize(64),
100     LocalMemorySize(0),
101     LDSBankCount(0),
102     MaxPrivateElementSize(0),
103 
104     FastFMAF32(false),
105     HalfRate64Ops(false),
106 
107     FP32Denormals(false),
108     FP64FP16Denormals(false),
109     FPExceptions(false),
110     DX10Clamp(false),
111     FlatForGlobal(false),
112     AutoWaitcntBeforeBarrier(false),
113     UnalignedScratchAccess(false),
114     UnalignedBufferAccess(false),
115 
116     HasApertureRegs(false),
117     EnableXNACK(false),
118     TrapHandler(false),
119     DebuggerInsertNops(false),
120     DebuggerReserveRegs(false),
121     DebuggerEmitPrologue(false),
122 
123     EnableVGPRSpilling(false),
124     EnablePromoteAlloca(false),
125     EnableLoadStoreOpt(false),
126     EnableUnsafeDSOffsetFolding(false),
127     EnableSIScheduler(false),
128     DumpCode(false),
129 
130     FP64(false),
131     IsGCN(false),
132     GCN3Encoding(false),
133     CIInsts(false),
134     GFX9Insts(false),
135     SGPRInitBug(false),
136     HasSMemRealTime(false),
137     Has16BitInsts(false),
138     HasIntClamp(false),
139     HasVOP3PInsts(false),
140     HasMovrel(false),
141     HasVGPRIndexMode(false),
142     HasScalarStores(false),
143     HasInv2PiInlineImm(false),
144     HasSDWA(false),
145     HasSDWAOmod(false),
146     HasSDWAScalar(false),
147     HasSDWASdst(false),
148     HasSDWAMac(false),
149     HasSDWAOutModsVOPC(false),
150     HasDPP(false),
151     FlatAddressSpace(false),
152     FlatInstOffsets(false),
153     FlatGlobalInsts(false),
154     FlatScratchInsts(false),
155     AddNoCarryInsts(false),
156 
157     R600ALUInst(false),
158     CaymanISA(false),
159     CFALUBug(false),
160     HasVertexCache(false),
161     TexVTXClauseSize(0),
162     ScalarizeGlobal(false),
163 
164     FeatureDisable(false),
165     InstrItins(getInstrItineraryForCPU(GPU)) {
166   AS = AMDGPU::getAMDGPUAS(TT);
167   initializeSubtargetDependencies(TT, GPU, FS);
168 }
169 
170 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
171   const Function &F) const {
172   if (NWaves == 1)
173     return getLocalMemorySize();
174   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
175   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
176   unsigned MaxWaves = getMaxWavesPerEU();
177   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
178 }
179 
180 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
181   const Function &F) const {
182   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
183   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
184   unsigned MaxWaves = getMaxWavesPerEU();
185   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
186   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
187   NumWaves = std::min(NumWaves, MaxWaves);
188   NumWaves = std::max(NumWaves, 1u);
189   return NumWaves;
190 }
191 
192 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
193   const Function &F) const {
194   // Default minimum/maximum flat work group sizes.
195   std::pair<unsigned, unsigned> Default =
196     AMDGPU::isCompute(F.getCallingConv()) ?
197       std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
198                                     getWavefrontSize() * 4) :
199       std::pair<unsigned, unsigned>(1, getWavefrontSize());
200 
201   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
202   // starts using "amdgpu-flat-work-group-size" attribute.
203   Default.second = AMDGPU::getIntegerAttribute(
204     F, "amdgpu-max-work-group-size", Default.second);
205   Default.first = std::min(Default.first, Default.second);
206 
207   // Requested minimum/maximum flat work group sizes.
208   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
209     F, "amdgpu-flat-work-group-size", Default);
210 
211   // Make sure requested minimum is less than requested maximum.
212   if (Requested.first > Requested.second)
213     return Default;
214 
215   // Make sure requested values do not violate subtarget's specifications.
216   if (Requested.first < getMinFlatWorkGroupSize())
217     return Default;
218   if (Requested.second > getMaxFlatWorkGroupSize())
219     return Default;
220 
221   return Requested;
222 }
223 
224 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
225   const Function &F) const {
226   // Default minimum/maximum number of waves per execution unit.
227   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
228 
229   // Default/requested minimum/maximum flat work group sizes.
230   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
231 
232   // If minimum/maximum flat work group sizes were explicitly requested using
233   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
234   // number of waves per execution unit to values implied by requested
235   // minimum/maximum flat work group sizes.
236   unsigned MinImpliedByFlatWorkGroupSize =
237     getMaxWavesPerEU(FlatWorkGroupSizes.second);
238   bool RequestedFlatWorkGroupSize = false;
239 
240   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
241   // starts using "amdgpu-flat-work-group-size" attribute.
242   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
243       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
244     Default.first = MinImpliedByFlatWorkGroupSize;
245     RequestedFlatWorkGroupSize = true;
246   }
247 
248   // Requested minimum/maximum number of waves per execution unit.
249   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
250     F, "amdgpu-waves-per-eu", Default, true);
251 
252   // Make sure requested minimum is less than requested maximum.
253   if (Requested.second && Requested.first > Requested.second)
254     return Default;
255 
256   // Make sure requested values do not violate subtarget's specifications.
257   if (Requested.first < getMinWavesPerEU() ||
258       Requested.first > getMaxWavesPerEU())
259     return Default;
260   if (Requested.second > getMaxWavesPerEU())
261     return Default;
262 
263   // Make sure requested values are compatible with values implied by requested
264   // minimum/maximum flat work group sizes.
265   if (RequestedFlatWorkGroupSize &&
266       Requested.first < MinImpliedByFlatWorkGroupSize)
267     return Default;
268 
269   return Requested;
270 }
271 
272 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
273   Function *Kernel = I->getParent()->getParent();
274   unsigned MinSize = 0;
275   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
276   bool IdQuery = false;
277 
278   // If reqd_work_group_size is present it narrows value down.
279   if (auto *CI = dyn_cast<CallInst>(I)) {
280     const Function *F = CI->getCalledFunction();
281     if (F) {
282       unsigned Dim = UINT_MAX;
283       switch (F->getIntrinsicID()) {
284       case Intrinsic::amdgcn_workitem_id_x:
285       case Intrinsic::r600_read_tidig_x:
286         IdQuery = true;
287         LLVM_FALLTHROUGH;
288       case Intrinsic::r600_read_local_size_x:
289         Dim = 0;
290         break;
291       case Intrinsic::amdgcn_workitem_id_y:
292       case Intrinsic::r600_read_tidig_y:
293         IdQuery = true;
294         LLVM_FALLTHROUGH;
295       case Intrinsic::r600_read_local_size_y:
296         Dim = 1;
297         break;
298       case Intrinsic::amdgcn_workitem_id_z:
299       case Intrinsic::r600_read_tidig_z:
300         IdQuery = true;
301         LLVM_FALLTHROUGH;
302       case Intrinsic::r600_read_local_size_z:
303         Dim = 2;
304         break;
305       default:
306         break;
307       }
308       if (Dim <= 3) {
309         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
310           if (Node->getNumOperands() == 3)
311             MinSize = MaxSize = mdconst::extract<ConstantInt>(
312                                   Node->getOperand(Dim))->getZExtValue();
313       }
314     }
315   }
316 
317   if (!MaxSize)
318     return false;
319 
320   // Range metadata is [Lo, Hi). For ID query we need to pass max size
321   // as Hi. For size query we need to pass Hi + 1.
322   if (IdQuery)
323     MinSize = 0;
324   else
325     ++MaxSize;
326 
327   MDBuilder MDB(I->getContext());
328   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
329                                                   APInt(32, MaxSize));
330   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
331   return true;
332 }
333 
334 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
335                              const TargetMachine &TM) :
336   AMDGPUSubtarget(TT, GPU, FS, TM),
337   InstrInfo(*this),
338   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
339   TLInfo(TM, *this) {}
340 
341 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
342                          const TargetMachine &TM)
343     : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this),
344       FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
345       TLInfo(TM, *this) {
346   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
347   Legalizer.reset(new AMDGPULegalizerInfo());
348 
349   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
350   InstSelector.reset(new AMDGPUInstructionSelector(
351       *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get())));
352 }
353 
354 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
355                                       unsigned NumRegionInstrs) const {
356   // Track register pressure so the scheduler can try to decrease
357   // pressure once register usage is above the threshold defined by
358   // SIRegisterInfo::getRegPressureSetLimit()
359   Policy.ShouldTrackPressure = true;
360 
361   // Enabling both top down and bottom up scheduling seems to give us less
362   // register spills than just using one of these approaches on its own.
363   Policy.OnlyTopDown = false;
364   Policy.OnlyBottomUp = false;
365 
366   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
367   if (!enableSIScheduler())
368     Policy.ShouldTrackLaneMasks = true;
369 }
370 
371 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
372   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
373 }
374 
375 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
376                                             unsigned ExplicitArgBytes) const {
377   unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
378   if (ImplicitBytes == 0)
379     return ExplicitArgBytes;
380 
381   unsigned Alignment = getAlignmentForImplicitArgPtr();
382   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
383 }
384 
385 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
386   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
387     if (SGPRs <= 80)
388       return 10;
389     if (SGPRs <= 88)
390       return 9;
391     if (SGPRs <= 100)
392       return 8;
393     return 7;
394   }
395   if (SGPRs <= 48)
396     return 10;
397   if (SGPRs <= 56)
398     return 9;
399   if (SGPRs <= 64)
400     return 8;
401   if (SGPRs <= 72)
402     return 7;
403   if (SGPRs <= 80)
404     return 6;
405   return 5;
406 }
407 
408 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
409   if (VGPRs <= 24)
410     return 10;
411   if (VGPRs <= 28)
412     return 9;
413   if (VGPRs <= 32)
414     return 8;
415   if (VGPRs <= 36)
416     return 7;
417   if (VGPRs <= 40)
418     return 6;
419   if (VGPRs <= 48)
420     return 5;
421   if (VGPRs <= 64)
422     return 4;
423   if (VGPRs <= 84)
424     return 3;
425   if (VGPRs <= 128)
426     return 2;
427   return 1;
428 }
429 
430 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
431   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
432   if (MFI.hasFlatScratchInit()) {
433     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
434       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
435     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
436       return 4; // FLAT_SCRATCH, VCC (in that order).
437   }
438 
439   if (isXNACKEnabled())
440     return 4; // XNACK, VCC (in that order).
441   return 2; // VCC.
442 }
443 
444 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
445   const Function &F = *MF.getFunction();
446   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
447 
448   // Compute maximum number of SGPRs function can use using default/requested
449   // minimum number of waves per execution unit.
450   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
451   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
452   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
453 
454   // Check if maximum number of SGPRs was explicitly requested using
455   // "amdgpu-num-sgpr" attribute.
456   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
457     unsigned Requested = AMDGPU::getIntegerAttribute(
458       F, "amdgpu-num-sgpr", MaxNumSGPRs);
459 
460     // Make sure requested value does not violate subtarget's specifications.
461     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
462       Requested = 0;
463 
464     // If more SGPRs are required to support the input user/system SGPRs,
465     // increase to accommodate them.
466     //
467     // FIXME: This really ends up using the requested number of SGPRs + number
468     // of reserved special registers in total. Theoretically you could re-use
469     // the last input registers for these special registers, but this would
470     // require a lot of complexity to deal with the weird aliasing.
471     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
472     if (Requested && Requested < InputNumSGPRs)
473       Requested = InputNumSGPRs;
474 
475     // Make sure requested value is compatible with values implied by
476     // default/requested minimum/maximum number of waves per execution unit.
477     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
478       Requested = 0;
479     if (WavesPerEU.second &&
480         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
481       Requested = 0;
482 
483     if (Requested)
484       MaxNumSGPRs = Requested;
485   }
486 
487   if (hasSGPRInitBug())
488     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
489 
490   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
491                   MaxAddressableNumSGPRs);
492 }
493 
494 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
495   const Function &F = *MF.getFunction();
496   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
497 
498   // Compute maximum number of VGPRs function can use using default/requested
499   // minimum number of waves per execution unit.
500   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
501   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
502 
503   // Check if maximum number of VGPRs was explicitly requested using
504   // "amdgpu-num-vgpr" attribute.
505   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
506     unsigned Requested = AMDGPU::getIntegerAttribute(
507       F, "amdgpu-num-vgpr", MaxNumVGPRs);
508 
509     // Make sure requested value does not violate subtarget's specifications.
510     if (Requested && Requested <= getReservedNumVGPRs(MF))
511       Requested = 0;
512 
513     // Make sure requested value is compatible with values implied by
514     // default/requested minimum/maximum number of waves per execution unit.
515     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
516       Requested = 0;
517     if (WavesPerEU.second &&
518         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
519       Requested = 0;
520 
521     if (Requested)
522       MaxNumVGPRs = Requested;
523   }
524 
525   return MaxNumVGPRs - getReservedNumVGPRs(MF);
526 }
527