1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "SIMachineFunctionInfo.h"
17 #include "llvm/ADT/SmallString.h"
18 #include "llvm/CodeGen/MachineScheduler.h"
19 #include "llvm/Target/TargetFrameLowering.h"
20 #include <algorithm>
21 
22 using namespace llvm;
23 
24 #define DEBUG_TYPE "amdgpu-subtarget"
25 
26 #define GET_SUBTARGETINFO_ENUM
27 #define GET_SUBTARGETINFO_TARGET_DESC
28 #define GET_SUBTARGETINFO_CTOR
29 #include "AMDGPUGenSubtargetInfo.inc"
30 
31 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
32 
33 AMDGPUSubtarget &
34 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
35                                                  StringRef GPU, StringRef FS) {
36   // Determine default and user-specified characteristics
37   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
38   // enabled, but some instructions do not respect them and they run at the
39   // double precision rate, so don't enable by default.
40   //
41   // We want to be able to turn these off, but making this a subtarget feature
42   // for SI has the unhelpful behavior that it unsets everything else if you
43   // disable it.
44 
45   SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+load-store-opt,");
46   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
47     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
48 
49   FullFS += FS;
50 
51   ParseSubtargetFeatures(GPU, FullFS);
52 
53   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
54   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
55   // variants of MUBUF instructions.
56   if (!hasAddr64() && !FS.contains("flat-for-global")) {
57     FlatForGlobal = true;
58   }
59 
60   // FIXME: I don't think think Evergreen has any useful support for
61   // denormals, but should be checked. Should we issue a warning somewhere
62   // if someone tries to enable these?
63   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
64     FP64FP16Denormals = false;
65     FP32Denormals = false;
66   }
67 
68   // Set defaults if needed.
69   if (MaxPrivateElementSize == 0)
70     MaxPrivateElementSize = 4;
71 
72   return *this;
73 }
74 
75 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
76                                  const TargetMachine &TM)
77   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
78     TargetTriple(TT),
79     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
80     IsaVersion(ISAVersion0_0_0),
81     WavefrontSize(64),
82     LocalMemorySize(0),
83     LDSBankCount(0),
84     MaxPrivateElementSize(0),
85 
86     FastFMAF32(false),
87     HalfRate64Ops(false),
88 
89     FP32Denormals(false),
90     FP64FP16Denormals(false),
91     FPExceptions(false),
92     FlatForGlobal(false),
93     UnalignedScratchAccess(false),
94     UnalignedBufferAccess(false),
95 
96     EnableXNACK(false),
97     TrapHandler(false),
98     DebuggerInsertNops(false),
99     DebuggerReserveRegs(false),
100     DebuggerEmitPrologue(false),
101 
102     EnableVGPRSpilling(false),
103     EnablePromoteAlloca(false),
104     EnableLoadStoreOpt(false),
105     EnableUnsafeDSOffsetFolding(false),
106     EnableSIScheduler(false),
107     DumpCode(false),
108 
109     FP64(false),
110     IsGCN(false),
111     GCN1Encoding(false),
112     GCN3Encoding(false),
113     CIInsts(false),
114     SGPRInitBug(false),
115     HasSMemRealTime(false),
116     Has16BitInsts(false),
117     HasMovrel(false),
118     HasVGPRIndexMode(false),
119     HasScalarStores(false),
120     HasInv2PiInlineImm(false),
121     HasSDWA(false),
122     HasDPP(false),
123     FlatAddressSpace(false),
124 
125     R600ALUInst(false),
126     CaymanISA(false),
127     CFALUBug(false),
128     HasVertexCache(false),
129     TexVTXClauseSize(0),
130     ScalarizeGlobal(false),
131 
132     FeatureDisable(false),
133     InstrItins(getInstrItineraryForCPU(GPU)) {
134   initializeSubtargetDependencies(TT, GPU, FS);
135 }
136 
137 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
138   const Function &F) const {
139   if (NWaves == 1)
140     return getLocalMemorySize();
141   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
142   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
143   unsigned MaxWaves = getMaxWavesPerEU();
144   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
145 }
146 
147 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
148   const Function &F) const {
149   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
150   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
151   unsigned MaxWaves = getMaxWavesPerEU();
152   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
153   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
154   NumWaves = std::min(NumWaves, MaxWaves);
155   NumWaves = std::max(NumWaves, 1u);
156   return NumWaves;
157 }
158 
159 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
160   const Function &F) const {
161   // Default minimum/maximum flat work group sizes.
162   std::pair<unsigned, unsigned> Default =
163     AMDGPU::isCompute(F.getCallingConv()) ?
164       std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
165                                     getWavefrontSize() * 4) :
166       std::pair<unsigned, unsigned>(1, getWavefrontSize());
167 
168   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
169   // starts using "amdgpu-flat-work-group-size" attribute.
170   Default.second = AMDGPU::getIntegerAttribute(
171     F, "amdgpu-max-work-group-size", Default.second);
172   Default.first = std::min(Default.first, Default.second);
173 
174   // Requested minimum/maximum flat work group sizes.
175   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
176     F, "amdgpu-flat-work-group-size", Default);
177 
178   // Make sure requested minimum is less than requested maximum.
179   if (Requested.first > Requested.second)
180     return Default;
181 
182   // Make sure requested values do not violate subtarget's specifications.
183   if (Requested.first < getMinFlatWorkGroupSize())
184     return Default;
185   if (Requested.second > getMaxFlatWorkGroupSize())
186     return Default;
187 
188   return Requested;
189 }
190 
191 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
192   const Function &F) const {
193   // Default minimum/maximum number of waves per execution unit.
194   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
195 
196   // Default/requested minimum/maximum flat work group sizes.
197   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
198 
199   // If minimum/maximum flat work group sizes were explicitly requested using
200   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
201   // number of waves per execution unit to values implied by requested
202   // minimum/maximum flat work group sizes.
203   unsigned MinImpliedByFlatWorkGroupSize =
204     getMaxWavesPerEU(FlatWorkGroupSizes.second);
205   bool RequestedFlatWorkGroupSize = false;
206 
207   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
208   // starts using "amdgpu-flat-work-group-size" attribute.
209   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
210       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
211     Default.first = MinImpliedByFlatWorkGroupSize;
212     RequestedFlatWorkGroupSize = true;
213   }
214 
215   // Requested minimum/maximum number of waves per execution unit.
216   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
217     F, "amdgpu-waves-per-eu", Default, true);
218 
219   // Make sure requested minimum is less than requested maximum.
220   if (Requested.second && Requested.first > Requested.second)
221     return Default;
222 
223   // Make sure requested values do not violate subtarget's specifications.
224   if (Requested.first < getMinWavesPerEU() ||
225       Requested.first > getMaxWavesPerEU())
226     return Default;
227   if (Requested.second > getMaxWavesPerEU())
228     return Default;
229 
230   // Make sure requested values are compatible with values implied by requested
231   // minimum/maximum flat work group sizes.
232   if (RequestedFlatWorkGroupSize &&
233       Requested.first > MinImpliedByFlatWorkGroupSize)
234     return Default;
235 
236   return Requested;
237 }
238 
239 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
240                              const TargetMachine &TM) :
241   AMDGPUSubtarget(TT, GPU, FS, TM),
242   InstrInfo(*this),
243   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
244   TLInfo(TM, *this) {}
245 
246 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
247                          const TargetMachine &TM) :
248   AMDGPUSubtarget(TT, GPU, FS, TM),
249   InstrInfo(*this),
250   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
251   TLInfo(TM, *this) {}
252 
253 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
254                                       unsigned NumRegionInstrs) const {
255   // Track register pressure so the scheduler can try to decrease
256   // pressure once register usage is above the threshold defined by
257   // SIRegisterInfo::getRegPressureSetLimit()
258   Policy.ShouldTrackPressure = true;
259 
260   // Enabling both top down and bottom up scheduling seems to give us less
261   // register spills than just using one of these approaches on its own.
262   Policy.OnlyTopDown = false;
263   Policy.OnlyBottomUp = false;
264 
265   Policy.ShouldTrackLaneMasks = enableSubRegLiveness();
266 }
267 
268 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
269   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
270 }
271 
272 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
273                                             unsigned ExplicitArgBytes) const {
274   unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
275   if (ImplicitBytes == 0)
276     return ExplicitArgBytes;
277 
278   unsigned Alignment = getAlignmentForImplicitArgPtr();
279   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
280 }
281 
282 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
283   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
284     if (SGPRs <= 80)
285       return 10;
286     if (SGPRs <= 88)
287       return 9;
288     if (SGPRs <= 100)
289       return 8;
290     return 7;
291   }
292   if (SGPRs <= 48)
293     return 10;
294   if (SGPRs <= 56)
295     return 9;
296   if (SGPRs <= 64)
297     return 8;
298   if (SGPRs <= 72)
299     return 7;
300   if (SGPRs <= 80)
301     return 6;
302   return 5;
303 }
304 
305 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
306   if (VGPRs <= 24)
307     return 10;
308   if (VGPRs <= 28)
309     return 9;
310   if (VGPRs <= 32)
311     return 8;
312   if (VGPRs <= 36)
313     return 7;
314   if (VGPRs <= 40)
315     return 6;
316   if (VGPRs <= 48)
317     return 5;
318   if (VGPRs <= 64)
319     return 4;
320   if (VGPRs <= 84)
321     return 3;
322   if (VGPRs <= 128)
323     return 2;
324   return 1;
325 }
326 
327 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
328   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
329   if (MFI.hasFlatScratchInit()) {
330     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
331       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
332     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
333       return 4; // FLAT_SCRATCH, VCC (in that order).
334   }
335 
336   if (isXNACKEnabled())
337     return 4; // XNACK, VCC (in that order).
338   return 2; // VCC.
339 }
340 
341 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
342   const Function &F = *MF.getFunction();
343   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
344 
345   // Compute maximum number of SGPRs function can use using default/requested
346   // minimum number of waves per execution unit.
347   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
348   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
349   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
350 
351   // Check if maximum number of SGPRs was explicitly requested using
352   // "amdgpu-num-sgpr" attribute.
353   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
354     unsigned Requested = AMDGPU::getIntegerAttribute(
355       F, "amdgpu-num-sgpr", MaxNumSGPRs);
356 
357     // Make sure requested value does not violate subtarget's specifications.
358     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
359       Requested = 0;
360 
361     // If more SGPRs are required to support the input user/system SGPRs,
362     // increase to accommodate them.
363     //
364     // FIXME: This really ends up using the requested number of SGPRs + number
365     // of reserved special registers in total. Theoretically you could re-use
366     // the last input registers for these special registers, but this would
367     // require a lot of complexity to deal with the weird aliasing.
368     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
369     if (Requested && Requested < InputNumSGPRs)
370       Requested = InputNumSGPRs;
371 
372     // Make sure requested value is compatible with values implied by
373     // default/requested minimum/maximum number of waves per execution unit.
374     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
375       Requested = 0;
376     if (WavesPerEU.second &&
377         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
378       Requested = 0;
379 
380     if (Requested)
381       MaxNumSGPRs = Requested;
382   }
383 
384   if (hasSGPRInitBug())
385     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
386 
387   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
388                   MaxAddressableNumSGPRs);
389 }
390 
391 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
392   const Function &F = *MF.getFunction();
393   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
394 
395   // Compute maximum number of VGPRs function can use using default/requested
396   // minimum number of waves per execution unit.
397   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
398   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
399 
400   // Check if maximum number of VGPRs was explicitly requested using
401   // "amdgpu-num-vgpr" attribute.
402   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
403     unsigned Requested = AMDGPU::getIntegerAttribute(
404       F, "amdgpu-num-vgpr", MaxNumVGPRs);
405 
406     // Make sure requested value does not violate subtarget's specifications.
407     if (Requested && Requested <= getReservedNumVGPRs(MF))
408       Requested = 0;
409 
410     // Make sure requested value is compatible with values implied by
411     // default/requested minimum/maximum number of waves per execution unit.
412     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
413       Requested = 0;
414     if (WavesPerEU.second &&
415         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
416       Requested = 0;
417 
418     if (Requested)
419       MaxNumVGPRs = Requested;
420   }
421 
422   return MaxNumVGPRs - getReservedNumVGPRs(MF);
423 }
424