1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "SIMachineFunctionInfo.h"
17 #include "llvm/ADT/SmallString.h"
18 #include "llvm/CodeGen/MachineScheduler.h"
19 #include "llvm/Target/TargetFrameLowering.h"
20 #include <algorithm>
21 
22 using namespace llvm;
23 
24 #define DEBUG_TYPE "amdgpu-subtarget"
25 
26 #define GET_SUBTARGETINFO_ENUM
27 #define GET_SUBTARGETINFO_TARGET_DESC
28 #define GET_SUBTARGETINFO_CTOR
29 #include "AMDGPUGenSubtargetInfo.inc"
30 
31 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
32 
33 AMDGPUSubtarget &
34 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
35                                                  StringRef GPU, StringRef FS) {
36   // Determine default and user-specified characteristics
37   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
38   // enabled, but some instructions do not respect them and they run at the
39   // double precision rate, so don't enable by default.
40   //
41   // We want to be able to turn these off, but making this a subtarget feature
42   // for SI has the unhelpful behavior that it unsets everything else if you
43   // disable it.
44 
45   SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+load-store-opt,");
46   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
47     FullFS += "+flat-for-global,+unaligned-buffer-access,";
48 
49   FullFS += FS;
50 
51   ParseSubtargetFeatures(GPU, FullFS);
52 
53   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
54   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
55   // variants of MUBUF instructions.
56   if (!hasAddr64() && !FS.contains("flat-for-global")) {
57     FlatForGlobal = true;
58   }
59 
60   // FIXME: I don't think think Evergreen has any useful support for
61   // denormals, but should be checked. Should we issue a warning somewhere
62   // if someone tries to enable these?
63   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
64     FP64FP16Denormals = false;
65     FP32Denormals = false;
66   }
67 
68   // Set defaults if needed.
69   if (MaxPrivateElementSize == 0)
70     MaxPrivateElementSize = 4;
71 
72   return *this;
73 }
74 
75 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
76                                  const TargetMachine &TM)
77   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
78     TargetTriple(TT),
79     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
80     IsaVersion(ISAVersion0_0_0),
81     WavefrontSize(64),
82     LocalMemorySize(0),
83     LDSBankCount(0),
84     MaxPrivateElementSize(0),
85 
86     FastFMAF32(false),
87     HalfRate64Ops(false),
88 
89     FP32Denormals(false),
90     FP64FP16Denormals(false),
91     FPExceptions(false),
92     FlatForGlobal(false),
93     UnalignedScratchAccess(false),
94     UnalignedBufferAccess(false),
95 
96     EnableXNACK(false),
97     DebuggerInsertNops(false),
98     DebuggerReserveRegs(false),
99     DebuggerEmitPrologue(false),
100 
101     EnableVGPRSpilling(false),
102     EnablePromoteAlloca(false),
103     EnableLoadStoreOpt(false),
104     EnableUnsafeDSOffsetFolding(false),
105     EnableSIScheduler(false),
106     DumpCode(false),
107 
108     FP64(false),
109     IsGCN(false),
110     GCN1Encoding(false),
111     GCN3Encoding(false),
112     CIInsts(false),
113     SGPRInitBug(false),
114     HasSMemRealTime(false),
115     Has16BitInsts(false),
116     HasMovrel(false),
117     HasVGPRIndexMode(false),
118     HasScalarStores(false),
119     HasInv2PiInlineImm(false),
120     HasSDWA(false),
121     HasDPP(false),
122     FlatAddressSpace(false),
123 
124     R600ALUInst(false),
125     CaymanISA(false),
126     CFALUBug(false),
127     HasVertexCache(false),
128     TexVTXClauseSize(0),
129     ScalarizeGlobal(false),
130 
131     FeatureDisable(false),
132     InstrItins(getInstrItineraryForCPU(GPU)) {
133   initializeSubtargetDependencies(TT, GPU, FS);
134 }
135 
136 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
137   const Function &F) const {
138   if (NWaves == 1)
139     return getLocalMemorySize();
140   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
141   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
142   unsigned MaxWaves = getMaxWavesPerEU();
143   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
144 }
145 
146 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
147   const Function &F) const {
148   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
149   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
150   unsigned MaxWaves = getMaxWavesPerEU();
151   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
152   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
153   NumWaves = std::min(NumWaves, MaxWaves);
154   NumWaves = std::max(NumWaves, 1u);
155   return NumWaves;
156 }
157 
158 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
159   const Function &F) const {
160   // Default minimum/maximum flat work group sizes.
161   std::pair<unsigned, unsigned> Default =
162     AMDGPU::isCompute(F.getCallingConv()) ?
163       std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
164                                     getWavefrontSize() * 4) :
165       std::pair<unsigned, unsigned>(1, getWavefrontSize());
166 
167   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
168   // starts using "amdgpu-flat-work-group-size" attribute.
169   Default.second = AMDGPU::getIntegerAttribute(
170     F, "amdgpu-max-work-group-size", Default.second);
171   Default.first = std::min(Default.first, Default.second);
172 
173   // Requested minimum/maximum flat work group sizes.
174   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
175     F, "amdgpu-flat-work-group-size", Default);
176 
177   // Make sure requested minimum is less than requested maximum.
178   if (Requested.first > Requested.second)
179     return Default;
180 
181   // Make sure requested values do not violate subtarget's specifications.
182   if (Requested.first < getMinFlatWorkGroupSize())
183     return Default;
184   if (Requested.second > getMaxFlatWorkGroupSize())
185     return Default;
186 
187   return Requested;
188 }
189 
190 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
191   const Function &F) const {
192   // Default minimum/maximum number of waves per execution unit.
193   std::pair<unsigned, unsigned> Default(1, 0);
194 
195   // Default/requested minimum/maximum flat work group sizes.
196   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
197 
198   // If minimum/maximum flat work group sizes were explicitly requested using
199   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
200   // number of waves per execution unit to values implied by requested
201   // minimum/maximum flat work group sizes.
202   unsigned MinImpliedByFlatWorkGroupSize =
203     getMaxWavesPerEU(FlatWorkGroupSizes.second);
204   bool RequestedFlatWorkGroupSize = false;
205 
206   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
207   // starts using "amdgpu-flat-work-group-size" attribute.
208   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
209       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
210     Default.first = MinImpliedByFlatWorkGroupSize;
211     RequestedFlatWorkGroupSize = true;
212   }
213 
214   // Requested minimum/maximum number of waves per execution unit.
215   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
216     F, "amdgpu-waves-per-eu", Default, true);
217 
218   // Make sure requested minimum is less than requested maximum.
219   if (Requested.second && Requested.first > Requested.second)
220     return Default;
221 
222   // Make sure requested values do not violate subtarget's specifications.
223   if (Requested.first < getMinWavesPerEU() ||
224       Requested.first > getMaxWavesPerEU())
225     return Default;
226   if (Requested.second > getMaxWavesPerEU())
227     return Default;
228 
229   // Make sure requested values are compatible with values implied by requested
230   // minimum/maximum flat work group sizes.
231   if (RequestedFlatWorkGroupSize &&
232       Requested.first > MinImpliedByFlatWorkGroupSize)
233     return Default;
234 
235   return Requested;
236 }
237 
238 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
239                              const TargetMachine &TM) :
240   AMDGPUSubtarget(TT, GPU, FS, TM),
241   InstrInfo(*this),
242   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
243   TLInfo(TM, *this) {}
244 
245 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
246                          const TargetMachine &TM) :
247   AMDGPUSubtarget(TT, GPU, FS, TM),
248   InstrInfo(*this),
249   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
250   TLInfo(TM, *this) {}
251 
252 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
253                                       unsigned NumRegionInstrs) const {
254   // Track register pressure so the scheduler can try to decrease
255   // pressure once register usage is above the threshold defined by
256   // SIRegisterInfo::getRegPressureSetLimit()
257   Policy.ShouldTrackPressure = true;
258 
259   // Enabling both top down and bottom up scheduling seems to give us less
260   // register spills than just using one of these approaches on its own.
261   Policy.OnlyTopDown = false;
262   Policy.OnlyBottomUp = false;
263 
264   Policy.ShouldTrackLaneMasks = enableSubRegLiveness();
265 }
266 
267 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
268   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
269 }
270 
271 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
272                                             unsigned ExplicitArgBytes) const {
273   unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
274   if (ImplicitBytes == 0)
275     return ExplicitArgBytes;
276 
277   unsigned Alignment = getAlignmentForImplicitArgPtr();
278   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
279 }
280 
281 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
282   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
283     if (SGPRs <= 80)
284       return 10;
285     if (SGPRs <= 88)
286       return 9;
287     if (SGPRs <= 100)
288       return 8;
289     return 7;
290   }
291   if (SGPRs <= 48)
292     return 10;
293   if (SGPRs <= 56)
294     return 9;
295   if (SGPRs <= 64)
296     return 8;
297   if (SGPRs <= 72)
298     return 7;
299   if (SGPRs <= 80)
300     return 6;
301   return 5;
302 }
303 
304 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
305   if (VGPRs <= 24)
306     return 10;
307   if (VGPRs <= 28)
308     return 9;
309   if (VGPRs <= 32)
310     return 8;
311   if (VGPRs <= 36)
312     return 7;
313   if (VGPRs <= 40)
314     return 6;
315   if (VGPRs <= 48)
316     return 5;
317   if (VGPRs <= 64)
318     return 4;
319   if (VGPRs <= 84)
320     return 3;
321   if (VGPRs <= 128)
322     return 2;
323   return 1;
324 }
325 
326 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
327   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
328   if (MFI.hasFlatScratchInit()) {
329     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
330       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
331     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
332       return 4; // FLAT_SCRATCH, VCC (in that order).
333   }
334 
335   if (isXNACKEnabled())
336     return 4; // XNACK, VCC (in that order).
337   return 2; // VCC.
338 }
339 
340 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
341   const Function &F = *MF.getFunction();
342   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
343 
344   // Compute maximum number of SGPRs function can use using default/requested
345   // minimum number of waves per execution unit.
346   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
347   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
348   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
349 
350   // Check if maximum number of SGPRs was explicitly requested using
351   // "amdgpu-num-sgpr" attribute.
352   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
353     unsigned Requested = AMDGPU::getIntegerAttribute(
354       F, "amdgpu-num-sgpr", MaxNumSGPRs);
355 
356     // Make sure requested value does not violate subtarget's specifications.
357     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
358       Requested = 0;
359 
360     // If more SGPRs are required to support the input user/system SGPRs,
361     // increase to accommodate them.
362     //
363     // FIXME: This really ends up using the requested number of SGPRs + number
364     // of reserved special registers in total. Theoretically you could re-use
365     // the last input registers for these special registers, but this would
366     // require a lot of complexity to deal with the weird aliasing.
367     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
368     if (Requested && Requested < InputNumSGPRs)
369       Requested = InputNumSGPRs;
370 
371     // Make sure requested value is compatible with values implied by
372     // default/requested minimum/maximum number of waves per execution unit.
373     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
374       Requested = 0;
375     if (WavesPerEU.second &&
376         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
377       Requested = 0;
378 
379     if (Requested)
380       MaxNumSGPRs = Requested;
381   }
382 
383   if (hasSGPRInitBug())
384     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
385 
386   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
387                   MaxAddressableNumSGPRs);
388 }
389 
390 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
391   const Function &F = *MF.getFunction();
392   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
393 
394   // Compute maximum number of VGPRs function can use using default/requested
395   // minimum number of waves per execution unit.
396   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
397   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
398 
399   // Check if maximum number of VGPRs was explicitly requested using
400   // "amdgpu-num-vgpr" attribute.
401   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
402     unsigned Requested = AMDGPU::getIntegerAttribute(
403       F, "amdgpu-num-vgpr", MaxNumVGPRs);
404 
405     // Make sure requested value does not violate subtarget's specifications.
406     if (Requested && Requested <= getReservedNumVGPRs(MF))
407       Requested = 0;
408 
409     // Make sure requested value is compatible with values implied by
410     // default/requested minimum/maximum number of waves per execution unit.
411     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
412       Requested = 0;
413     if (WavesPerEU.second &&
414         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
415       Requested = 0;
416 
417     if (Requested)
418       MaxNumVGPRs = Requested;
419   }
420 
421   return MaxNumVGPRs - getReservedNumVGPRs(MF);
422 }
423