1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "SIMachineFunctionInfo.h"
17 #include "llvm/ADT/SmallString.h"
18 #include "llvm/CodeGen/MachineScheduler.h"
19 #include "llvm/Target/TargetFrameLowering.h"
20 #include <algorithm>
21 
22 using namespace llvm;
23 
24 #define DEBUG_TYPE "amdgpu-subtarget"
25 
26 #define GET_SUBTARGETINFO_ENUM
27 #define GET_SUBTARGETINFO_TARGET_DESC
28 #define GET_SUBTARGETINFO_CTOR
29 #include "AMDGPUGenSubtargetInfo.inc"
30 
31 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
32 
33 AMDGPUSubtarget &
34 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
35                                                  StringRef GPU, StringRef FS) {
36   // Determine default and user-specified characteristics
37   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
38   // enabled, but some instructions do not respect them and they run at the
39   // double precision rate, so don't enable by default.
40   //
41   // We want to be able to turn these off, but making this a subtarget feature
42   // for SI has the unhelpful behavior that it unsets everything else if you
43   // disable it.
44 
45   SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,");
46   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
47     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
48 
49   FullFS += FS;
50 
51   ParseSubtargetFeatures(GPU, FullFS);
52 
53   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
54   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
55   // variants of MUBUF instructions.
56   if (!hasAddr64() && !FS.contains("flat-for-global")) {
57     FlatForGlobal = true;
58   }
59 
60   // FIXME: I don't think think Evergreen has any useful support for
61   // denormals, but should be checked. Should we issue a warning somewhere
62   // if someone tries to enable these?
63   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
64     FP64FP16Denormals = false;
65     FP32Denormals = false;
66   }
67 
68   // Set defaults if needed.
69   if (MaxPrivateElementSize == 0)
70     MaxPrivateElementSize = 4;
71 
72   return *this;
73 }
74 
75 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
76                                  const TargetMachine &TM)
77   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
78     TargetTriple(TT),
79     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
80     IsaVersion(ISAVersion0_0_0),
81     WavefrontSize(64),
82     LocalMemorySize(0),
83     LDSBankCount(0),
84     MaxPrivateElementSize(0),
85 
86     FastFMAF32(false),
87     HalfRate64Ops(false),
88 
89     FP32Denormals(false),
90     FP64FP16Denormals(false),
91     FPExceptions(false),
92     DX10Clamp(false),
93     FlatForGlobal(false),
94     UnalignedScratchAccess(false),
95     UnalignedBufferAccess(false),
96 
97     HasApertureRegs(false),
98     EnableXNACK(false),
99     TrapHandler(false),
100     DebuggerInsertNops(false),
101     DebuggerReserveRegs(false),
102     DebuggerEmitPrologue(false),
103 
104     EnableVGPRSpilling(false),
105     EnablePromoteAlloca(false),
106     EnableLoadStoreOpt(false),
107     EnableUnsafeDSOffsetFolding(false),
108     EnableSIScheduler(false),
109     DumpCode(false),
110 
111     FP64(false),
112     IsGCN(false),
113     GCN1Encoding(false),
114     GCN3Encoding(false),
115     CIInsts(false),
116     GFX9Insts(false),
117     SGPRInitBug(false),
118     HasSMemRealTime(false),
119     Has16BitInsts(false),
120     HasVOP3PInsts(false),
121     HasMovrel(false),
122     HasVGPRIndexMode(false),
123     HasScalarStores(false),
124     HasInv2PiInlineImm(false),
125     HasSDWA(false),
126     HasDPP(false),
127     FlatAddressSpace(false),
128 
129     R600ALUInst(false),
130     CaymanISA(false),
131     CFALUBug(false),
132     HasVertexCache(false),
133     TexVTXClauseSize(0),
134     ScalarizeGlobal(false),
135 
136     FeatureDisable(false),
137     InstrItins(getInstrItineraryForCPU(GPU)) {
138   initializeSubtargetDependencies(TT, GPU, FS);
139 }
140 
141 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
142   const Function &F) const {
143   if (NWaves == 1)
144     return getLocalMemorySize();
145   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
146   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
147   unsigned MaxWaves = getMaxWavesPerEU();
148   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
149 }
150 
151 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
152   const Function &F) const {
153   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
154   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
155   unsigned MaxWaves = getMaxWavesPerEU();
156   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
157   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
158   NumWaves = std::min(NumWaves, MaxWaves);
159   NumWaves = std::max(NumWaves, 1u);
160   return NumWaves;
161 }
162 
163 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
164   const Function &F) const {
165   // Default minimum/maximum flat work group sizes.
166   std::pair<unsigned, unsigned> Default =
167     AMDGPU::isCompute(F.getCallingConv()) ?
168       std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
169                                     getWavefrontSize() * 4) :
170       std::pair<unsigned, unsigned>(1, getWavefrontSize());
171 
172   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
173   // starts using "amdgpu-flat-work-group-size" attribute.
174   Default.second = AMDGPU::getIntegerAttribute(
175     F, "amdgpu-max-work-group-size", Default.second);
176   Default.first = std::min(Default.first, Default.second);
177 
178   // Requested minimum/maximum flat work group sizes.
179   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
180     F, "amdgpu-flat-work-group-size", Default);
181 
182   // Make sure requested minimum is less than requested maximum.
183   if (Requested.first > Requested.second)
184     return Default;
185 
186   // Make sure requested values do not violate subtarget's specifications.
187   if (Requested.first < getMinFlatWorkGroupSize())
188     return Default;
189   if (Requested.second > getMaxFlatWorkGroupSize())
190     return Default;
191 
192   return Requested;
193 }
194 
195 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
196   const Function &F) const {
197   // Default minimum/maximum number of waves per execution unit.
198   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
199 
200   // Default/requested minimum/maximum flat work group sizes.
201   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
202 
203   // If minimum/maximum flat work group sizes were explicitly requested using
204   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
205   // number of waves per execution unit to values implied by requested
206   // minimum/maximum flat work group sizes.
207   unsigned MinImpliedByFlatWorkGroupSize =
208     getMaxWavesPerEU(FlatWorkGroupSizes.second);
209   bool RequestedFlatWorkGroupSize = false;
210 
211   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
212   // starts using "amdgpu-flat-work-group-size" attribute.
213   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
214       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
215     Default.first = MinImpliedByFlatWorkGroupSize;
216     RequestedFlatWorkGroupSize = true;
217   }
218 
219   // Requested minimum/maximum number of waves per execution unit.
220   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
221     F, "amdgpu-waves-per-eu", Default, true);
222 
223   // Make sure requested minimum is less than requested maximum.
224   if (Requested.second && Requested.first > Requested.second)
225     return Default;
226 
227   // Make sure requested values do not violate subtarget's specifications.
228   if (Requested.first < getMinWavesPerEU() ||
229       Requested.first > getMaxWavesPerEU())
230     return Default;
231   if (Requested.second > getMaxWavesPerEU())
232     return Default;
233 
234   // Make sure requested values are compatible with values implied by requested
235   // minimum/maximum flat work group sizes.
236   if (RequestedFlatWorkGroupSize &&
237       Requested.first > MinImpliedByFlatWorkGroupSize)
238     return Default;
239 
240   return Requested;
241 }
242 
243 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
244                              const TargetMachine &TM) :
245   AMDGPUSubtarget(TT, GPU, FS, TM),
246   InstrInfo(*this),
247   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
248   TLInfo(TM, *this) {}
249 
250 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
251                          const TargetMachine &TM) :
252   AMDGPUSubtarget(TT, GPU, FS, TM),
253   InstrInfo(*this),
254   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
255   TLInfo(TM, *this) {}
256 
257 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
258                                       unsigned NumRegionInstrs) const {
259   // Track register pressure so the scheduler can try to decrease
260   // pressure once register usage is above the threshold defined by
261   // SIRegisterInfo::getRegPressureSetLimit()
262   Policy.ShouldTrackPressure = true;
263 
264   // Enabling both top down and bottom up scheduling seems to give us less
265   // register spills than just using one of these approaches on its own.
266   Policy.OnlyTopDown = false;
267   Policy.OnlyBottomUp = false;
268 
269   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
270   if (!enableSIScheduler())
271     Policy.ShouldTrackLaneMasks = true;
272 }
273 
274 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
275   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
276 }
277 
278 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
279                                             unsigned ExplicitArgBytes) const {
280   unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
281   if (ImplicitBytes == 0)
282     return ExplicitArgBytes;
283 
284   unsigned Alignment = getAlignmentForImplicitArgPtr();
285   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
286 }
287 
288 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
289   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
290     if (SGPRs <= 80)
291       return 10;
292     if (SGPRs <= 88)
293       return 9;
294     if (SGPRs <= 100)
295       return 8;
296     return 7;
297   }
298   if (SGPRs <= 48)
299     return 10;
300   if (SGPRs <= 56)
301     return 9;
302   if (SGPRs <= 64)
303     return 8;
304   if (SGPRs <= 72)
305     return 7;
306   if (SGPRs <= 80)
307     return 6;
308   return 5;
309 }
310 
311 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
312   if (VGPRs <= 24)
313     return 10;
314   if (VGPRs <= 28)
315     return 9;
316   if (VGPRs <= 32)
317     return 8;
318   if (VGPRs <= 36)
319     return 7;
320   if (VGPRs <= 40)
321     return 6;
322   if (VGPRs <= 48)
323     return 5;
324   if (VGPRs <= 64)
325     return 4;
326   if (VGPRs <= 84)
327     return 3;
328   if (VGPRs <= 128)
329     return 2;
330   return 1;
331 }
332 
333 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
334   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
335   if (MFI.hasFlatScratchInit()) {
336     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
337       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
338     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
339       return 4; // FLAT_SCRATCH, VCC (in that order).
340   }
341 
342   if (isXNACKEnabled())
343     return 4; // XNACK, VCC (in that order).
344   return 2; // VCC.
345 }
346 
347 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
348   const Function &F = *MF.getFunction();
349   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
350 
351   // Compute maximum number of SGPRs function can use using default/requested
352   // minimum number of waves per execution unit.
353   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
354   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
355   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
356 
357   // Check if maximum number of SGPRs was explicitly requested using
358   // "amdgpu-num-sgpr" attribute.
359   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
360     unsigned Requested = AMDGPU::getIntegerAttribute(
361       F, "amdgpu-num-sgpr", MaxNumSGPRs);
362 
363     // Make sure requested value does not violate subtarget's specifications.
364     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
365       Requested = 0;
366 
367     // If more SGPRs are required to support the input user/system SGPRs,
368     // increase to accommodate them.
369     //
370     // FIXME: This really ends up using the requested number of SGPRs + number
371     // of reserved special registers in total. Theoretically you could re-use
372     // the last input registers for these special registers, but this would
373     // require a lot of complexity to deal with the weird aliasing.
374     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
375     if (Requested && Requested < InputNumSGPRs)
376       Requested = InputNumSGPRs;
377 
378     // Make sure requested value is compatible with values implied by
379     // default/requested minimum/maximum number of waves per execution unit.
380     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
381       Requested = 0;
382     if (WavesPerEU.second &&
383         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
384       Requested = 0;
385 
386     if (Requested)
387       MaxNumSGPRs = Requested;
388   }
389 
390   if (hasSGPRInitBug())
391     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
392 
393   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
394                   MaxAddressableNumSGPRs);
395 }
396 
397 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
398   const Function &F = *MF.getFunction();
399   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
400 
401   // Compute maximum number of VGPRs function can use using default/requested
402   // minimum number of waves per execution unit.
403   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
404   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
405 
406   // Check if maximum number of VGPRs was explicitly requested using
407   // "amdgpu-num-vgpr" attribute.
408   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
409     unsigned Requested = AMDGPU::getIntegerAttribute(
410       F, "amdgpu-num-vgpr", MaxNumVGPRs);
411 
412     // Make sure requested value does not violate subtarget's specifications.
413     if (Requested && Requested <= getReservedNumVGPRs(MF))
414       Requested = 0;
415 
416     // Make sure requested value is compatible with values implied by
417     // default/requested minimum/maximum number of waves per execution unit.
418     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
419       Requested = 0;
420     if (WavesPerEU.second &&
421         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
422       Requested = 0;
423 
424     if (Requested)
425       MaxNumVGPRs = Requested;
426   }
427 
428   return MaxNumVGPRs - getReservedNumVGPRs(MF);
429 }
430