1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "SIMachineFunctionInfo.h"
17 #include "llvm/ADT/SmallString.h"
18 #include "llvm/CodeGen/MachineScheduler.h"
19 #include "llvm/Target/TargetFrameLowering.h"
20 #include <algorithm>
21 
22 using namespace llvm;
23 
24 #define DEBUG_TYPE "amdgpu-subtarget"
25 
26 #define GET_SUBTARGETINFO_ENUM
27 #define GET_SUBTARGETINFO_TARGET_DESC
28 #define GET_SUBTARGETINFO_CTOR
29 #include "AMDGPUGenSubtargetInfo.inc"
30 
31 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
32 
33 AMDGPUSubtarget &
34 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
35                                                  StringRef GPU, StringRef FS) {
36   // Determine default and user-specified characteristics
37   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
38   // enabled, but some instructions do not respect them and they run at the
39   // double precision rate, so don't enable by default.
40   //
41   // We want to be able to turn these off, but making this a subtarget feature
42   // for SI has the unhelpful behavior that it unsets everything else if you
43   // disable it.
44 
45   SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,");
46   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
47     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
48 
49   FullFS += FS;
50 
51   ParseSubtargetFeatures(GPU, FullFS);
52 
53   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
54   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
55   // variants of MUBUF instructions.
56   if (!hasAddr64() && !FS.contains("flat-for-global")) {
57     FlatForGlobal = true;
58   }
59 
60   // FIXME: I don't think think Evergreen has any useful support for
61   // denormals, but should be checked. Should we issue a warning somewhere
62   // if someone tries to enable these?
63   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
64     FP64FP16Denormals = false;
65     FP32Denormals = false;
66   }
67 
68   // Set defaults if needed.
69   if (MaxPrivateElementSize == 0)
70     MaxPrivateElementSize = 4;
71 
72   return *this;
73 }
74 
75 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
76                                  const TargetMachine &TM)
77   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
78     TargetTriple(TT),
79     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
80     IsaVersion(ISAVersion0_0_0),
81     WavefrontSize(64),
82     LocalMemorySize(0),
83     LDSBankCount(0),
84     MaxPrivateElementSize(0),
85 
86     FastFMAF32(false),
87     HalfRate64Ops(false),
88 
89     FP32Denormals(false),
90     FP64FP16Denormals(false),
91     FPExceptions(false),
92     DX10Clamp(false),
93     FlatForGlobal(false),
94     UnalignedScratchAccess(false),
95     UnalignedBufferAccess(false),
96 
97     HasApertureRegs(false),
98     EnableXNACK(false),
99     TrapHandler(false),
100     DebuggerInsertNops(false),
101     DebuggerReserveRegs(false),
102     DebuggerEmitPrologue(false),
103 
104     EnableVGPRSpilling(false),
105     EnablePromoteAlloca(false),
106     EnableLoadStoreOpt(false),
107     EnableUnsafeDSOffsetFolding(false),
108     EnableSIScheduler(false),
109     DumpCode(false),
110 
111     FP64(false),
112     IsGCN(false),
113     GCN1Encoding(false),
114     GCN3Encoding(false),
115     CIInsts(false),
116     GFX9Insts(false),
117     SGPRInitBug(false),
118     HasSMemRealTime(false),
119     Has16BitInsts(false),
120     HasVOP3PInsts(false),
121     HasMovrel(false),
122     HasVGPRIndexMode(false),
123     HasScalarStores(false),
124     HasInv2PiInlineImm(false),
125     HasSDWA(false),
126     HasDPP(false),
127     FlatAddressSpace(false),
128 
129     R600ALUInst(false),
130     CaymanISA(false),
131     CFALUBug(false),
132     HasVertexCache(false),
133     TexVTXClauseSize(0),
134     ScalarizeGlobal(false),
135 
136     FeatureDisable(false),
137     InstrItins(getInstrItineraryForCPU(GPU)) {
138   AS = AMDGPU::getAMDGPUAS(TT);
139   initializeSubtargetDependencies(TT, GPU, FS);
140 }
141 
142 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
143   const Function &F) const {
144   if (NWaves == 1)
145     return getLocalMemorySize();
146   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
147   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
148   unsigned MaxWaves = getMaxWavesPerEU();
149   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
150 }
151 
152 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
153   const Function &F) const {
154   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
155   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
156   unsigned MaxWaves = getMaxWavesPerEU();
157   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
158   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
159   NumWaves = std::min(NumWaves, MaxWaves);
160   NumWaves = std::max(NumWaves, 1u);
161   return NumWaves;
162 }
163 
164 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
165   const Function &F) const {
166   // Default minimum/maximum flat work group sizes.
167   std::pair<unsigned, unsigned> Default =
168     AMDGPU::isCompute(F.getCallingConv()) ?
169       std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
170                                     getWavefrontSize() * 4) :
171       std::pair<unsigned, unsigned>(1, getWavefrontSize());
172 
173   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
174   // starts using "amdgpu-flat-work-group-size" attribute.
175   Default.second = AMDGPU::getIntegerAttribute(
176     F, "amdgpu-max-work-group-size", Default.second);
177   Default.first = std::min(Default.first, Default.second);
178 
179   // Requested minimum/maximum flat work group sizes.
180   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
181     F, "amdgpu-flat-work-group-size", Default);
182 
183   // Make sure requested minimum is less than requested maximum.
184   if (Requested.first > Requested.second)
185     return Default;
186 
187   // Make sure requested values do not violate subtarget's specifications.
188   if (Requested.first < getMinFlatWorkGroupSize())
189     return Default;
190   if (Requested.second > getMaxFlatWorkGroupSize())
191     return Default;
192 
193   return Requested;
194 }
195 
196 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
197   const Function &F) const {
198   // Default minimum/maximum number of waves per execution unit.
199   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
200 
201   // Default/requested minimum/maximum flat work group sizes.
202   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
203 
204   // If minimum/maximum flat work group sizes were explicitly requested using
205   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
206   // number of waves per execution unit to values implied by requested
207   // minimum/maximum flat work group sizes.
208   unsigned MinImpliedByFlatWorkGroupSize =
209     getMaxWavesPerEU(FlatWorkGroupSizes.second);
210   bool RequestedFlatWorkGroupSize = false;
211 
212   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
213   // starts using "amdgpu-flat-work-group-size" attribute.
214   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
215       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
216     Default.first = MinImpliedByFlatWorkGroupSize;
217     RequestedFlatWorkGroupSize = true;
218   }
219 
220   // Requested minimum/maximum number of waves per execution unit.
221   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
222     F, "amdgpu-waves-per-eu", Default, true);
223 
224   // Make sure requested minimum is less than requested maximum.
225   if (Requested.second && Requested.first > Requested.second)
226     return Default;
227 
228   // Make sure requested values do not violate subtarget's specifications.
229   if (Requested.first < getMinWavesPerEU() ||
230       Requested.first > getMaxWavesPerEU())
231     return Default;
232   if (Requested.second > getMaxWavesPerEU())
233     return Default;
234 
235   // Make sure requested values are compatible with values implied by requested
236   // minimum/maximum flat work group sizes.
237   if (RequestedFlatWorkGroupSize &&
238       Requested.first > MinImpliedByFlatWorkGroupSize)
239     return Default;
240 
241   return Requested;
242 }
243 
244 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
245                              const TargetMachine &TM) :
246   AMDGPUSubtarget(TT, GPU, FS, TM),
247   InstrInfo(*this),
248   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
249   TLInfo(TM, *this) {}
250 
251 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
252                          const TargetMachine &TM) :
253   AMDGPUSubtarget(TT, GPU, FS, TM),
254   InstrInfo(*this),
255   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
256   TLInfo(TM, *this) {}
257 
258 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
259                                       unsigned NumRegionInstrs) const {
260   // Track register pressure so the scheduler can try to decrease
261   // pressure once register usage is above the threshold defined by
262   // SIRegisterInfo::getRegPressureSetLimit()
263   Policy.ShouldTrackPressure = true;
264 
265   // Enabling both top down and bottom up scheduling seems to give us less
266   // register spills than just using one of these approaches on its own.
267   Policy.OnlyTopDown = false;
268   Policy.OnlyBottomUp = false;
269 
270   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
271   if (!enableSIScheduler())
272     Policy.ShouldTrackLaneMasks = true;
273 }
274 
275 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
276   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
277 }
278 
279 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
280                                             unsigned ExplicitArgBytes) const {
281   unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
282   if (ImplicitBytes == 0)
283     return ExplicitArgBytes;
284 
285   unsigned Alignment = getAlignmentForImplicitArgPtr();
286   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
287 }
288 
289 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
290   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
291     if (SGPRs <= 80)
292       return 10;
293     if (SGPRs <= 88)
294       return 9;
295     if (SGPRs <= 100)
296       return 8;
297     return 7;
298   }
299   if (SGPRs <= 48)
300     return 10;
301   if (SGPRs <= 56)
302     return 9;
303   if (SGPRs <= 64)
304     return 8;
305   if (SGPRs <= 72)
306     return 7;
307   if (SGPRs <= 80)
308     return 6;
309   return 5;
310 }
311 
312 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
313   if (VGPRs <= 24)
314     return 10;
315   if (VGPRs <= 28)
316     return 9;
317   if (VGPRs <= 32)
318     return 8;
319   if (VGPRs <= 36)
320     return 7;
321   if (VGPRs <= 40)
322     return 6;
323   if (VGPRs <= 48)
324     return 5;
325   if (VGPRs <= 64)
326     return 4;
327   if (VGPRs <= 84)
328     return 3;
329   if (VGPRs <= 128)
330     return 2;
331   return 1;
332 }
333 
334 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
335   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
336   if (MFI.hasFlatScratchInit()) {
337     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
338       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
339     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
340       return 4; // FLAT_SCRATCH, VCC (in that order).
341   }
342 
343   if (isXNACKEnabled())
344     return 4; // XNACK, VCC (in that order).
345   return 2; // VCC.
346 }
347 
348 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
349   const Function &F = *MF.getFunction();
350   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
351 
352   // Compute maximum number of SGPRs function can use using default/requested
353   // minimum number of waves per execution unit.
354   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
355   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
356   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
357 
358   // Check if maximum number of SGPRs was explicitly requested using
359   // "amdgpu-num-sgpr" attribute.
360   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
361     unsigned Requested = AMDGPU::getIntegerAttribute(
362       F, "amdgpu-num-sgpr", MaxNumSGPRs);
363 
364     // Make sure requested value does not violate subtarget's specifications.
365     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
366       Requested = 0;
367 
368     // If more SGPRs are required to support the input user/system SGPRs,
369     // increase to accommodate them.
370     //
371     // FIXME: This really ends up using the requested number of SGPRs + number
372     // of reserved special registers in total. Theoretically you could re-use
373     // the last input registers for these special registers, but this would
374     // require a lot of complexity to deal with the weird aliasing.
375     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
376     if (Requested && Requested < InputNumSGPRs)
377       Requested = InputNumSGPRs;
378 
379     // Make sure requested value is compatible with values implied by
380     // default/requested minimum/maximum number of waves per execution unit.
381     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
382       Requested = 0;
383     if (WavesPerEU.second &&
384         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
385       Requested = 0;
386 
387     if (Requested)
388       MaxNumSGPRs = Requested;
389   }
390 
391   if (hasSGPRInitBug())
392     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
393 
394   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
395                   MaxAddressableNumSGPRs);
396 }
397 
398 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
399   const Function &F = *MF.getFunction();
400   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
401 
402   // Compute maximum number of VGPRs function can use using default/requested
403   // minimum number of waves per execution unit.
404   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
405   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
406 
407   // Check if maximum number of VGPRs was explicitly requested using
408   // "amdgpu-num-vgpr" attribute.
409   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
410     unsigned Requested = AMDGPU::getIntegerAttribute(
411       F, "amdgpu-num-vgpr", MaxNumVGPRs);
412 
413     // Make sure requested value does not violate subtarget's specifications.
414     if (Requested && Requested <= getReservedNumVGPRs(MF))
415       Requested = 0;
416 
417     // Make sure requested value is compatible with values implied by
418     // default/requested minimum/maximum number of waves per execution unit.
419     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
420       Requested = 0;
421     if (WavesPerEU.second &&
422         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
423       Requested = 0;
424 
425     if (Requested)
426       MaxNumVGPRs = Requested;
427   }
428 
429   return MaxNumVGPRs - getReservedNumVGPRs(MF);
430 }
431