1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "llvm/ADT/SmallString.h"
17 #include "llvm/CodeGen/MachineScheduler.h"
18 #include "llvm/Target/TargetFrameLowering.h"
19 #include <algorithm>
20 
21 using namespace llvm;
22 
23 #define DEBUG_TYPE "amdgpu-subtarget"
24 
25 #define GET_SUBTARGETINFO_ENUM
26 #define GET_SUBTARGETINFO_TARGET_DESC
27 #define GET_SUBTARGETINFO_CTOR
28 #include "AMDGPUGenSubtargetInfo.inc"
29 
30 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
31 
32 AMDGPUSubtarget &
33 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
34                                                  StringRef GPU, StringRef FS) {
35   // Determine default and user-specified characteristics
36   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
37   // enabled, but some instructions do not respect them and they run at the
38   // double precision rate, so don't enable by default.
39   //
40   // We want to be able to turn these off, but making this a subtarget feature
41   // for SI has the unhelpful behavior that it unsets everything else if you
42   // disable it.
43 
44   SmallString<256> FullFS("+promote-alloca,+fp64-denormals,+load-store-opt,");
45   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
46     FullFS += "+flat-for-global,+unaligned-buffer-access,";
47   FullFS += FS;
48 
49   ParseSubtargetFeatures(GPU, FullFS);
50 
51   // FIXME: I don't think think Evergreen has any useful support for
52   // denormals, but should be checked. Should we issue a warning somewhere
53   // if someone tries to enable these?
54   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
55     FP16Denormals = false;
56     FP32Denormals = false;
57     FP64Denormals = false;
58   }
59 
60   // Set defaults if needed.
61   if (MaxPrivateElementSize == 0)
62     MaxPrivateElementSize = 4;
63 
64   return *this;
65 }
66 
67 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
68                                  const TargetMachine &TM)
69   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
70     TargetTriple(TT),
71     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
72     IsaVersion(ISAVersion0_0_0),
73     WavefrontSize(64),
74     LocalMemorySize(0),
75     LDSBankCount(0),
76     MaxPrivateElementSize(0),
77 
78     FastFMAF32(false),
79     HalfRate64Ops(false),
80 
81     FP16Denormals(false),
82     FP32Denormals(false),
83     FP64Denormals(false),
84     FPExceptions(false),
85     FlatForGlobal(false),
86     UnalignedScratchAccess(false),
87     UnalignedBufferAccess(false),
88 
89     EnableXNACK(false),
90     DebuggerInsertNops(false),
91     DebuggerReserveRegs(false),
92     DebuggerEmitPrologue(false),
93 
94     EnableVGPRSpilling(false),
95     EnablePromoteAlloca(false),
96     EnableLoadStoreOpt(false),
97     EnableUnsafeDSOffsetFolding(false),
98     EnableSIScheduler(false),
99     DumpCode(false),
100 
101     FP64(false),
102     IsGCN(false),
103     GCN1Encoding(false),
104     GCN3Encoding(false),
105     CIInsts(false),
106     SGPRInitBug(false),
107     HasSMemRealTime(false),
108     Has16BitInsts(false),
109     HasMovrel(false),
110     HasVGPRIndexMode(false),
111     HasScalarStores(false),
112     HasInv2PiInlineImm(false),
113     FlatAddressSpace(false),
114 
115     R600ALUInst(false),
116     CaymanISA(false),
117     CFALUBug(false),
118     HasVertexCache(false),
119     TexVTXClauseSize(0),
120     ScalarizeGlobal(false),
121 
122     FeatureDisable(false),
123     InstrItins(getInstrItineraryForCPU(GPU)) {
124   initializeSubtargetDependencies(TT, GPU, FS);
125 }
126 
127 // FIXME: These limits are for SI. Did they change with the larger maximum LDS
128 // size?
129 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const {
130   switch (NWaves) {
131   case 10:
132     return 1638;
133   case 9:
134     return 1820;
135   case 8:
136     return 2048;
137   case 7:
138     return 2340;
139   case 6:
140     return 2730;
141   case 5:
142     return 3276;
143   case 4:
144     return 4096;
145   case 3:
146     return 5461;
147   case 2:
148     return 8192;
149   default:
150     return getLocalMemorySize();
151   }
152 }
153 
154 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const {
155   if (Bytes <= 1638)
156     return 10;
157 
158   if (Bytes <= 1820)
159     return 9;
160 
161   if (Bytes <= 2048)
162     return 8;
163 
164   if (Bytes <= 2340)
165     return 7;
166 
167   if (Bytes <= 2730)
168     return 6;
169 
170   if (Bytes <= 3276)
171     return 5;
172 
173   if (Bytes <= 4096)
174     return 4;
175 
176   if (Bytes <= 5461)
177     return 3;
178 
179   if (Bytes <= 8192)
180     return 2;
181 
182   return 1;
183 }
184 
185 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
186   const Function &F) const {
187   // Default minimum/maximum flat work group sizes.
188   std::pair<unsigned, unsigned> Default =
189     AMDGPU::isCompute(F.getCallingConv()) ?
190       std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
191                                     getWavefrontSize() * 4) :
192       std::pair<unsigned, unsigned>(1, getWavefrontSize());
193 
194   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
195   // starts using "amdgpu-flat-work-group-size" attribute.
196   Default.second = AMDGPU::getIntegerAttribute(
197     F, "amdgpu-max-work-group-size", Default.second);
198   Default.first = std::min(Default.first, Default.second);
199 
200   // Requested minimum/maximum flat work group sizes.
201   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
202     F, "amdgpu-flat-work-group-size", Default);
203 
204   // Make sure requested minimum is less than requested maximum.
205   if (Requested.first > Requested.second)
206     return Default;
207 
208   // Make sure requested values do not violate subtarget's specifications.
209   if (Requested.first < getMinFlatWorkGroupSize())
210     return Default;
211   if (Requested.second > getMaxFlatWorkGroupSize())
212     return Default;
213 
214   return Requested;
215 }
216 
217 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
218   const Function &F) const {
219   // Default minimum/maximum number of waves per execution unit.
220   std::pair<unsigned, unsigned> Default(1, 0);
221 
222   // Default/requested minimum/maximum flat work group sizes.
223   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
224 
225   // If minimum/maximum flat work group sizes were explicitly requested using
226   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
227   // number of waves per execution unit to values implied by requested
228   // minimum/maximum flat work group sizes.
229   unsigned MinImpliedByFlatWorkGroupSize =
230     getMaxWavesPerEU(FlatWorkGroupSizes.second);
231   bool RequestedFlatWorkGroupSize = false;
232 
233   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
234   // starts using "amdgpu-flat-work-group-size" attribute.
235   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
236       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
237     Default.first = MinImpliedByFlatWorkGroupSize;
238     RequestedFlatWorkGroupSize = true;
239   }
240 
241   // Requested minimum/maximum number of waves per execution unit.
242   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
243     F, "amdgpu-waves-per-eu", Default, true);
244 
245   // Make sure requested minimum is less than requested maximum.
246   if (Requested.second && Requested.first > Requested.second)
247     return Default;
248 
249   // Make sure requested values do not violate subtarget's specifications.
250   if (Requested.first < getMinWavesPerEU() ||
251       Requested.first > getMaxWavesPerEU())
252     return Default;
253   if (Requested.second > getMaxWavesPerEU())
254     return Default;
255 
256   // Make sure requested values are compatible with values implied by requested
257   // minimum/maximum flat work group sizes.
258   if (RequestedFlatWorkGroupSize &&
259       Requested.first > MinImpliedByFlatWorkGroupSize)
260     return Default;
261 
262   return Requested;
263 }
264 
265 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
266                              const TargetMachine &TM) :
267   AMDGPUSubtarget(TT, GPU, FS, TM),
268   InstrInfo(*this),
269   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
270   TLInfo(TM, *this) {}
271 
272 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
273                          const TargetMachine &TM) :
274   AMDGPUSubtarget(TT, GPU, FS, TM),
275   InstrInfo(*this),
276   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
277   TLInfo(TM, *this) {}
278 
279 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
280                                       unsigned NumRegionInstrs) const {
281   // Track register pressure so the scheduler can try to decrease
282   // pressure once register usage is above the threshold defined by
283   // SIRegisterInfo::getRegPressureSetLimit()
284   Policy.ShouldTrackPressure = true;
285 
286   // Enabling both top down and bottom up scheduling seems to give us less
287   // register spills than just using one of these approaches on its own.
288   Policy.OnlyTopDown = false;
289   Policy.OnlyBottomUp = false;
290 
291   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
292   if (!enableSIScheduler())
293     Policy.ShouldTrackLaneMasks = true;
294 }
295 
296 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
297   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
298 }
299 
300 unsigned SISubtarget::getKernArgSegmentSize(unsigned ExplicitArgBytes) const {
301   unsigned ImplicitBytes = getImplicitArgNumBytes();
302   if (ImplicitBytes == 0)
303     return ExplicitArgBytes;
304 
305   unsigned Alignment = getAlignmentForImplicitArgPtr();
306   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
307 }
308 
309 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
310   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
311     if (SGPRs <= 80)
312       return 10;
313     if (SGPRs <= 88)
314       return 9;
315     if (SGPRs <= 100)
316       return 8;
317     return 7;
318   }
319   if (SGPRs <= 48)
320     return 10;
321   if (SGPRs <= 56)
322     return 9;
323   if (SGPRs <= 64)
324     return 8;
325   if (SGPRs <= 72)
326     return 7;
327   if (SGPRs <= 80)
328     return 6;
329   return 5;
330 }
331 
332 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
333   if (VGPRs <= 24)
334     return 10;
335   if (VGPRs <= 28)
336     return 9;
337   if (VGPRs <= 32)
338     return 8;
339   if (VGPRs <= 36)
340     return 7;
341   if (VGPRs <= 40)
342     return 6;
343   if (VGPRs <= 48)
344     return 5;
345   if (VGPRs <= 64)
346     return 4;
347   if (VGPRs <= 84)
348     return 3;
349   if (VGPRs <= 128)
350     return 2;
351   return 1;
352 }
353 
354 unsigned SISubtarget::getMaxNumSGPRs() const {
355   if (hasSGPRInitBug())
356     return SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
357 
358   if (getGeneration() >= VOLCANIC_ISLANDS)
359     return 102;
360 
361   return 104;
362 }
363