1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "llvm/ADT/SmallString.h"
17 #include "llvm/CodeGen/MachineScheduler.h"
18 #include "llvm/Target/TargetFrameLowering.h"
19 #include <algorithm>
20 
21 using namespace llvm;
22 
23 #define DEBUG_TYPE "amdgpu-subtarget"
24 
25 #define GET_SUBTARGETINFO_ENUM
26 #define GET_SUBTARGETINFO_TARGET_DESC
27 #define GET_SUBTARGETINFO_CTOR
28 #include "AMDGPUGenSubtargetInfo.inc"
29 
30 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
31 
32 AMDGPUSubtarget &
33 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
34                                                  StringRef GPU, StringRef FS) {
35   // Determine default and user-specified characteristics
36   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
37   // enabled, but some instructions do not respect them and they run at the
38   // double precision rate, so don't enable by default.
39   //
40   // We want to be able to turn these off, but making this a subtarget feature
41   // for SI has the unhelpful behavior that it unsets everything else if you
42   // disable it.
43 
44   SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+load-store-opt,");
45   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
46     FullFS += "+flat-for-global,+unaligned-buffer-access,";
47 
48   FullFS += FS;
49 
50   ParseSubtargetFeatures(GPU, FullFS);
51 
52   // FIXME: I don't think think Evergreen has any useful support for
53   // denormals, but should be checked. Should we issue a warning somewhere
54   // if someone tries to enable these?
55   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
56     FP64FP16Denormals = false;
57     FP32Denormals = false;
58   }
59 
60   // Set defaults if needed.
61   if (MaxPrivateElementSize == 0)
62     MaxPrivateElementSize = 4;
63 
64   return *this;
65 }
66 
67 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
68                                  const TargetMachine &TM)
69   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
70     TargetTriple(TT),
71     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
72     IsaVersion(ISAVersion0_0_0),
73     WavefrontSize(64),
74     LocalMemorySize(0),
75     LDSBankCount(0),
76     MaxPrivateElementSize(0),
77 
78     FastFMAF32(false),
79     HalfRate64Ops(false),
80 
81     FP32Denormals(false),
82     FP64FP16Denormals(false),
83     FPExceptions(false),
84     FlatForGlobal(false),
85     NoAddr64(false),
86     UnalignedScratchAccess(false),
87     UnalignedBufferAccess(false),
88 
89     EnableXNACK(false),
90     DebuggerInsertNops(false),
91     DebuggerReserveRegs(false),
92     DebuggerEmitPrologue(false),
93 
94     EnableVGPRSpilling(false),
95     EnablePromoteAlloca(false),
96     EnableLoadStoreOpt(false),
97     EnableUnsafeDSOffsetFolding(false),
98     EnableSIScheduler(false),
99     DumpCode(false),
100 
101     FP64(false),
102     IsGCN(false),
103     GCN1Encoding(false),
104     GCN3Encoding(false),
105     CIInsts(false),
106     SGPRInitBug(false),
107     HasSMemRealTime(false),
108     Has16BitInsts(false),
109     HasMovrel(false),
110     HasVGPRIndexMode(false),
111     HasScalarStores(false),
112     HasInv2PiInlineImm(false),
113     HasSDWA(false),
114     HasDPP(false),
115     FlatAddressSpace(false),
116 
117     R600ALUInst(false),
118     CaymanISA(false),
119     CFALUBug(false),
120     HasVertexCache(false),
121     TexVTXClauseSize(0),
122     ScalarizeGlobal(false),
123 
124     FeatureDisable(false),
125     InstrItins(getInstrItineraryForCPU(GPU)) {
126   initializeSubtargetDependencies(TT, GPU, FS);
127 }
128 
129 // FIXME: These limits are for SI. Did they change with the larger maximum LDS
130 // size?
131 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const {
132   switch (NWaves) {
133   case 10:
134     return 1638;
135   case 9:
136     return 1820;
137   case 8:
138     return 2048;
139   case 7:
140     return 2340;
141   case 6:
142     return 2730;
143   case 5:
144     return 3276;
145   case 4:
146     return 4096;
147   case 3:
148     return 5461;
149   case 2:
150     return 8192;
151   default:
152     return getLocalMemorySize();
153   }
154 }
155 
156 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const {
157   if (Bytes <= 1638)
158     return 10;
159 
160   if (Bytes <= 1820)
161     return 9;
162 
163   if (Bytes <= 2048)
164     return 8;
165 
166   if (Bytes <= 2340)
167     return 7;
168 
169   if (Bytes <= 2730)
170     return 6;
171 
172   if (Bytes <= 3276)
173     return 5;
174 
175   if (Bytes <= 4096)
176     return 4;
177 
178   if (Bytes <= 5461)
179     return 3;
180 
181   if (Bytes <= 8192)
182     return 2;
183 
184   return 1;
185 }
186 
187 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
188   const Function &F) const {
189   // Default minimum/maximum flat work group sizes.
190   std::pair<unsigned, unsigned> Default =
191     AMDGPU::isCompute(F.getCallingConv()) ?
192       std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
193                                     getWavefrontSize() * 4) :
194       std::pair<unsigned, unsigned>(1, getWavefrontSize());
195 
196   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
197   // starts using "amdgpu-flat-work-group-size" attribute.
198   Default.second = AMDGPU::getIntegerAttribute(
199     F, "amdgpu-max-work-group-size", Default.second);
200   Default.first = std::min(Default.first, Default.second);
201 
202   // Requested minimum/maximum flat work group sizes.
203   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
204     F, "amdgpu-flat-work-group-size", Default);
205 
206   // Make sure requested minimum is less than requested maximum.
207   if (Requested.first > Requested.second)
208     return Default;
209 
210   // Make sure requested values do not violate subtarget's specifications.
211   if (Requested.first < getMinFlatWorkGroupSize())
212     return Default;
213   if (Requested.second > getMaxFlatWorkGroupSize())
214     return Default;
215 
216   return Requested;
217 }
218 
219 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
220   const Function &F) const {
221   // Default minimum/maximum number of waves per execution unit.
222   std::pair<unsigned, unsigned> Default(1, 0);
223 
224   // Default/requested minimum/maximum flat work group sizes.
225   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
226 
227   // If minimum/maximum flat work group sizes were explicitly requested using
228   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
229   // number of waves per execution unit to values implied by requested
230   // minimum/maximum flat work group sizes.
231   unsigned MinImpliedByFlatWorkGroupSize =
232     getMaxWavesPerEU(FlatWorkGroupSizes.second);
233   bool RequestedFlatWorkGroupSize = false;
234 
235   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
236   // starts using "amdgpu-flat-work-group-size" attribute.
237   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
238       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
239     Default.first = MinImpliedByFlatWorkGroupSize;
240     RequestedFlatWorkGroupSize = true;
241   }
242 
243   // Requested minimum/maximum number of waves per execution unit.
244   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
245     F, "amdgpu-waves-per-eu", Default, true);
246 
247   // Make sure requested minimum is less than requested maximum.
248   if (Requested.second && Requested.first > Requested.second)
249     return Default;
250 
251   // Make sure requested values do not violate subtarget's specifications.
252   if (Requested.first < getMinWavesPerEU() ||
253       Requested.first > getMaxWavesPerEU())
254     return Default;
255   if (Requested.second > getMaxWavesPerEU())
256     return Default;
257 
258   // Make sure requested values are compatible with values implied by requested
259   // minimum/maximum flat work group sizes.
260   if (RequestedFlatWorkGroupSize &&
261       Requested.first > MinImpliedByFlatWorkGroupSize)
262     return Default;
263 
264   return Requested;
265 }
266 
267 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
268                              const TargetMachine &TM) :
269   AMDGPUSubtarget(TT, GPU, FS, TM),
270   InstrInfo(*this),
271   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
272   TLInfo(TM, *this) {}
273 
274 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
275                          const TargetMachine &TM) :
276   AMDGPUSubtarget(TT, GPU, FS, TM),
277   InstrInfo(*this),
278   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
279   TLInfo(TM, *this) {}
280 
281 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
282                                       unsigned NumRegionInstrs) const {
283   // Track register pressure so the scheduler can try to decrease
284   // pressure once register usage is above the threshold defined by
285   // SIRegisterInfo::getRegPressureSetLimit()
286   Policy.ShouldTrackPressure = true;
287 
288   // Enabling both top down and bottom up scheduling seems to give us less
289   // register spills than just using one of these approaches on its own.
290   Policy.OnlyTopDown = false;
291   Policy.OnlyBottomUp = false;
292 
293   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
294   if (!enableSIScheduler())
295     Policy.ShouldTrackLaneMasks = true;
296 }
297 
298 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
299   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
300 }
301 
302 unsigned SISubtarget::getKernArgSegmentSize(unsigned ExplicitArgBytes) const {
303   unsigned ImplicitBytes = getImplicitArgNumBytes();
304   if (ImplicitBytes == 0)
305     return ExplicitArgBytes;
306 
307   unsigned Alignment = getAlignmentForImplicitArgPtr();
308   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
309 }
310 
311 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
312   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
313     if (SGPRs <= 80)
314       return 10;
315     if (SGPRs <= 88)
316       return 9;
317     if (SGPRs <= 100)
318       return 8;
319     return 7;
320   }
321   if (SGPRs <= 48)
322     return 10;
323   if (SGPRs <= 56)
324     return 9;
325   if (SGPRs <= 64)
326     return 8;
327   if (SGPRs <= 72)
328     return 7;
329   if (SGPRs <= 80)
330     return 6;
331   return 5;
332 }
333 
334 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
335   if (VGPRs <= 24)
336     return 10;
337   if (VGPRs <= 28)
338     return 9;
339   if (VGPRs <= 32)
340     return 8;
341   if (VGPRs <= 36)
342     return 7;
343   if (VGPRs <= 40)
344     return 6;
345   if (VGPRs <= 48)
346     return 5;
347   if (VGPRs <= 64)
348     return 4;
349   if (VGPRs <= 84)
350     return 3;
351   if (VGPRs <= 128)
352     return 2;
353   return 1;
354 }
355 
356 unsigned SISubtarget::getMaxNumSGPRs() const {
357   if (hasSGPRInitBug())
358     return SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
359 
360   if (getGeneration() >= VOLCANIC_ISLANDS)
361     return 102;
362 
363   return 104;
364 }
365