1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "llvm/ADT/SmallString.h"
17 #include "llvm/CodeGen/MachineScheduler.h"
18 #include "llvm/Target/TargetFrameLowering.h"
19 #include <algorithm>
20 
21 using namespace llvm;
22 
23 #define DEBUG_TYPE "amdgpu-subtarget"
24 
25 #define GET_SUBTARGETINFO_ENUM
26 #define GET_SUBTARGETINFO_TARGET_DESC
27 #define GET_SUBTARGETINFO_CTOR
28 #include "AMDGPUGenSubtargetInfo.inc"
29 
30 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
31 
32 AMDGPUSubtarget &
33 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
34                                                  StringRef GPU, StringRef FS) {
35   // Determine default and user-specified characteristics
36   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
37   // enabled, but some instructions do not respect them and they run at the
38   // double precision rate, so don't enable by default.
39   //
40   // We want to be able to turn these off, but making this a subtarget feature
41   // for SI has the unhelpful behavior that it unsets everything else if you
42   // disable it.
43 
44   SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+load-store-opt,");
45   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
46     FullFS += "+flat-for-global,+unaligned-buffer-access,";
47 
48   FullFS += FS;
49 
50   ParseSubtargetFeatures(GPU, FullFS);
51 
52   // FIXME: I don't think think Evergreen has any useful support for
53   // denormals, but should be checked. Should we issue a warning somewhere
54   // if someone tries to enable these?
55   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
56     FP64FP16Denormals = false;
57     FP32Denormals = false;
58   }
59 
60   // Set defaults if needed.
61   if (MaxPrivateElementSize == 0)
62     MaxPrivateElementSize = 4;
63 
64   return *this;
65 }
66 
67 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
68                                  const TargetMachine &TM)
69   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
70     TargetTriple(TT),
71     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
72     IsaVersion(ISAVersion0_0_0),
73     WavefrontSize(64),
74     LocalMemorySize(0),
75     LDSBankCount(0),
76     MaxPrivateElementSize(0),
77 
78     FastFMAF32(false),
79     HalfRate64Ops(false),
80 
81     FP32Denormals(false),
82     FP64FP16Denormals(false),
83     FPExceptions(false),
84     FlatForGlobal(false),
85     UnalignedScratchAccess(false),
86     UnalignedBufferAccess(false),
87 
88     EnableXNACK(false),
89     DebuggerInsertNops(false),
90     DebuggerReserveRegs(false),
91     DebuggerEmitPrologue(false),
92 
93     EnableVGPRSpilling(false),
94     EnablePromoteAlloca(false),
95     EnableLoadStoreOpt(false),
96     EnableUnsafeDSOffsetFolding(false),
97     EnableSIScheduler(false),
98     DumpCode(false),
99 
100     FP64(false),
101     IsGCN(false),
102     GCN1Encoding(false),
103     GCN3Encoding(false),
104     CIInsts(false),
105     SGPRInitBug(false),
106     HasSMemRealTime(false),
107     Has16BitInsts(false),
108     HasMovrel(false),
109     HasVGPRIndexMode(false),
110     HasScalarStores(false),
111     HasInv2PiInlineImm(false),
112     HasSDWA(false),
113     HasDPP(false),
114     FlatAddressSpace(false),
115 
116     R600ALUInst(false),
117     CaymanISA(false),
118     CFALUBug(false),
119     HasVertexCache(false),
120     TexVTXClauseSize(0),
121     ScalarizeGlobal(false),
122 
123     FeatureDisable(false),
124     InstrItins(getInstrItineraryForCPU(GPU)) {
125   initializeSubtargetDependencies(TT, GPU, FS);
126 }
127 
128 // FIXME: These limits are for SI. Did they change with the larger maximum LDS
129 // size?
130 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const {
131   switch (NWaves) {
132   case 10:
133     return 1638;
134   case 9:
135     return 1820;
136   case 8:
137     return 2048;
138   case 7:
139     return 2340;
140   case 6:
141     return 2730;
142   case 5:
143     return 3276;
144   case 4:
145     return 4096;
146   case 3:
147     return 5461;
148   case 2:
149     return 8192;
150   default:
151     return getLocalMemorySize();
152   }
153 }
154 
155 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const {
156   if (Bytes <= 1638)
157     return 10;
158 
159   if (Bytes <= 1820)
160     return 9;
161 
162   if (Bytes <= 2048)
163     return 8;
164 
165   if (Bytes <= 2340)
166     return 7;
167 
168   if (Bytes <= 2730)
169     return 6;
170 
171   if (Bytes <= 3276)
172     return 5;
173 
174   if (Bytes <= 4096)
175     return 4;
176 
177   if (Bytes <= 5461)
178     return 3;
179 
180   if (Bytes <= 8192)
181     return 2;
182 
183   return 1;
184 }
185 
186 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
187   const Function &F) const {
188   // Default minimum/maximum flat work group sizes.
189   std::pair<unsigned, unsigned> Default =
190     AMDGPU::isCompute(F.getCallingConv()) ?
191       std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
192                                     getWavefrontSize() * 4) :
193       std::pair<unsigned, unsigned>(1, getWavefrontSize());
194 
195   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
196   // starts using "amdgpu-flat-work-group-size" attribute.
197   Default.second = AMDGPU::getIntegerAttribute(
198     F, "amdgpu-max-work-group-size", Default.second);
199   Default.first = std::min(Default.first, Default.second);
200 
201   // Requested minimum/maximum flat work group sizes.
202   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
203     F, "amdgpu-flat-work-group-size", Default);
204 
205   // Make sure requested minimum is less than requested maximum.
206   if (Requested.first > Requested.second)
207     return Default;
208 
209   // Make sure requested values do not violate subtarget's specifications.
210   if (Requested.first < getMinFlatWorkGroupSize())
211     return Default;
212   if (Requested.second > getMaxFlatWorkGroupSize())
213     return Default;
214 
215   return Requested;
216 }
217 
218 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
219   const Function &F) const {
220   // Default minimum/maximum number of waves per execution unit.
221   std::pair<unsigned, unsigned> Default(1, 0);
222 
223   // Default/requested minimum/maximum flat work group sizes.
224   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
225 
226   // If minimum/maximum flat work group sizes were explicitly requested using
227   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
228   // number of waves per execution unit to values implied by requested
229   // minimum/maximum flat work group sizes.
230   unsigned MinImpliedByFlatWorkGroupSize =
231     getMaxWavesPerEU(FlatWorkGroupSizes.second);
232   bool RequestedFlatWorkGroupSize = false;
233 
234   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
235   // starts using "amdgpu-flat-work-group-size" attribute.
236   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
237       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
238     Default.first = MinImpliedByFlatWorkGroupSize;
239     RequestedFlatWorkGroupSize = true;
240   }
241 
242   // Requested minimum/maximum number of waves per execution unit.
243   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
244     F, "amdgpu-waves-per-eu", Default, true);
245 
246   // Make sure requested minimum is less than requested maximum.
247   if (Requested.second && Requested.first > Requested.second)
248     return Default;
249 
250   // Make sure requested values do not violate subtarget's specifications.
251   if (Requested.first < getMinWavesPerEU() ||
252       Requested.first > getMaxWavesPerEU())
253     return Default;
254   if (Requested.second > getMaxWavesPerEU())
255     return Default;
256 
257   // Make sure requested values are compatible with values implied by requested
258   // minimum/maximum flat work group sizes.
259   if (RequestedFlatWorkGroupSize &&
260       Requested.first > MinImpliedByFlatWorkGroupSize)
261     return Default;
262 
263   return Requested;
264 }
265 
266 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
267                              const TargetMachine &TM) :
268   AMDGPUSubtarget(TT, GPU, FS, TM),
269   InstrInfo(*this),
270   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
271   TLInfo(TM, *this) {}
272 
273 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
274                          const TargetMachine &TM) :
275   AMDGPUSubtarget(TT, GPU, FS, TM),
276   InstrInfo(*this),
277   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
278   TLInfo(TM, *this) {}
279 
280 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
281                                       unsigned NumRegionInstrs) const {
282   // Track register pressure so the scheduler can try to decrease
283   // pressure once register usage is above the threshold defined by
284   // SIRegisterInfo::getRegPressureSetLimit()
285   Policy.ShouldTrackPressure = true;
286 
287   // Enabling both top down and bottom up scheduling seems to give us less
288   // register spills than just using one of these approaches on its own.
289   Policy.OnlyTopDown = false;
290   Policy.OnlyBottomUp = false;
291 
292   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
293   if (!enableSIScheduler())
294     Policy.ShouldTrackLaneMasks = true;
295 }
296 
297 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
298   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
299 }
300 
301 unsigned SISubtarget::getKernArgSegmentSize(unsigned ExplicitArgBytes) const {
302   unsigned ImplicitBytes = getImplicitArgNumBytes();
303   if (ImplicitBytes == 0)
304     return ExplicitArgBytes;
305 
306   unsigned Alignment = getAlignmentForImplicitArgPtr();
307   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
308 }
309 
310 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
311   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
312     if (SGPRs <= 80)
313       return 10;
314     if (SGPRs <= 88)
315       return 9;
316     if (SGPRs <= 100)
317       return 8;
318     return 7;
319   }
320   if (SGPRs <= 48)
321     return 10;
322   if (SGPRs <= 56)
323     return 9;
324   if (SGPRs <= 64)
325     return 8;
326   if (SGPRs <= 72)
327     return 7;
328   if (SGPRs <= 80)
329     return 6;
330   return 5;
331 }
332 
333 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
334   if (VGPRs <= 24)
335     return 10;
336   if (VGPRs <= 28)
337     return 9;
338   if (VGPRs <= 32)
339     return 8;
340   if (VGPRs <= 36)
341     return 7;
342   if (VGPRs <= 40)
343     return 6;
344   if (VGPRs <= 48)
345     return 5;
346   if (VGPRs <= 64)
347     return 4;
348   if (VGPRs <= 84)
349     return 3;
350   if (VGPRs <= 128)
351     return 2;
352   return 1;
353 }
354 
355 unsigned SISubtarget::getMaxNumSGPRs() const {
356   if (hasSGPRInitBug())
357     return SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
358 
359   if (getGeneration() >= VOLCANIC_ISLANDS)
360     return 102;
361 
362   return 104;
363 }
364