1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "R600ISelLowering.h"
17 #include "R600InstrInfo.h"
18 #include "SIFrameLowering.h"
19 #include "SIISelLowering.h"
20 #include "SIInstrInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "llvm/ADT/SmallString.h"
23 #include "llvm/CodeGen/MachineScheduler.h"
24 
25 using namespace llvm;
26 
27 #define DEBUG_TYPE "amdgpu-subtarget"
28 
29 #define GET_SUBTARGETINFO_ENUM
30 #define GET_SUBTARGETINFO_TARGET_DESC
31 #define GET_SUBTARGETINFO_CTOR
32 #include "AMDGPUGenSubtargetInfo.inc"
33 
34 AMDGPUSubtarget::~AMDGPUSubtarget() {}
35 
36 AMDGPUSubtarget &
37 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
38                                                  StringRef GPU, StringRef FS) {
39   // Determine default and user-specified characteristics
40   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
41   // enabled, but some instructions do not respect them and they run at the
42   // double precision rate, so don't enable by default.
43   //
44   // We want to be able to turn these off, but making this a subtarget feature
45   // for SI has the unhelpful behavior that it unsets everything else if you
46   // disable it.
47 
48   SmallString<256> FullFS("+promote-alloca,+fp64-denormals,+load-store-opt,");
49   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
50     FullFS += "+flat-for-global,+unaligned-buffer-access,";
51   FullFS += FS;
52 
53   ParseSubtargetFeatures(GPU, FullFS);
54 
55   // FIXME: I don't think think Evergreen has any useful support for
56   // denormals, but should be checked. Should we issue a warning somewhere
57   // if someone tries to enable these?
58   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
59     FP16Denormals = false;
60     FP32Denormals = false;
61     FP64Denormals = false;
62   }
63 
64   // Set defaults if needed.
65   if (MaxPrivateElementSize == 0)
66     MaxPrivateElementSize = 4;
67 
68   return *this;
69 }
70 
71 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
72                                  const TargetMachine &TM)
73   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
74     TargetTriple(TT),
75     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
76     IsaVersion(ISAVersion0_0_0),
77     WavefrontSize(64),
78     LocalMemorySize(0),
79     LDSBankCount(0),
80     MaxPrivateElementSize(0),
81 
82     FastFMAF32(false),
83     HalfRate64Ops(false),
84 
85     FP16Denormals(false),
86     FP32Denormals(false),
87     FP64Denormals(false),
88     FPExceptions(false),
89     FlatForGlobal(false),
90     UnalignedScratchAccess(false),
91     UnalignedBufferAccess(false),
92 
93     EnableXNACK(false),
94     DebuggerInsertNops(false),
95     DebuggerReserveRegs(false),
96     DebuggerEmitPrologue(false),
97 
98     EnableVGPRSpilling(false),
99     EnablePromoteAlloca(false),
100     EnableLoadStoreOpt(false),
101     EnableUnsafeDSOffsetFolding(false),
102     EnableSIScheduler(false),
103     DumpCode(false),
104 
105     FP64(false),
106     IsGCN(false),
107     GCN1Encoding(false),
108     GCN3Encoding(false),
109     CIInsts(false),
110     SGPRInitBug(false),
111     HasSMemRealTime(false),
112     Has16BitInsts(false),
113     HasMovrel(false),
114     HasVGPRIndexMode(false),
115     HasScalarStores(false),
116     HasInv2PiInlineImm(false),
117     FlatAddressSpace(false),
118 
119     R600ALUInst(false),
120     CaymanISA(false),
121     CFALUBug(false),
122     HasVertexCache(false),
123     TexVTXClauseSize(0),
124     ScalarizeGlobal(false),
125 
126     FeatureDisable(false),
127     InstrItins(getInstrItineraryForCPU(GPU)),
128     TSInfo() {
129   initializeSubtargetDependencies(TT, GPU, FS);
130 }
131 
132 // FIXME: These limits are for SI. Did they change with the larger maximum LDS
133 // size?
134 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const {
135   switch (NWaves) {
136   case 10:
137     return 1638;
138   case 9:
139     return 1820;
140   case 8:
141     return 2048;
142   case 7:
143     return 2340;
144   case 6:
145     return 2730;
146   case 5:
147     return 3276;
148   case 4:
149     return 4096;
150   case 3:
151     return 5461;
152   case 2:
153     return 8192;
154   default:
155     return getLocalMemorySize();
156   }
157 }
158 
159 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const {
160   if (Bytes <= 1638)
161     return 10;
162 
163   if (Bytes <= 1820)
164     return 9;
165 
166   if (Bytes <= 2048)
167     return 8;
168 
169   if (Bytes <= 2340)
170     return 7;
171 
172   if (Bytes <= 2730)
173     return 6;
174 
175   if (Bytes <= 3276)
176     return 5;
177 
178   if (Bytes <= 4096)
179     return 4;
180 
181   if (Bytes <= 5461)
182     return 3;
183 
184   if (Bytes <= 8192)
185     return 2;
186 
187   return 1;
188 }
189 
190 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
191   const Function &F) const {
192 
193   // Default minimum/maximum flat work group sizes.
194   std::pair<unsigned, unsigned> Default =
195     AMDGPU::isCompute(F.getCallingConv()) ?
196       std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
197                                     getWavefrontSize() * 4) :
198       std::pair<unsigned, unsigned>(1, getWavefrontSize());
199 
200   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
201   // starts using "amdgpu-flat-work-group-size" attribute.
202   Default.second = AMDGPU::getIntegerAttribute(
203     F, "amdgpu-max-work-group-size", Default.second);
204   Default.first = std::min(Default.first, Default.second);
205 
206   // Requested minimum/maximum flat work group sizes.
207   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
208     F, "amdgpu-flat-work-group-size", Default);
209 
210   // Make sure requested minimum is less than requested maximum.
211   if (Requested.first > Requested.second)
212     return Default;
213 
214   // Make sure requested values do not violate subtarget's specifications.
215   if (Requested.first < getMinFlatWorkGroupSize())
216     return Default;
217   if (Requested.second > getMaxFlatWorkGroupSize())
218     return Default;
219 
220   return Requested;
221 }
222 
223 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
224   const Function &F) const {
225 
226   // Default minimum/maximum number of waves per execution unit.
227   std::pair<unsigned, unsigned> Default(1, 0);
228 
229   // Default/requested minimum/maximum flat work group sizes.
230   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
231 
232   // If minimum/maximum flat work group sizes were explicitly requested using
233   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
234   // number of waves per execution unit to values implied by requested
235   // minimum/maximum flat work group sizes.
236   unsigned MinImpliedByFlatWorkGroupSize =
237     getMaxWavesPerEU(FlatWorkGroupSizes.second);
238   bool RequestedFlatWorkGroupSize = false;
239 
240   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
241   // starts using "amdgpu-flat-work-group-size" attribute.
242   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
243       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
244     Default.first = MinImpliedByFlatWorkGroupSize;
245     RequestedFlatWorkGroupSize = true;
246   }
247 
248   // Requested minimum/maximum number of waves per execution unit.
249   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
250     F, "amdgpu-waves-per-eu", Default, true);
251 
252   // Make sure requested minimum is less than requested maximum.
253   if (Requested.second && Requested.first > Requested.second)
254     return Default;
255 
256   // Make sure requested values do not violate subtarget's specifications.
257   if (Requested.first < getMinWavesPerEU() ||
258       Requested.first > getMaxWavesPerEU())
259     return Default;
260   if (Requested.second > getMaxWavesPerEU())
261     return Default;
262 
263   // Make sure requested values are compatible with values implied by requested
264   // minimum/maximum flat work group sizes.
265   if (RequestedFlatWorkGroupSize &&
266       Requested.first > MinImpliedByFlatWorkGroupSize)
267     return Default;
268 
269   return Requested;
270 }
271 
272 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
273                              const TargetMachine &TM) :
274   AMDGPUSubtarget(TT, GPU, FS, TM),
275   InstrInfo(*this),
276   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
277   TLInfo(TM, *this) {}
278 
279 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
280                          const TargetMachine &TM) :
281   AMDGPUSubtarget(TT, GPU, FS, TM),
282   InstrInfo(*this),
283   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
284   TLInfo(TM, *this),
285   GISel() {}
286 
287 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
288                                       unsigned NumRegionInstrs) const {
289   // Track register pressure so the scheduler can try to decrease
290   // pressure once register usage is above the threshold defined by
291   // SIRegisterInfo::getRegPressureSetLimit()
292   Policy.ShouldTrackPressure = true;
293 
294   // Enabling both top down and bottom up scheduling seems to give us less
295   // register spills than just using one of these approaches on its own.
296   Policy.OnlyTopDown = false;
297   Policy.OnlyBottomUp = false;
298 
299   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
300   if (!enableSIScheduler())
301     Policy.ShouldTrackLaneMasks = true;
302 }
303 
304 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
305   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
306 }
307 
308 unsigned SISubtarget::getKernArgSegmentSize(unsigned ExplicitArgBytes) const {
309   unsigned ImplicitBytes = getImplicitArgNumBytes();
310   if (ImplicitBytes == 0)
311     return ExplicitArgBytes;
312 
313   unsigned Alignment = getAlignmentForImplicitArgPtr();
314   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
315 }
316 
317 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
318   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
319     if (SGPRs <= 80)
320       return 10;
321     if (SGPRs <= 88)
322       return 9;
323     if (SGPRs <= 100)
324       return 8;
325     return 7;
326   }
327   if (SGPRs <= 48)
328     return 10;
329   if (SGPRs <= 56)
330     return 9;
331   if (SGPRs <= 64)
332     return 8;
333   if (SGPRs <= 72)
334     return 7;
335   if (SGPRs <= 80)
336     return 6;
337   return 5;
338 }
339 
340 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
341   if (VGPRs <= 24)
342     return 10;
343   if (VGPRs <= 28)
344     return 9;
345   if (VGPRs <= 32)
346     return 8;
347   if (VGPRs <= 36)
348     return 7;
349   if (VGPRs <= 40)
350     return 6;
351   if (VGPRs <= 48)
352     return 5;
353   if (VGPRs <= 64)
354     return 4;
355   if (VGPRs <= 84)
356     return 3;
357   if (VGPRs <= 128)
358     return 2;
359   return 1;
360 }
361 
362 unsigned SISubtarget::getMaxNumSGPRs() const {
363   if (hasSGPRInitBug())
364     return SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
365 
366   if (getGeneration() >= VOLCANIC_ISLANDS)
367     return 102;
368 
369   return 104;
370 }
371