1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "R600ISelLowering.h"
17 #include "R600InstrInfo.h"
18 #include "SIFrameLowering.h"
19 #include "SIISelLowering.h"
20 #include "SIInstrInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "llvm/ADT/SmallString.h"
23 #include "llvm/CodeGen/MachineScheduler.h"
24 
25 using namespace llvm;
26 
27 #define DEBUG_TYPE "amdgpu-subtarget"
28 
29 #define GET_SUBTARGETINFO_ENUM
30 #define GET_SUBTARGETINFO_TARGET_DESC
31 #define GET_SUBTARGETINFO_CTOR
32 #include "AMDGPUGenSubtargetInfo.inc"
33 
34 AMDGPUSubtarget::~AMDGPUSubtarget() {}
35 
36 AMDGPUSubtarget &
37 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
38                                                  StringRef GPU, StringRef FS) {
39   // Determine default and user-specified characteristics
40   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
41   // enabled, but some instructions do not respect them and they run at the
42   // double precision rate, so don't enable by default.
43   //
44   // We want to be able to turn these off, but making this a subtarget feature
45   // for SI has the unhelpful behavior that it unsets everything else if you
46   // disable it.
47 
48   SmallString<256> FullFS("+promote-alloca,+fp64-denormals,+load-store-opt,");
49   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
50     FullFS += "+flat-for-global,+unaligned-buffer-access,";
51   FullFS += FS;
52 
53   ParseSubtargetFeatures(GPU, FullFS);
54 
55   // FIXME: I don't think think Evergreen has any useful support for
56   // denormals, but should be checked. Should we issue a warning somewhere
57   // if someone tries to enable these?
58   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
59     FP32Denormals = false;
60     FP64Denormals = false;
61   }
62 
63   // Set defaults if needed.
64   if (MaxPrivateElementSize == 0)
65     MaxPrivateElementSize = 4;
66 
67   return *this;
68 }
69 
70 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
71                                  const TargetMachine &TM)
72   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
73     TargetTriple(TT),
74     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
75     IsaVersion(ISAVersion0_0_0),
76     WavefrontSize(64),
77     LocalMemorySize(0),
78     LDSBankCount(0),
79     MaxPrivateElementSize(0),
80 
81     FastFMAF32(false),
82     HalfRate64Ops(false),
83 
84     FP32Denormals(false),
85     FP64Denormals(false),
86     FPExceptions(false),
87     FlatForGlobal(false),
88     UnalignedBufferAccess(false),
89 
90     EnableXNACK(false),
91     DebuggerInsertNops(false),
92     DebuggerReserveRegs(false),
93     DebuggerEmitPrologue(false),
94 
95     EnableVGPRSpilling(false),
96     EnablePromoteAlloca(false),
97     EnableLoadStoreOpt(false),
98     EnableUnsafeDSOffsetFolding(false),
99     EnableSIScheduler(false),
100     DumpCode(false),
101 
102     FP64(false),
103     IsGCN(false),
104     GCN1Encoding(false),
105     GCN3Encoding(false),
106     CIInsts(false),
107     SGPRInitBug(false),
108     HasSMemRealTime(false),
109     Has16BitInsts(false),
110     FlatAddressSpace(false),
111 
112     R600ALUInst(false),
113     CaymanISA(false),
114     CFALUBug(false),
115     HasVertexCache(false),
116     TexVTXClauseSize(0),
117 
118     FeatureDisable(false),
119     InstrItins(getInstrItineraryForCPU(GPU)),
120     TSInfo() {
121   initializeSubtargetDependencies(TT, GPU, FS);
122 }
123 
124 // FIXME: These limits are for SI. Did they change with the larger maximum LDS
125 // size?
126 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const {
127   switch (NWaves) {
128   case 10:
129     return 1638;
130   case 9:
131     return 1820;
132   case 8:
133     return 2048;
134   case 7:
135     return 2340;
136   case 6:
137     return 2730;
138   case 5:
139     return 3276;
140   case 4:
141     return 4096;
142   case 3:
143     return 5461;
144   case 2:
145     return 8192;
146   default:
147     return getLocalMemorySize();
148   }
149 }
150 
151 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const {
152   if (Bytes <= 1638)
153     return 10;
154 
155   if (Bytes <= 1820)
156     return 9;
157 
158   if (Bytes <= 2048)
159     return 8;
160 
161   if (Bytes <= 2340)
162     return 7;
163 
164   if (Bytes <= 2730)
165     return 6;
166 
167   if (Bytes <= 3276)
168     return 5;
169 
170   if (Bytes <= 4096)
171     return 4;
172 
173   if (Bytes <= 5461)
174     return 3;
175 
176   if (Bytes <= 8192)
177     return 2;
178 
179   return 1;
180 }
181 
182 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
183   const Function &F) const {
184 
185   // Default minimum/maximum flat work group sizes.
186   std::pair<unsigned, unsigned> Default =
187     AMDGPU::isCompute(F.getCallingConv()) ?
188       std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
189                                     getWavefrontSize() * 4) :
190       std::pair<unsigned, unsigned>(1, getWavefrontSize());
191 
192   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
193   // starts using "amdgpu-flat-work-group-size" attribute.
194   Default.second = AMDGPU::getIntegerAttribute(
195     F, "amdgpu-max-work-group-size", Default.second);
196   Default.first = std::min(Default.first, Default.second);
197 
198   // Requested minimum/maximum flat work group sizes.
199   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
200     F, "amdgpu-flat-work-group-size", Default);
201 
202   // Make sure requested minimum is less than requested maximum.
203   if (Requested.first > Requested.second)
204     return Default;
205 
206   // Make sure requested values do not violate subtarget's specifications.
207   if (Requested.first < getMinFlatWorkGroupSize())
208     return Default;
209   if (Requested.second > getMaxFlatWorkGroupSize())
210     return Default;
211 
212   return Requested;
213 }
214 
215 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
216   const Function &F) const {
217 
218   // Default minimum/maximum number of waves per execution unit.
219   std::pair<unsigned, unsigned> Default(1, 0);
220 
221   // Default/requested minimum/maximum flat work group sizes.
222   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
223 
224   // If minimum/maximum flat work group sizes were explicitly requested using
225   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
226   // number of waves per execution unit to values implied by requested
227   // minimum/maximum flat work group sizes.
228   unsigned MinImpliedByFlatWorkGroupSize =
229     getMaxWavesPerEU(FlatWorkGroupSizes.second);
230   bool RequestedFlatWorkGroupSize = false;
231 
232   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
233   // starts using "amdgpu-flat-work-group-size" attribute.
234   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
235       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
236     Default.first = MinImpliedByFlatWorkGroupSize;
237     RequestedFlatWorkGroupSize = true;
238   }
239 
240   // Requested minimum/maximum number of waves per execution unit.
241   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
242     F, "amdgpu-waves-per-eu", Default, true);
243 
244   // Make sure requested minimum is less than requested maximum.
245   if (Requested.second && Requested.first > Requested.second)
246     return Default;
247 
248   // Make sure requested values do not violate subtarget's specifications.
249   if (Requested.first < getMinWavesPerEU() ||
250       Requested.first > getMaxWavesPerEU())
251     return Default;
252   if (Requested.second > getMaxWavesPerEU())
253     return Default;
254 
255   // Make sure requested values are compatible with values implied by requested
256   // minimum/maximum flat work group sizes.
257   if (RequestedFlatWorkGroupSize &&
258       Requested.first > MinImpliedByFlatWorkGroupSize)
259     return Default;
260 
261   return Requested;
262 }
263 
264 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
265                              const TargetMachine &TM) :
266   AMDGPUSubtarget(TT, GPU, FS, TM),
267   InstrInfo(*this),
268   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
269   TLInfo(TM, *this) {}
270 
271 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
272                          const TargetMachine &TM) :
273   AMDGPUSubtarget(TT, GPU, FS, TM),
274   InstrInfo(*this),
275   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
276   TLInfo(TM, *this),
277   GISel() {}
278 
279 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
280                                       unsigned NumRegionInstrs) const {
281   // Track register pressure so the scheduler can try to decrease
282   // pressure once register usage is above the threshold defined by
283   // SIRegisterInfo::getRegPressureSetLimit()
284   Policy.ShouldTrackPressure = true;
285 
286   // Enabling both top down and bottom up scheduling seems to give us less
287   // register spills than just using one of these approaches on its own.
288   Policy.OnlyTopDown = false;
289   Policy.OnlyBottomUp = false;
290 
291   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
292   if (!enableSIScheduler())
293     Policy.ShouldTrackLaneMasks = true;
294 }
295 
296 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
297   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
298 }
299 
300 unsigned SISubtarget::getKernArgSegmentSize(unsigned ExplicitArgBytes) const {
301   unsigned ImplicitBytes = getImplicitArgNumBytes();
302   if (ImplicitBytes == 0)
303     return ExplicitArgBytes;
304 
305   unsigned Alignment = getAlignmentForImplicitArgPtr();
306   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
307 }
308 
309 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
310   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
311     if (SGPRs <= 80)
312       return 10;
313     if (SGPRs <= 88)
314       return 9;
315     if (SGPRs <= 100)
316       return 8;
317     return 7;
318   }
319   if (SGPRs <= 48)
320     return 10;
321   if (SGPRs <= 56)
322     return 9;
323   if (SGPRs <= 64)
324     return 8;
325   if (SGPRs <= 72)
326     return 7;
327   if (SGPRs <= 80)
328     return 6;
329   return 5;
330 }
331 
332 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
333   if (VGPRs <= 24)
334     return 10;
335   if (VGPRs <= 28)
336     return 9;
337   if (VGPRs <= 32)
338     return 8;
339   if (VGPRs <= 36)
340     return 7;
341   if (VGPRs <= 40)
342     return 6;
343   if (VGPRs <= 48)
344     return 5;
345   if (VGPRs <= 64)
346     return 4;
347   if (VGPRs <= 84)
348     return 3;
349   if (VGPRs <= 128)
350     return 2;
351   return 1;
352 }
353