1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "R600ISelLowering.h"
17 #include "R600InstrInfo.h"
18 #include "SIFrameLowering.h"
19 #include "SIISelLowering.h"
20 #include "SIInstrInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "llvm/ADT/SmallString.h"
23 #include "llvm/CodeGen/MachineScheduler.h"
24 
25 using namespace llvm;
26 
27 #define DEBUG_TYPE "amdgpu-subtarget"
28 
29 #define GET_SUBTARGETINFO_ENUM
30 #define GET_SUBTARGETINFO_TARGET_DESC
31 #define GET_SUBTARGETINFO_CTOR
32 #include "AMDGPUGenSubtargetInfo.inc"
33 
34 AMDGPUSubtarget::~AMDGPUSubtarget() {}
35 
36 AMDGPUSubtarget &
37 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
38                                                  StringRef GPU, StringRef FS) {
39   // Determine default and user-specified characteristics
40   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
41   // enabled, but some instructions do not respect them and they run at the
42   // double precision rate, so don't enable by default.
43   //
44   // We want to be able to turn these off, but making this a subtarget feature
45   // for SI has the unhelpful behavior that it unsets everything else if you
46   // disable it.
47 
48   SmallString<256> FullFS("+promote-alloca,+fp64-denormals,+load-store-opt,");
49   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
50     FullFS += "+flat-for-global,+unaligned-buffer-access,";
51   FullFS += FS;
52 
53   ParseSubtargetFeatures(GPU, FullFS);
54 
55   // FIXME: I don't think think Evergreen has any useful support for
56   // denormals, but should be checked. Should we issue a warning somewhere
57   // if someone tries to enable these?
58   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
59     FP32Denormals = false;
60     FP64Denormals = false;
61   }
62 
63   // Set defaults if needed.
64   if (MaxPrivateElementSize == 0)
65     MaxPrivateElementSize = 4;
66 
67   return *this;
68 }
69 
70 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
71                                  const TargetMachine &TM)
72   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
73     TargetTriple(TT),
74     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
75     IsaVersion(ISAVersion0_0_0),
76     WavefrontSize(64),
77     LocalMemorySize(0),
78     LDSBankCount(0),
79     MaxPrivateElementSize(0),
80 
81     FastFMAF32(false),
82     HalfRate64Ops(false),
83 
84     FP32Denormals(false),
85     FP64Denormals(false),
86     FPExceptions(false),
87     FlatForGlobal(false),
88     UnalignedBufferAccess(false),
89 
90     EnableXNACK(false),
91     DebuggerInsertNops(false),
92     DebuggerReserveRegs(false),
93     DebuggerEmitPrologue(false),
94 
95     EnableVGPRSpilling(false),
96     EnablePromoteAlloca(false),
97     EnableLoadStoreOpt(false),
98     EnableUnsafeDSOffsetFolding(false),
99     EnableSIScheduler(false),
100     DumpCode(false),
101 
102     FP64(false),
103     IsGCN(false),
104     GCN1Encoding(false),
105     GCN3Encoding(false),
106     CIInsts(false),
107     SGPRInitBug(false),
108     HasSMemRealTime(false),
109     Has16BitInsts(false),
110     HasMovrel(false),
111     HasVGPRIndexMode(false),
112     FlatAddressSpace(false),
113 
114     R600ALUInst(false),
115     CaymanISA(false),
116     CFALUBug(false),
117     HasVertexCache(false),
118     TexVTXClauseSize(0),
119 
120     FeatureDisable(false),
121     InstrItins(getInstrItineraryForCPU(GPU)),
122     TSInfo() {
123   initializeSubtargetDependencies(TT, GPU, FS);
124 }
125 
126 // FIXME: These limits are for SI. Did they change with the larger maximum LDS
127 // size?
128 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const {
129   switch (NWaves) {
130   case 10:
131     return 1638;
132   case 9:
133     return 1820;
134   case 8:
135     return 2048;
136   case 7:
137     return 2340;
138   case 6:
139     return 2730;
140   case 5:
141     return 3276;
142   case 4:
143     return 4096;
144   case 3:
145     return 5461;
146   case 2:
147     return 8192;
148   default:
149     return getLocalMemorySize();
150   }
151 }
152 
153 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const {
154   if (Bytes <= 1638)
155     return 10;
156 
157   if (Bytes <= 1820)
158     return 9;
159 
160   if (Bytes <= 2048)
161     return 8;
162 
163   if (Bytes <= 2340)
164     return 7;
165 
166   if (Bytes <= 2730)
167     return 6;
168 
169   if (Bytes <= 3276)
170     return 5;
171 
172   if (Bytes <= 4096)
173     return 4;
174 
175   if (Bytes <= 5461)
176     return 3;
177 
178   if (Bytes <= 8192)
179     return 2;
180 
181   return 1;
182 }
183 
184 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
185   const Function &F) const {
186 
187   // Default minimum/maximum flat work group sizes.
188   std::pair<unsigned, unsigned> Default =
189     AMDGPU::isCompute(F.getCallingConv()) ?
190       std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
191                                     getWavefrontSize() * 4) :
192       std::pair<unsigned, unsigned>(1, getWavefrontSize());
193 
194   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
195   // starts using "amdgpu-flat-work-group-size" attribute.
196   Default.second = AMDGPU::getIntegerAttribute(
197     F, "amdgpu-max-work-group-size", Default.second);
198   Default.first = std::min(Default.first, Default.second);
199 
200   // Requested minimum/maximum flat work group sizes.
201   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
202     F, "amdgpu-flat-work-group-size", Default);
203 
204   // Make sure requested minimum is less than requested maximum.
205   if (Requested.first > Requested.second)
206     return Default;
207 
208   // Make sure requested values do not violate subtarget's specifications.
209   if (Requested.first < getMinFlatWorkGroupSize())
210     return Default;
211   if (Requested.second > getMaxFlatWorkGroupSize())
212     return Default;
213 
214   return Requested;
215 }
216 
217 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
218   const Function &F) const {
219 
220   // Default minimum/maximum number of waves per execution unit.
221   std::pair<unsigned, unsigned> Default(1, 0);
222 
223   // Default/requested minimum/maximum flat work group sizes.
224   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
225 
226   // If minimum/maximum flat work group sizes were explicitly requested using
227   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
228   // number of waves per execution unit to values implied by requested
229   // minimum/maximum flat work group sizes.
230   unsigned MinImpliedByFlatWorkGroupSize =
231     getMaxWavesPerEU(FlatWorkGroupSizes.second);
232   bool RequestedFlatWorkGroupSize = false;
233 
234   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
235   // starts using "amdgpu-flat-work-group-size" attribute.
236   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
237       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
238     Default.first = MinImpliedByFlatWorkGroupSize;
239     RequestedFlatWorkGroupSize = true;
240   }
241 
242   // Requested minimum/maximum number of waves per execution unit.
243   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
244     F, "amdgpu-waves-per-eu", Default, true);
245 
246   // Make sure requested minimum is less than requested maximum.
247   if (Requested.second && Requested.first > Requested.second)
248     return Default;
249 
250   // Make sure requested values do not violate subtarget's specifications.
251   if (Requested.first < getMinWavesPerEU() ||
252       Requested.first > getMaxWavesPerEU())
253     return Default;
254   if (Requested.second > getMaxWavesPerEU())
255     return Default;
256 
257   // Make sure requested values are compatible with values implied by requested
258   // minimum/maximum flat work group sizes.
259   if (RequestedFlatWorkGroupSize &&
260       Requested.first > MinImpliedByFlatWorkGroupSize)
261     return Default;
262 
263   return Requested;
264 }
265 
266 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
267                              const TargetMachine &TM) :
268   AMDGPUSubtarget(TT, GPU, FS, TM),
269   InstrInfo(*this),
270   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
271   TLInfo(TM, *this) {}
272 
273 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
274                          const TargetMachine &TM) :
275   AMDGPUSubtarget(TT, GPU, FS, TM),
276   InstrInfo(*this),
277   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
278   TLInfo(TM, *this),
279   GISel() {}
280 
281 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
282                                       unsigned NumRegionInstrs) const {
283   // Track register pressure so the scheduler can try to decrease
284   // pressure once register usage is above the threshold defined by
285   // SIRegisterInfo::getRegPressureSetLimit()
286   Policy.ShouldTrackPressure = true;
287 
288   // Enabling both top down and bottom up scheduling seems to give us less
289   // register spills than just using one of these approaches on its own.
290   Policy.OnlyTopDown = false;
291   Policy.OnlyBottomUp = false;
292 
293   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
294   if (!enableSIScheduler())
295     Policy.ShouldTrackLaneMasks = true;
296 }
297 
298 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
299   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
300 }
301 
302 unsigned SISubtarget::getKernArgSegmentSize(unsigned ExplicitArgBytes) const {
303   unsigned ImplicitBytes = getImplicitArgNumBytes();
304   if (ImplicitBytes == 0)
305     return ExplicitArgBytes;
306 
307   unsigned Alignment = getAlignmentForImplicitArgPtr();
308   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
309 }
310 
311 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
312   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
313     if (SGPRs <= 80)
314       return 10;
315     if (SGPRs <= 88)
316       return 9;
317     if (SGPRs <= 100)
318       return 8;
319     return 7;
320   }
321   if (SGPRs <= 48)
322     return 10;
323   if (SGPRs <= 56)
324     return 9;
325   if (SGPRs <= 64)
326     return 8;
327   if (SGPRs <= 72)
328     return 7;
329   if (SGPRs <= 80)
330     return 6;
331   return 5;
332 }
333 
334 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
335   if (VGPRs <= 24)
336     return 10;
337   if (VGPRs <= 28)
338     return 9;
339   if (VGPRs <= 32)
340     return 8;
341   if (VGPRs <= 36)
342     return 7;
343   if (VGPRs <= 40)
344     return 6;
345   if (VGPRs <= 48)
346     return 5;
347   if (VGPRs <= 64)
348     return 4;
349   if (VGPRs <= 84)
350     return 3;
351   if (VGPRs <= 128)
352     return 2;
353   return 1;
354 }
355