1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "R600ISelLowering.h"
17 #include "R600InstrInfo.h"
18 #include "SIFrameLowering.h"
19 #include "SIISelLowering.h"
20 #include "SIInstrInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "llvm/ADT/SmallString.h"
23 #include "llvm/CodeGen/MachineScheduler.h"
24 
25 using namespace llvm;
26 
27 #define DEBUG_TYPE "amdgpu-subtarget"
28 
29 #define GET_SUBTARGETINFO_ENUM
30 #define GET_SUBTARGETINFO_TARGET_DESC
31 #define GET_SUBTARGETINFO_CTOR
32 #include "AMDGPUGenSubtargetInfo.inc"
33 
34 AMDGPUSubtarget::~AMDGPUSubtarget() {}
35 
36 AMDGPUSubtarget &
37 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
38                                                  StringRef GPU, StringRef FS) {
39   // Determine default and user-specified characteristics
40   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
41   // enabled, but some instructions do not respect them and they run at the
42   // double precision rate, so don't enable by default.
43   //
44   // We want to be able to turn these off, but making this a subtarget feature
45   // for SI has the unhelpful behavior that it unsets everything else if you
46   // disable it.
47 
48   SmallString<256> FullFS("+promote-alloca,+fp64-denormals,+load-store-opt,");
49   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
50     FullFS += "+flat-for-global,+unaligned-buffer-access,";
51   FullFS += FS;
52 
53   ParseSubtargetFeatures(GPU, FullFS);
54 
55   // FIXME: I don't think think Evergreen has any useful support for
56   // denormals, but should be checked. Should we issue a warning somewhere
57   // if someone tries to enable these?
58   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
59     FP32Denormals = false;
60     FP64Denormals = false;
61   }
62 
63   // Set defaults if needed.
64   if (MaxPrivateElementSize == 0)
65     MaxPrivateElementSize = 4;
66 
67   return *this;
68 }
69 
70 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
71                                  const TargetMachine &TM)
72   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
73     TargetTriple(TT),
74     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
75     IsaVersion(ISAVersion0_0_0),
76     WavefrontSize(64),
77     LocalMemorySize(0),
78     LDSBankCount(0),
79     MaxPrivateElementSize(0),
80 
81     FastFMAF32(false),
82     HalfRate64Ops(false),
83 
84     FP32Denormals(false),
85     FP64Denormals(false),
86     FPExceptions(false),
87     FlatForGlobal(false),
88     UnalignedScratchAccess(false),
89     UnalignedBufferAccess(false),
90 
91     EnableXNACK(false),
92     DebuggerInsertNops(false),
93     DebuggerReserveRegs(false),
94     DebuggerEmitPrologue(false),
95 
96     EnableVGPRSpilling(false),
97     EnablePromoteAlloca(false),
98     EnableLoadStoreOpt(false),
99     EnableUnsafeDSOffsetFolding(false),
100     EnableSIScheduler(false),
101     DumpCode(false),
102 
103     FP64(false),
104     IsGCN(false),
105     GCN1Encoding(false),
106     GCN3Encoding(false),
107     CIInsts(false),
108     SGPRInitBug(false),
109     HasSMemRealTime(false),
110     Has16BitInsts(false),
111     HasMovrel(false),
112     HasVGPRIndexMode(false),
113     FlatAddressSpace(false),
114 
115     R600ALUInst(false),
116     CaymanISA(false),
117     CFALUBug(false),
118     HasVertexCache(false),
119     TexVTXClauseSize(0),
120 
121     FeatureDisable(false),
122     InstrItins(getInstrItineraryForCPU(GPU)),
123     TSInfo() {
124   initializeSubtargetDependencies(TT, GPU, FS);
125 }
126 
127 // FIXME: These limits are for SI. Did they change with the larger maximum LDS
128 // size?
129 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const {
130   switch (NWaves) {
131   case 10:
132     return 1638;
133   case 9:
134     return 1820;
135   case 8:
136     return 2048;
137   case 7:
138     return 2340;
139   case 6:
140     return 2730;
141   case 5:
142     return 3276;
143   case 4:
144     return 4096;
145   case 3:
146     return 5461;
147   case 2:
148     return 8192;
149   default:
150     return getLocalMemorySize();
151   }
152 }
153 
154 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const {
155   if (Bytes <= 1638)
156     return 10;
157 
158   if (Bytes <= 1820)
159     return 9;
160 
161   if (Bytes <= 2048)
162     return 8;
163 
164   if (Bytes <= 2340)
165     return 7;
166 
167   if (Bytes <= 2730)
168     return 6;
169 
170   if (Bytes <= 3276)
171     return 5;
172 
173   if (Bytes <= 4096)
174     return 4;
175 
176   if (Bytes <= 5461)
177     return 3;
178 
179   if (Bytes <= 8192)
180     return 2;
181 
182   return 1;
183 }
184 
185 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
186   const Function &F) const {
187 
188   // Default minimum/maximum flat work group sizes.
189   std::pair<unsigned, unsigned> Default =
190     AMDGPU::isCompute(F.getCallingConv()) ?
191       std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
192                                     getWavefrontSize() * 4) :
193       std::pair<unsigned, unsigned>(1, getWavefrontSize());
194 
195   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
196   // starts using "amdgpu-flat-work-group-size" attribute.
197   Default.second = AMDGPU::getIntegerAttribute(
198     F, "amdgpu-max-work-group-size", Default.second);
199   Default.first = std::min(Default.first, Default.second);
200 
201   // Requested minimum/maximum flat work group sizes.
202   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
203     F, "amdgpu-flat-work-group-size", Default);
204 
205   // Make sure requested minimum is less than requested maximum.
206   if (Requested.first > Requested.second)
207     return Default;
208 
209   // Make sure requested values do not violate subtarget's specifications.
210   if (Requested.first < getMinFlatWorkGroupSize())
211     return Default;
212   if (Requested.second > getMaxFlatWorkGroupSize())
213     return Default;
214 
215   return Requested;
216 }
217 
218 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
219   const Function &F) const {
220 
221   // Default minimum/maximum number of waves per execution unit.
222   std::pair<unsigned, unsigned> Default(1, 0);
223 
224   // Default/requested minimum/maximum flat work group sizes.
225   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
226 
227   // If minimum/maximum flat work group sizes were explicitly requested using
228   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
229   // number of waves per execution unit to values implied by requested
230   // minimum/maximum flat work group sizes.
231   unsigned MinImpliedByFlatWorkGroupSize =
232     getMaxWavesPerEU(FlatWorkGroupSizes.second);
233   bool RequestedFlatWorkGroupSize = false;
234 
235   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
236   // starts using "amdgpu-flat-work-group-size" attribute.
237   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
238       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
239     Default.first = MinImpliedByFlatWorkGroupSize;
240     RequestedFlatWorkGroupSize = true;
241   }
242 
243   // Requested minimum/maximum number of waves per execution unit.
244   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
245     F, "amdgpu-waves-per-eu", Default, true);
246 
247   // Make sure requested minimum is less than requested maximum.
248   if (Requested.second && Requested.first > Requested.second)
249     return Default;
250 
251   // Make sure requested values do not violate subtarget's specifications.
252   if (Requested.first < getMinWavesPerEU() ||
253       Requested.first > getMaxWavesPerEU())
254     return Default;
255   if (Requested.second > getMaxWavesPerEU())
256     return Default;
257 
258   // Make sure requested values are compatible with values implied by requested
259   // minimum/maximum flat work group sizes.
260   if (RequestedFlatWorkGroupSize &&
261       Requested.first > MinImpliedByFlatWorkGroupSize)
262     return Default;
263 
264   return Requested;
265 }
266 
267 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
268                              const TargetMachine &TM) :
269   AMDGPUSubtarget(TT, GPU, FS, TM),
270   InstrInfo(*this),
271   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
272   TLInfo(TM, *this) {}
273 
274 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
275                          const TargetMachine &TM) :
276   AMDGPUSubtarget(TT, GPU, FS, TM),
277   InstrInfo(*this),
278   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
279   TLInfo(TM, *this),
280   GISel() {}
281 
282 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
283                                       unsigned NumRegionInstrs) const {
284   // Track register pressure so the scheduler can try to decrease
285   // pressure once register usage is above the threshold defined by
286   // SIRegisterInfo::getRegPressureSetLimit()
287   Policy.ShouldTrackPressure = true;
288 
289   // Enabling both top down and bottom up scheduling seems to give us less
290   // register spills than just using one of these approaches on its own.
291   Policy.OnlyTopDown = false;
292   Policy.OnlyBottomUp = false;
293 
294   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
295   if (!enableSIScheduler())
296     Policy.ShouldTrackLaneMasks = true;
297 }
298 
299 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
300   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
301 }
302 
303 unsigned SISubtarget::getKernArgSegmentSize(unsigned ExplicitArgBytes) const {
304   unsigned ImplicitBytes = getImplicitArgNumBytes();
305   if (ImplicitBytes == 0)
306     return ExplicitArgBytes;
307 
308   unsigned Alignment = getAlignmentForImplicitArgPtr();
309   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
310 }
311 
312 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
313   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
314     if (SGPRs <= 80)
315       return 10;
316     if (SGPRs <= 88)
317       return 9;
318     if (SGPRs <= 100)
319       return 8;
320     return 7;
321   }
322   if (SGPRs <= 48)
323     return 10;
324   if (SGPRs <= 56)
325     return 9;
326   if (SGPRs <= 64)
327     return 8;
328   if (SGPRs <= 72)
329     return 7;
330   if (SGPRs <= 80)
331     return 6;
332   return 5;
333 }
334 
335 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
336   if (VGPRs <= 24)
337     return 10;
338   if (VGPRs <= 28)
339     return 9;
340   if (VGPRs <= 32)
341     return 8;
342   if (VGPRs <= 36)
343     return 7;
344   if (VGPRs <= 40)
345     return 6;
346   if (VGPRs <= 48)
347     return 5;
348   if (VGPRs <= 64)
349     return 4;
350   if (VGPRs <= 84)
351     return 3;
352   if (VGPRs <= 128)
353     return 2;
354   return 1;
355 }
356