1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "R600ISelLowering.h"
17 #include "R600InstrInfo.h"
18 #include "SIFrameLowering.h"
19 #include "SIISelLowering.h"
20 #include "SIInstrInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "llvm/ADT/SmallString.h"
23 #include "llvm/CodeGen/MachineScheduler.h"
24 
25 using namespace llvm;
26 
27 #define DEBUG_TYPE "amdgpu-subtarget"
28 
29 #define GET_SUBTARGETINFO_ENUM
30 #define GET_SUBTARGETINFO_TARGET_DESC
31 #define GET_SUBTARGETINFO_CTOR
32 #include "AMDGPUGenSubtargetInfo.inc"
33 
34 AMDGPUSubtarget::~AMDGPUSubtarget() {}
35 
36 AMDGPUSubtarget &
37 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
38                                                  StringRef GPU, StringRef FS) {
39   // Determine default and user-specified characteristics
40   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
41   // enabled, but some instructions do not respect them and they run at the
42   // double precision rate, so don't enable by default.
43   //
44   // We want to be able to turn these off, but making this a subtarget feature
45   // for SI has the unhelpful behavior that it unsets everything else if you
46   // disable it.
47 
48   SmallString<256> FullFS("+promote-alloca,+fp64-denormals,+load-store-opt,");
49   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
50     FullFS += "+flat-for-global,+unaligned-buffer-access,";
51   FullFS += FS;
52 
53   ParseSubtargetFeatures(GPU, FullFS);
54 
55   // FIXME: I don't think think Evergreen has any useful support for
56   // denormals, but should be checked. Should we issue a warning somewhere
57   // if someone tries to enable these?
58   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
59     FP16Denormals = false;
60     FP32Denormals = false;
61     FP64Denormals = false;
62   }
63 
64   // Set defaults if needed.
65   if (MaxPrivateElementSize == 0)
66     MaxPrivateElementSize = 4;
67 
68   return *this;
69 }
70 
71 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
72                                  const TargetMachine &TM)
73   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
74     TargetTriple(TT),
75     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
76     IsaVersion(ISAVersion0_0_0),
77     WavefrontSize(64),
78     LocalMemorySize(0),
79     LDSBankCount(0),
80     MaxPrivateElementSize(0),
81 
82     FastFMAF32(false),
83     HalfRate64Ops(false),
84 
85     FP16Denormals(false),
86     FP32Denormals(false),
87     FP64Denormals(false),
88     FPExceptions(false),
89     FlatForGlobal(false),
90     UnalignedScratchAccess(false),
91     UnalignedBufferAccess(false),
92 
93     EnableXNACK(false),
94     DebuggerInsertNops(false),
95     DebuggerReserveRegs(false),
96     DebuggerEmitPrologue(false),
97 
98     EnableVGPRSpilling(false),
99     EnablePromoteAlloca(false),
100     EnableLoadStoreOpt(false),
101     EnableUnsafeDSOffsetFolding(false),
102     EnableSIScheduler(false),
103     DumpCode(false),
104 
105     FP64(false),
106     IsGCN(false),
107     GCN1Encoding(false),
108     GCN3Encoding(false),
109     CIInsts(false),
110     SGPRInitBug(false),
111     HasSMemRealTime(false),
112     Has16BitInsts(false),
113     HasMovrel(false),
114     HasVGPRIndexMode(false),
115     HasScalarStores(false),
116     HasInv2PiInlineImm(false),
117     FlatAddressSpace(false),
118 
119     R600ALUInst(false),
120     CaymanISA(false),
121     CFALUBug(false),
122     HasVertexCache(false),
123     TexVTXClauseSize(0),
124 
125     FeatureDisable(false),
126     InstrItins(getInstrItineraryForCPU(GPU)),
127     TSInfo() {
128   initializeSubtargetDependencies(TT, GPU, FS);
129 }
130 
131 // FIXME: These limits are for SI. Did they change with the larger maximum LDS
132 // size?
133 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const {
134   switch (NWaves) {
135   case 10:
136     return 1638;
137   case 9:
138     return 1820;
139   case 8:
140     return 2048;
141   case 7:
142     return 2340;
143   case 6:
144     return 2730;
145   case 5:
146     return 3276;
147   case 4:
148     return 4096;
149   case 3:
150     return 5461;
151   case 2:
152     return 8192;
153   default:
154     return getLocalMemorySize();
155   }
156 }
157 
158 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const {
159   if (Bytes <= 1638)
160     return 10;
161 
162   if (Bytes <= 1820)
163     return 9;
164 
165   if (Bytes <= 2048)
166     return 8;
167 
168   if (Bytes <= 2340)
169     return 7;
170 
171   if (Bytes <= 2730)
172     return 6;
173 
174   if (Bytes <= 3276)
175     return 5;
176 
177   if (Bytes <= 4096)
178     return 4;
179 
180   if (Bytes <= 5461)
181     return 3;
182 
183   if (Bytes <= 8192)
184     return 2;
185 
186   return 1;
187 }
188 
189 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
190   const Function &F) const {
191 
192   // Default minimum/maximum flat work group sizes.
193   std::pair<unsigned, unsigned> Default =
194     AMDGPU::isCompute(F.getCallingConv()) ?
195       std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
196                                     getWavefrontSize() * 4) :
197       std::pair<unsigned, unsigned>(1, getWavefrontSize());
198 
199   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
200   // starts using "amdgpu-flat-work-group-size" attribute.
201   Default.second = AMDGPU::getIntegerAttribute(
202     F, "amdgpu-max-work-group-size", Default.second);
203   Default.first = std::min(Default.first, Default.second);
204 
205   // Requested minimum/maximum flat work group sizes.
206   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
207     F, "amdgpu-flat-work-group-size", Default);
208 
209   // Make sure requested minimum is less than requested maximum.
210   if (Requested.first > Requested.second)
211     return Default;
212 
213   // Make sure requested values do not violate subtarget's specifications.
214   if (Requested.first < getMinFlatWorkGroupSize())
215     return Default;
216   if (Requested.second > getMaxFlatWorkGroupSize())
217     return Default;
218 
219   return Requested;
220 }
221 
222 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
223   const Function &F) const {
224 
225   // Default minimum/maximum number of waves per execution unit.
226   std::pair<unsigned, unsigned> Default(1, 0);
227 
228   // Default/requested minimum/maximum flat work group sizes.
229   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
230 
231   // If minimum/maximum flat work group sizes were explicitly requested using
232   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
233   // number of waves per execution unit to values implied by requested
234   // minimum/maximum flat work group sizes.
235   unsigned MinImpliedByFlatWorkGroupSize =
236     getMaxWavesPerEU(FlatWorkGroupSizes.second);
237   bool RequestedFlatWorkGroupSize = false;
238 
239   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
240   // starts using "amdgpu-flat-work-group-size" attribute.
241   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
242       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
243     Default.first = MinImpliedByFlatWorkGroupSize;
244     RequestedFlatWorkGroupSize = true;
245   }
246 
247   // Requested minimum/maximum number of waves per execution unit.
248   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
249     F, "amdgpu-waves-per-eu", Default, true);
250 
251   // Make sure requested minimum is less than requested maximum.
252   if (Requested.second && Requested.first > Requested.second)
253     return Default;
254 
255   // Make sure requested values do not violate subtarget's specifications.
256   if (Requested.first < getMinWavesPerEU() ||
257       Requested.first > getMaxWavesPerEU())
258     return Default;
259   if (Requested.second > getMaxWavesPerEU())
260     return Default;
261 
262   // Make sure requested values are compatible with values implied by requested
263   // minimum/maximum flat work group sizes.
264   if (RequestedFlatWorkGroupSize &&
265       Requested.first > MinImpliedByFlatWorkGroupSize)
266     return Default;
267 
268   return Requested;
269 }
270 
271 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
272                              const TargetMachine &TM) :
273   AMDGPUSubtarget(TT, GPU, FS, TM),
274   InstrInfo(*this),
275   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
276   TLInfo(TM, *this) {}
277 
278 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
279                          const TargetMachine &TM) :
280   AMDGPUSubtarget(TT, GPU, FS, TM),
281   InstrInfo(*this),
282   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
283   TLInfo(TM, *this),
284   GISel() {}
285 
286 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
287                                       unsigned NumRegionInstrs) const {
288   // Track register pressure so the scheduler can try to decrease
289   // pressure once register usage is above the threshold defined by
290   // SIRegisterInfo::getRegPressureSetLimit()
291   Policy.ShouldTrackPressure = true;
292 
293   // Enabling both top down and bottom up scheduling seems to give us less
294   // register spills than just using one of these approaches on its own.
295   Policy.OnlyTopDown = false;
296   Policy.OnlyBottomUp = false;
297 
298   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
299   if (!enableSIScheduler())
300     Policy.ShouldTrackLaneMasks = true;
301 }
302 
303 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
304   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
305 }
306 
307 unsigned SISubtarget::getKernArgSegmentSize(unsigned ExplicitArgBytes) const {
308   unsigned ImplicitBytes = getImplicitArgNumBytes();
309   if (ImplicitBytes == 0)
310     return ExplicitArgBytes;
311 
312   unsigned Alignment = getAlignmentForImplicitArgPtr();
313   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
314 }
315 
316 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
317   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
318     if (SGPRs <= 80)
319       return 10;
320     if (SGPRs <= 88)
321       return 9;
322     if (SGPRs <= 100)
323       return 8;
324     return 7;
325   }
326   if (SGPRs <= 48)
327     return 10;
328   if (SGPRs <= 56)
329     return 9;
330   if (SGPRs <= 64)
331     return 8;
332   if (SGPRs <= 72)
333     return 7;
334   if (SGPRs <= 80)
335     return 6;
336   return 5;
337 }
338 
339 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
340   if (VGPRs <= 24)
341     return 10;
342   if (VGPRs <= 28)
343     return 9;
344   if (VGPRs <= 32)
345     return 8;
346   if (VGPRs <= 36)
347     return 7;
348   if (VGPRs <= 40)
349     return 6;
350   if (VGPRs <= 48)
351     return 5;
352   if (VGPRs <= 64)
353     return 4;
354   if (VGPRs <= 84)
355     return 3;
356   if (VGPRs <= 128)
357     return 2;
358   return 1;
359 }
360 
361 unsigned SISubtarget::getMaxNumSGPRs() const {
362   if (hasSGPRInitBug())
363     return SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
364 
365   if (getGeneration() >= VOLCANIC_ISLANDS)
366     return 102;
367 
368   return 104;
369 }
370