1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "llvm/ADT/SmallString.h"
17 #include "llvm/CodeGen/MachineScheduler.h"
18 #include "llvm/Target/TargetFrameLowering.h"
19 #include <algorithm>
20 
21 using namespace llvm;
22 
23 #define DEBUG_TYPE "amdgpu-subtarget"
24 
25 #define GET_SUBTARGETINFO_ENUM
26 #define GET_SUBTARGETINFO_TARGET_DESC
27 #define GET_SUBTARGETINFO_CTOR
28 #include "AMDGPUGenSubtargetInfo.inc"
29 
30 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
31 
32 AMDGPUSubtarget &
33 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
34                                                  StringRef GPU, StringRef FS) {
35   // Determine default and user-specified characteristics
36   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
37   // enabled, but some instructions do not respect them and they run at the
38   // double precision rate, so don't enable by default.
39   //
40   // We want to be able to turn these off, but making this a subtarget feature
41   // for SI has the unhelpful behavior that it unsets everything else if you
42   // disable it.
43 
44   SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+load-store-opt,");
45   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
46     FullFS += "+flat-for-global,+unaligned-buffer-access,";
47 
48   FullFS += FS;
49 
50   ParseSubtargetFeatures(GPU, FullFS);
51 
52   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
53   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
54   // variants of MUBUF instructions.
55   if (!hasAddr64() && !FS.contains("flat-for-global")) {
56     FlatForGlobal = true;
57   }
58 
59   // FIXME: I don't think think Evergreen has any useful support for
60   // denormals, but should be checked. Should we issue a warning somewhere
61   // if someone tries to enable these?
62   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
63     FP64FP16Denormals = false;
64     FP32Denormals = false;
65   }
66 
67   // Set defaults if needed.
68   if (MaxPrivateElementSize == 0)
69     MaxPrivateElementSize = 4;
70 
71   return *this;
72 }
73 
74 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
75                                  const TargetMachine &TM)
76   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
77     TargetTriple(TT),
78     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
79     IsaVersion(ISAVersion0_0_0),
80     WavefrontSize(64),
81     LocalMemorySize(0),
82     LDSBankCount(0),
83     MaxPrivateElementSize(0),
84 
85     FastFMAF32(false),
86     HalfRate64Ops(false),
87 
88     FP32Denormals(false),
89     FP64FP16Denormals(false),
90     FPExceptions(false),
91     FlatForGlobal(false),
92     UnalignedScratchAccess(false),
93     UnalignedBufferAccess(false),
94 
95     EnableXNACK(false),
96     DebuggerInsertNops(false),
97     DebuggerReserveRegs(false),
98     DebuggerEmitPrologue(false),
99 
100     EnableVGPRSpilling(false),
101     EnablePromoteAlloca(false),
102     EnableLoadStoreOpt(false),
103     EnableUnsafeDSOffsetFolding(false),
104     EnableSIScheduler(false),
105     DumpCode(false),
106 
107     FP64(false),
108     IsGCN(false),
109     GCN1Encoding(false),
110     GCN3Encoding(false),
111     CIInsts(false),
112     SGPRInitBug(false),
113     HasSMemRealTime(false),
114     Has16BitInsts(false),
115     HasMovrel(false),
116     HasVGPRIndexMode(false),
117     HasScalarStores(false),
118     HasInv2PiInlineImm(false),
119     HasSDWA(false),
120     HasDPP(false),
121     FlatAddressSpace(false),
122 
123     R600ALUInst(false),
124     CaymanISA(false),
125     CFALUBug(false),
126     HasVertexCache(false),
127     TexVTXClauseSize(0),
128     ScalarizeGlobal(false),
129 
130     FeatureDisable(false),
131     InstrItins(getInstrItineraryForCPU(GPU)) {
132   initializeSubtargetDependencies(TT, GPU, FS);
133 }
134 
135 // FIXME: These limits are for SI. Did they change with the larger maximum LDS
136 // size?
137 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const {
138   switch (NWaves) {
139   case 10:
140     return 1638;
141   case 9:
142     return 1820;
143   case 8:
144     return 2048;
145   case 7:
146     return 2340;
147   case 6:
148     return 2730;
149   case 5:
150     return 3276;
151   case 4:
152     return 4096;
153   case 3:
154     return 5461;
155   case 2:
156     return 8192;
157   default:
158     return getLocalMemorySize();
159   }
160 }
161 
162 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const {
163   if (Bytes <= 1638)
164     return 10;
165 
166   if (Bytes <= 1820)
167     return 9;
168 
169   if (Bytes <= 2048)
170     return 8;
171 
172   if (Bytes <= 2340)
173     return 7;
174 
175   if (Bytes <= 2730)
176     return 6;
177 
178   if (Bytes <= 3276)
179     return 5;
180 
181   if (Bytes <= 4096)
182     return 4;
183 
184   if (Bytes <= 5461)
185     return 3;
186 
187   if (Bytes <= 8192)
188     return 2;
189 
190   return 1;
191 }
192 
193 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
194   const Function &F) const {
195   // Default minimum/maximum flat work group sizes.
196   std::pair<unsigned, unsigned> Default =
197     AMDGPU::isCompute(F.getCallingConv()) ?
198       std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
199                                     getWavefrontSize() * 4) :
200       std::pair<unsigned, unsigned>(1, getWavefrontSize());
201 
202   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
203   // starts using "amdgpu-flat-work-group-size" attribute.
204   Default.second = AMDGPU::getIntegerAttribute(
205     F, "amdgpu-max-work-group-size", Default.second);
206   Default.first = std::min(Default.first, Default.second);
207 
208   // Requested minimum/maximum flat work group sizes.
209   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
210     F, "amdgpu-flat-work-group-size", Default);
211 
212   // Make sure requested minimum is less than requested maximum.
213   if (Requested.first > Requested.second)
214     return Default;
215 
216   // Make sure requested values do not violate subtarget's specifications.
217   if (Requested.first < getMinFlatWorkGroupSize())
218     return Default;
219   if (Requested.second > getMaxFlatWorkGroupSize())
220     return Default;
221 
222   return Requested;
223 }
224 
225 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
226   const Function &F) const {
227   // Default minimum/maximum number of waves per execution unit.
228   std::pair<unsigned, unsigned> Default(1, 0);
229 
230   // Default/requested minimum/maximum flat work group sizes.
231   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
232 
233   // If minimum/maximum flat work group sizes were explicitly requested using
234   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
235   // number of waves per execution unit to values implied by requested
236   // minimum/maximum flat work group sizes.
237   unsigned MinImpliedByFlatWorkGroupSize =
238     getMaxWavesPerEU(FlatWorkGroupSizes.second);
239   bool RequestedFlatWorkGroupSize = false;
240 
241   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
242   // starts using "amdgpu-flat-work-group-size" attribute.
243   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
244       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
245     Default.first = MinImpliedByFlatWorkGroupSize;
246     RequestedFlatWorkGroupSize = true;
247   }
248 
249   // Requested minimum/maximum number of waves per execution unit.
250   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
251     F, "amdgpu-waves-per-eu", Default, true);
252 
253   // Make sure requested minimum is less than requested maximum.
254   if (Requested.second && Requested.first > Requested.second)
255     return Default;
256 
257   // Make sure requested values do not violate subtarget's specifications.
258   if (Requested.first < getMinWavesPerEU() ||
259       Requested.first > getMaxWavesPerEU())
260     return Default;
261   if (Requested.second > getMaxWavesPerEU())
262     return Default;
263 
264   // Make sure requested values are compatible with values implied by requested
265   // minimum/maximum flat work group sizes.
266   if (RequestedFlatWorkGroupSize &&
267       Requested.first > MinImpliedByFlatWorkGroupSize)
268     return Default;
269 
270   return Requested;
271 }
272 
273 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
274                              const TargetMachine &TM) :
275   AMDGPUSubtarget(TT, GPU, FS, TM),
276   InstrInfo(*this),
277   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
278   TLInfo(TM, *this) {}
279 
280 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
281                          const TargetMachine &TM) :
282   AMDGPUSubtarget(TT, GPU, FS, TM),
283   InstrInfo(*this),
284   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
285   TLInfo(TM, *this) {}
286 
287 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
288                                       unsigned NumRegionInstrs) const {
289   // Track register pressure so the scheduler can try to decrease
290   // pressure once register usage is above the threshold defined by
291   // SIRegisterInfo::getRegPressureSetLimit()
292   Policy.ShouldTrackPressure = true;
293 
294   // Enabling both top down and bottom up scheduling seems to give us less
295   // register spills than just using one of these approaches on its own.
296   Policy.OnlyTopDown = false;
297   Policy.OnlyBottomUp = false;
298 
299   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
300   if (!enableSIScheduler())
301     Policy.ShouldTrackLaneMasks = true;
302 }
303 
304 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
305   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
306 }
307 
308 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
309 					    unsigned ExplicitArgBytes) const {
310   unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
311   if (ImplicitBytes == 0)
312     return ExplicitArgBytes;
313 
314   unsigned Alignment = getAlignmentForImplicitArgPtr();
315   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
316 }
317 
318 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
319   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
320     if (SGPRs <= 80)
321       return 10;
322     if (SGPRs <= 88)
323       return 9;
324     if (SGPRs <= 100)
325       return 8;
326     return 7;
327   }
328   if (SGPRs <= 48)
329     return 10;
330   if (SGPRs <= 56)
331     return 9;
332   if (SGPRs <= 64)
333     return 8;
334   if (SGPRs <= 72)
335     return 7;
336   if (SGPRs <= 80)
337     return 6;
338   return 5;
339 }
340 
341 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
342   if (VGPRs <= 24)
343     return 10;
344   if (VGPRs <= 28)
345     return 9;
346   if (VGPRs <= 32)
347     return 8;
348   if (VGPRs <= 36)
349     return 7;
350   if (VGPRs <= 40)
351     return 6;
352   if (VGPRs <= 48)
353     return 5;
354   if (VGPRs <= 64)
355     return 4;
356   if (VGPRs <= 84)
357     return 3;
358   if (VGPRs <= 128)
359     return 2;
360   return 1;
361 }
362 
363 unsigned SISubtarget::getMaxNumSGPRs() const {
364   if (hasSGPRInitBug())
365     return SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
366 
367   if (getGeneration() >= VOLCANIC_ISLANDS)
368     return 102;
369 
370   return 104;
371 }
372