1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "llvm/ADT/SmallString.h"
17 #include "llvm/CodeGen/MachineScheduler.h"
18 #include "llvm/Target/TargetFrameLowering.h"
19 #include <algorithm>
20 
21 using namespace llvm;
22 
23 #define DEBUG_TYPE "amdgpu-subtarget"
24 
25 #define GET_SUBTARGETINFO_ENUM
26 #define GET_SUBTARGETINFO_TARGET_DESC
27 #define GET_SUBTARGETINFO_CTOR
28 #include "AMDGPUGenSubtargetInfo.inc"
29 
30 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
31 
32 AMDGPUSubtarget &
33 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
34                                                  StringRef GPU, StringRef FS) {
35   // Determine default and user-specified characteristics
36   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
37   // enabled, but some instructions do not respect them and they run at the
38   // double precision rate, so don't enable by default.
39   //
40   // We want to be able to turn these off, but making this a subtarget feature
41   // for SI has the unhelpful behavior that it unsets everything else if you
42   // disable it.
43 
44   SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+load-store-opt,");
45   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
46     FullFS += "+flat-for-global,+unaligned-buffer-access,";
47 
48   FullFS += FS;
49 
50   ParseSubtargetFeatures(GPU, FullFS);
51 
52   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
53   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
54   // variants of MUBUF instructions.
55   if (!hasAddr64() && !FS.contains("flat-for-global")) {
56     FlatForGlobal = true;
57   }
58 
59   // FIXME: I don't think think Evergreen has any useful support for
60   // denormals, but should be checked. Should we issue a warning somewhere
61   // if someone tries to enable these?
62   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
63     FP64FP16Denormals = false;
64     FP32Denormals = false;
65   }
66 
67   // Set defaults if needed.
68   if (MaxPrivateElementSize == 0)
69     MaxPrivateElementSize = 4;
70 
71   return *this;
72 }
73 
74 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
75                                  const TargetMachine &TM)
76   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
77     TargetTriple(TT),
78     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
79     IsaVersion(ISAVersion0_0_0),
80     WavefrontSize(64),
81     LocalMemorySize(0),
82     LDSBankCount(0),
83     MaxPrivateElementSize(0),
84 
85     FastFMAF32(false),
86     HalfRate64Ops(false),
87 
88     FP32Denormals(false),
89     FP64FP16Denormals(false),
90     FPExceptions(false),
91     FlatForGlobal(false),
92     UnalignedScratchAccess(false),
93     UnalignedBufferAccess(false),
94 
95     EnableXNACK(false),
96     DebuggerInsertNops(false),
97     DebuggerReserveRegs(false),
98     DebuggerEmitPrologue(false),
99 
100     EnableVGPRSpilling(false),
101     EnablePromoteAlloca(false),
102     EnableLoadStoreOpt(false),
103     EnableUnsafeDSOffsetFolding(false),
104     EnableSIScheduler(false),
105     DumpCode(false),
106 
107     FP64(false),
108     IsGCN(false),
109     GCN1Encoding(false),
110     GCN3Encoding(false),
111     CIInsts(false),
112     SGPRInitBug(false),
113     HasSMemRealTime(false),
114     Has16BitInsts(false),
115     HasMovrel(false),
116     HasVGPRIndexMode(false),
117     HasScalarStores(false),
118     HasInv2PiInlineImm(false),
119     HasSDWA(false),
120     HasDPP(false),
121     FlatAddressSpace(false),
122 
123     R600ALUInst(false),
124     CaymanISA(false),
125     CFALUBug(false),
126     HasVertexCache(false),
127     TexVTXClauseSize(0),
128     ScalarizeGlobal(false),
129 
130     FeatureDisable(false),
131     InstrItins(getInstrItineraryForCPU(GPU)) {
132   initializeSubtargetDependencies(TT, GPU, FS);
133 }
134 
135 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
136   const Function &F) const {
137   if (NWaves == 1)
138     return getLocalMemorySize();
139   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
140   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
141   unsigned MaxWaves = getMaxWavesPerEU();
142   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
143 }
144 
145 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
146   const Function &F) const {
147   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
148   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
149   unsigned MaxWaves = getMaxWavesPerEU();
150   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
151   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
152   NumWaves = std::min(NumWaves, MaxWaves);
153   NumWaves = std::max(NumWaves, 1u);
154   return NumWaves;
155 }
156 
157 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
158   const Function &F) const {
159   // Default minimum/maximum flat work group sizes.
160   std::pair<unsigned, unsigned> Default =
161     AMDGPU::isCompute(F.getCallingConv()) ?
162       std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
163                                     getWavefrontSize() * 4) :
164       std::pair<unsigned, unsigned>(1, getWavefrontSize());
165 
166   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
167   // starts using "amdgpu-flat-work-group-size" attribute.
168   Default.second = AMDGPU::getIntegerAttribute(
169     F, "amdgpu-max-work-group-size", Default.second);
170   Default.first = std::min(Default.first, Default.second);
171 
172   // Requested minimum/maximum flat work group sizes.
173   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
174     F, "amdgpu-flat-work-group-size", Default);
175 
176   // Make sure requested minimum is less than requested maximum.
177   if (Requested.first > Requested.second)
178     return Default;
179 
180   // Make sure requested values do not violate subtarget's specifications.
181   if (Requested.first < getMinFlatWorkGroupSize())
182     return Default;
183   if (Requested.second > getMaxFlatWorkGroupSize())
184     return Default;
185 
186   return Requested;
187 }
188 
189 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
190   const Function &F) const {
191   // Default minimum/maximum number of waves per execution unit.
192   std::pair<unsigned, unsigned> Default(1, 0);
193 
194   // Default/requested minimum/maximum flat work group sizes.
195   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
196 
197   // If minimum/maximum flat work group sizes were explicitly requested using
198   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
199   // number of waves per execution unit to values implied by requested
200   // minimum/maximum flat work group sizes.
201   unsigned MinImpliedByFlatWorkGroupSize =
202     getMaxWavesPerEU(FlatWorkGroupSizes.second);
203   bool RequestedFlatWorkGroupSize = false;
204 
205   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
206   // starts using "amdgpu-flat-work-group-size" attribute.
207   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
208       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
209     Default.first = MinImpliedByFlatWorkGroupSize;
210     RequestedFlatWorkGroupSize = true;
211   }
212 
213   // Requested minimum/maximum number of waves per execution unit.
214   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
215     F, "amdgpu-waves-per-eu", Default, true);
216 
217   // Make sure requested minimum is less than requested maximum.
218   if (Requested.second && Requested.first > Requested.second)
219     return Default;
220 
221   // Make sure requested values do not violate subtarget's specifications.
222   if (Requested.first < getMinWavesPerEU() ||
223       Requested.first > getMaxWavesPerEU())
224     return Default;
225   if (Requested.second > getMaxWavesPerEU())
226     return Default;
227 
228   // Make sure requested values are compatible with values implied by requested
229   // minimum/maximum flat work group sizes.
230   if (RequestedFlatWorkGroupSize &&
231       Requested.first > MinImpliedByFlatWorkGroupSize)
232     return Default;
233 
234   return Requested;
235 }
236 
237 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
238                              const TargetMachine &TM) :
239   AMDGPUSubtarget(TT, GPU, FS, TM),
240   InstrInfo(*this),
241   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
242   TLInfo(TM, *this) {}
243 
244 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
245                          const TargetMachine &TM) :
246   AMDGPUSubtarget(TT, GPU, FS, TM),
247   InstrInfo(*this),
248   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
249   TLInfo(TM, *this) {}
250 
251 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
252                                       unsigned NumRegionInstrs) const {
253   // Track register pressure so the scheduler can try to decrease
254   // pressure once register usage is above the threshold defined by
255   // SIRegisterInfo::getRegPressureSetLimit()
256   Policy.ShouldTrackPressure = true;
257 
258   // Enabling both top down and bottom up scheduling seems to give us less
259   // register spills than just using one of these approaches on its own.
260   Policy.OnlyTopDown = false;
261   Policy.OnlyBottomUp = false;
262 
263   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
264   if (!enableSIScheduler())
265     Policy.ShouldTrackLaneMasks = true;
266 }
267 
268 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
269   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
270 }
271 
272 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
273 					    unsigned ExplicitArgBytes) const {
274   unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
275   if (ImplicitBytes == 0)
276     return ExplicitArgBytes;
277 
278   unsigned Alignment = getAlignmentForImplicitArgPtr();
279   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
280 }
281 
282 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
283   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
284     if (SGPRs <= 80)
285       return 10;
286     if (SGPRs <= 88)
287       return 9;
288     if (SGPRs <= 100)
289       return 8;
290     return 7;
291   }
292   if (SGPRs <= 48)
293     return 10;
294   if (SGPRs <= 56)
295     return 9;
296   if (SGPRs <= 64)
297     return 8;
298   if (SGPRs <= 72)
299     return 7;
300   if (SGPRs <= 80)
301     return 6;
302   return 5;
303 }
304 
305 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
306   if (VGPRs <= 24)
307     return 10;
308   if (VGPRs <= 28)
309     return 9;
310   if (VGPRs <= 32)
311     return 8;
312   if (VGPRs <= 36)
313     return 7;
314   if (VGPRs <= 40)
315     return 6;
316   if (VGPRs <= 48)
317     return 5;
318   if (VGPRs <= 64)
319     return 4;
320   if (VGPRs <= 84)
321     return 3;
322   if (VGPRs <= 128)
323     return 2;
324   return 1;
325 }
326 
327 unsigned SISubtarget::getMaxNumSGPRs() const {
328   if (hasSGPRInitBug())
329     return SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
330 
331   if (getGeneration() >= VOLCANIC_ISLANDS)
332     return 102;
333 
334   return 104;
335 }
336