1 //===- SIMachineFunctionInfo.cpp - SI Machine Function Info ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "SIMachineFunctionInfo.h"
10 #include "AMDGPUArgumentUsageInfo.h"
11 #include "AMDGPUSubtarget.h"
12 #include "SIRegisterInfo.h"
13 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
14 #include "Utils/AMDGPUBaseInfo.h"
15 #include "llvm/ADT/Optional.h"
16 #include "llvm/CodeGen/MachineBasicBlock.h"
17 #include "llvm/CodeGen/MachineFrameInfo.h"
18 #include "llvm/CodeGen/MachineFunction.h"
19 #include "llvm/CodeGen/MachineRegisterInfo.h"
20 #include "llvm/IR/CallingConv.h"
21 #include "llvm/IR/Function.h"
22 #include <cassert>
23 #include <vector>
24 
25 #define MAX_LANES 64
26 
27 using namespace llvm;
28 
29 SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
30   : AMDGPUMachineFunction(MF),
31     PrivateSegmentBuffer(false),
32     DispatchPtr(false),
33     QueuePtr(false),
34     KernargSegmentPtr(false),
35     DispatchID(false),
36     FlatScratchInit(false),
37     WorkGroupIDX(false),
38     WorkGroupIDY(false),
39     WorkGroupIDZ(false),
40     WorkGroupInfo(false),
41     PrivateSegmentWaveByteOffset(false),
42     WorkItemIDX(false),
43     WorkItemIDY(false),
44     WorkItemIDZ(false),
45     ImplicitBufferPtr(false),
46     ImplicitArgPtr(false),
47     GITPtrHigh(0xffffffff),
48     HighBitsOf32BitAddress(0) {
49   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
50   const Function &F = MF.getFunction();
51   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
52   WavesPerEU = ST.getWavesPerEU(F);
53 
54   Occupancy = getMaxWavesPerEU();
55   limitOccupancy(MF);
56   CallingConv::ID CC = F.getCallingConv();
57 
58   if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
59     if (!F.arg_empty())
60       KernargSegmentPtr = true;
61     WorkGroupIDX = true;
62     WorkItemIDX = true;
63   } else if (CC == CallingConv::AMDGPU_PS) {
64     PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
65   }
66 
67   if (!isEntryFunction()) {
68     // Non-entry functions have no special inputs for now, other registers
69     // required for scratch access.
70     ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
71     ScratchWaveOffsetReg = AMDGPU::SGPR4;
72     FrameOffsetReg = AMDGPU::SGPR5;
73     StackPtrOffsetReg = AMDGPU::SGPR32;
74 
75     ArgInfo.PrivateSegmentBuffer =
76       ArgDescriptor::createRegister(ScratchRSrcReg);
77     ArgInfo.PrivateSegmentWaveByteOffset =
78       ArgDescriptor::createRegister(ScratchWaveOffsetReg);
79 
80     if (F.hasFnAttribute("amdgpu-implicitarg-ptr"))
81       ImplicitArgPtr = true;
82   } else {
83     if (F.hasFnAttribute("amdgpu-implicitarg-ptr")) {
84       KernargSegmentPtr = true;
85       MaxKernArgAlign = std::max(ST.getAlignmentForImplicitArgPtr(),
86                                  MaxKernArgAlign);
87     }
88   }
89 
90   if (ST.debuggerEmitPrologue()) {
91     // Enable everything.
92     WorkGroupIDX = true;
93     WorkGroupIDY = true;
94     WorkGroupIDZ = true;
95     WorkItemIDX = true;
96     WorkItemIDY = true;
97     WorkItemIDZ = true;
98   } else {
99     if (F.hasFnAttribute("amdgpu-work-group-id-x"))
100       WorkGroupIDX = true;
101 
102     if (F.hasFnAttribute("amdgpu-work-group-id-y"))
103       WorkGroupIDY = true;
104 
105     if (F.hasFnAttribute("amdgpu-work-group-id-z"))
106       WorkGroupIDZ = true;
107 
108     if (F.hasFnAttribute("amdgpu-work-item-id-x"))
109       WorkItemIDX = true;
110 
111     if (F.hasFnAttribute("amdgpu-work-item-id-y"))
112       WorkItemIDY = true;
113 
114     if (F.hasFnAttribute("amdgpu-work-item-id-z"))
115       WorkItemIDZ = true;
116   }
117 
118   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
119   bool HasStackObjects = FrameInfo.hasStackObjects();
120 
121   if (isEntryFunction()) {
122     // X, XY, and XYZ are the only supported combinations, so make sure Y is
123     // enabled if Z is.
124     if (WorkItemIDZ)
125       WorkItemIDY = true;
126 
127     PrivateSegmentWaveByteOffset = true;
128 
129     // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
130     if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
131         (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
132       ArgInfo.PrivateSegmentWaveByteOffset =
133           ArgDescriptor::createRegister(AMDGPU::SGPR5);
134   }
135 
136   bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
137   if (isAmdHsaOrMesa) {
138     PrivateSegmentBuffer = true;
139 
140     if (F.hasFnAttribute("amdgpu-dispatch-ptr"))
141       DispatchPtr = true;
142 
143     if (F.hasFnAttribute("amdgpu-queue-ptr"))
144       QueuePtr = true;
145 
146     if (F.hasFnAttribute("amdgpu-dispatch-id"))
147       DispatchID = true;
148   } else if (ST.isMesaGfxShader(F)) {
149     ImplicitBufferPtr = true;
150   }
151 
152   if (F.hasFnAttribute("amdgpu-kernarg-segment-ptr"))
153     KernargSegmentPtr = true;
154 
155   if (ST.hasFlatAddressSpace() && isEntryFunction() && isAmdHsaOrMesa) {
156     // TODO: This could be refined a lot. The attribute is a poor way of
157     // detecting calls that may require it before argument lowering.
158     if (HasStackObjects || F.hasFnAttribute("amdgpu-flat-scratch"))
159       FlatScratchInit = true;
160   }
161 
162   Attribute A = F.getFnAttribute("amdgpu-git-ptr-high");
163   StringRef S = A.getValueAsString();
164   if (!S.empty())
165     S.consumeInteger(0, GITPtrHigh);
166 
167   A = F.getFnAttribute("amdgpu-32bit-address-high-bits");
168   S = A.getValueAsString();
169   if (!S.empty())
170     S.consumeInteger(0, HighBitsOf32BitAddress);
171 }
172 
173 void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) {
174   limitOccupancy(getMaxWavesPerEU());
175   const GCNSubtarget& ST = MF.getSubtarget<GCNSubtarget>();
176   limitOccupancy(ST.getOccupancyWithLocalMemSize(getLDSSize(),
177                  MF.getFunction()));
178 }
179 
180 unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
181   const SIRegisterInfo &TRI) {
182   ArgInfo.PrivateSegmentBuffer =
183     ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
184     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass));
185   NumUserSGPRs += 4;
186   return ArgInfo.PrivateSegmentBuffer.getRegister();
187 }
188 
189 unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) {
190   ArgInfo.DispatchPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
191     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
192   NumUserSGPRs += 2;
193   return ArgInfo.DispatchPtr.getRegister();
194 }
195 
196 unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) {
197   ArgInfo.QueuePtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
198     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
199   NumUserSGPRs += 2;
200   return ArgInfo.QueuePtr.getRegister();
201 }
202 
203 unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) {
204   ArgInfo.KernargSegmentPtr
205     = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
206     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
207   NumUserSGPRs += 2;
208   return ArgInfo.KernargSegmentPtr.getRegister();
209 }
210 
211 unsigned SIMachineFunctionInfo::addDispatchID(const SIRegisterInfo &TRI) {
212   ArgInfo.DispatchID = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
213     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
214   NumUserSGPRs += 2;
215   return ArgInfo.DispatchID.getRegister();
216 }
217 
218 unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
219   ArgInfo.FlatScratchInit = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
220     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
221   NumUserSGPRs += 2;
222   return ArgInfo.FlatScratchInit.getRegister();
223 }
224 
225 unsigned SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) {
226   ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
227     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
228   NumUserSGPRs += 2;
229   return ArgInfo.ImplicitBufferPtr.getRegister();
230 }
231 
232 static bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) {
233   for (unsigned I = 0; CSRegs[I]; ++I) {
234     if (CSRegs[I] == Reg)
235       return true;
236   }
237 
238   return false;
239 }
240 
241 /// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI.
242 bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
243                                                     int FI) {
244   std::vector<SpilledReg> &SpillLanes = SGPRToVGPRSpills[FI];
245 
246   // This has already been allocated.
247   if (!SpillLanes.empty())
248     return true;
249 
250   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
251   const SIRegisterInfo *TRI = ST.getRegisterInfo();
252   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
253   MachineRegisterInfo &MRI = MF.getRegInfo();
254   unsigned WaveSize = ST.getWavefrontSize();
255 
256   unsigned Size = FrameInfo.getObjectSize(FI);
257   assert(Size >= 4 && Size <= 64 && "invalid sgpr spill size");
258   assert(TRI->spillSGPRToVGPR() && "not spilling SGPRs to VGPRs");
259 
260   int NumLanes = Size / 4;
261 
262   const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
263 
264   // Make sure to handle the case where a wide SGPR spill may span between two
265   // VGPRs.
266   for (int I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) {
267     unsigned LaneVGPR;
268     unsigned VGPRIndex = (NumVGPRSpillLanes % WaveSize);
269 
270     if (VGPRIndex == 0) {
271       LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
272       if (LaneVGPR == AMDGPU::NoRegister) {
273         // We have no VGPRs left for spilling SGPRs. Reset because we will not
274         // partially spill the SGPR to VGPRs.
275         SGPRToVGPRSpills.erase(FI);
276         NumVGPRSpillLanes -= I;
277         return false;
278       }
279 
280       Optional<int> CSRSpillFI;
281       if ((FrameInfo.hasCalls() || !isEntryFunction()) && CSRegs &&
282           isCalleeSavedReg(CSRegs, LaneVGPR)) {
283         CSRSpillFI = FrameInfo.CreateSpillStackObject(4, 4);
284       }
285 
286       SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, CSRSpillFI));
287 
288       // Add this register as live-in to all blocks to avoid machine verifer
289       // complaining about use of an undefined physical register.
290       for (MachineBasicBlock &BB : MF)
291         BB.addLiveIn(LaneVGPR);
292     } else {
293       LaneVGPR = SpillVGPRs.back().VGPR;
294     }
295 
296     SpillLanes.push_back(SpilledReg(LaneVGPR, VGPRIndex));
297   }
298 
299   return true;
300 }
301 
302 void SIMachineFunctionInfo::removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI) {
303   for (auto &R : SGPRToVGPRSpills)
304     MFI.RemoveStackObject(R.first);
305 }
306 
307 
308 /// \returns VGPR used for \p Dim' work item ID.
309 unsigned SIMachineFunctionInfo::getWorkItemIDVGPR(unsigned Dim) const {
310   switch (Dim) {
311   case 0:
312     assert(hasWorkItemIDX());
313     return AMDGPU::VGPR0;
314   case 1:
315     assert(hasWorkItemIDY());
316     return AMDGPU::VGPR1;
317   case 2:
318     assert(hasWorkItemIDZ());
319     return AMDGPU::VGPR2;
320   }
321   llvm_unreachable("unexpected dimension");
322 }
323 
324 MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const {
325   assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
326   return AMDGPU::SGPR0 + NumUserSGPRs;
327 }
328 
329 MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const {
330   return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
331 }
332