1 //===- SIMachineFunctionInfo.cpp - SI Machine Function Info ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "SIMachineFunctionInfo.h"
10 #include "AMDGPUArgumentUsageInfo.h"
11 #include "AMDGPUSubtarget.h"
12 #include "SIRegisterInfo.h"
13 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
14 #include "Utils/AMDGPUBaseInfo.h"
15 #include "llvm/ADT/Optional.h"
16 #include "llvm/CodeGen/MachineBasicBlock.h"
17 #include "llvm/CodeGen/MachineFrameInfo.h"
18 #include "llvm/CodeGen/MachineFunction.h"
19 #include "llvm/CodeGen/MachineRegisterInfo.h"
20 #include "llvm/IR/CallingConv.h"
21 #include "llvm/IR/Function.h"
22 #include <cassert>
23 #include <vector>
24 
25 #define MAX_LANES 64
26 
27 using namespace llvm;
28 
29 SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
30   : AMDGPUMachineFunction(MF),
31     PrivateSegmentBuffer(false),
32     DispatchPtr(false),
33     QueuePtr(false),
34     KernargSegmentPtr(false),
35     DispatchID(false),
36     FlatScratchInit(false),
37     WorkGroupIDX(false),
38     WorkGroupIDY(false),
39     WorkGroupIDZ(false),
40     WorkGroupInfo(false),
41     PrivateSegmentWaveByteOffset(false),
42     WorkItemIDX(false),
43     WorkItemIDY(false),
44     WorkItemIDZ(false),
45     ImplicitBufferPtr(false),
46     ImplicitArgPtr(false),
47     GITPtrHigh(0xffffffff),
48     HighBitsOf32BitAddress(0) {
49   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
50   const Function &F = MF.getFunction();
51   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
52   WavesPerEU = ST.getWavesPerEU(F);
53 
54   Occupancy = getMaxWavesPerEU();
55   limitOccupancy(MF);
56   CallingConv::ID CC = F.getCallingConv();
57 
58   if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
59     if (!F.arg_empty())
60       KernargSegmentPtr = true;
61     WorkGroupIDX = true;
62     WorkItemIDX = true;
63   } else if (CC == CallingConv::AMDGPU_PS) {
64     PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
65   }
66 
67   if (!isEntryFunction()) {
68     // Non-entry functions have no special inputs for now, other registers
69     // required for scratch access.
70     ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
71     ScratchWaveOffsetReg = AMDGPU::SGPR4;
72     FrameOffsetReg = AMDGPU::SGPR5;
73     StackPtrOffsetReg = AMDGPU::SGPR32;
74 
75     ArgInfo.PrivateSegmentBuffer =
76       ArgDescriptor::createRegister(ScratchRSrcReg);
77     ArgInfo.PrivateSegmentWaveByteOffset =
78       ArgDescriptor::createRegister(ScratchWaveOffsetReg);
79 
80     if (F.hasFnAttribute("amdgpu-implicitarg-ptr"))
81       ImplicitArgPtr = true;
82   } else {
83     if (F.hasFnAttribute("amdgpu-implicitarg-ptr")) {
84       KernargSegmentPtr = true;
85       MaxKernArgAlign = std::max(ST.getAlignmentForImplicitArgPtr(),
86                                  MaxKernArgAlign);
87     }
88   }
89 
90   if (F.hasFnAttribute("amdgpu-work-group-id-x"))
91     WorkGroupIDX = true;
92 
93   if (F.hasFnAttribute("amdgpu-work-group-id-y"))
94     WorkGroupIDY = true;
95 
96   if (F.hasFnAttribute("amdgpu-work-group-id-z"))
97     WorkGroupIDZ = true;
98 
99   if (F.hasFnAttribute("amdgpu-work-item-id-x"))
100     WorkItemIDX = true;
101 
102   if (F.hasFnAttribute("amdgpu-work-item-id-y"))
103     WorkItemIDY = true;
104 
105   if (F.hasFnAttribute("amdgpu-work-item-id-z"))
106     WorkItemIDZ = true;
107 
108   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
109   bool HasStackObjects = FrameInfo.hasStackObjects();
110 
111   if (isEntryFunction()) {
112     // X, XY, and XYZ are the only supported combinations, so make sure Y is
113     // enabled if Z is.
114     if (WorkItemIDZ)
115       WorkItemIDY = true;
116 
117     PrivateSegmentWaveByteOffset = true;
118 
119     // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
120     if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
121         (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
122       ArgInfo.PrivateSegmentWaveByteOffset =
123           ArgDescriptor::createRegister(AMDGPU::SGPR5);
124   }
125 
126   bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
127   if (isAmdHsaOrMesa) {
128     PrivateSegmentBuffer = true;
129 
130     if (F.hasFnAttribute("amdgpu-dispatch-ptr"))
131       DispatchPtr = true;
132 
133     if (F.hasFnAttribute("amdgpu-queue-ptr"))
134       QueuePtr = true;
135 
136     if (F.hasFnAttribute("amdgpu-dispatch-id"))
137       DispatchID = true;
138   } else if (ST.isMesaGfxShader(F)) {
139     ImplicitBufferPtr = true;
140   }
141 
142   if (F.hasFnAttribute("amdgpu-kernarg-segment-ptr"))
143     KernargSegmentPtr = true;
144 
145   if (ST.hasFlatAddressSpace() && isEntryFunction() && isAmdHsaOrMesa) {
146     // TODO: This could be refined a lot. The attribute is a poor way of
147     // detecting calls that may require it before argument lowering.
148     if (HasStackObjects || F.hasFnAttribute("amdgpu-flat-scratch"))
149       FlatScratchInit = true;
150   }
151 
152   Attribute A = F.getFnAttribute("amdgpu-git-ptr-high");
153   StringRef S = A.getValueAsString();
154   if (!S.empty())
155     S.consumeInteger(0, GITPtrHigh);
156 
157   A = F.getFnAttribute("amdgpu-32bit-address-high-bits");
158   S = A.getValueAsString();
159   if (!S.empty())
160     S.consumeInteger(0, HighBitsOf32BitAddress);
161 }
162 
163 void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) {
164   limitOccupancy(getMaxWavesPerEU());
165   const GCNSubtarget& ST = MF.getSubtarget<GCNSubtarget>();
166   limitOccupancy(ST.getOccupancyWithLocalMemSize(getLDSSize(),
167                  MF.getFunction()));
168 }
169 
170 unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
171   const SIRegisterInfo &TRI) {
172   ArgInfo.PrivateSegmentBuffer =
173     ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
174     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass));
175   NumUserSGPRs += 4;
176   return ArgInfo.PrivateSegmentBuffer.getRegister();
177 }
178 
179 unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) {
180   ArgInfo.DispatchPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
181     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
182   NumUserSGPRs += 2;
183   return ArgInfo.DispatchPtr.getRegister();
184 }
185 
186 unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) {
187   ArgInfo.QueuePtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
188     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
189   NumUserSGPRs += 2;
190   return ArgInfo.QueuePtr.getRegister();
191 }
192 
193 unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) {
194   ArgInfo.KernargSegmentPtr
195     = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
196     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
197   NumUserSGPRs += 2;
198   return ArgInfo.KernargSegmentPtr.getRegister();
199 }
200 
201 unsigned SIMachineFunctionInfo::addDispatchID(const SIRegisterInfo &TRI) {
202   ArgInfo.DispatchID = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
203     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
204   NumUserSGPRs += 2;
205   return ArgInfo.DispatchID.getRegister();
206 }
207 
208 unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
209   ArgInfo.FlatScratchInit = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
210     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
211   NumUserSGPRs += 2;
212   return ArgInfo.FlatScratchInit.getRegister();
213 }
214 
215 unsigned SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) {
216   ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
217     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
218   NumUserSGPRs += 2;
219   return ArgInfo.ImplicitBufferPtr.getRegister();
220 }
221 
222 static bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) {
223   for (unsigned I = 0; CSRegs[I]; ++I) {
224     if (CSRegs[I] == Reg)
225       return true;
226   }
227 
228   return false;
229 }
230 
231 /// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI.
232 bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
233                                                     int FI) {
234   std::vector<SpilledReg> &SpillLanes = SGPRToVGPRSpills[FI];
235 
236   // This has already been allocated.
237   if (!SpillLanes.empty())
238     return true;
239 
240   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
241   const SIRegisterInfo *TRI = ST.getRegisterInfo();
242   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
243   MachineRegisterInfo &MRI = MF.getRegInfo();
244   unsigned WaveSize = ST.getWavefrontSize();
245 
246   unsigned Size = FrameInfo.getObjectSize(FI);
247   assert(Size >= 4 && Size <= 64 && "invalid sgpr spill size");
248   assert(TRI->spillSGPRToVGPR() && "not spilling SGPRs to VGPRs");
249 
250   int NumLanes = Size / 4;
251 
252   const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
253 
254   // Make sure to handle the case where a wide SGPR spill may span between two
255   // VGPRs.
256   for (int I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) {
257     unsigned LaneVGPR;
258     unsigned VGPRIndex = (NumVGPRSpillLanes % WaveSize);
259 
260     if (VGPRIndex == 0) {
261       LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
262       if (LaneVGPR == AMDGPU::NoRegister) {
263         // We have no VGPRs left for spilling SGPRs. Reset because we will not
264         // partially spill the SGPR to VGPRs.
265         SGPRToVGPRSpills.erase(FI);
266         NumVGPRSpillLanes -= I;
267         return false;
268       }
269 
270       Optional<int> CSRSpillFI;
271       if ((FrameInfo.hasCalls() || !isEntryFunction()) && CSRegs &&
272           isCalleeSavedReg(CSRegs, LaneVGPR)) {
273         CSRSpillFI = FrameInfo.CreateSpillStackObject(4, 4);
274       }
275 
276       SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, CSRSpillFI));
277 
278       // Add this register as live-in to all blocks to avoid machine verifer
279       // complaining about use of an undefined physical register.
280       for (MachineBasicBlock &BB : MF)
281         BB.addLiveIn(LaneVGPR);
282     } else {
283       LaneVGPR = SpillVGPRs.back().VGPR;
284     }
285 
286     SpillLanes.push_back(SpilledReg(LaneVGPR, VGPRIndex));
287   }
288 
289   return true;
290 }
291 
292 void SIMachineFunctionInfo::removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI) {
293   for (auto &R : SGPRToVGPRSpills)
294     MFI.RemoveStackObject(R.first);
295 }
296 
297 
298 /// \returns VGPR used for \p Dim' work item ID.
299 unsigned SIMachineFunctionInfo::getWorkItemIDVGPR(unsigned Dim) const {
300   switch (Dim) {
301   case 0:
302     assert(hasWorkItemIDX());
303     return AMDGPU::VGPR0;
304   case 1:
305     assert(hasWorkItemIDY());
306     return AMDGPU::VGPR1;
307   case 2:
308     assert(hasWorkItemIDZ());
309     return AMDGPU::VGPR2;
310   }
311   llvm_unreachable("unexpected dimension");
312 }
313 
314 MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const {
315   assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
316   return AMDGPU::SGPR0 + NumUserSGPRs;
317 }
318 
319 MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const {
320   return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
321 }
322 
323 static yaml::StringValue regToString(unsigned Reg,
324                                      const TargetRegisterInfo &TRI) {
325   yaml::StringValue Dest;
326   {
327     raw_string_ostream OS(Dest.Value);
328     OS << printReg(Reg, &TRI);
329   }
330   return Dest;
331 }
332 
333 yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
334   const llvm::SIMachineFunctionInfo& MFI,
335   const TargetRegisterInfo &TRI)
336   : ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
337     MaxKernArgAlign(MFI.getMaxKernArgAlign()),
338     LDSSize(MFI.getLDSSize()),
339     IsEntryFunction(MFI.isEntryFunction()),
340     NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()),
341     MemoryBound(MFI.isMemoryBound()),
342     WaveLimiter(MFI.needsWaveLimiter()),
343     ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
344     ScratchWaveOffsetReg(regToString(MFI.getScratchWaveOffsetReg(), TRI)),
345     FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)),
346     StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)) {}
347 
348 void yaml::SIMachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) {
349   MappingTraits<SIMachineFunctionInfo>::mapping(YamlIO, *this);
350 }
351 
352 bool SIMachineFunctionInfo::initializeBaseYamlFields(
353   const yaml::SIMachineFunctionInfo &YamlMFI) {
354   ExplicitKernArgSize = YamlMFI.ExplicitKernArgSize;
355   MaxKernArgAlign = YamlMFI.MaxKernArgAlign;
356   LDSSize = YamlMFI.LDSSize;
357   IsEntryFunction = YamlMFI.IsEntryFunction;
358   NoSignedZerosFPMath = YamlMFI.NoSignedZerosFPMath;
359   MemoryBound = YamlMFI.MemoryBound;
360   WaveLimiter = YamlMFI.WaveLimiter;
361   return false;
362 }
363