1 //===- SIMachineFunctionInfo.cpp - SI Machine Function Info ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "SIMachineFunctionInfo.h"
10 #include "AMDGPUArgumentUsageInfo.h"
11 #include "AMDGPUSubtarget.h"
12 #include "SIRegisterInfo.h"
13 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
14 #include "Utils/AMDGPUBaseInfo.h"
15 #include "llvm/ADT/Optional.h"
16 #include "llvm/CodeGen/MachineBasicBlock.h"
17 #include "llvm/CodeGen/MachineFrameInfo.h"
18 #include "llvm/CodeGen/MachineFunction.h"
19 #include "llvm/CodeGen/MachineRegisterInfo.h"
20 #include "llvm/IR/CallingConv.h"
21 #include "llvm/IR/Function.h"
22 #include <cassert>
23 #include <vector>
24 
25 #define MAX_LANES 64
26 
27 using namespace llvm;
28 
29 SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
30   : AMDGPUMachineFunction(MF),
31     Mode(MF.getFunction()),
32     PrivateSegmentBuffer(false),
33     DispatchPtr(false),
34     QueuePtr(false),
35     KernargSegmentPtr(false),
36     DispatchID(false),
37     FlatScratchInit(false),
38     WorkGroupIDX(false),
39     WorkGroupIDY(false),
40     WorkGroupIDZ(false),
41     WorkGroupInfo(false),
42     PrivateSegmentWaveByteOffset(false),
43     WorkItemIDX(false),
44     WorkItemIDY(false),
45     WorkItemIDZ(false),
46     ImplicitBufferPtr(false),
47     ImplicitArgPtr(false),
48     GITPtrHigh(0xffffffff),
49     HighBitsOf32BitAddress(0) {
50   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
51   const Function &F = MF.getFunction();
52   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
53   WavesPerEU = ST.getWavesPerEU(F);
54 
55   Occupancy = getMaxWavesPerEU();
56   limitOccupancy(MF);
57   CallingConv::ID CC = F.getCallingConv();
58 
59   if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
60     if (!F.arg_empty())
61       KernargSegmentPtr = true;
62     WorkGroupIDX = true;
63     WorkItemIDX = true;
64   } else if (CC == CallingConv::AMDGPU_PS) {
65     PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
66   }
67 
68   if (!isEntryFunction()) {
69     // Non-entry functions have no special inputs for now, other registers
70     // required for scratch access.
71     ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
72     ScratchWaveOffsetReg = AMDGPU::SGPR4;
73     FrameOffsetReg = AMDGPU::SGPR5;
74     StackPtrOffsetReg = AMDGPU::SGPR32;
75 
76     ArgInfo.PrivateSegmentBuffer =
77       ArgDescriptor::createRegister(ScratchRSrcReg);
78     ArgInfo.PrivateSegmentWaveByteOffset =
79       ArgDescriptor::createRegister(ScratchWaveOffsetReg);
80 
81     if (F.hasFnAttribute("amdgpu-implicitarg-ptr"))
82       ImplicitArgPtr = true;
83   } else {
84     if (F.hasFnAttribute("amdgpu-implicitarg-ptr")) {
85       KernargSegmentPtr = true;
86       MaxKernArgAlign = std::max(ST.getAlignmentForImplicitArgPtr(),
87                                  MaxKernArgAlign);
88     }
89   }
90 
91   if (F.hasFnAttribute("amdgpu-work-group-id-x"))
92     WorkGroupIDX = true;
93 
94   if (F.hasFnAttribute("amdgpu-work-group-id-y"))
95     WorkGroupIDY = true;
96 
97   if (F.hasFnAttribute("amdgpu-work-group-id-z"))
98     WorkGroupIDZ = true;
99 
100   if (F.hasFnAttribute("amdgpu-work-item-id-x"))
101     WorkItemIDX = true;
102 
103   if (F.hasFnAttribute("amdgpu-work-item-id-y"))
104     WorkItemIDY = true;
105 
106   if (F.hasFnAttribute("amdgpu-work-item-id-z"))
107     WorkItemIDZ = true;
108 
109   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
110   bool HasStackObjects = FrameInfo.hasStackObjects();
111 
112   if (isEntryFunction()) {
113     // X, XY, and XYZ are the only supported combinations, so make sure Y is
114     // enabled if Z is.
115     if (WorkItemIDZ)
116       WorkItemIDY = true;
117 
118     PrivateSegmentWaveByteOffset = true;
119 
120     // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
121     if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
122         (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
123       ArgInfo.PrivateSegmentWaveByteOffset =
124           ArgDescriptor::createRegister(AMDGPU::SGPR5);
125   }
126 
127   bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
128   if (isAmdHsaOrMesa) {
129     PrivateSegmentBuffer = true;
130 
131     if (F.hasFnAttribute("amdgpu-dispatch-ptr"))
132       DispatchPtr = true;
133 
134     if (F.hasFnAttribute("amdgpu-queue-ptr"))
135       QueuePtr = true;
136 
137     if (F.hasFnAttribute("amdgpu-dispatch-id"))
138       DispatchID = true;
139   } else if (ST.isMesaGfxShader(F)) {
140     ImplicitBufferPtr = true;
141   }
142 
143   if (F.hasFnAttribute("amdgpu-kernarg-segment-ptr"))
144     KernargSegmentPtr = true;
145 
146   if (ST.hasFlatAddressSpace() && isEntryFunction() && isAmdHsaOrMesa) {
147     // TODO: This could be refined a lot. The attribute is a poor way of
148     // detecting calls that may require it before argument lowering.
149     if (HasStackObjects || F.hasFnAttribute("amdgpu-flat-scratch"))
150       FlatScratchInit = true;
151   }
152 
153   Attribute A = F.getFnAttribute("amdgpu-git-ptr-high");
154   StringRef S = A.getValueAsString();
155   if (!S.empty())
156     S.consumeInteger(0, GITPtrHigh);
157 
158   A = F.getFnAttribute("amdgpu-32bit-address-high-bits");
159   S = A.getValueAsString();
160   if (!S.empty())
161     S.consumeInteger(0, HighBitsOf32BitAddress);
162 }
163 
164 void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) {
165   limitOccupancy(getMaxWavesPerEU());
166   const GCNSubtarget& ST = MF.getSubtarget<GCNSubtarget>();
167   limitOccupancy(ST.getOccupancyWithLocalMemSize(getLDSSize(),
168                  MF.getFunction()));
169 }
170 
171 unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
172   const SIRegisterInfo &TRI) {
173   ArgInfo.PrivateSegmentBuffer =
174     ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
175     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass));
176   NumUserSGPRs += 4;
177   return ArgInfo.PrivateSegmentBuffer.getRegister();
178 }
179 
180 unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) {
181   ArgInfo.DispatchPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
182     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
183   NumUserSGPRs += 2;
184   return ArgInfo.DispatchPtr.getRegister();
185 }
186 
187 unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) {
188   ArgInfo.QueuePtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
189     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
190   NumUserSGPRs += 2;
191   return ArgInfo.QueuePtr.getRegister();
192 }
193 
194 unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) {
195   ArgInfo.KernargSegmentPtr
196     = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
197     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
198   NumUserSGPRs += 2;
199   return ArgInfo.KernargSegmentPtr.getRegister();
200 }
201 
202 unsigned SIMachineFunctionInfo::addDispatchID(const SIRegisterInfo &TRI) {
203   ArgInfo.DispatchID = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
204     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
205   NumUserSGPRs += 2;
206   return ArgInfo.DispatchID.getRegister();
207 }
208 
209 unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
210   ArgInfo.FlatScratchInit = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
211     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
212   NumUserSGPRs += 2;
213   return ArgInfo.FlatScratchInit.getRegister();
214 }
215 
216 unsigned SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) {
217   ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
218     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
219   NumUserSGPRs += 2;
220   return ArgInfo.ImplicitBufferPtr.getRegister();
221 }
222 
223 static bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) {
224   for (unsigned I = 0; CSRegs[I]; ++I) {
225     if (CSRegs[I] == Reg)
226       return true;
227   }
228 
229   return false;
230 }
231 
232 /// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI.
233 bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
234                                                     int FI) {
235   std::vector<SpilledReg> &SpillLanes = SGPRToVGPRSpills[FI];
236 
237   // This has already been allocated.
238   if (!SpillLanes.empty())
239     return true;
240 
241   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
242   const SIRegisterInfo *TRI = ST.getRegisterInfo();
243   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
244   MachineRegisterInfo &MRI = MF.getRegInfo();
245   unsigned WaveSize = ST.getWavefrontSize();
246 
247   unsigned Size = FrameInfo.getObjectSize(FI);
248   assert(Size >= 4 && Size <= 64 && "invalid sgpr spill size");
249   assert(TRI->spillSGPRToVGPR() && "not spilling SGPRs to VGPRs");
250 
251   int NumLanes = Size / 4;
252 
253   const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
254 
255   // Make sure to handle the case where a wide SGPR spill may span between two
256   // VGPRs.
257   for (int I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) {
258     unsigned LaneVGPR;
259     unsigned VGPRIndex = (NumVGPRSpillLanes % WaveSize);
260 
261     if (VGPRIndex == 0) {
262       LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
263       if (LaneVGPR == AMDGPU::NoRegister) {
264         // We have no VGPRs left for spilling SGPRs. Reset because we will not
265         // partially spill the SGPR to VGPRs.
266         SGPRToVGPRSpills.erase(FI);
267         NumVGPRSpillLanes -= I;
268         return false;
269       }
270 
271       Optional<int> CSRSpillFI;
272       if ((FrameInfo.hasCalls() || !isEntryFunction()) && CSRegs &&
273           isCalleeSavedReg(CSRegs, LaneVGPR)) {
274         CSRSpillFI = FrameInfo.CreateSpillStackObject(4, 4);
275       }
276 
277       SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, CSRSpillFI));
278 
279       // Add this register as live-in to all blocks to avoid machine verifer
280       // complaining about use of an undefined physical register.
281       for (MachineBasicBlock &BB : MF)
282         BB.addLiveIn(LaneVGPR);
283     } else {
284       LaneVGPR = SpillVGPRs.back().VGPR;
285     }
286 
287     SpillLanes.push_back(SpilledReg(LaneVGPR, VGPRIndex));
288   }
289 
290   return true;
291 }
292 
293 void SIMachineFunctionInfo::removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI) {
294   for (auto &R : SGPRToVGPRSpills)
295     MFI.RemoveStackObject(R.first);
296 }
297 
298 
299 /// \returns VGPR used for \p Dim' work item ID.
300 unsigned SIMachineFunctionInfo::getWorkItemIDVGPR(unsigned Dim) const {
301   switch (Dim) {
302   case 0:
303     assert(hasWorkItemIDX());
304     return AMDGPU::VGPR0;
305   case 1:
306     assert(hasWorkItemIDY());
307     return AMDGPU::VGPR1;
308   case 2:
309     assert(hasWorkItemIDZ());
310     return AMDGPU::VGPR2;
311   }
312   llvm_unreachable("unexpected dimension");
313 }
314 
315 MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const {
316   assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
317   return AMDGPU::SGPR0 + NumUserSGPRs;
318 }
319 
320 MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const {
321   return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
322 }
323 
324 static yaml::StringValue regToString(unsigned Reg,
325                                      const TargetRegisterInfo &TRI) {
326   yaml::StringValue Dest;
327   {
328     raw_string_ostream OS(Dest.Value);
329     OS << printReg(Reg, &TRI);
330   }
331   return Dest;
332 }
333 
334 yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
335   const llvm::SIMachineFunctionInfo& MFI,
336   const TargetRegisterInfo &TRI)
337   : ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
338     MaxKernArgAlign(MFI.getMaxKernArgAlign()),
339     LDSSize(MFI.getLDSSize()),
340     IsEntryFunction(MFI.isEntryFunction()),
341     NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()),
342     MemoryBound(MFI.isMemoryBound()),
343     WaveLimiter(MFI.needsWaveLimiter()),
344     ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
345     ScratchWaveOffsetReg(regToString(MFI.getScratchWaveOffsetReg(), TRI)),
346     FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)),
347     StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)) {}
348 
349 void yaml::SIMachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) {
350   MappingTraits<SIMachineFunctionInfo>::mapping(YamlIO, *this);
351 }
352 
353 bool SIMachineFunctionInfo::initializeBaseYamlFields(
354   const yaml::SIMachineFunctionInfo &YamlMFI) {
355   ExplicitKernArgSize = YamlMFI.ExplicitKernArgSize;
356   MaxKernArgAlign = YamlMFI.MaxKernArgAlign;
357   LDSSize = YamlMFI.LDSSize;
358   IsEntryFunction = YamlMFI.IsEntryFunction;
359   NoSignedZerosFPMath = YamlMFI.NoSignedZerosFPMath;
360   MemoryBound = YamlMFI.MemoryBound;
361   WaveLimiter = YamlMFI.WaveLimiter;
362   return false;
363 }
364