1 //===- SIMachineFunctionInfo.cpp - SI Machine Function Info ---------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 
10 #include "SIMachineFunctionInfo.h"
11 #include "AMDGPUArgumentUsageInfo.h"
12 #include "AMDGPUSubtarget.h"
13 #include "SIRegisterInfo.h"
14 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
15 #include "Utils/AMDGPUBaseInfo.h"
16 #include "llvm/ADT/Optional.h"
17 #include "llvm/CodeGen/MachineBasicBlock.h"
18 #include "llvm/CodeGen/MachineFrameInfo.h"
19 #include "llvm/CodeGen/MachineFunction.h"
20 #include "llvm/CodeGen/MachineRegisterInfo.h"
21 #include "llvm/IR/CallingConv.h"
22 #include "llvm/IR/Function.h"
23 #include <cassert>
24 #include <vector>
25 
26 #define MAX_LANES 64
27 
28 using namespace llvm;
29 
30 SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
31   : AMDGPUMachineFunction(MF),
32     PrivateSegmentBuffer(false),
33     DispatchPtr(false),
34     QueuePtr(false),
35     KernargSegmentPtr(false),
36     DispatchID(false),
37     FlatScratchInit(false),
38     GridWorkgroupCountX(false),
39     GridWorkgroupCountY(false),
40     GridWorkgroupCountZ(false),
41     WorkGroupIDX(false),
42     WorkGroupIDY(false),
43     WorkGroupIDZ(false),
44     WorkGroupInfo(false),
45     PrivateSegmentWaveByteOffset(false),
46     WorkItemIDX(false),
47     WorkItemIDY(false),
48     WorkItemIDZ(false),
49     ImplicitBufferPtr(false),
50     ImplicitArgPtr(false),
51     GITPtrHigh(0xffffffff),
52     HighBitsOf32BitAddress(0) {
53   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
54   const Function &F = MF.getFunction();
55   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
56   WavesPerEU = ST.getWavesPerEU(F);
57 
58   Occupancy = getMaxWavesPerEU();
59   limitOccupancy(MF);
60 
61   if (!isEntryFunction()) {
62     // Non-entry functions have no special inputs for now, other registers
63     // required for scratch access.
64     ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
65     ScratchWaveOffsetReg = AMDGPU::SGPR4;
66     FrameOffsetReg = AMDGPU::SGPR5;
67     StackPtrOffsetReg = AMDGPU::SGPR32;
68 
69     ArgInfo.PrivateSegmentBuffer =
70       ArgDescriptor::createRegister(ScratchRSrcReg);
71     ArgInfo.PrivateSegmentWaveByteOffset =
72       ArgDescriptor::createRegister(ScratchWaveOffsetReg);
73 
74     if (F.hasFnAttribute("amdgpu-implicitarg-ptr"))
75       ImplicitArgPtr = true;
76   } else {
77     if (F.hasFnAttribute("amdgpu-implicitarg-ptr")) {
78       KernargSegmentPtr = true;
79       assert(MaxKernArgAlign == 0);
80       MaxKernArgAlign =  ST.getAlignmentForImplicitArgPtr();
81     }
82   }
83 
84   CallingConv::ID CC = F.getCallingConv();
85   if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
86     if (!F.arg_empty())
87       KernargSegmentPtr = true;
88     WorkGroupIDX = true;
89     WorkItemIDX = true;
90   } else if (CC == CallingConv::AMDGPU_PS) {
91     PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
92   }
93 
94   if (ST.debuggerEmitPrologue()) {
95     // Enable everything.
96     WorkGroupIDX = true;
97     WorkGroupIDY = true;
98     WorkGroupIDZ = true;
99     WorkItemIDX = true;
100     WorkItemIDY = true;
101     WorkItemIDZ = true;
102   } else {
103     if (F.hasFnAttribute("amdgpu-work-group-id-x"))
104       WorkGroupIDX = true;
105 
106     if (F.hasFnAttribute("amdgpu-work-group-id-y"))
107       WorkGroupIDY = true;
108 
109     if (F.hasFnAttribute("amdgpu-work-group-id-z"))
110       WorkGroupIDZ = true;
111 
112     if (F.hasFnAttribute("amdgpu-work-item-id-x"))
113       WorkItemIDX = true;
114 
115     if (F.hasFnAttribute("amdgpu-work-item-id-y"))
116       WorkItemIDY = true;
117 
118     if (F.hasFnAttribute("amdgpu-work-item-id-z"))
119       WorkItemIDZ = true;
120   }
121 
122   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
123   bool MaySpill = ST.isVGPRSpillingEnabled(F);
124   bool HasStackObjects = FrameInfo.hasStackObjects();
125 
126   if (isEntryFunction()) {
127     // X, XY, and XYZ are the only supported combinations, so make sure Y is
128     // enabled if Z is.
129     if (WorkItemIDZ)
130       WorkItemIDY = true;
131 
132     if (HasStackObjects || MaySpill) {
133       PrivateSegmentWaveByteOffset = true;
134 
135     // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
136     if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
137         (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
138       ArgInfo.PrivateSegmentWaveByteOffset
139         = ArgDescriptor::createRegister(AMDGPU::SGPR5);
140     }
141   }
142 
143   bool IsCOV2 = ST.isAmdCodeObjectV2(F);
144   if (IsCOV2) {
145     if (HasStackObjects || MaySpill)
146       PrivateSegmentBuffer = true;
147 
148     if (F.hasFnAttribute("amdgpu-dispatch-ptr"))
149       DispatchPtr = true;
150 
151     if (F.hasFnAttribute("amdgpu-queue-ptr"))
152       QueuePtr = true;
153 
154     if (F.hasFnAttribute("amdgpu-dispatch-id"))
155       DispatchID = true;
156   } else if (ST.isMesaGfxShader(F)) {
157     if (HasStackObjects || MaySpill)
158       ImplicitBufferPtr = true;
159   }
160 
161   if (F.hasFnAttribute("amdgpu-kernarg-segment-ptr"))
162     KernargSegmentPtr = true;
163 
164   if (ST.hasFlatAddressSpace() && isEntryFunction() && IsCOV2) {
165     // TODO: This could be refined a lot. The attribute is a poor way of
166     // detecting calls that may require it before argument lowering.
167     if (HasStackObjects || F.hasFnAttribute("amdgpu-flat-scratch"))
168       FlatScratchInit = true;
169   }
170 
171   Attribute A = F.getFnAttribute("amdgpu-git-ptr-high");
172   StringRef S = A.getValueAsString();
173   if (!S.empty())
174     S.consumeInteger(0, GITPtrHigh);
175 
176   A = F.getFnAttribute("amdgpu-32bit-address-high-bits");
177   S = A.getValueAsString();
178   if (!S.empty())
179     S.consumeInteger(0, HighBitsOf32BitAddress);
180 }
181 
182 void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) {
183   limitOccupancy(getMaxWavesPerEU());
184   const SISubtarget& ST = MF.getSubtarget<SISubtarget>();
185   limitOccupancy(ST.getOccupancyWithLocalMemSize(getLDSSize(),
186                  MF.getFunction()));
187 }
188 
189 unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
190   const SIRegisterInfo &TRI) {
191   ArgInfo.PrivateSegmentBuffer =
192     ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
193     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass));
194   NumUserSGPRs += 4;
195   return ArgInfo.PrivateSegmentBuffer.getRegister();
196 }
197 
198 unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) {
199   ArgInfo.DispatchPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
200     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
201   NumUserSGPRs += 2;
202   return ArgInfo.DispatchPtr.getRegister();
203 }
204 
205 unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) {
206   ArgInfo.QueuePtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
207     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
208   NumUserSGPRs += 2;
209   return ArgInfo.QueuePtr.getRegister();
210 }
211 
212 unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) {
213   ArgInfo.KernargSegmentPtr
214     = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
215     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
216   NumUserSGPRs += 2;
217   return ArgInfo.KernargSegmentPtr.getRegister();
218 }
219 
220 unsigned SIMachineFunctionInfo::addDispatchID(const SIRegisterInfo &TRI) {
221   ArgInfo.DispatchID = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
222     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
223   NumUserSGPRs += 2;
224   return ArgInfo.DispatchID.getRegister();
225 }
226 
227 unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
228   ArgInfo.FlatScratchInit = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
229     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
230   NumUserSGPRs += 2;
231   return ArgInfo.FlatScratchInit.getRegister();
232 }
233 
234 unsigned SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) {
235   ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
236     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
237   NumUserSGPRs += 2;
238   return ArgInfo.ImplicitBufferPtr.getRegister();
239 }
240 
241 static bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) {
242   for (unsigned I = 0; CSRegs[I]; ++I) {
243     if (CSRegs[I] == Reg)
244       return true;
245   }
246 
247   return false;
248 }
249 
250 /// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI.
251 bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
252                                                     int FI) {
253   std::vector<SpilledReg> &SpillLanes = SGPRToVGPRSpills[FI];
254 
255   // This has already been allocated.
256   if (!SpillLanes.empty())
257     return true;
258 
259   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
260   const SIRegisterInfo *TRI = ST.getRegisterInfo();
261   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
262   MachineRegisterInfo &MRI = MF.getRegInfo();
263   unsigned WaveSize = ST.getWavefrontSize();
264 
265   unsigned Size = FrameInfo.getObjectSize(FI);
266   assert(Size >= 4 && Size <= 64 && "invalid sgpr spill size");
267   assert(TRI->spillSGPRToVGPR() && "not spilling SGPRs to VGPRs");
268 
269   int NumLanes = Size / 4;
270 
271   const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
272 
273   // Make sure to handle the case where a wide SGPR spill may span between two
274   // VGPRs.
275   for (int I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) {
276     unsigned LaneVGPR;
277     unsigned VGPRIndex = (NumVGPRSpillLanes % WaveSize);
278 
279     if (VGPRIndex == 0) {
280       LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
281       if (LaneVGPR == AMDGPU::NoRegister) {
282         // We have no VGPRs left for spilling SGPRs. Reset because we will not
283         // partially spill the SGPR to VGPRs.
284         SGPRToVGPRSpills.erase(FI);
285         NumVGPRSpillLanes -= I;
286         return false;
287       }
288 
289       Optional<int> CSRSpillFI;
290       if ((FrameInfo.hasCalls() || !isEntryFunction()) && CSRegs &&
291           isCalleeSavedReg(CSRegs, LaneVGPR)) {
292         CSRSpillFI = FrameInfo.CreateSpillStackObject(4, 4);
293       }
294 
295       SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, CSRSpillFI));
296 
297       // Add this register as live-in to all blocks to avoid machine verifer
298       // complaining about use of an undefined physical register.
299       for (MachineBasicBlock &BB : MF)
300         BB.addLiveIn(LaneVGPR);
301     } else {
302       LaneVGPR = SpillVGPRs.back().VGPR;
303     }
304 
305     SpillLanes.push_back(SpilledReg(LaneVGPR, VGPRIndex));
306   }
307 
308   return true;
309 }
310 
311 void SIMachineFunctionInfo::removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI) {
312   for (auto &R : SGPRToVGPRSpills)
313     MFI.RemoveStackObject(R.first);
314 }
315 
316 
317 /// \returns VGPR used for \p Dim' work item ID.
318 unsigned SIMachineFunctionInfo::getWorkItemIDVGPR(unsigned Dim) const {
319   switch (Dim) {
320   case 0:
321     assert(hasWorkItemIDX());
322     return AMDGPU::VGPR0;
323   case 1:
324     assert(hasWorkItemIDY());
325     return AMDGPU::VGPR1;
326   case 2:
327     assert(hasWorkItemIDZ());
328     return AMDGPU::VGPR2;
329   }
330   llvm_unreachable("unexpected dimension");
331 }
332 
333 MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const {
334   assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
335   return AMDGPU::SGPR0 + NumUserSGPRs;
336 }
337 
338 MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const {
339   return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
340 }
341