1 //===----------------------- SIFrameLowering.cpp --------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //==-----------------------------------------------------------------------===//
9 
10 #include "SIFrameLowering.h"
11 #include "SIInstrInfo.h"
12 #include "SIMachineFunctionInfo.h"
13 #include "SIRegisterInfo.h"
14 #include "AMDGPUSubtarget.h"
15 
16 #include "llvm/CodeGen/MachineFrameInfo.h"
17 #include "llvm/CodeGen/MachineFunction.h"
18 #include "llvm/CodeGen/MachineInstrBuilder.h"
19 #include "llvm/CodeGen/RegisterScavenging.h"
20 
21 using namespace llvm;
22 
23 
24 static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo,
25                               const MachineFrameInfo &MFI) {
26   return FuncInfo->hasSpilledSGPRs() &&
27     (!FuncInfo->hasSpilledVGPRs() && !FuncInfo->hasNonSpillStackObjects());
28 }
29 
30 static ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF,
31                                          const SIRegisterInfo *TRI) {
32   return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
33                       TRI->getMaxNumSGPRs(MF) / 4);
34 }
35 
36 static ArrayRef<MCPhysReg> getAllSGPRs(const MachineFunction &MF,
37                                        const SIRegisterInfo *TRI) {
38   return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
39                       TRI->getMaxNumSGPRs(MF));
40 }
41 
42 void SIFrameLowering::emitFlatScratchInit(const SIInstrInfo *TII,
43                                           const SIRegisterInfo* TRI,
44                                           MachineFunction &MF,
45                                           MachineBasicBlock &MBB) const {
46   // We don't need this if we only have spills since there is no user facing
47   // scratch.
48 
49   // TODO: If we know we don't have flat instructions earlier, we can omit
50   // this from the input registers.
51   //
52   // TODO: We only need to know if we access scratch space through a flat
53   // pointer. Because we only detect if flat instructions are used at all,
54   // this will be used more often than necessary on VI.
55 
56   // Debug location must be unknown since the first debug location is used to
57   // determine the end of the prologue.
58   DebugLoc DL;
59   MachineBasicBlock::iterator I = MBB.begin();
60 
61   unsigned FlatScratchInitReg
62     = TRI->getPreloadedValue(MF, SIRegisterInfo::FLAT_SCRATCH_INIT);
63 
64   MachineRegisterInfo &MRI = MF.getRegInfo();
65   MRI.addLiveIn(FlatScratchInitReg);
66   MBB.addLiveIn(FlatScratchInitReg);
67 
68   // Copy the size in bytes.
69   unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
70   BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
71     .addReg(FlatScrInitHi, RegState::Kill);
72 
73   unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
74 
75   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
76   unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
77 
78 
79   // Add wave offset in bytes to private base offset.
80   // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
81   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
82     .addReg(FlatScrInitLo)
83     .addReg(ScratchWaveOffsetReg);
84 
85   // Convert offset to 256-byte units.
86   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI)
87     .addReg(FlatScrInitLo, RegState::Kill)
88     .addImm(8);
89 }
90 
91 unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
92   const SISubtarget &ST,
93   const SIInstrInfo *TII,
94   const SIRegisterInfo *TRI,
95   SIMachineFunctionInfo *MFI,
96   MachineFunction &MF) const {
97 
98   // We need to insert initialization of the scratch resource descriptor.
99   unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
100   assert(ScratchRsrcReg != AMDGPU::NoRegister);
101 
102   if (ST.hasSGPRInitBug() ||
103       ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
104     return ScratchRsrcReg;
105 
106   // We reserved the last registers for this. Shift it down to the end of those
107   // which were actually used.
108   //
109   // FIXME: It might be safer to use a pseudoregister before replacement.
110 
111   // FIXME: We should be able to eliminate unused input registers. We only
112   // cannot do this for the resources required for scratch access. For now we
113   // skip over user SGPRs and may leave unused holes.
114 
115   // We find the resource first because it has an alignment requirement.
116 
117   MachineRegisterInfo &MRI = MF.getRegInfo();
118 
119   unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4;
120   // Skip the last 2 elements because the last one is reserved for VCC, and
121   // this is the 2nd to last element already.
122   for (MCPhysReg Reg : getAllSGPR128(MF, TRI).drop_back(2).slice(NumPreloaded)) {
123     // Pick the first unallocated one. Make sure we don't clobber the other
124     // reserved input we needed.
125     if (!MRI.isPhysRegUsed(Reg)) {
126       assert(MRI.isAllocatable(Reg));
127       MRI.replaceRegWith(ScratchRsrcReg, Reg);
128       MFI->setScratchRSrcReg(Reg);
129       return Reg;
130     }
131   }
132 
133   return ScratchRsrcReg;
134 }
135 
136 unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
137   const SISubtarget &ST,
138   const SIInstrInfo *TII,
139   const SIRegisterInfo *TRI,
140   SIMachineFunctionInfo *MFI,
141   MachineFunction &MF) const {
142   unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
143   if (ST.hasSGPRInitBug() ||
144       ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF))
145     return ScratchWaveOffsetReg;
146 
147   unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
148   MachineRegisterInfo &MRI = MF.getRegInfo();
149   unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
150 
151   // We need to drop register from the end of the list that we cannot use
152   // for the scratch wave offset.
153   // + 2 s102 and s103 do not exist on VI.
154   // + 2 for vcc
155   // + 2 for xnack_mask
156   // + 2 for flat_scratch
157   // + 4 for registers reserved for scratch resource register
158   // + 1 for register reserved for scratch wave offset.  (By exluding this
159   //     register from the list to consider, it means that when this
160   //     register is being used for the scratch wave offset and there
161   //     are no other free SGPRs, then the value will stay in this register.
162   // ----
163   //  13
164   for (MCPhysReg Reg : getAllSGPRs(MF, TRI).drop_back(13).slice(NumPreloaded)) {
165     // Pick the first unallocated SGPR. Be careful not to pick an alias of the
166     // scratch descriptor, since we haven’t added its uses yet.
167     if (!MRI.isPhysRegUsed(Reg)) {
168       if (!MRI.isAllocatable(Reg) ||
169           TRI->isSubRegisterEq(ScratchRsrcReg, Reg))
170         continue;
171 
172       MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
173       MFI->setScratchWaveOffsetReg(Reg);
174       return Reg;
175     }
176   }
177 
178   return ScratchWaveOffsetReg;
179 }
180 
181 void SIFrameLowering::emitPrologue(MachineFunction &MF,
182                                    MachineBasicBlock &MBB) const {
183   // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was
184   // specified.
185   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
186   if (ST.debuggerEmitPrologue())
187     emitDebuggerPrologue(MF, MBB);
188 
189   if (!MF.getFrameInfo().hasStackObjects())
190     return;
191 
192   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
193 
194   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
195 
196   // If we only have SGPR spills, we won't actually be using scratch memory
197   // since these spill to VGPRs.
198   //
199   // FIXME: We should be cleaning up these unused SGPR spill frame indices
200   // somewhere.
201   if (hasOnlySGPRSpills(MFI, MF.getFrameInfo()))
202     return;
203 
204   const SIInstrInfo *TII = ST.getInstrInfo();
205   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
206   MachineRegisterInfo &MRI = MF.getRegInfo();
207 
208   unsigned ScratchRsrcReg
209     = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF);
210   unsigned ScratchWaveOffsetReg
211     = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF);
212   assert(ScratchRsrcReg != AMDGPU::NoRegister);
213   assert(ScratchWaveOffsetReg != AMDGPU::NoRegister);
214   assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg));
215 
216   if (MFI->hasFlatScratchInit())
217     emitFlatScratchInit(TII, TRI, MF, MBB);
218 
219   // We need to insert initialization of the scratch resource descriptor.
220   unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue(
221     MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
222 
223   unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
224   if (ST.isAmdCodeObjectV2()) {
225     PreloadedPrivateBufferReg = TRI->getPreloadedValue(
226       MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
227   }
228 
229   // If we reserved the original input registers, we don't need to copy to the
230   // reserved registers.
231   if (ScratchRsrcReg == PreloadedPrivateBufferReg) {
232     // We should always reserve these 5 registers at the same time.
233     assert(ScratchWaveOffsetReg == PreloadedScratchWaveOffsetReg &&
234            "scratch wave offset and private segment buffer inconsistent");
235     return;
236   }
237 
238   // We added live-ins during argument lowering, but since they were not used
239   // they were deleted. We're adding the uses now, so add them back.
240   MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
241   MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
242 
243   if (ST.isAmdCodeObjectV2()) {
244     MRI.addLiveIn(PreloadedPrivateBufferReg);
245     MBB.addLiveIn(PreloadedPrivateBufferReg);
246   }
247 
248   // Make the register selected live throughout the function.
249   for (MachineBasicBlock &OtherBB : MF) {
250     if (&OtherBB == &MBB)
251       continue;
252 
253     OtherBB.addLiveIn(ScratchRsrcReg);
254     OtherBB.addLiveIn(ScratchWaveOffsetReg);
255   }
256 
257   DebugLoc DL;
258   MachineBasicBlock::iterator I = MBB.begin();
259 
260   if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
261     // Make sure we emit the copy for the offset first. We may have chosen to
262     // copy the buffer resource into a register that aliases the input offset
263     // register.
264     BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
265       .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
266   }
267 
268   if (ST.isAmdCodeObjectV2()) {
269     // Insert copies from argument register.
270     assert(
271       !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchRsrcReg) &&
272       !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchWaveOffsetReg));
273 
274     BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
275       .addReg(PreloadedPrivateBufferReg, RegState::Kill);
276   } else {
277     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
278 
279     unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
280     unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
281     unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
282     unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
283 
284     // Use relocations to get the pointer, and setup the other bits manually.
285     uint64_t Rsrc23 = TII->getScratchRsrcWords23();
286     BuildMI(MBB, I, DL, SMovB32, Rsrc0)
287       .addExternalSymbol("SCRATCH_RSRC_DWORD0")
288       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
289 
290     BuildMI(MBB, I, DL, SMovB32, Rsrc1)
291       .addExternalSymbol("SCRATCH_RSRC_DWORD1")
292       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
293 
294     BuildMI(MBB, I, DL, SMovB32, Rsrc2)
295       .addImm(Rsrc23 & 0xffffffff)
296       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
297 
298     BuildMI(MBB, I, DL, SMovB32, Rsrc3)
299       .addImm(Rsrc23 >> 32)
300       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
301   }
302 }
303 
304 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
305                                    MachineBasicBlock &MBB) const {
306 
307 }
308 
309 void SIFrameLowering::processFunctionBeforeFrameFinalized(
310   MachineFunction &MF,
311   RegScavenger *RS) const {
312   MachineFrameInfo &MFI = MF.getFrameInfo();
313 
314   if (!MFI.hasStackObjects())
315     return;
316 
317   bool MayNeedScavengingEmergencySlot = MFI.hasStackObjects();
318 
319   assert((RS || !MayNeedScavengingEmergencySlot) &&
320          "RegScavenger required if spilling");
321 
322   if (MayNeedScavengingEmergencySlot) {
323     int ScavengeFI = MFI.CreateStackObject(
324       AMDGPU::SGPR_32RegClass.getSize(),
325       AMDGPU::SGPR_32RegClass.getAlignment(), false);
326     RS->addScavengingFrameIndex(ScavengeFI);
327   }
328 }
329 
330 void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF,
331                                            MachineBasicBlock &MBB) const {
332   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
333   const SIInstrInfo *TII = ST.getInstrInfo();
334   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
335   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
336 
337   MachineBasicBlock::iterator I = MBB.begin();
338   DebugLoc DL;
339 
340   // For each dimension:
341   for (unsigned i = 0; i < 3; ++i) {
342     // Get work group ID SGPR, and make it live-in again.
343     unsigned WorkGroupIDSGPR = MFI->getWorkGroupIDSGPR(i);
344     MF.getRegInfo().addLiveIn(WorkGroupIDSGPR);
345     MBB.addLiveIn(WorkGroupIDSGPR);
346 
347     // Since SGPRs are spilled into VGPRs, copy work group ID SGPR to VGPR in
348     // order to spill it to scratch.
349     unsigned WorkGroupIDVGPR =
350       MF.getRegInfo().createVirtualRegister(&AMDGPU::VGPR_32RegClass);
351     BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), WorkGroupIDVGPR)
352       .addReg(WorkGroupIDSGPR);
353 
354     // Spill work group ID.
355     int WorkGroupIDObjectIdx = MFI->getDebuggerWorkGroupIDStackObjectIndex(i);
356     TII->storeRegToStackSlot(MBB, I, WorkGroupIDVGPR, false,
357       WorkGroupIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI);
358 
359     // Get work item ID VGPR, and make it live-in again.
360     unsigned WorkItemIDVGPR = MFI->getWorkItemIDVGPR(i);
361     MF.getRegInfo().addLiveIn(WorkItemIDVGPR);
362     MBB.addLiveIn(WorkItemIDVGPR);
363 
364     // Spill work item ID.
365     int WorkItemIDObjectIdx = MFI->getDebuggerWorkItemIDStackObjectIndex(i);
366     TII->storeRegToStackSlot(MBB, I, WorkItemIDVGPR, false,
367       WorkItemIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI);
368   }
369 }
370