1 //===----------------------- SIFrameLowering.cpp --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 
9 #include "SIFrameLowering.h"
10 #include "AMDGPUSubtarget.h"
11 #include "SIInstrInfo.h"
12 #include "SIMachineFunctionInfo.h"
13 #include "SIRegisterInfo.h"
14 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
15 
16 #include "llvm/CodeGen/LivePhysRegs.h"
17 #include "llvm/CodeGen/MachineFrameInfo.h"
18 #include "llvm/CodeGen/MachineFunction.h"
19 #include "llvm/CodeGen/MachineInstrBuilder.h"
20 #include "llvm/CodeGen/RegisterScavenging.h"
21 
22 using namespace llvm;
23 
24 
25 static ArrayRef<MCPhysReg> getAllSGPR128(const GCNSubtarget &ST,
26                                          const MachineFunction &MF) {
27   return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
28                       ST.getMaxNumSGPRs(MF) / 4);
29 }
30 
31 static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST,
32                                        const MachineFunction &MF) {
33   return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
34                       ST.getMaxNumSGPRs(MF));
35 }
36 
37 void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST,
38                                           MachineFunction &MF,
39                                           MachineBasicBlock &MBB) const {
40   const SIInstrInfo *TII = ST.getInstrInfo();
41   const SIRegisterInfo* TRI = &TII->getRegisterInfo();
42   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
43 
44   // We don't need this if we only have spills since there is no user facing
45   // scratch.
46 
47   // TODO: If we know we don't have flat instructions earlier, we can omit
48   // this from the input registers.
49   //
50   // TODO: We only need to know if we access scratch space through a flat
51   // pointer. Because we only detect if flat instructions are used at all,
52   // this will be used more often than necessary on VI.
53 
54   // Debug location must be unknown since the first debug location is used to
55   // determine the end of the prologue.
56   DebugLoc DL;
57   MachineBasicBlock::iterator I = MBB.begin();
58 
59   unsigned FlatScratchInitReg
60     = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
61 
62   MachineRegisterInfo &MRI = MF.getRegInfo();
63   MRI.addLiveIn(FlatScratchInitReg);
64   MBB.addLiveIn(FlatScratchInitReg);
65 
66   unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
67   unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
68 
69   unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
70 
71   // Do a 64-bit pointer add.
72   if (ST.flatScratchIsPointer()) {
73     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
74       .addReg(FlatScrInitLo)
75       .addReg(ScratchWaveOffsetReg);
76     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI)
77       .addReg(FlatScrInitHi)
78       .addImm(0);
79 
80     return;
81   }
82 
83   // Copy the size in bytes.
84   BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
85     .addReg(FlatScrInitHi, RegState::Kill);
86 
87   // Add wave offset in bytes to private base offset.
88   // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
89   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
90     .addReg(FlatScrInitLo)
91     .addReg(ScratchWaveOffsetReg);
92 
93   // Convert offset to 256-byte units.
94   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI)
95     .addReg(FlatScrInitLo, RegState::Kill)
96     .addImm(8);
97 }
98 
99 unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
100   const GCNSubtarget &ST,
101   const SIInstrInfo *TII,
102   const SIRegisterInfo *TRI,
103   SIMachineFunctionInfo *MFI,
104   MachineFunction &MF) const {
105   MachineRegisterInfo &MRI = MF.getRegInfo();
106 
107   // We need to insert initialization of the scratch resource descriptor.
108   unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
109   if (ScratchRsrcReg == AMDGPU::NoRegister ||
110       !MRI.isPhysRegUsed(ScratchRsrcReg))
111     return AMDGPU::NoRegister;
112 
113   if (ST.hasSGPRInitBug() ||
114       ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
115     return ScratchRsrcReg;
116 
117   // We reserved the last registers for this. Shift it down to the end of those
118   // which were actually used.
119   //
120   // FIXME: It might be safer to use a pseudoregister before replacement.
121 
122   // FIXME: We should be able to eliminate unused input registers. We only
123   // cannot do this for the resources required for scratch access. For now we
124   // skip over user SGPRs and may leave unused holes.
125 
126   // We find the resource first because it has an alignment requirement.
127 
128   unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
129   ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(ST, MF);
130   AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
131 
132   // Skip the last N reserved elements because they should have already been
133   // reserved for VCC etc.
134   for (MCPhysReg Reg : AllSGPR128s) {
135     // Pick the first unallocated one. Make sure we don't clobber the other
136     // reserved input we needed.
137     if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
138       MRI.replaceRegWith(ScratchRsrcReg, Reg);
139       MFI->setScratchRSrcReg(Reg);
140       return Reg;
141     }
142   }
143 
144   return ScratchRsrcReg;
145 }
146 
147 // Shift down registers reserved for the scratch wave offset and stack pointer
148 // SGPRs.
149 std::pair<unsigned, unsigned>
150 SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
151   const GCNSubtarget &ST,
152   const SIInstrInfo *TII,
153   const SIRegisterInfo *TRI,
154   SIMachineFunctionInfo *MFI,
155   MachineFunction &MF) const {
156   MachineRegisterInfo &MRI = MF.getRegInfo();
157   unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
158 
159   // No replacement necessary.
160   if (ScratchWaveOffsetReg == AMDGPU::NoRegister ||
161       !MRI.isPhysRegUsed(ScratchWaveOffsetReg)) {
162     assert(MFI->getStackPtrOffsetReg() == AMDGPU::SP_REG);
163     return std::make_pair(AMDGPU::NoRegister, AMDGPU::NoRegister);
164   }
165 
166   unsigned SPReg = MFI->getStackPtrOffsetReg();
167   if (ST.hasSGPRInitBug())
168     return std::make_pair(ScratchWaveOffsetReg, SPReg);
169 
170   unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
171 
172   ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF);
173   if (NumPreloaded > AllSGPRs.size())
174     return std::make_pair(ScratchWaveOffsetReg, SPReg);
175 
176   AllSGPRs = AllSGPRs.slice(NumPreloaded);
177 
178   // We need to drop register from the end of the list that we cannot use
179   // for the scratch wave offset.
180   // + 2 s102 and s103 do not exist on VI.
181   // + 2 for vcc
182   // + 2 for xnack_mask
183   // + 2 for flat_scratch
184   // + 4 for registers reserved for scratch resource register
185   // + 1 for register reserved for scratch wave offset.  (By exluding this
186   //     register from the list to consider, it means that when this
187   //     register is being used for the scratch wave offset and there
188   //     are no other free SGPRs, then the value will stay in this register.
189   // + 1 if stack pointer is used.
190   // ----
191   //  13 (+1)
192   unsigned ReservedRegCount = 13;
193 
194   if (AllSGPRs.size() < ReservedRegCount)
195     return std::make_pair(ScratchWaveOffsetReg, SPReg);
196 
197   bool HandledScratchWaveOffsetReg =
198     ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
199 
200   for (MCPhysReg Reg : AllSGPRs.drop_back(ReservedRegCount)) {
201     // Pick the first unallocated SGPR. Be careful not to pick an alias of the
202     // scratch descriptor, since we haven’t added its uses yet.
203     if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
204       if (!HandledScratchWaveOffsetReg) {
205         HandledScratchWaveOffsetReg = true;
206 
207         MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
208         MFI->setScratchWaveOffsetReg(Reg);
209         ScratchWaveOffsetReg = Reg;
210         break;
211       }
212     }
213   }
214 
215   return std::make_pair(ScratchWaveOffsetReg, SPReg);
216 }
217 
218 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
219                                                 MachineBasicBlock &MBB) const {
220   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
221 
222   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
223 
224   // If we only have SGPR spills, we won't actually be using scratch memory
225   // since these spill to VGPRs.
226   //
227   // FIXME: We should be cleaning up these unused SGPR spill frame indices
228   // somewhere.
229 
230   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
231   const SIInstrInfo *TII = ST.getInstrInfo();
232   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
233   MachineRegisterInfo &MRI = MF.getRegInfo();
234   const Function &F = MF.getFunction();
235 
236   // We need to do the replacement of the private segment buffer and wave offset
237   // register even if there are no stack objects. There could be stores to undef
238   // or a constant without an associated object.
239 
240   // FIXME: We still have implicit uses on SGPR spill instructions in case they
241   // need to spill to vector memory. It's likely that will not happen, but at
242   // this point it appears we need the setup. This part of the prolog should be
243   // emitted after frame indices are eliminated.
244 
245   if (MFI->hasFlatScratchInit())
246     emitFlatScratchInit(ST, MF, MBB);
247 
248   unsigned SPReg = MFI->getStackPtrOffsetReg();
249   if (SPReg != AMDGPU::SP_REG) {
250     assert(MRI.isReserved(SPReg) && "SPReg used but not reserved");
251 
252     DebugLoc DL;
253     const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
254     int64_t StackSize = FrameInfo.getStackSize();
255 
256     if (StackSize == 0) {
257       BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::COPY), SPReg)
258         .addReg(MFI->getScratchWaveOffsetReg());
259     } else {
260       BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::S_ADD_U32), SPReg)
261         .addReg(MFI->getScratchWaveOffsetReg())
262         .addImm(StackSize * ST.getWavefrontSize());
263     }
264   }
265 
266   unsigned ScratchRsrcReg
267     = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF);
268 
269   unsigned ScratchWaveOffsetReg;
270   std::tie(ScratchWaveOffsetReg, SPReg)
271     = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF);
272 
273   // It's possible to have uses of only ScratchWaveOffsetReg without
274   // ScratchRsrcReg if it's only used for the initialization of flat_scratch,
275   // but the inverse is not true.
276   if (ScratchWaveOffsetReg == AMDGPU::NoRegister) {
277     assert(ScratchRsrcReg == AMDGPU::NoRegister);
278     return;
279   }
280 
281   // We need to insert initialization of the scratch resource descriptor.
282   unsigned PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
283     AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
284 
285   unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
286   if (ST.isAmdHsaOrMesa(F)) {
287     PreloadedPrivateBufferReg = MFI->getPreloadedReg(
288       AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
289   }
290 
291   bool OffsetRegUsed = MRI.isPhysRegUsed(ScratchWaveOffsetReg);
292   bool ResourceRegUsed = ScratchRsrcReg != AMDGPU::NoRegister &&
293                          MRI.isPhysRegUsed(ScratchRsrcReg);
294 
295   // We added live-ins during argument lowering, but since they were not used
296   // they were deleted. We're adding the uses now, so add them back.
297   if (OffsetRegUsed) {
298     assert(PreloadedScratchWaveOffsetReg != AMDGPU::NoRegister &&
299            "scratch wave offset input is required");
300     MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
301     MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
302   }
303 
304   if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) {
305     assert(ST.isAmdHsaOrMesa(F) || ST.isMesaGfxShader(F));
306     MRI.addLiveIn(PreloadedPrivateBufferReg);
307     MBB.addLiveIn(PreloadedPrivateBufferReg);
308   }
309 
310   // Make the register selected live throughout the function.
311   for (MachineBasicBlock &OtherBB : MF) {
312     if (&OtherBB == &MBB)
313       continue;
314 
315     if (OffsetRegUsed)
316       OtherBB.addLiveIn(ScratchWaveOffsetReg);
317 
318     if (ResourceRegUsed)
319       OtherBB.addLiveIn(ScratchRsrcReg);
320   }
321 
322   DebugLoc DL;
323   MachineBasicBlock::iterator I = MBB.begin();
324 
325   // If we reserved the original input registers, we don't need to copy to the
326   // reserved registers.
327 
328   bool CopyBuffer = ResourceRegUsed &&
329     PreloadedPrivateBufferReg != AMDGPU::NoRegister &&
330     ST.isAmdHsaOrMesa(F) &&
331     ScratchRsrcReg != PreloadedPrivateBufferReg;
332 
333   // This needs to be careful of the copying order to avoid overwriting one of
334   // the input registers before it's been copied to it's final
335   // destination. Usually the offset should be copied first.
336   bool CopyBufferFirst = TRI->isSubRegisterEq(PreloadedPrivateBufferReg,
337                                               ScratchWaveOffsetReg);
338   if (CopyBuffer && CopyBufferFirst) {
339     BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
340       .addReg(PreloadedPrivateBufferReg, RegState::Kill);
341   }
342 
343   if (OffsetRegUsed &&
344       PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
345     BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
346       .addReg(PreloadedScratchWaveOffsetReg,
347               MRI.isPhysRegUsed(ScratchWaveOffsetReg) ? 0 : RegState::Kill);
348   }
349 
350   if (CopyBuffer && !CopyBufferFirst) {
351     BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
352       .addReg(PreloadedPrivateBufferReg, RegState::Kill);
353   }
354 
355   if (ResourceRegUsed)
356     emitEntryFunctionScratchSetup(ST, MF, MBB, MFI, I,
357         PreloadedPrivateBufferReg, ScratchRsrcReg);
358 }
359 
360 // Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set.
361 void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
362       MachineFunction &MF, MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI,
363       MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg,
364       unsigned ScratchRsrcReg) const {
365 
366   const SIInstrInfo *TII = ST.getInstrInfo();
367   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
368   const Function &Fn = MF.getFunction();
369   DebugLoc DL;
370 
371   if (ST.isAmdPalOS()) {
372     // The pointer to the GIT is formed from the offset passed in and either
373     // the amdgpu-git-ptr-high function attribute or the top part of the PC
374     unsigned RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
375     unsigned RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
376     unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
377 
378     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
379 
380     if (MFI->getGITPtrHigh() != 0xffffffff) {
381       BuildMI(MBB, I, DL, SMovB32, RsrcHi)
382         .addImm(MFI->getGITPtrHigh())
383         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
384     } else {
385       const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
386       BuildMI(MBB, I, DL, GetPC64, Rsrc01);
387     }
388     auto GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in
389     if (ST.hasMergedShaders()) {
390       switch (MF.getFunction().getCallingConv()) {
391         case CallingConv::AMDGPU_HS:
392         case CallingConv::AMDGPU_GS:
393           // Low GIT address is passed in s8 rather than s0 for an LS+HS or
394           // ES+GS merged shader on gfx9+.
395           GitPtrLo = AMDGPU::SGPR8;
396           break;
397         default:
398           break;
399       }
400     }
401     MF.getRegInfo().addLiveIn(GitPtrLo);
402     MF.front().addLiveIn(GitPtrLo);
403     BuildMI(MBB, I, DL, SMovB32, RsrcLo)
404       .addReg(GitPtrLo)
405       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
406 
407     // We now have the GIT ptr - now get the scratch descriptor from the entry
408     // at offset 0 (or offset 16 for a compute shader).
409     PointerType *PtrTy =
410       PointerType::get(Type::getInt64Ty(MF.getFunction().getContext()),
411                        AMDGPUAS::CONSTANT_ADDRESS);
412     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
413     const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
414     auto MMO = MF.getMachineMemOperand(PtrInfo,
415                                        MachineMemOperand::MOLoad |
416                                        MachineMemOperand::MOInvariant |
417                                        MachineMemOperand::MODereferenceable,
418                                        16, 4);
419     unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
420     const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
421     unsigned EncodedOffset = AMDGPU::getSMRDEncodedOffset(Subtarget, Offset);
422     BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
423       .addReg(Rsrc01)
424       .addImm(EncodedOffset) // offset
425       .addImm(0) // glc
426       .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
427       .addMemOperand(MMO);
428     return;
429   }
430   if (ST.isMesaGfxShader(Fn)
431       || (PreloadedPrivateBufferReg == AMDGPU::NoRegister)) {
432     assert(!ST.isAmdHsaOrMesa(Fn));
433     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
434 
435     unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
436     unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
437 
438     // Use relocations to get the pointer, and setup the other bits manually.
439     uint64_t Rsrc23 = TII->getScratchRsrcWords23();
440 
441     if (MFI->hasImplicitBufferPtr()) {
442       unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
443 
444       if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
445         const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
446 
447         BuildMI(MBB, I, DL, Mov64, Rsrc01)
448           .addReg(MFI->getImplicitBufferPtrUserSGPR())
449           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
450       } else {
451         const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
452 
453         PointerType *PtrTy =
454           PointerType::get(Type::getInt64Ty(MF.getFunction().getContext()),
455                            AMDGPUAS::CONSTANT_ADDRESS);
456         MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
457         auto MMO = MF.getMachineMemOperand(PtrInfo,
458                                            MachineMemOperand::MOLoad |
459                                            MachineMemOperand::MOInvariant |
460                                            MachineMemOperand::MODereferenceable,
461                                            8, 4);
462         BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
463           .addReg(MFI->getImplicitBufferPtrUserSGPR())
464           .addImm(0) // offset
465           .addImm(0) // glc
466           .addMemOperand(MMO)
467           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
468       }
469     } else {
470       unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
471       unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
472 
473       BuildMI(MBB, I, DL, SMovB32, Rsrc0)
474         .addExternalSymbol("SCRATCH_RSRC_DWORD0")
475         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
476 
477       BuildMI(MBB, I, DL, SMovB32, Rsrc1)
478         .addExternalSymbol("SCRATCH_RSRC_DWORD1")
479         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
480 
481     }
482 
483     BuildMI(MBB, I, DL, SMovB32, Rsrc2)
484       .addImm(Rsrc23 & 0xffffffff)
485       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
486 
487     BuildMI(MBB, I, DL, SMovB32, Rsrc3)
488       .addImm(Rsrc23 >> 32)
489       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
490   }
491 }
492 
493 // Find a scratch register that we can use at the start of the prologue to
494 // re-align the stack pointer.  We avoid using callee-save registers since they
495 // may appear to be free when this is called from canUseAsPrologue (during
496 // shrink wrapping), but then no longer be free when this is called from
497 // emitPrologue.
498 //
499 // FIXME: This is a bit conservative, since in the above case we could use one
500 // of the callee-save registers as a scratch temp to re-align the stack pointer,
501 // but we would then have to make sure that we were in fact saving at least one
502 // callee-save register in the prologue, which is additional complexity that
503 // doesn't seem worth the benefit.
504 static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock &MBB) {
505   MachineFunction *MF = MBB.getParent();
506 
507   const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
508   const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo();
509   LivePhysRegs LiveRegs(TRI);
510   LiveRegs.addLiveIns(MBB);
511 
512   // Mark callee saved registers as used so we will not choose them.
513   const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF);
514   for (unsigned i = 0; CSRegs[i]; ++i)
515     LiveRegs.addReg(CSRegs[i]);
516 
517   MachineRegisterInfo &MRI = MF->getRegInfo();
518 
519   for (unsigned Reg : AMDGPU::SReg_32_XM0RegClass) {
520     if (LiveRegs.available(MRI, Reg))
521       return Reg;
522   }
523 
524   return AMDGPU::NoRegister;
525 }
526 
527 void SIFrameLowering::emitPrologue(MachineFunction &MF,
528                                    MachineBasicBlock &MBB) const {
529   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
530   if (FuncInfo->isEntryFunction()) {
531     emitEntryFunctionPrologue(MF, MBB);
532     return;
533   }
534 
535   const MachineFrameInfo &MFI = MF.getFrameInfo();
536   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
537   const SIInstrInfo *TII = ST.getInstrInfo();
538   const SIRegisterInfo &TRI = TII->getRegisterInfo();
539 
540   unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
541   unsigned FramePtrReg = FuncInfo->getFrameOffsetReg();
542 
543   MachineBasicBlock::iterator MBBI = MBB.begin();
544   DebugLoc DL;
545 
546   // XXX - Is this the right predicate?
547 
548   bool NeedFP = hasFP(MF);
549   uint32_t NumBytes = MFI.getStackSize();
550   uint32_t RoundedSize = NumBytes;
551   const bool NeedsRealignment = TRI.needsStackRealignment(MF);
552 
553   if (NeedsRealignment) {
554     assert(NeedFP);
555     const unsigned Alignment = MFI.getMaxAlignment();
556 
557     RoundedSize += Alignment;
558 
559     unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(MBB);
560     assert(ScratchSPReg != AMDGPU::NoRegister);
561 
562     // s_add_u32 tmp_reg, s32, NumBytes
563     // s_and_b32 s32, tmp_reg, 0b111...0000
564     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg)
565       .addReg(StackPtrReg)
566       .addImm((Alignment - 1) * ST.getWavefrontSize())
567       .setMIFlag(MachineInstr::FrameSetup);
568     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
569       .addReg(ScratchSPReg, RegState::Kill)
570       .addImm(-Alignment * ST.getWavefrontSize())
571       .setMIFlag(MachineInstr::FrameSetup);
572     FuncInfo->setIsStackRealigned(true);
573   } else if (NeedFP) {
574     // If we need a base pointer, set it up here. It's whatever the value of
575     // the stack pointer is at this point. Any variable size objects will be
576     // allocated after this, so we can still use the base pointer to reference
577     // locals.
578     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
579       .addReg(StackPtrReg)
580       .setMIFlag(MachineInstr::FrameSetup);
581   }
582 
583   if (RoundedSize != 0 && hasSP(MF)) {
584     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
585       .addReg(StackPtrReg)
586       .addImm(RoundedSize * ST.getWavefrontSize())
587       .setMIFlag(MachineInstr::FrameSetup);
588   }
589 
590   for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
591          : FuncInfo->getSGPRSpillVGPRs()) {
592     if (!Reg.FI.hasValue())
593       continue;
594     TII->storeRegToStackSlot(MBB, MBBI, Reg.VGPR, true,
595                              Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
596                              &TII->getRegisterInfo());
597   }
598 }
599 
600 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
601                                    MachineBasicBlock &MBB) const {
602   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
603   if (FuncInfo->isEntryFunction())
604     return;
605 
606   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
607   const SIInstrInfo *TII = ST.getInstrInfo();
608   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
609 
610   for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
611          : FuncInfo->getSGPRSpillVGPRs()) {
612     if (!Reg.FI.hasValue())
613       continue;
614     TII->loadRegFromStackSlot(MBB, MBBI, Reg.VGPR,
615                               Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
616                               &TII->getRegisterInfo());
617   }
618 
619   unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
620   if (StackPtrReg == AMDGPU::NoRegister)
621     return;
622 
623   const MachineFrameInfo &MFI = MF.getFrameInfo();
624   uint32_t NumBytes = MFI.getStackSize();
625 
626   DebugLoc DL;
627 
628   // FIXME: Clarify distinction between no set SP and SP. For callee functions,
629   // it's really whether we need SP to be accurate or not.
630 
631   if (NumBytes != 0 && hasSP(MF)) {
632     uint32_t RoundedSize = FuncInfo->isStackRealigned() ?
633       NumBytes + MFI.getMaxAlignment() : NumBytes;
634 
635     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
636       .addReg(StackPtrReg)
637       .addImm(RoundedSize * ST.getWavefrontSize());
638   }
639 }
640 
641 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
642   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
643        I != E; ++I) {
644     if (!MFI.isDeadObjectIndex(I))
645       return false;
646   }
647 
648   return true;
649 }
650 
651 int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
652                                             unsigned &FrameReg) const {
653   const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
654 
655   FrameReg = RI->getFrameRegister(MF);
656   return MF.getFrameInfo().getObjectOffset(FI);
657 }
658 
659 void SIFrameLowering::processFunctionBeforeFrameFinalized(
660   MachineFunction &MF,
661   RegScavenger *RS) const {
662   MachineFrameInfo &MFI = MF.getFrameInfo();
663 
664   if (!MFI.hasStackObjects())
665     return;
666 
667   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
668   const SIInstrInfo *TII = ST.getInstrInfo();
669   const SIRegisterInfo &TRI = TII->getRegisterInfo();
670   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
671   bool AllSGPRSpilledToVGPRs = false;
672 
673   if (TRI.spillSGPRToVGPR() && FuncInfo->hasSpilledSGPRs()) {
674     AllSGPRSpilledToVGPRs = true;
675 
676     // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs
677     // are spilled to VGPRs, in which case we can eliminate the stack usage.
678     //
679     // XXX - This operates under the assumption that only other SGPR spills are
680     // users of the frame index. I'm not 100% sure this is correct. The
681     // StackColoring pass has a comment saying a future improvement would be to
682     // merging of allocas with spill slots, but for now according to
683     // MachineFrameInfo isSpillSlot can't alias any other object.
684     for (MachineBasicBlock &MBB : MF) {
685       MachineBasicBlock::iterator Next;
686       for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) {
687         MachineInstr &MI = *I;
688         Next = std::next(I);
689 
690         if (TII->isSGPRSpill(MI)) {
691           int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
692           assert(MFI.getStackID(FI) == SIStackID::SGPR_SPILL);
693           if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) {
694             bool Spilled = TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS);
695             (void)Spilled;
696             assert(Spilled && "failed to spill SGPR to VGPR when allocated");
697           } else
698             AllSGPRSpilledToVGPRs = false;
699         }
700       }
701     }
702 
703     FuncInfo->removeSGPRToVGPRFrameIndices(MFI);
704   }
705 
706   // FIXME: The other checks should be redundant with allStackObjectsAreDead,
707   // but currently hasNonSpillStackObjects is set only from source
708   // allocas. Stack temps produced from legalization are not counted currently.
709   if (FuncInfo->hasNonSpillStackObjects() || FuncInfo->hasSpilledVGPRs() ||
710       !AllSGPRSpilledToVGPRs || !allStackObjectsAreDead(MFI)) {
711     assert(RS && "RegScavenger required if spilling");
712 
713     // We force this to be at offset 0 so no user object ever has 0 as an
714     // address, so we may use 0 as an invalid pointer value. This is because
715     // LLVM assumes 0 is an invalid pointer in address space 0. Because alloca
716     // is required to be address space 0, we are forced to accept this for
717     // now. Ideally we could have the stack in another address space with 0 as a
718     // valid pointer, and -1 as the null value.
719     //
720     // This will also waste additional space when user stack objects require > 4
721     // byte alignment.
722     //
723     // The main cost here is losing the offset for addressing modes. However
724     // this also ensures we shouldn't need a register for the offset when
725     // emergency scavenging.
726     int ScavengeFI = MFI.CreateFixedObject(
727       TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
728     RS->addScavengingFrameIndex(ScavengeFI);
729   }
730 }
731 
732 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
733                                            RegScavenger *RS) const {
734   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
735   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
736 
737   // The SP is specifically managed and we don't want extra spills of it.
738   SavedRegs.reset(MFI->getStackPtrOffsetReg());
739 }
740 
741 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
742   MachineFunction &MF,
743   MachineBasicBlock &MBB,
744   MachineBasicBlock::iterator I) const {
745   int64_t Amount = I->getOperand(0).getImm();
746   if (Amount == 0)
747     return MBB.erase(I);
748 
749   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
750   const SIInstrInfo *TII = ST.getInstrInfo();
751   const DebugLoc &DL = I->getDebugLoc();
752   unsigned Opc = I->getOpcode();
753   bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
754   uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
755 
756   const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
757   if (!TFI->hasReservedCallFrame(MF)) {
758     unsigned Align = getStackAlignment();
759 
760     Amount = alignTo(Amount, Align);
761     assert(isUInt<32>(Amount) && "exceeded stack address space size");
762     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
763     unsigned SPReg = MFI->getStackPtrOffsetReg();
764 
765     unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
766     BuildMI(MBB, I, DL, TII->get(Op), SPReg)
767       .addReg(SPReg)
768       .addImm(Amount * ST.getWavefrontSize());
769   } else if (CalleePopAmount != 0) {
770     llvm_unreachable("is this used?");
771   }
772 
773   return MBB.erase(I);
774 }
775 
776 bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
777   // All stack operations are relative to the frame offset SGPR.
778   // TODO: Still want to eliminate sometimes.
779   const MachineFrameInfo &MFI = MF.getFrameInfo();
780 
781   // XXX - Is this only called after frame is finalized? Should be able to check
782   // frame size.
783   return MFI.hasStackObjects() && !allStackObjectsAreDead(MFI);
784 }
785 
786 bool SIFrameLowering::hasSP(const MachineFunction &MF) const {
787   const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
788   // All stack operations are relative to the frame offset SGPR.
789   const MachineFrameInfo &MFI = MF.getFrameInfo();
790   return MFI.hasCalls() || MFI.hasVarSizedObjects() || TRI->needsStackRealignment(MF);
791 }
792