1 //===----------------------- SIFrameLowering.cpp --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 
9 #include "SIFrameLowering.h"
10 #include "AMDGPUSubtarget.h"
11 #include "SIInstrInfo.h"
12 #include "SIMachineFunctionInfo.h"
13 #include "SIRegisterInfo.h"
14 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
15 
16 #include "llvm/CodeGen/LivePhysRegs.h"
17 #include "llvm/CodeGen/MachineFrameInfo.h"
18 #include "llvm/CodeGen/MachineFunction.h"
19 #include "llvm/CodeGen/MachineInstrBuilder.h"
20 #include "llvm/CodeGen/RegisterScavenging.h"
21 
22 using namespace llvm;
23 
24 #define DEBUG_TYPE "frame-info"
25 
26 
27 static ArrayRef<MCPhysReg> getAllSGPR128(const GCNSubtarget &ST,
28                                          const MachineFunction &MF) {
29   return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
30                       ST.getMaxNumSGPRs(MF) / 4);
31 }
32 
33 static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST,
34                                        const MachineFunction &MF) {
35   return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
36 }
37 
38 // Find a scratch register that we can use at the start of the prologue to
39 // re-align the stack pointer. We avoid using callee-save registers since they
40 // may appear to be free when this is called from canUseAsPrologue (during
41 // shrink wrapping), but then no longer be free when this is called from
42 // emitPrologue.
43 //
44 // FIXME: This is a bit conservative, since in the above case we could use one
45 // of the callee-save registers as a scratch temp to re-align the stack pointer,
46 // but we would then have to make sure that we were in fact saving at least one
47 // callee-save register in the prologue, which is additional complexity that
48 // doesn't seem worth the benefit.
49 static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
50                                                    LivePhysRegs &LiveRegs,
51                                                    const TargetRegisterClass &RC,
52                                                    bool Unused = false) {
53   // Mark callee saved registers as used so we will not choose them.
54   const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
55   for (unsigned i = 0; CSRegs[i]; ++i)
56     LiveRegs.addReg(CSRegs[i]);
57 
58   if (Unused) {
59     // We are looking for a register that can be used throughout the entire
60     // function, so any use is unacceptable.
61     for (MCRegister Reg : RC) {
62       if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg))
63         return Reg;
64     }
65   } else {
66     for (MCRegister Reg : RC) {
67       if (LiveRegs.available(MRI, Reg))
68         return Reg;
69     }
70   }
71 
72   // If we require an unused register, this is used in contexts where failure is
73   // an option and has an alternative plan. In other contexts, this must
74   // succeed0.
75   if (!Unused)
76     report_fatal_error("failed to find free scratch register");
77 
78   return MCRegister();
79 }
80 
81 static MCPhysReg findUnusedSGPRNonCalleeSaved(MachineRegisterInfo &MRI) {
82   LivePhysRegs LiveRegs;
83   LiveRegs.init(*MRI.getTargetRegisterInfo());
84   return findScratchNonCalleeSaveRegister(
85     MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true);
86 }
87 
88 // We need to specially emit stack operations here because a different frame
89 // register is used than in the rest of the function, as getFrameRegister would
90 // use.
91 static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
92                              MachineBasicBlock::iterator I,
93                              const SIInstrInfo *TII, Register SpillReg,
94                              Register ScratchRsrcReg, Register SPReg, int FI) {
95   MachineFunction *MF = MBB.getParent();
96   MachineFrameInfo &MFI = MF->getFrameInfo();
97 
98   int64_t Offset = MFI.getObjectOffset(FI);
99 
100   MachineMemOperand *MMO = MF->getMachineMemOperand(
101       MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4,
102       MFI.getObjectAlign(FI));
103 
104   if (isUInt<12>(Offset)) {
105     BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET))
106       .addReg(SpillReg, RegState::Kill)
107       .addReg(ScratchRsrcReg)
108       .addReg(SPReg)
109       .addImm(Offset)
110       .addImm(0) // glc
111       .addImm(0) // slc
112       .addImm(0) // tfe
113       .addImm(0) // dlc
114       .addImm(0) // swz
115       .addMemOperand(MMO);
116     return;
117   }
118 
119   MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
120     MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
121 
122   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
123     .addImm(Offset);
124 
125   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN))
126     .addReg(SpillReg, RegState::Kill)
127     .addReg(OffsetReg, RegState::Kill)
128     .addReg(ScratchRsrcReg)
129     .addReg(SPReg)
130     .addImm(0)
131     .addImm(0) // glc
132     .addImm(0) // slc
133     .addImm(0) // tfe
134     .addImm(0) // dlc
135     .addImm(0) // swz
136     .addMemOperand(MMO);
137 }
138 
139 static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
140                               MachineBasicBlock::iterator I,
141                               const SIInstrInfo *TII, Register SpillReg,
142                               Register ScratchRsrcReg, Register SPReg, int FI) {
143   MachineFunction *MF = MBB.getParent();
144   MachineFrameInfo &MFI = MF->getFrameInfo();
145   int64_t Offset = MFI.getObjectOffset(FI);
146 
147   MachineMemOperand *MMO = MF->getMachineMemOperand(
148       MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4,
149       MFI.getObjectAlign(FI));
150 
151   if (isUInt<12>(Offset)) {
152     BuildMI(MBB, I, DebugLoc(),
153             TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFSET), SpillReg)
154       .addReg(ScratchRsrcReg)
155       .addReg(SPReg)
156       .addImm(Offset)
157       .addImm(0) // glc
158       .addImm(0) // slc
159       .addImm(0) // tfe
160       .addImm(0) // dlc
161       .addImm(0) // swz
162       .addMemOperand(MMO);
163     return;
164   }
165 
166   MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
167     MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
168 
169   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
170     .addImm(Offset);
171 
172   BuildMI(MBB, I, DebugLoc(),
173           TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), SpillReg)
174     .addReg(OffsetReg, RegState::Kill)
175     .addReg(ScratchRsrcReg)
176     .addReg(SPReg)
177     .addImm(0)
178     .addImm(0) // glc
179     .addImm(0) // slc
180     .addImm(0) // tfe
181     .addImm(0) // dlc
182     .addImm(0) // swz
183     .addMemOperand(MMO);
184 }
185 
186 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
187 void SIFrameLowering::emitEntryFunctionFlatScratchInit(
188     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
189     const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
190   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
191   const SIInstrInfo *TII = ST.getInstrInfo();
192   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
193   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
194 
195   // We don't need this if we only have spills since there is no user facing
196   // scratch.
197 
198   // TODO: If we know we don't have flat instructions earlier, we can omit
199   // this from the input registers.
200   //
201   // TODO: We only need to know if we access scratch space through a flat
202   // pointer. Because we only detect if flat instructions are used at all,
203   // this will be used more often than necessary on VI.
204 
205   Register FlatScratchInitReg =
206       MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
207 
208   MachineRegisterInfo &MRI = MF.getRegInfo();
209   MRI.addLiveIn(FlatScratchInitReg);
210   MBB.addLiveIn(FlatScratchInitReg);
211 
212   Register FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
213   Register FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
214 
215   // Do a 64-bit pointer add.
216   if (ST.flatScratchIsPointer()) {
217     if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
218       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
219         .addReg(FlatScrInitLo)
220         .addReg(ScratchWaveOffsetReg);
221       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), FlatScrInitHi)
222         .addReg(FlatScrInitHi)
223         .addImm(0);
224       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
225         addReg(FlatScrInitLo).
226         addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO |
227                        (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
228       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
229         addReg(FlatScrInitHi).
230         addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
231                        (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
232       return;
233     }
234 
235     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
236       .addReg(FlatScrInitLo)
237       .addReg(ScratchWaveOffsetReg);
238     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI)
239       .addReg(FlatScrInitHi)
240       .addImm(0);
241 
242     return;
243   }
244 
245   assert(ST.getGeneration() < AMDGPUSubtarget::GFX10);
246 
247   // Copy the size in bytes.
248   BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
249     .addReg(FlatScrInitHi, RegState::Kill);
250 
251   // Add wave offset in bytes to private base offset.
252   // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
253   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
254     .addReg(FlatScrInitLo)
255     .addReg(ScratchWaveOffsetReg);
256 
257   // Convert offset to 256-byte units.
258   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI)
259     .addReg(FlatScrInitLo, RegState::Kill)
260     .addImm(8);
261 }
262 
263 // Shift down registers reserved for the scratch RSRC.
264 Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
265     MachineFunction &MF) const {
266 
267   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
268   const SIInstrInfo *TII = ST.getInstrInfo();
269   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
270   MachineRegisterInfo &MRI = MF.getRegInfo();
271   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
272 
273   assert(MFI->isEntryFunction());
274 
275   Register ScratchRsrcReg = MFI->getScratchRSrcReg();
276 
277   if (!ScratchRsrcReg || !MRI.isPhysRegUsed(ScratchRsrcReg))
278     return Register();
279 
280   if (ST.hasSGPRInitBug() ||
281       ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
282     return ScratchRsrcReg;
283 
284   // We reserved the last registers for this. Shift it down to the end of those
285   // which were actually used.
286   //
287   // FIXME: It might be safer to use a pseudoregister before replacement.
288 
289   // FIXME: We should be able to eliminate unused input registers. We only
290   // cannot do this for the resources required for scratch access. For now we
291   // skip over user SGPRs and may leave unused holes.
292 
293   unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
294   ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(ST, MF);
295   AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
296 
297   // Skip the last N reserved elements because they should have already been
298   // reserved for VCC etc.
299   Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
300   for (MCPhysReg Reg : AllSGPR128s) {
301     // Pick the first unallocated one. Make sure we don't clobber the other
302     // reserved input we needed. Also for PAL, make sure we don't clobber
303     // the GIT pointer passed in SGPR0 or SGPR8.
304     if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
305         !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
306       MRI.replaceRegWith(ScratchRsrcReg, Reg);
307       MFI->setScratchRSrcReg(Reg);
308       return Reg;
309     }
310   }
311 
312   return ScratchRsrcReg;
313 }
314 
315 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
316                                                 MachineBasicBlock &MBB) const {
317   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
318 
319   // FIXME: If we only have SGPR spills, we won't actually be using scratch
320   // memory since these spill to VGPRs. We should be cleaning up these unused
321   // SGPR spill frame indices somewhere.
322 
323   // FIXME: We still have implicit uses on SGPR spill instructions in case they
324   // need to spill to vector memory. It's likely that will not happen, but at
325   // this point it appears we need the setup. This part of the prolog should be
326   // emitted after frame indices are eliminated.
327 
328   // FIXME: Remove all of the isPhysRegUsed checks
329 
330   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
331   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
332   const SIInstrInfo *TII = ST.getInstrInfo();
333   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
334   MachineRegisterInfo &MRI = MF.getRegInfo();
335   const Function &F = MF.getFunction();
336 
337   assert(MFI->isEntryFunction());
338 
339   Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
340       AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
341   // FIXME: Hack to not crash in situations which emitted an error.
342   if (!PreloadedScratchWaveOffsetReg)
343     return;
344 
345   // We need to do the replacement of the private segment buffer register even
346   // if there are no stack objects. There could be stores to undef or a
347   // constant without an associated object.
348   //
349   // This will return `Register()` in cases where there are no actual
350   // uses of the SRSRC.
351   Register ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
352 
353   // Make the selected register live throughout the function.
354   if (ScratchRsrcReg) {
355     for (MachineBasicBlock &OtherBB : MF) {
356       if (&OtherBB != &MBB) {
357         OtherBB.addLiveIn(ScratchRsrcReg);
358       }
359     }
360   }
361 
362   // Now that we have fixed the reserved SRSRC we need to locate the
363   // (potentially) preloaded SRSRC.
364   Register PreloadedScratchRsrcReg;
365   if (ST.isAmdHsaOrMesa(F)) {
366     PreloadedScratchRsrcReg =
367         MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
368     if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
369       // We added live-ins during argument lowering, but since they were not
370       // used they were deleted. We're adding the uses now, so add them back.
371       MRI.addLiveIn(PreloadedScratchRsrcReg);
372       MBB.addLiveIn(PreloadedScratchRsrcReg);
373     }
374   }
375 
376   // Debug location must be unknown since the first debug location is used to
377   // determine the end of the prologue.
378   DebugLoc DL;
379   MachineBasicBlock::iterator I = MBB.begin();
380 
381   // We found the SRSRC first because it needs four registers and has an
382   // alignment requirement. If the SRSRC that we found is clobbering with
383   // the scratch wave offset, which may be in a fixed SGPR or a free SGPR
384   // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
385   // wave offset to a free SGPR.
386   Register ScratchWaveOffsetReg;
387   if (TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) {
388     ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF);
389     unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
390     AllSGPRs = AllSGPRs.slice(
391         std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded));
392     Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
393     for (MCPhysReg Reg : AllSGPRs) {
394       if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
395           !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) {
396         ScratchWaveOffsetReg = Reg;
397         BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
398             .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
399         break;
400       }
401     }
402   } else {
403     ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
404   }
405   assert(ScratchWaveOffsetReg);
406 
407   if (MF.getFrameInfo().hasCalls()) {
408     Register SPReg = MFI->getStackPtrOffsetReg();
409     assert(SPReg != AMDGPU::SP_REG);
410     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
411         .addImm(MF.getFrameInfo().getStackSize() * ST.getWavefrontSize());
412   }
413 
414   if (hasFP(MF)) {
415     Register FPReg = MFI->getFrameOffsetReg();
416     assert(FPReg != AMDGPU::FP_REG);
417     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
418   }
419 
420   if (MFI->hasFlatScratchInit() || ScratchRsrcReg) {
421     MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
422     MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
423   }
424 
425   if (MFI->hasFlatScratchInit()) {
426     emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
427   }
428 
429   if (ScratchRsrcReg) {
430     emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
431                                          PreloadedScratchRsrcReg,
432                                          ScratchRsrcReg, ScratchWaveOffsetReg);
433   }
434 }
435 
436 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
437 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
438     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
439     const DebugLoc &DL, Register PreloadedScratchRsrcReg,
440     Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
441 
442   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
443   const SIInstrInfo *TII = ST.getInstrInfo();
444   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
445   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
446   const Function &Fn = MF.getFunction();
447 
448   if (ST.isAmdPalOS()) {
449     // The pointer to the GIT is formed from the offset passed in and either
450     // the amdgpu-git-ptr-high function attribute or the top part of the PC
451     Register RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
452     Register RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
453     Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
454 
455     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
456 
457     if (MFI->getGITPtrHigh() != 0xffffffff) {
458       BuildMI(MBB, I, DL, SMovB32, RsrcHi)
459         .addImm(MFI->getGITPtrHigh())
460         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
461     } else {
462       const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
463       BuildMI(MBB, I, DL, GetPC64, Rsrc01);
464     }
465     Register GitPtrLo = MFI->getGITPtrLoReg(MF);
466     MF.getRegInfo().addLiveIn(GitPtrLo);
467     MBB.addLiveIn(GitPtrLo);
468     BuildMI(MBB, I, DL, SMovB32, RsrcLo)
469       .addReg(GitPtrLo)
470       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
471 
472     // We now have the GIT ptr - now get the scratch descriptor from the entry
473     // at offset 0 (or offset 16 for a compute shader).
474     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
475     const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
476     auto MMO = MF.getMachineMemOperand(PtrInfo,
477                                        MachineMemOperand::MOLoad |
478                                            MachineMemOperand::MOInvariant |
479                                            MachineMemOperand::MODereferenceable,
480                                        16, Align(4));
481     unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
482     const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
483     unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
484     BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
485       .addReg(Rsrc01)
486       .addImm(EncodedOffset) // offset
487       .addImm(0) // glc
488       .addImm(0) // dlc
489       .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
490       .addMemOperand(MMO);
491   } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
492     assert(!ST.isAmdHsaOrMesa(Fn));
493     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
494 
495     Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
496     Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
497 
498     // Use relocations to get the pointer, and setup the other bits manually.
499     uint64_t Rsrc23 = TII->getScratchRsrcWords23();
500 
501     if (MFI->hasImplicitBufferPtr()) {
502       Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
503 
504       if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
505         const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
506 
507         BuildMI(MBB, I, DL, Mov64, Rsrc01)
508           .addReg(MFI->getImplicitBufferPtrUserSGPR())
509           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
510       } else {
511         const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
512 
513         MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
514         auto MMO = MF.getMachineMemOperand(
515             PtrInfo,
516             MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
517                 MachineMemOperand::MODereferenceable,
518             8, Align(4));
519         BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
520           .addReg(MFI->getImplicitBufferPtrUserSGPR())
521           .addImm(0) // offset
522           .addImm(0) // glc
523           .addImm(0) // dlc
524           .addMemOperand(MMO)
525           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
526 
527         MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
528         MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
529       }
530     } else {
531       Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
532       Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
533 
534       BuildMI(MBB, I, DL, SMovB32, Rsrc0)
535         .addExternalSymbol("SCRATCH_RSRC_DWORD0")
536         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
537 
538       BuildMI(MBB, I, DL, SMovB32, Rsrc1)
539         .addExternalSymbol("SCRATCH_RSRC_DWORD1")
540         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
541 
542     }
543 
544     BuildMI(MBB, I, DL, SMovB32, Rsrc2)
545       .addImm(Rsrc23 & 0xffffffff)
546       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
547 
548     BuildMI(MBB, I, DL, SMovB32, Rsrc3)
549       .addImm(Rsrc23 >> 32)
550       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
551   } else if (ST.isAmdHsaOrMesa(Fn)) {
552     assert(PreloadedScratchRsrcReg);
553 
554     if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
555       BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
556           .addReg(PreloadedScratchRsrcReg, RegState::Kill);
557     }
558   }
559 
560   // Add the scratch wave offset into the scratch RSRC.
561   //
562   // We only want to update the first 48 bits, which is the base address
563   // pointer, without touching the adjacent 16 bits of flags. We know this add
564   // cannot carry-out from bit 47, otherwise the scratch allocation would be
565   // impossible to fit in the 48-bit global address space.
566   //
567   // TODO: Evaluate if it is better to just construct an SRD using the flat
568   // scratch init and some constants rather than update the one we are passed.
569   Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
570   Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
571 
572   // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
573   // the kernel body via inreg arguments.
574   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0)
575       .addReg(ScratchRsrcSub0)
576       .addReg(ScratchWaveOffsetReg)
577       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
578   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1)
579       .addReg(ScratchRsrcSub1)
580       .addImm(0)
581       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
582 }
583 
584 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
585   switch (ID) {
586   case TargetStackID::Default:
587   case TargetStackID::NoAlloc:
588   case TargetStackID::SGPRSpill:
589     return true;
590   case TargetStackID::SVEVector:
591     return false;
592   }
593   llvm_unreachable("Invalid TargetStackID::Value");
594 }
595 
596 // Activate all lanes, returns saved exec.
597 static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
598                                      MachineFunction &MF,
599                                      MachineBasicBlock &MBB,
600                                      MachineBasicBlock::iterator MBBI,
601                                      bool IsProlog) {
602   Register ScratchExecCopy;
603   MachineRegisterInfo &MRI = MF.getRegInfo();
604   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
605   const SIInstrInfo *TII = ST.getInstrInfo();
606   const SIRegisterInfo &TRI = TII->getRegisterInfo();
607   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
608   DebugLoc DL;
609 
610   if (LiveRegs.empty()) {
611     if (IsProlog) {
612       LiveRegs.init(TRI);
613       LiveRegs.addLiveIns(MBB);
614       if (FuncInfo->SGPRForFPSaveRestoreCopy)
615         LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy);
616     } else {
617       // In epilog.
618       LiveRegs.init(*ST.getRegisterInfo());
619       LiveRegs.addLiveOuts(MBB);
620       LiveRegs.stepBackward(*MBBI);
621     }
622   }
623 
624   ScratchExecCopy = findScratchNonCalleeSaveRegister(
625       MRI, LiveRegs, *TRI.getWaveMaskRegClass());
626 
627   if (!IsProlog)
628     LiveRegs.removeReg(ScratchExecCopy);
629 
630   const unsigned OrSaveExec =
631       ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
632   BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy).addImm(-1);
633 
634   return ScratchExecCopy;
635 }
636 
637 void SIFrameLowering::emitPrologue(MachineFunction &MF,
638                                    MachineBasicBlock &MBB) const {
639   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
640   if (FuncInfo->isEntryFunction()) {
641     emitEntryFunctionPrologue(MF, MBB);
642     return;
643   }
644 
645   const MachineFrameInfo &MFI = MF.getFrameInfo();
646   MachineRegisterInfo &MRI = MF.getRegInfo();
647   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
648   const SIInstrInfo *TII = ST.getInstrInfo();
649   const SIRegisterInfo &TRI = TII->getRegisterInfo();
650 
651   Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
652   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
653   LivePhysRegs LiveRegs;
654 
655   MachineBasicBlock::iterator MBBI = MBB.begin();
656   DebugLoc DL;
657 
658   bool HasFP = false;
659   uint32_t NumBytes = MFI.getStackSize();
660   uint32_t RoundedSize = NumBytes;
661   // To avoid clobbering VGPRs in lanes that weren't active on function entry,
662   // turn on all lanes before doing the spill to memory.
663   Register ScratchExecCopy;
664 
665   bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue();
666   bool SpillFPToMemory = false;
667   // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR.
668   // Otherwise we are spilling the FP to memory.
669   if (HasFPSaveIndex) {
670     SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) !=
671                       TargetStackID::SGPRSpill;
672   }
673 
674   // Emit the copy if we need an FP, and are using a free SGPR to save it.
675   if (FuncInfo->SGPRForFPSaveRestoreCopy) {
676     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy)
677       .addReg(FramePtrReg)
678       .setMIFlag(MachineInstr::FrameSetup);
679     // Make the register live throughout the function.
680     for (MachineBasicBlock &MBB : MF)
681       MBB.addLiveIn(FuncInfo->SGPRForFPSaveRestoreCopy);
682   }
683 
684   for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
685          : FuncInfo->getSGPRSpillVGPRs()) {
686     if (!Reg.FI.hasValue())
687       continue;
688 
689     if (!ScratchExecCopy)
690       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
691 
692     buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
693                      FuncInfo->getScratchRSrcReg(),
694                      StackPtrReg,
695                      Reg.FI.getValue());
696   }
697 
698   if (HasFPSaveIndex && SpillFPToMemory) {
699     assert(!MFI.isDeadObjectIndex(FuncInfo->FramePointerSaveIndex.getValue()));
700 
701     if (!ScratchExecCopy)
702       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
703 
704     MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
705         MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
706 
707     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
708         .addReg(FramePtrReg);
709 
710     buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR,
711                      FuncInfo->getScratchRSrcReg(), StackPtrReg,
712                      FuncInfo->FramePointerSaveIndex.getValue());
713   }
714 
715   if (ScratchExecCopy) {
716     // FIXME: Split block and make terminator.
717     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
718     MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
719     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
720         .addReg(ScratchExecCopy, RegState::Kill);
721     LiveRegs.addReg(ScratchExecCopy);
722   }
723 
724   // In this case, spill the FP to a reserved VGPR.
725   if (HasFPSaveIndex && !SpillFPToMemory) {
726     const int FI = FuncInfo->FramePointerSaveIndex.getValue();
727     assert(!MFI.isDeadObjectIndex(FI));
728 
729     assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
730     ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
731         FuncInfo->getSGPRToVGPRSpills(FI);
732     assert(Spill.size() == 1);
733 
734     // Save FP before setting it up.
735     // FIXME: This should respect spillSGPRToVGPR;
736     BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
737             Spill[0].VGPR)
738         .addReg(FramePtrReg)
739         .addImm(Spill[0].Lane)
740         .addReg(Spill[0].VGPR, RegState::Undef);
741   }
742 
743   if (TRI.needsStackRealignment(MF)) {
744     HasFP = true;
745     const unsigned Alignment = MFI.getMaxAlign().value();
746 
747     RoundedSize += Alignment;
748     if (LiveRegs.empty()) {
749       LiveRegs.init(TRI);
750       LiveRegs.addLiveIns(MBB);
751       LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy);
752     }
753 
754     Register ScratchSPReg = findScratchNonCalleeSaveRegister(
755         MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass);
756     assert(ScratchSPReg && ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy);
757 
758     // s_add_u32 tmp_reg, s32, NumBytes
759     // s_and_b32 s32, tmp_reg, 0b111...0000
760     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg)
761         .addReg(StackPtrReg)
762         .addImm((Alignment - 1) * ST.getWavefrontSize())
763         .setMIFlag(MachineInstr::FrameSetup);
764     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
765         .addReg(ScratchSPReg, RegState::Kill)
766         .addImm(-Alignment * ST.getWavefrontSize())
767         .setMIFlag(MachineInstr::FrameSetup);
768     FuncInfo->setIsStackRealigned(true);
769   } else if ((HasFP = hasFP(MF))) {
770     // If we need a base pointer, set it up here. It's whatever the value of
771     // the stack pointer is at this point. Any variable size objects will be
772     // allocated after this, so we can still use the base pointer to reference
773     // locals.
774     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
775         .addReg(StackPtrReg)
776         .setMIFlag(MachineInstr::FrameSetup);
777   }
778 
779   if (HasFP && RoundedSize != 0) {
780     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
781         .addReg(StackPtrReg)
782         .addImm(RoundedSize * ST.getWavefrontSize())
783         .setMIFlag(MachineInstr::FrameSetup);
784   }
785 
786   assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy ||
787                      FuncInfo->FramePointerSaveIndex)) &&
788          "Needed to save FP but didn't save it anywhere");
789 
790   assert((HasFP || (!FuncInfo->SGPRForFPSaveRestoreCopy &&
791                     !FuncInfo->FramePointerSaveIndex)) &&
792          "Saved FP but didn't need it");
793 }
794 
795 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
796                                    MachineBasicBlock &MBB) const {
797   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
798   if (FuncInfo->isEntryFunction())
799     return;
800 
801   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
802   const SIInstrInfo *TII = ST.getInstrInfo();
803   MachineRegisterInfo &MRI = MF.getRegInfo();
804   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
805   LivePhysRegs LiveRegs;
806   DebugLoc DL;
807 
808   const MachineFrameInfo &MFI = MF.getFrameInfo();
809   uint32_t NumBytes = MFI.getStackSize();
810   uint32_t RoundedSize = FuncInfo->isStackRealigned()
811                              ? NumBytes + MFI.getMaxAlign().value()
812                              : NumBytes;
813   const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
814   const Register FramePtrReg = FuncInfo->getFrameOffsetReg();
815 
816   bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue();
817   bool SpillFPToMemory = false;
818   if (HasFPSaveIndex) {
819     SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) !=
820                       TargetStackID::SGPRSpill;
821   }
822 
823   if (RoundedSize != 0 && hasFP(MF)) {
824     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
825       .addReg(StackPtrReg)
826       .addImm(RoundedSize * ST.getWavefrontSize())
827       .setMIFlag(MachineInstr::FrameDestroy);
828   }
829 
830   if (FuncInfo->SGPRForFPSaveRestoreCopy) {
831     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
832         .addReg(FuncInfo->SGPRForFPSaveRestoreCopy)
833         .setMIFlag(MachineInstr::FrameSetup);
834   }
835 
836   Register ScratchExecCopy;
837   if (HasFPSaveIndex) {
838     const int FI = FuncInfo->FramePointerSaveIndex.getValue();
839     assert(!MFI.isDeadObjectIndex(FI));
840     if (SpillFPToMemory) {
841       if (!ScratchExecCopy)
842         ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
843 
844       MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister(
845           MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
846       buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR,
847                         FuncInfo->getScratchRSrcReg(), StackPtrReg, FI);
848       BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg)
849           .addReg(TempVGPR, RegState::Kill);
850     } else {
851       // Reload from VGPR spill.
852       assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
853       ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
854           FuncInfo->getSGPRToVGPRSpills(FI);
855       assert(Spill.size() == 1);
856       BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
857               FramePtrReg)
858           .addReg(Spill[0].VGPR)
859           .addImm(Spill[0].Lane);
860     }
861   }
862 
863   for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg :
864        FuncInfo->getSGPRSpillVGPRs()) {
865     if (!Reg.FI.hasValue())
866       continue;
867 
868     if (!ScratchExecCopy)
869       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
870 
871     buildEpilogReload(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
872                       FuncInfo->getScratchRSrcReg(), StackPtrReg,
873                       Reg.FI.getValue());
874   }
875 
876   if (ScratchExecCopy) {
877     // FIXME: Split block and make terminator.
878     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
879     MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
880     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
881         .addReg(ScratchExecCopy, RegState::Kill);
882   }
883 }
884 
885 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
886 // memory. They should have been removed by now.
887 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
888   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
889        I != E; ++I) {
890     if (!MFI.isDeadObjectIndex(I))
891       return false;
892   }
893 
894   return true;
895 }
896 
897 #ifndef NDEBUG
898 static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI,
899                                  Optional<int> FramePointerSaveIndex) {
900   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
901        I != E; ++I) {
902     if (!MFI.isDeadObjectIndex(I) &&
903         MFI.getStackID(I) == TargetStackID::SGPRSpill &&
904         FramePointerSaveIndex && I != FramePointerSaveIndex) {
905       return false;
906     }
907   }
908 
909   return true;
910 }
911 #endif
912 
913 int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
914                                             Register &FrameReg) const {
915   const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
916 
917   FrameReg = RI->getFrameRegister(MF);
918   return MF.getFrameInfo().getObjectOffset(FI);
919 }
920 
921 void SIFrameLowering::processFunctionBeforeFrameFinalized(
922   MachineFunction &MF,
923   RegScavenger *RS) const {
924   MachineFrameInfo &MFI = MF.getFrameInfo();
925 
926   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
927   const SIRegisterInfo *TRI = ST.getRegisterInfo();
928   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
929 
930   FuncInfo->removeDeadFrameIndices(MFI);
931   assert(allSGPRSpillsAreDead(MFI, None) &&
932          "SGPR spill should have been removed in SILowerSGPRSpills");
933 
934   // FIXME: The other checks should be redundant with allStackObjectsAreDead,
935   // but currently hasNonSpillStackObjects is set only from source
936   // allocas. Stack temps produced from legalization are not counted currently.
937   if (!allStackObjectsAreDead(MFI)) {
938     assert(RS && "RegScavenger required if spilling");
939 
940     if (FuncInfo->isEntryFunction()) {
941       int ScavengeFI = MFI.CreateFixedObject(
942         TRI->getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
943       RS->addScavengingFrameIndex(ScavengeFI);
944     } else {
945       int ScavengeFI = MFI.CreateStackObject(
946         TRI->getSpillSize(AMDGPU::SGPR_32RegClass),
947         TRI->getSpillAlignment(AMDGPU::SGPR_32RegClass),
948         false);
949       RS->addScavengingFrameIndex(ScavengeFI);
950     }
951   }
952 }
953 
954 // Only report VGPRs to generic code.
955 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
956                                            BitVector &SavedVGPRs,
957                                            RegScavenger *RS) const {
958   TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
959   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
960   if (MFI->isEntryFunction())
961     return;
962 
963   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
964   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
965   const SIRegisterInfo *TRI = ST.getRegisterInfo();
966 
967   // Ignore the SGPRs the default implementation found.
968   SavedVGPRs.clearBitsNotInMask(TRI->getAllVGPRRegMask());
969 
970   // hasFP only knows about stack objects that already exist. We're now
971   // determining the stack slots that will be created, so we have to predict
972   // them. Stack objects force FP usage with calls.
973   //
974   // Note a new VGPR CSR may be introduced if one is used for the spill, but we
975   // don't want to report it here.
976   //
977   // FIXME: Is this really hasReservedCallFrame?
978   const bool WillHaveFP =
979       FrameInfo.hasCalls() &&
980       (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo));
981 
982   // VGPRs used for SGPR spilling need to be specially inserted in the prolog,
983   // so don't allow the default insertion to handle them.
984   for (auto SSpill : MFI->getSGPRSpillVGPRs())
985     SavedVGPRs.reset(SSpill.VGPR);
986 
987   const bool HasFP = WillHaveFP || hasFP(MF);
988   if (!HasFP)
989     return;
990 
991   // We need to save and restore the current FP.
992 
993   // 1: If there is already a VGPR with free lanes, use it. We
994   // may already have to pay the penalty for spilling a CSR VGPR.
995   if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) {
996     int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr,
997                                                     TargetStackID::SGPRSpill);
998 
999     if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
1000       llvm_unreachable("allocate SGPR spill should have worked");
1001 
1002     MFI->FramePointerSaveIndex = NewFI;
1003 
1004     LLVM_DEBUG(
1005       auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
1006       dbgs() << "Spilling FP to  " << printReg(Spill.VGPR, TRI)
1007              << ':' << Spill.Lane << '\n');
1008     return;
1009   }
1010 
1011   // 2: Next, try to save the FP in an unused SGPR.
1012   MFI->SGPRForFPSaveRestoreCopy = findUnusedSGPRNonCalleeSaved(MF.getRegInfo());
1013 
1014   if (!MFI->SGPRForFPSaveRestoreCopy) {
1015     int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr,
1016                                                     TargetStackID::SGPRSpill);
1017 
1018     if (MFI->allocateSGPRSpillToVGPR(MF, NewFI)) {
1019       // 3: There's no free lane to spill, and no free register to save FP, so
1020       // we're forced to spill another VGPR to use for the spill.
1021       MFI->FramePointerSaveIndex = NewFI;
1022     } else {
1023       // 4: If all else fails, spill the FP to memory.
1024       MFI->FramePointerSaveIndex =
1025           FrameInfo.CreateSpillStackObject(4, Align(4));
1026     }
1027 
1028     LLVM_DEBUG(
1029       auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
1030       dbgs() << "FP requires fallback spill to " << printReg(Spill.VGPR, TRI)
1031              << ':' << Spill.Lane << '\n';);
1032   } else {
1033     LLVM_DEBUG(dbgs() << "Saving FP with copy to " <<
1034                printReg(MFI->SGPRForFPSaveRestoreCopy, TRI) << '\n');
1035   }
1036 }
1037 
1038 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
1039                                                BitVector &SavedRegs,
1040                                                RegScavenger *RS) const {
1041   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
1042   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1043   if (MFI->isEntryFunction())
1044     return;
1045 
1046   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1047   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1048 
1049   // The SP is specifically managed and we don't want extra spills of it.
1050   SavedRegs.reset(MFI->getStackPtrOffsetReg());
1051   SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask());
1052 }
1053 
1054 bool SIFrameLowering::assignCalleeSavedSpillSlots(
1055     MachineFunction &MF, const TargetRegisterInfo *TRI,
1056     std::vector<CalleeSavedInfo> &CSI) const {
1057   if (CSI.empty())
1058     return true; // Early exit if no callee saved registers are modified!
1059 
1060   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1061   if (!FuncInfo->SGPRForFPSaveRestoreCopy)
1062     return false;
1063 
1064   for (auto &CS : CSI) {
1065     if (CS.getReg() == FuncInfo->getFrameOffsetReg()) {
1066       if (FuncInfo->SGPRForFPSaveRestoreCopy)
1067         CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy);
1068       break;
1069     }
1070   }
1071 
1072   return false;
1073 }
1074 
1075 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
1076   MachineFunction &MF,
1077   MachineBasicBlock &MBB,
1078   MachineBasicBlock::iterator I) const {
1079   int64_t Amount = I->getOperand(0).getImm();
1080   if (Amount == 0)
1081     return MBB.erase(I);
1082 
1083   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1084   const SIInstrInfo *TII = ST.getInstrInfo();
1085   const DebugLoc &DL = I->getDebugLoc();
1086   unsigned Opc = I->getOpcode();
1087   bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
1088   uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
1089 
1090   if (!hasReservedCallFrame(MF)) {
1091     Amount = alignTo(Amount, getStackAlign());
1092     assert(isUInt<32>(Amount) && "exceeded stack address space size");
1093     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1094     Register SPReg = MFI->getStackPtrOffsetReg();
1095 
1096     unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
1097     BuildMI(MBB, I, DL, TII->get(Op), SPReg)
1098       .addReg(SPReg)
1099       .addImm(Amount * ST.getWavefrontSize());
1100   } else if (CalleePopAmount != 0) {
1101     llvm_unreachable("is this used?");
1102   }
1103 
1104   return MBB.erase(I);
1105 }
1106 
1107 bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
1108   const MachineFrameInfo &MFI = MF.getFrameInfo();
1109 
1110   // For entry functions we can use an immediate offset in most cases, so the
1111   // presence of calls doesn't imply we need a distinct frame pointer.
1112   if (MFI.hasCalls() &&
1113       !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1114     // All offsets are unsigned, so need to be addressed in the same direction
1115     // as stack growth.
1116 
1117     // FIXME: This function is pretty broken, since it can be called before the
1118     // frame layout is determined or CSR spills are inserted.
1119     return MFI.getStackSize() != 0;
1120   }
1121 
1122   return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
1123     MFI.hasStackMap() || MFI.hasPatchPoint() ||
1124     MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->needsStackRealignment(MF) ||
1125     MF.getTarget().Options.DisableFramePointerElim(MF);
1126 }
1127