1 //===----------------------- SIFrameLowering.cpp --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 
9 #include "SIFrameLowering.h"
10 #include "AMDGPUSubtarget.h"
11 #include "SIInstrInfo.h"
12 #include "SIMachineFunctionInfo.h"
13 #include "SIRegisterInfo.h"
14 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
15 
16 #include "llvm/CodeGen/LivePhysRegs.h"
17 #include "llvm/CodeGen/MachineFrameInfo.h"
18 #include "llvm/CodeGen/MachineFunction.h"
19 #include "llvm/CodeGen/MachineInstrBuilder.h"
20 #include "llvm/CodeGen/RegisterScavenging.h"
21 
22 using namespace llvm;
23 
24 #define DEBUG_TYPE "frame-info"
25 
26 
27 static ArrayRef<MCPhysReg> getAllSGPR128(const GCNSubtarget &ST,
28                                          const MachineFunction &MF) {
29   return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
30                       ST.getMaxNumSGPRs(MF) / 4);
31 }
32 
33 static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST,
34                                        const MachineFunction &MF) {
35   return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
36 }
37 
38 // Find a scratch register that we can use at the start of the prologue to
39 // re-align the stack pointer. We avoid using callee-save registers since they
40 // may appear to be free when this is called from canUseAsPrologue (during
41 // shrink wrapping), but then no longer be free when this is called from
42 // emitPrologue.
43 //
44 // FIXME: This is a bit conservative, since in the above case we could use one
45 // of the callee-save registers as a scratch temp to re-align the stack pointer,
46 // but we would then have to make sure that we were in fact saving at least one
47 // callee-save register in the prologue, which is additional complexity that
48 // doesn't seem worth the benefit.
49 static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
50                                                    LivePhysRegs &LiveRegs,
51                                                    const TargetRegisterClass &RC,
52                                                    bool Unused = false) {
53   // Mark callee saved registers as used so we will not choose them.
54   const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
55   for (unsigned i = 0; CSRegs[i]; ++i)
56     LiveRegs.addReg(CSRegs[i]);
57 
58   if (Unused) {
59     // We are looking for a register that can be used throughout the entire
60     // function, so any use is unacceptable.
61     for (MCRegister Reg : RC) {
62       if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg))
63         return Reg;
64     }
65   } else {
66     for (MCRegister Reg : RC) {
67       if (LiveRegs.available(MRI, Reg))
68         return Reg;
69     }
70   }
71 
72   // If we require an unused register, this is used in contexts where failure is
73   // an option and has an alternative plan. In other contexts, this must
74   // succeed0.
75   if (!Unused)
76     report_fatal_error("failed to find free scratch register");
77 
78   return MCRegister();
79 }
80 
81 static MCPhysReg findUnusedSGPRNonCalleeSaved(MachineRegisterInfo &MRI) {
82   LivePhysRegs LiveRegs;
83   LiveRegs.init(*MRI.getTargetRegisterInfo());
84   return findScratchNonCalleeSaveRegister(
85     MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true);
86 }
87 
88 // We need to specially emit stack operations here because a different frame
89 // register is used than in the rest of the function, as getFrameRegister would
90 // use.
91 static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
92                              MachineBasicBlock::iterator I,
93                              const SIInstrInfo *TII, Register SpillReg,
94                              Register ScratchRsrcReg, Register SPReg, int FI) {
95   MachineFunction *MF = MBB.getParent();
96   MachineFrameInfo &MFI = MF->getFrameInfo();
97 
98   int64_t Offset = MFI.getObjectOffset(FI);
99 
100   MachineMemOperand *MMO = MF->getMachineMemOperand(
101       MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4,
102       MFI.getObjectAlign(FI));
103 
104   if (isUInt<12>(Offset)) {
105     BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET))
106       .addReg(SpillReg, RegState::Kill)
107       .addReg(ScratchRsrcReg)
108       .addReg(SPReg)
109       .addImm(Offset)
110       .addImm(0) // glc
111       .addImm(0) // slc
112       .addImm(0) // tfe
113       .addImm(0) // dlc
114       .addImm(0) // swz
115       .addMemOperand(MMO);
116     return;
117   }
118 
119   MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
120     MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
121 
122   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
123     .addImm(Offset);
124 
125   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN))
126     .addReg(SpillReg, RegState::Kill)
127     .addReg(OffsetReg, RegState::Kill)
128     .addReg(ScratchRsrcReg)
129     .addReg(SPReg)
130     .addImm(0)
131     .addImm(0) // glc
132     .addImm(0) // slc
133     .addImm(0) // tfe
134     .addImm(0) // dlc
135     .addImm(0) // swz
136     .addMemOperand(MMO);
137 }
138 
139 static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
140                               MachineBasicBlock::iterator I,
141                               const SIInstrInfo *TII, Register SpillReg,
142                               Register ScratchRsrcReg, Register SPReg, int FI) {
143   MachineFunction *MF = MBB.getParent();
144   MachineFrameInfo &MFI = MF->getFrameInfo();
145   int64_t Offset = MFI.getObjectOffset(FI);
146 
147   MachineMemOperand *MMO = MF->getMachineMemOperand(
148       MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4,
149       MFI.getObjectAlign(FI));
150 
151   if (isUInt<12>(Offset)) {
152     BuildMI(MBB, I, DebugLoc(),
153             TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFSET), SpillReg)
154       .addReg(ScratchRsrcReg)
155       .addReg(SPReg)
156       .addImm(Offset)
157       .addImm(0) // glc
158       .addImm(0) // slc
159       .addImm(0) // tfe
160       .addImm(0) // dlc
161       .addImm(0) // swz
162       .addMemOperand(MMO);
163     return;
164   }
165 
166   MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
167     MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
168 
169   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
170     .addImm(Offset);
171 
172   BuildMI(MBB, I, DebugLoc(),
173           TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), SpillReg)
174     .addReg(OffsetReg, RegState::Kill)
175     .addReg(ScratchRsrcReg)
176     .addReg(SPReg)
177     .addImm(0)
178     .addImm(0) // glc
179     .addImm(0) // slc
180     .addImm(0) // tfe
181     .addImm(0) // dlc
182     .addImm(0) // swz
183     .addMemOperand(MMO);
184 }
185 
186 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
187 void SIFrameLowering::emitEntryFunctionFlatScratchInit(
188     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
189     const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
190   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
191   const SIInstrInfo *TII = ST.getInstrInfo();
192   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
193   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
194 
195   // We don't need this if we only have spills since there is no user facing
196   // scratch.
197 
198   // TODO: If we know we don't have flat instructions earlier, we can omit
199   // this from the input registers.
200   //
201   // TODO: We only need to know if we access scratch space through a flat
202   // pointer. Because we only detect if flat instructions are used at all,
203   // this will be used more often than necessary on VI.
204 
205   Register FlatScratchInitReg =
206       MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
207 
208   MachineRegisterInfo &MRI = MF.getRegInfo();
209   MRI.addLiveIn(FlatScratchInitReg);
210   MBB.addLiveIn(FlatScratchInitReg);
211 
212   Register FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
213   Register FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
214 
215   // Do a 64-bit pointer add.
216   if (ST.flatScratchIsPointer()) {
217     if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
218       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
219         .addReg(FlatScrInitLo)
220         .addReg(ScratchWaveOffsetReg);
221       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), FlatScrInitHi)
222         .addReg(FlatScrInitHi)
223         .addImm(0);
224       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
225         addReg(FlatScrInitLo).
226         addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO |
227                        (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
228       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
229         addReg(FlatScrInitHi).
230         addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
231                        (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
232       return;
233     }
234 
235     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
236       .addReg(FlatScrInitLo)
237       .addReg(ScratchWaveOffsetReg);
238     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI)
239       .addReg(FlatScrInitHi)
240       .addImm(0);
241 
242     return;
243   }
244 
245   assert(ST.getGeneration() < AMDGPUSubtarget::GFX10);
246 
247   // Copy the size in bytes.
248   BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
249     .addReg(FlatScrInitHi, RegState::Kill);
250 
251   // Add wave offset in bytes to private base offset.
252   // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
253   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
254     .addReg(FlatScrInitLo)
255     .addReg(ScratchWaveOffsetReg);
256 
257   // Convert offset to 256-byte units.
258   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI)
259     .addReg(FlatScrInitLo, RegState::Kill)
260     .addImm(8);
261 }
262 
263 // Shift down registers reserved for the scratch RSRC.
264 Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
265     MachineFunction &MF) const {
266 
267   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
268   const SIInstrInfo *TII = ST.getInstrInfo();
269   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
270   MachineRegisterInfo &MRI = MF.getRegInfo();
271   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
272 
273   assert(MFI->isEntryFunction());
274 
275   Register ScratchRsrcReg = MFI->getScratchRSrcReg();
276 
277   if (!ScratchRsrcReg || !MRI.isPhysRegUsed(ScratchRsrcReg))
278     return Register();
279 
280   if (ST.hasSGPRInitBug() ||
281       ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
282     return ScratchRsrcReg;
283 
284   // We reserved the last registers for this. Shift it down to the end of those
285   // which were actually used.
286   //
287   // FIXME: It might be safer to use a pseudoregister before replacement.
288 
289   // FIXME: We should be able to eliminate unused input registers. We only
290   // cannot do this for the resources required for scratch access. For now we
291   // skip over user SGPRs and may leave unused holes.
292 
293   unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
294   ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(ST, MF);
295   AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
296 
297   // Skip the last N reserved elements because they should have already been
298   // reserved for VCC etc.
299   Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
300   for (MCPhysReg Reg : AllSGPR128s) {
301     // Pick the first unallocated one. Make sure we don't clobber the other
302     // reserved input we needed. Also for PAL, make sure we don't clobber
303     // the GIT pointer passed in SGPR0 or SGPR8.
304     if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
305         !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
306       MRI.replaceRegWith(ScratchRsrcReg, Reg);
307       MFI->setScratchRSrcReg(Reg);
308       return Reg;
309     }
310   }
311 
312   return ScratchRsrcReg;
313 }
314 
315 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
316                                                 MachineBasicBlock &MBB) const {
317   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
318 
319   // FIXME: If we only have SGPR spills, we won't actually be using scratch
320   // memory since these spill to VGPRs. We should be cleaning up these unused
321   // SGPR spill frame indices somewhere.
322 
323   // FIXME: We still have implicit uses on SGPR spill instructions in case they
324   // need to spill to vector memory. It's likely that will not happen, but at
325   // this point it appears we need the setup. This part of the prolog should be
326   // emitted after frame indices are eliminated.
327 
328   // FIXME: Remove all of the isPhysRegUsed checks
329 
330   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
331   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
332   const SIInstrInfo *TII = ST.getInstrInfo();
333   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
334   MachineRegisterInfo &MRI = MF.getRegInfo();
335   const Function &F = MF.getFunction();
336 
337   assert(MFI->isEntryFunction());
338 
339   Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
340       AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
341   // FIXME: Hack to not crash in situations which emitted an error.
342   if (!PreloadedScratchWaveOffsetReg)
343     return;
344 
345   // We need to do the replacement of the private segment buffer register even
346   // if there are no stack objects. There could be stores to undef or a
347   // constant without an associated object.
348   //
349   // This will return `Register()` in cases where there are no actual
350   // uses of the SRSRC.
351   Register ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
352 
353   // Make the selected register live throughout the function.
354   if (ScratchRsrcReg) {
355     for (MachineBasicBlock &OtherBB : MF) {
356       if (&OtherBB != &MBB) {
357         OtherBB.addLiveIn(ScratchRsrcReg);
358       }
359     }
360   }
361 
362   // Now that we have fixed the reserved SRSRC we need to locate the
363   // (potentially) preloaded SRSRC.
364   Register PreloadedScratchRsrcReg;
365   if (ST.isAmdHsaOrMesa(F)) {
366     PreloadedScratchRsrcReg =
367         MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
368     if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
369       // We added live-ins during argument lowering, but since they were not
370       // used they were deleted. We're adding the uses now, so add them back.
371       MRI.addLiveIn(PreloadedScratchRsrcReg);
372       MBB.addLiveIn(PreloadedScratchRsrcReg);
373     }
374   }
375 
376   // Debug location must be unknown since the first debug location is used to
377   // determine the end of the prologue.
378   DebugLoc DL;
379   MachineBasicBlock::iterator I = MBB.begin();
380 
381   // We found the SRSRC first because it needs four registers and has an
382   // alignment requirement. If the SRSRC that we found is clobbering with
383   // the scratch wave offset, which may be in a fixed SGPR or a free SGPR
384   // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
385   // wave offset to a free SGPR.
386   Register ScratchWaveOffsetReg;
387   if (TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) {
388     ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF);
389     unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
390     AllSGPRs = AllSGPRs.slice(
391         std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded));
392     Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
393     for (MCPhysReg Reg : AllSGPRs) {
394       if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
395           !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) {
396         ScratchWaveOffsetReg = Reg;
397         BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
398             .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
399         break;
400       }
401     }
402   } else {
403     ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
404   }
405   assert(ScratchWaveOffsetReg);
406 
407   if (MF.getFrameInfo().hasCalls()) {
408     Register SPReg = MFI->getStackPtrOffsetReg();
409     assert(SPReg != AMDGPU::SP_REG);
410     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
411         .addImm(MF.getFrameInfo().getStackSize() * ST.getWavefrontSize());
412   }
413 
414   if (hasFP(MF)) {
415     Register FPReg = MFI->getFrameOffsetReg();
416     assert(FPReg != AMDGPU::FP_REG);
417     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
418   }
419 
420   if (MFI->hasFlatScratchInit() || ScratchRsrcReg) {
421     MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
422     MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
423   }
424 
425   if (MFI->hasFlatScratchInit()) {
426     emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
427   }
428 
429   if (ScratchRsrcReg) {
430     emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
431                                          PreloadedScratchRsrcReg,
432                                          ScratchRsrcReg, ScratchWaveOffsetReg);
433   }
434 }
435 
436 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
437 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
438     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
439     const DebugLoc &DL, Register PreloadedScratchRsrcReg,
440     Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
441 
442   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
443   const SIInstrInfo *TII = ST.getInstrInfo();
444   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
445   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
446   const Function &Fn = MF.getFunction();
447 
448   if (ST.isAmdPalOS()) {
449     // The pointer to the GIT is formed from the offset passed in and either
450     // the amdgpu-git-ptr-high function attribute or the top part of the PC
451     Register RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
452     Register RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
453     Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
454 
455     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
456 
457     if (MFI->getGITPtrHigh() != 0xffffffff) {
458       BuildMI(MBB, I, DL, SMovB32, RsrcHi)
459         .addImm(MFI->getGITPtrHigh())
460         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
461     } else {
462       const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
463       BuildMI(MBB, I, DL, GetPC64, Rsrc01);
464     }
465     Register GitPtrLo = MFI->getGITPtrLoReg(MF);
466     MF.getRegInfo().addLiveIn(GitPtrLo);
467     MBB.addLiveIn(GitPtrLo);
468     BuildMI(MBB, I, DL, SMovB32, RsrcLo)
469       .addReg(GitPtrLo)
470       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
471 
472     // We now have the GIT ptr - now get the scratch descriptor from the entry
473     // at offset 0 (or offset 16 for a compute shader).
474     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
475     const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
476     auto MMO = MF.getMachineMemOperand(PtrInfo,
477                                        MachineMemOperand::MOLoad |
478                                            MachineMemOperand::MOInvariant |
479                                            MachineMemOperand::MODereferenceable,
480                                        16, Align(4));
481     unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
482     const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
483     unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
484     BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
485       .addReg(Rsrc01)
486       .addImm(EncodedOffset) // offset
487       .addImm(0) // glc
488       .addImm(0) // dlc
489       .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
490       .addMemOperand(MMO);
491   } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
492     assert(!ST.isAmdHsaOrMesa(Fn));
493     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
494 
495     Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
496     Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
497 
498     // Use relocations to get the pointer, and setup the other bits manually.
499     uint64_t Rsrc23 = TII->getScratchRsrcWords23();
500 
501     if (MFI->hasImplicitBufferPtr()) {
502       Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
503 
504       if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
505         const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
506 
507         BuildMI(MBB, I, DL, Mov64, Rsrc01)
508           .addReg(MFI->getImplicitBufferPtrUserSGPR())
509           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
510       } else {
511         const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
512 
513         MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
514         auto MMO = MF.getMachineMemOperand(
515             PtrInfo,
516             MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
517                 MachineMemOperand::MODereferenceable,
518             8, Align(4));
519         BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
520           .addReg(MFI->getImplicitBufferPtrUserSGPR())
521           .addImm(0) // offset
522           .addImm(0) // glc
523           .addImm(0) // dlc
524           .addMemOperand(MMO)
525           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
526 
527         MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
528         MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
529       }
530     } else {
531       Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
532       Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
533 
534       BuildMI(MBB, I, DL, SMovB32, Rsrc0)
535         .addExternalSymbol("SCRATCH_RSRC_DWORD0")
536         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
537 
538       BuildMI(MBB, I, DL, SMovB32, Rsrc1)
539         .addExternalSymbol("SCRATCH_RSRC_DWORD1")
540         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
541 
542     }
543 
544     BuildMI(MBB, I, DL, SMovB32, Rsrc2)
545       .addImm(Rsrc23 & 0xffffffff)
546       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
547 
548     BuildMI(MBB, I, DL, SMovB32, Rsrc3)
549       .addImm(Rsrc23 >> 32)
550       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
551   } else if (ST.isAmdHsaOrMesa(Fn)) {
552     assert(PreloadedScratchRsrcReg);
553 
554     if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
555       BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
556           .addReg(PreloadedScratchRsrcReg, RegState::Kill);
557     }
558   }
559 
560   // Add the scratch wave offset into the scratch RSRC.
561   //
562   // We only want to update the first 48 bits, which is the base address
563   // pointer, without touching the adjacent 16 bits of flags. We know this add
564   // cannot carry-out from bit 47, otherwise the scratch allocation would be
565   // impossible to fit in the 48-bit global address space.
566   //
567   // TODO: Evaluate if it is better to just construct an SRD using the flat
568   // scratch init and some constants rather than update the one we are passed.
569   Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
570   Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
571 
572   // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
573   // the kernel body via inreg arguments.
574   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0)
575       .addReg(ScratchRsrcSub0)
576       .addReg(ScratchWaveOffsetReg)
577       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
578   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1)
579       .addReg(ScratchRsrcSub1)
580       .addImm(0)
581       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
582 }
583 
584 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
585   switch (ID) {
586   case TargetStackID::Default:
587   case TargetStackID::NoAlloc:
588   case TargetStackID::SGPRSpill:
589     return true;
590   case TargetStackID::SVEVector:
591     return false;
592   }
593   llvm_unreachable("Invalid TargetStackID::Value");
594 }
595 
596 // Activate all lanes, returns saved exec.
597 static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
598                                      MachineFunction &MF,
599                                      MachineBasicBlock &MBB,
600                                      MachineBasicBlock::iterator MBBI,
601                                      bool IsProlog) {
602   Register ScratchExecCopy;
603   MachineRegisterInfo &MRI = MF.getRegInfo();
604   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
605   const SIInstrInfo *TII = ST.getInstrInfo();
606   const SIRegisterInfo &TRI = TII->getRegisterInfo();
607   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
608   DebugLoc DL;
609 
610   if (LiveRegs.empty()) {
611     if (IsProlog) {
612       LiveRegs.init(TRI);
613       LiveRegs.addLiveIns(MBB);
614       if (FuncInfo->SGPRForFPSaveRestoreCopy)
615         LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy);
616     } else {
617       // In epilog.
618       LiveRegs.init(*ST.getRegisterInfo());
619       LiveRegs.addLiveOuts(MBB);
620       LiveRegs.stepBackward(*MBBI);
621     }
622   }
623 
624   ScratchExecCopy = findScratchNonCalleeSaveRegister(
625       MRI, LiveRegs, *TRI.getWaveMaskRegClass());
626 
627   if (!IsProlog)
628     LiveRegs.removeReg(ScratchExecCopy);
629 
630   const unsigned OrSaveExec =
631       ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
632   BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy).addImm(-1);
633 
634   return ScratchExecCopy;
635 }
636 
637 void SIFrameLowering::emitPrologue(MachineFunction &MF,
638                                    MachineBasicBlock &MBB) const {
639   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
640   if (FuncInfo->isEntryFunction()) {
641     emitEntryFunctionPrologue(MF, MBB);
642     return;
643   }
644 
645   const MachineFrameInfo &MFI = MF.getFrameInfo();
646   MachineRegisterInfo &MRI = MF.getRegInfo();
647   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
648   const SIInstrInfo *TII = ST.getInstrInfo();
649   const SIRegisterInfo &TRI = TII->getRegisterInfo();
650 
651   Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
652   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
653   LivePhysRegs LiveRegs;
654 
655   MachineBasicBlock::iterator MBBI = MBB.begin();
656   DebugLoc DL;
657 
658   bool HasFP = false;
659   uint32_t NumBytes = MFI.getStackSize();
660   uint32_t RoundedSize = NumBytes;
661   // To avoid clobbering VGPRs in lanes that weren't active on function entry,
662   // turn on all lanes before doing the spill to memory.
663   Register ScratchExecCopy;
664 
665   bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue();
666   bool SpillFPToMemory = false;
667   // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR.
668   // Otherwise we are spilling the FP to memory.
669   if (HasFPSaveIndex) {
670     SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) !=
671                       TargetStackID::SGPRSpill;
672   }
673 
674   // Emit the copy if we need an FP, and are using a free SGPR to save it.
675   if (FuncInfo->SGPRForFPSaveRestoreCopy) {
676     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy)
677       .addReg(FramePtrReg)
678       .setMIFlag(MachineInstr::FrameSetup);
679     // Make the register live throughout the function.
680     for (MachineBasicBlock &MBB : MF)
681       MBB.addLiveIn(FuncInfo->SGPRForFPSaveRestoreCopy);
682   }
683 
684   for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
685          : FuncInfo->getSGPRSpillVGPRs()) {
686     if (!Reg.FI.hasValue())
687       continue;
688 
689     if (!ScratchExecCopy)
690       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
691 
692     buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
693                      FuncInfo->getScratchRSrcReg(),
694                      StackPtrReg,
695                      Reg.FI.getValue());
696   }
697 
698   if (HasFPSaveIndex && SpillFPToMemory) {
699     const int FI = FuncInfo->FramePointerSaveIndex.getValue();
700     assert(!MFI.isDeadObjectIndex(FI));
701 
702     if (!ScratchExecCopy)
703       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
704 
705     MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
706         MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
707 
708     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
709         .addReg(FramePtrReg);
710 
711     buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR,
712                      FuncInfo->getScratchRSrcReg(), StackPtrReg,
713                      FuncInfo->FramePointerSaveIndex.getValue());
714   }
715 
716   if (ScratchExecCopy) {
717     // FIXME: Split block and make terminator.
718     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
719     MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
720     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
721         .addReg(ScratchExecCopy, RegState::Kill);
722     LiveRegs.addReg(ScratchExecCopy);
723   }
724 
725   // In this case, spill the FP to a reserved VGPR.
726   if (HasFPSaveIndex && !SpillFPToMemory) {
727     const int FI = FuncInfo->FramePointerSaveIndex.getValue();
728     assert(!MFI.isDeadObjectIndex(FI));
729 
730     assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
731     ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
732         FuncInfo->getSGPRToVGPRSpills(FI);
733     assert(Spill.size() == 1);
734 
735     // Save FP before setting it up.
736     // FIXME: This should respect spillSGPRToVGPR;
737     BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
738             Spill[0].VGPR)
739         .addReg(FramePtrReg)
740         .addImm(Spill[0].Lane)
741         .addReg(Spill[0].VGPR, RegState::Undef);
742   }
743 
744   if (TRI.needsStackRealignment(MF)) {
745     HasFP = true;
746     const unsigned Alignment = MFI.getMaxAlign().value();
747 
748     RoundedSize += Alignment;
749     if (LiveRegs.empty()) {
750       LiveRegs.init(TRI);
751       LiveRegs.addLiveIns(MBB);
752       LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy);
753     }
754 
755     Register ScratchSPReg = findScratchNonCalleeSaveRegister(
756         MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass);
757     assert(ScratchSPReg && ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy);
758 
759     // s_add_u32 tmp_reg, s32, NumBytes
760     // s_and_b32 s32, tmp_reg, 0b111...0000
761     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg)
762         .addReg(StackPtrReg)
763         .addImm((Alignment - 1) * ST.getWavefrontSize())
764         .setMIFlag(MachineInstr::FrameSetup);
765     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
766         .addReg(ScratchSPReg, RegState::Kill)
767         .addImm(-Alignment * ST.getWavefrontSize())
768         .setMIFlag(MachineInstr::FrameSetup);
769     FuncInfo->setIsStackRealigned(true);
770   } else if ((HasFP = hasFP(MF))) {
771     // If we need a base pointer, set it up here. It's whatever the value of
772     // the stack pointer is at this point. Any variable size objects will be
773     // allocated after this, so we can still use the base pointer to reference
774     // locals.
775     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
776         .addReg(StackPtrReg)
777         .setMIFlag(MachineInstr::FrameSetup);
778   }
779 
780   if (HasFP && RoundedSize != 0) {
781     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
782         .addReg(StackPtrReg)
783         .addImm(RoundedSize * ST.getWavefrontSize())
784         .setMIFlag(MachineInstr::FrameSetup);
785   }
786 
787   assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy ||
788                      FuncInfo->FramePointerSaveIndex)) &&
789          "Needed to save FP but didn't save it anywhere");
790 
791   assert((HasFP || (!FuncInfo->SGPRForFPSaveRestoreCopy &&
792                     !FuncInfo->FramePointerSaveIndex)) &&
793          "Saved FP but didn't need it");
794 }
795 
796 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
797                                    MachineBasicBlock &MBB) const {
798   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
799   if (FuncInfo->isEntryFunction())
800     return;
801 
802   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
803   const SIInstrInfo *TII = ST.getInstrInfo();
804   MachineRegisterInfo &MRI = MF.getRegInfo();
805   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
806   LivePhysRegs LiveRegs;
807   DebugLoc DL;
808 
809   const MachineFrameInfo &MFI = MF.getFrameInfo();
810   uint32_t NumBytes = MFI.getStackSize();
811   uint32_t RoundedSize = FuncInfo->isStackRealigned()
812                              ? NumBytes + MFI.getMaxAlign().value()
813                              : NumBytes;
814   const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
815   const Register FramePtrReg = FuncInfo->getFrameOffsetReg();
816 
817   bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue();
818   bool SpillFPToMemory = false;
819   if (HasFPSaveIndex) {
820     SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) !=
821                       TargetStackID::SGPRSpill;
822   }
823 
824   if (RoundedSize != 0 && hasFP(MF)) {
825     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
826       .addReg(StackPtrReg)
827       .addImm(RoundedSize * ST.getWavefrontSize())
828       .setMIFlag(MachineInstr::FrameDestroy);
829   }
830 
831   if (FuncInfo->SGPRForFPSaveRestoreCopy) {
832     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
833         .addReg(FuncInfo->SGPRForFPSaveRestoreCopy)
834         .setMIFlag(MachineInstr::FrameSetup);
835   }
836 
837   Register ScratchExecCopy;
838   if (HasFPSaveIndex) {
839     const int FI = FuncInfo->FramePointerSaveIndex.getValue();
840     assert(!MFI.isDeadObjectIndex(FI));
841     if (SpillFPToMemory) {
842       if (!ScratchExecCopy)
843         ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
844 
845       MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister(
846           MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
847       buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR,
848                         FuncInfo->getScratchRSrcReg(), StackPtrReg, FI);
849       BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg)
850           .addReg(TempVGPR, RegState::Kill);
851     } else {
852       // Reload from VGPR spill.
853       assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
854       ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
855           FuncInfo->getSGPRToVGPRSpills(FI);
856       assert(Spill.size() == 1);
857       BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
858               FramePtrReg)
859           .addReg(Spill[0].VGPR)
860           .addImm(Spill[0].Lane);
861     }
862   }
863 
864   for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg :
865        FuncInfo->getSGPRSpillVGPRs()) {
866     if (!Reg.FI.hasValue())
867       continue;
868 
869     if (!ScratchExecCopy)
870       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
871 
872     buildEpilogReload(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
873                       FuncInfo->getScratchRSrcReg(), StackPtrReg,
874                       Reg.FI.getValue());
875   }
876 
877   if (ScratchExecCopy) {
878     // FIXME: Split block and make terminator.
879     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
880     MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
881     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
882         .addReg(ScratchExecCopy, RegState::Kill);
883   }
884 }
885 
886 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
887 // memory. They should have been removed by now.
888 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
889   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
890        I != E; ++I) {
891     if (!MFI.isDeadObjectIndex(I))
892       return false;
893   }
894 
895   return true;
896 }
897 
898 #ifndef NDEBUG
899 static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI,
900                                  Optional<int> FramePointerSaveIndex) {
901   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
902        I != E; ++I) {
903     if (!MFI.isDeadObjectIndex(I) &&
904         MFI.getStackID(I) == TargetStackID::SGPRSpill &&
905         FramePointerSaveIndex && I != FramePointerSaveIndex) {
906       return false;
907     }
908   }
909 
910   return true;
911 }
912 #endif
913 
914 int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
915                                             Register &FrameReg) const {
916   const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
917 
918   FrameReg = RI->getFrameRegister(MF);
919   return MF.getFrameInfo().getObjectOffset(FI);
920 }
921 
922 void SIFrameLowering::processFunctionBeforeFrameFinalized(
923   MachineFunction &MF,
924   RegScavenger *RS) const {
925   MachineFrameInfo &MFI = MF.getFrameInfo();
926 
927   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
928   const SIRegisterInfo *TRI = ST.getRegisterInfo();
929   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
930 
931   FuncInfo->removeDeadFrameIndices(MFI);
932   assert(allSGPRSpillsAreDead(MFI, None) &&
933          "SGPR spill should have been removed in SILowerSGPRSpills");
934 
935   // FIXME: The other checks should be redundant with allStackObjectsAreDead,
936   // but currently hasNonSpillStackObjects is set only from source
937   // allocas. Stack temps produced from legalization are not counted currently.
938   if (!allStackObjectsAreDead(MFI)) {
939     assert(RS && "RegScavenger required if spilling");
940 
941     if (FuncInfo->isEntryFunction()) {
942       int ScavengeFI = MFI.CreateFixedObject(
943         TRI->getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
944       RS->addScavengingFrameIndex(ScavengeFI);
945     } else {
946       int ScavengeFI = MFI.CreateStackObject(
947         TRI->getSpillSize(AMDGPU::SGPR_32RegClass),
948         TRI->getSpillAlignment(AMDGPU::SGPR_32RegClass),
949         false);
950       RS->addScavengingFrameIndex(ScavengeFI);
951     }
952   }
953 }
954 
955 // Only report VGPRs to generic code.
956 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
957                                            BitVector &SavedVGPRs,
958                                            RegScavenger *RS) const {
959   TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
960   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
961   if (MFI->isEntryFunction())
962     return;
963 
964   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
965   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
966   const SIRegisterInfo *TRI = ST.getRegisterInfo();
967 
968   // Ignore the SGPRs the default implementation found.
969   SavedVGPRs.clearBitsNotInMask(TRI->getAllVGPRRegMask());
970 
971   // hasFP only knows about stack objects that already exist. We're now
972   // determining the stack slots that will be created, so we have to predict
973   // them. Stack objects force FP usage with calls.
974   //
975   // Note a new VGPR CSR may be introduced if one is used for the spill, but we
976   // don't want to report it here.
977   //
978   // FIXME: Is this really hasReservedCallFrame?
979   const bool WillHaveFP =
980       FrameInfo.hasCalls() &&
981       (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo));
982 
983   // VGPRs used for SGPR spilling need to be specially inserted in the prolog,
984   // so don't allow the default insertion to handle them.
985   for (auto SSpill : MFI->getSGPRSpillVGPRs())
986     SavedVGPRs.reset(SSpill.VGPR);
987 
988   const bool HasFP = WillHaveFP || hasFP(MF);
989   if (!HasFP)
990     return;
991 
992   // We need to save and restore the current FP.
993 
994   // 1: If there is already a VGPR with free lanes, use it. We
995   // may already have to pay the penalty for spilling a CSR VGPR.
996   if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) {
997     int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr,
998                                                     TargetStackID::SGPRSpill);
999 
1000     if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
1001       llvm_unreachable("allocate SGPR spill should have worked");
1002 
1003     MFI->FramePointerSaveIndex = NewFI;
1004 
1005     LLVM_DEBUG(
1006       auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
1007       dbgs() << "Spilling FP to  " << printReg(Spill.VGPR, TRI)
1008              << ':' << Spill.Lane << '\n');
1009     return;
1010   }
1011 
1012   // 2: Next, try to save the FP in an unused SGPR.
1013   MFI->SGPRForFPSaveRestoreCopy = findUnusedSGPRNonCalleeSaved(MF.getRegInfo());
1014 
1015   if (!MFI->SGPRForFPSaveRestoreCopy) {
1016     int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr,
1017                                                     TargetStackID::SGPRSpill);
1018 
1019     if (MFI->allocateSGPRSpillToVGPR(MF, NewFI)) {
1020       // 3: There's no free lane to spill, and no free register to save FP, so
1021       // we're forced to spill another VGPR to use for the spill.
1022       MFI->FramePointerSaveIndex = NewFI;
1023     } else {
1024       // 4: If all else fails, spill the FP to memory.
1025       MFI->FramePointerSaveIndex =
1026           FrameInfo.CreateSpillStackObject(4, Align(4));
1027     }
1028 
1029     LLVM_DEBUG(
1030       auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
1031       dbgs() << "FP requires fallback spill to " << printReg(Spill.VGPR, TRI)
1032              << ':' << Spill.Lane << '\n';);
1033   } else {
1034     LLVM_DEBUG(dbgs() << "Saving FP with copy to " <<
1035                printReg(MFI->SGPRForFPSaveRestoreCopy, TRI) << '\n');
1036   }
1037 }
1038 
1039 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
1040                                                BitVector &SavedRegs,
1041                                                RegScavenger *RS) const {
1042   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
1043   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1044   if (MFI->isEntryFunction())
1045     return;
1046 
1047   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1048   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1049 
1050   // The SP is specifically managed and we don't want extra spills of it.
1051   SavedRegs.reset(MFI->getStackPtrOffsetReg());
1052   SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask());
1053 }
1054 
1055 bool SIFrameLowering::assignCalleeSavedSpillSlots(
1056     MachineFunction &MF, const TargetRegisterInfo *TRI,
1057     std::vector<CalleeSavedInfo> &CSI) const {
1058   if (CSI.empty())
1059     return true; // Early exit if no callee saved registers are modified!
1060 
1061   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1062   if (!FuncInfo->SGPRForFPSaveRestoreCopy)
1063     return false;
1064 
1065   for (auto &CS : CSI) {
1066     if (CS.getReg() == FuncInfo->getFrameOffsetReg()) {
1067       if (FuncInfo->SGPRForFPSaveRestoreCopy)
1068         CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy);
1069       break;
1070     }
1071   }
1072 
1073   return false;
1074 }
1075 
1076 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
1077   MachineFunction &MF,
1078   MachineBasicBlock &MBB,
1079   MachineBasicBlock::iterator I) const {
1080   int64_t Amount = I->getOperand(0).getImm();
1081   if (Amount == 0)
1082     return MBB.erase(I);
1083 
1084   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1085   const SIInstrInfo *TII = ST.getInstrInfo();
1086   const DebugLoc &DL = I->getDebugLoc();
1087   unsigned Opc = I->getOpcode();
1088   bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
1089   uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
1090 
1091   if (!hasReservedCallFrame(MF)) {
1092     Amount = alignTo(Amount, getStackAlign());
1093     assert(isUInt<32>(Amount) && "exceeded stack address space size");
1094     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1095     Register SPReg = MFI->getStackPtrOffsetReg();
1096 
1097     unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
1098     BuildMI(MBB, I, DL, TII->get(Op), SPReg)
1099       .addReg(SPReg)
1100       .addImm(Amount * ST.getWavefrontSize());
1101   } else if (CalleePopAmount != 0) {
1102     llvm_unreachable("is this used?");
1103   }
1104 
1105   return MBB.erase(I);
1106 }
1107 
1108 bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
1109   const MachineFrameInfo &MFI = MF.getFrameInfo();
1110 
1111   // For entry functions we can use an immediate offset in most cases, so the
1112   // presence of calls doesn't imply we need a distinct frame pointer.
1113   if (MFI.hasCalls() &&
1114       !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1115     // All offsets are unsigned, so need to be addressed in the same direction
1116     // as stack growth.
1117 
1118     // FIXME: This function is pretty broken, since it can be called before the
1119     // frame layout is determined or CSR spills are inserted.
1120     return MFI.getStackSize() != 0;
1121   }
1122 
1123   return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
1124     MFI.hasStackMap() || MFI.hasPatchPoint() ||
1125     MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->needsStackRealignment(MF) ||
1126     MF.getTarget().Options.DisableFramePointerElim(MF);
1127 }
1128