1 //===----------------------- SIFrameLowering.cpp --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 
9 #include "SIFrameLowering.h"
10 #include "AMDGPUSubtarget.h"
11 #include "SIInstrInfo.h"
12 #include "SIMachineFunctionInfo.h"
13 #include "SIRegisterInfo.h"
14 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
15 
16 #include "llvm/CodeGen/LivePhysRegs.h"
17 #include "llvm/CodeGen/MachineFrameInfo.h"
18 #include "llvm/CodeGen/MachineFunction.h"
19 #include "llvm/CodeGen/MachineInstrBuilder.h"
20 #include "llvm/CodeGen/RegisterScavenging.h"
21 
22 using namespace llvm;
23 
24 #define DEBUG_TYPE "frame-info"
25 
26 
27 static ArrayRef<MCPhysReg> getAllSGPR128(const GCNSubtarget &ST,
28                                          const MachineFunction &MF) {
29   return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
30                       ST.getMaxNumSGPRs(MF) / 4);
31 }
32 
33 static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST,
34                                        const MachineFunction &MF) {
35   return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
36                       ST.getMaxNumSGPRs(MF));
37 }
38 
39 // Find a scratch register that we can use at the start of the prologue to
40 // re-align the stack pointer. We avoid using callee-save registers since they
41 // may appear to be free when this is called from canUseAsPrologue (during
42 // shrink wrapping), but then no longer be free when this is called from
43 // emitPrologue.
44 //
45 // FIXME: This is a bit conservative, since in the above case we could use one
46 // of the callee-save registers as a scratch temp to re-align the stack pointer,
47 // but we would then have to make sure that we were in fact saving at least one
48 // callee-save register in the prologue, which is additional complexity that
49 // doesn't seem worth the benefit.
50 static unsigned findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
51                                                  LivePhysRegs &LiveRegs,
52                                                  const TargetRegisterClass &RC,
53                                                  bool Unused = false) {
54   // Mark callee saved registers as used so we will not choose them.
55   const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
56   for (unsigned i = 0; CSRegs[i]; ++i)
57     LiveRegs.addReg(CSRegs[i]);
58 
59   if (Unused) {
60     // We are looking for a register that can be used throughout the entire
61     // function, so any use is unacceptable.
62     for (unsigned Reg : RC) {
63       if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg))
64         return Reg;
65     }
66   } else {
67     for (unsigned Reg : RC) {
68       if (LiveRegs.available(MRI, Reg))
69         return Reg;
70     }
71   }
72 
73   // If we require an unused register, this is used in contexts where failure is
74   // an option and has an alternative plan. In other contexts, this must
75   // succeed0.
76   if (!Unused)
77     report_fatal_error("failed to find free scratch register");
78 
79   return AMDGPU::NoRegister;
80 }
81 
82 static MCPhysReg findUnusedSGPRNonCalleeSaved(MachineRegisterInfo &MRI) {
83   LivePhysRegs LiveRegs;
84   LiveRegs.init(*MRI.getTargetRegisterInfo());
85   return findScratchNonCalleeSaveRegister(
86     MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true);
87 }
88 
89 // We need to specially emit stack operations here because a different frame
90 // register is used than in the rest of the function, as getFrameRegister would
91 // use.
92 static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
93                              MachineBasicBlock::iterator I,
94                              const SIInstrInfo *TII, unsigned SpillReg,
95                              unsigned ScratchRsrcReg, unsigned SPReg, int FI) {
96   MachineFunction *MF = MBB.getParent();
97   MachineFrameInfo &MFI = MF->getFrameInfo();
98 
99   int64_t Offset = MFI.getObjectOffset(FI);
100 
101   MachineMemOperand *MMO = MF->getMachineMemOperand(
102       MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4,
103       MFI.getObjectAlignment(FI));
104 
105   if (isUInt<12>(Offset)) {
106     BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET))
107       .addReg(SpillReg, RegState::Kill)
108       .addReg(ScratchRsrcReg)
109       .addReg(SPReg)
110       .addImm(Offset)
111       .addImm(0) // glc
112       .addImm(0) // slc
113       .addImm(0) // tfe
114       .addImm(0) // dlc
115       .addMemOperand(MMO);
116     return;
117   }
118 
119   MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
120     MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
121 
122   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
123     .addImm(Offset);
124 
125   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN))
126     .addReg(SpillReg, RegState::Kill)
127     .addReg(OffsetReg, RegState::Kill)
128     .addReg(ScratchRsrcReg)
129     .addReg(SPReg)
130     .addImm(0)
131     .addImm(0) // glc
132     .addImm(0) // slc
133     .addImm(0) // tfe
134     .addImm(0) // dlc
135     .addMemOperand(MMO);
136 }
137 
138 static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
139                               MachineBasicBlock::iterator I,
140                               const SIInstrInfo *TII, unsigned SpillReg,
141                               unsigned ScratchRsrcReg, unsigned SPReg, int FI) {
142   MachineFunction *MF = MBB.getParent();
143   MachineFrameInfo &MFI = MF->getFrameInfo();
144   int64_t Offset = MFI.getObjectOffset(FI);
145 
146   MachineMemOperand *MMO = MF->getMachineMemOperand(
147       MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4,
148       MFI.getObjectAlignment(FI));
149 
150   if (isUInt<12>(Offset)) {
151     BuildMI(MBB, I, DebugLoc(),
152             TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFSET), SpillReg)
153       .addReg(ScratchRsrcReg)
154       .addReg(SPReg)
155       .addImm(Offset)
156       .addImm(0) // glc
157       .addImm(0) // slc
158       .addImm(0) // tfe
159       .addImm(0) // dlc
160       .addMemOperand(MMO);
161     return;
162   }
163 
164   MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
165     MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
166 
167   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
168     .addImm(Offset);
169 
170   BuildMI(MBB, I, DebugLoc(),
171           TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), SpillReg)
172     .addReg(OffsetReg, RegState::Kill)
173     .addReg(ScratchRsrcReg)
174     .addReg(SPReg)
175     .addImm(0)
176     .addImm(0) // glc
177     .addImm(0) // slc
178     .addImm(0) // tfe
179     .addImm(0) // dlc
180     .addMemOperand(MMO);
181 }
182 
183 void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST,
184                                           MachineFunction &MF,
185                                           MachineBasicBlock &MBB) const {
186   const SIInstrInfo *TII = ST.getInstrInfo();
187   const SIRegisterInfo* TRI = &TII->getRegisterInfo();
188   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
189 
190   // We don't need this if we only have spills since there is no user facing
191   // scratch.
192 
193   // TODO: If we know we don't have flat instructions earlier, we can omit
194   // this from the input registers.
195   //
196   // TODO: We only need to know if we access scratch space through a flat
197   // pointer. Because we only detect if flat instructions are used at all,
198   // this will be used more often than necessary on VI.
199 
200   // Debug location must be unknown since the first debug location is used to
201   // determine the end of the prologue.
202   DebugLoc DL;
203   MachineBasicBlock::iterator I = MBB.begin();
204 
205   unsigned FlatScratchInitReg
206     = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
207 
208   MachineRegisterInfo &MRI = MF.getRegInfo();
209   MRI.addLiveIn(FlatScratchInitReg);
210   MBB.addLiveIn(FlatScratchInitReg);
211 
212   unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
213   unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
214 
215   unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
216 
217   // Do a 64-bit pointer add.
218   if (ST.flatScratchIsPointer()) {
219     if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
220       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
221         .addReg(FlatScrInitLo)
222         .addReg(ScratchWaveOffsetReg);
223       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), FlatScrInitHi)
224         .addReg(FlatScrInitHi)
225         .addImm(0);
226       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
227         addReg(FlatScrInitLo).
228         addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO |
229                        (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
230       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
231         addReg(FlatScrInitHi).
232         addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
233                        (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
234       return;
235     }
236 
237     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
238       .addReg(FlatScrInitLo)
239       .addReg(ScratchWaveOffsetReg);
240     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI)
241       .addReg(FlatScrInitHi)
242       .addImm(0);
243 
244     return;
245   }
246 
247   assert(ST.getGeneration() < AMDGPUSubtarget::GFX10);
248 
249   // Copy the size in bytes.
250   BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
251     .addReg(FlatScrInitHi, RegState::Kill);
252 
253   // Add wave offset in bytes to private base offset.
254   // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
255   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
256     .addReg(FlatScrInitLo)
257     .addReg(ScratchWaveOffsetReg);
258 
259   // Convert offset to 256-byte units.
260   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI)
261     .addReg(FlatScrInitLo, RegState::Kill)
262     .addImm(8);
263 }
264 
265 unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
266   const GCNSubtarget &ST,
267   const SIInstrInfo *TII,
268   const SIRegisterInfo *TRI,
269   SIMachineFunctionInfo *MFI,
270   MachineFunction &MF) const {
271   MachineRegisterInfo &MRI = MF.getRegInfo();
272 
273   // We need to insert initialization of the scratch resource descriptor.
274   unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
275   if (ScratchRsrcReg == AMDGPU::NoRegister ||
276       !MRI.isPhysRegUsed(ScratchRsrcReg))
277     return AMDGPU::NoRegister;
278 
279   if (ST.hasSGPRInitBug() ||
280       ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
281     return ScratchRsrcReg;
282 
283   // We reserved the last registers for this. Shift it down to the end of those
284   // which were actually used.
285   //
286   // FIXME: It might be safer to use a pseudoregister before replacement.
287 
288   // FIXME: We should be able to eliminate unused input registers. We only
289   // cannot do this for the resources required for scratch access. For now we
290   // skip over user SGPRs and may leave unused holes.
291 
292   // We find the resource first because it has an alignment requirement.
293 
294   unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
295   ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(ST, MF);
296   AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
297 
298   // Skip the last N reserved elements because they should have already been
299   // reserved for VCC etc.
300   for (MCPhysReg Reg : AllSGPR128s) {
301     // Pick the first unallocated one. Make sure we don't clobber the other
302     // reserved input we needed.
303     if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
304       MRI.replaceRegWith(ScratchRsrcReg, Reg);
305       MFI->setScratchRSrcReg(Reg);
306       return Reg;
307     }
308   }
309 
310   return ScratchRsrcReg;
311 }
312 
313 // Shift down registers reserved for the scratch wave offset.
314 unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
315     const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI,
316     SIMachineFunctionInfo *MFI, MachineFunction &MF) const {
317   MachineRegisterInfo &MRI = MF.getRegInfo();
318   unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
319 
320   assert(MFI->isEntryFunction());
321 
322   // No replacement necessary.
323   if (ScratchWaveOffsetReg == AMDGPU::NoRegister ||
324       (!hasFP(MF) && !MRI.isPhysRegUsed(ScratchWaveOffsetReg))) {
325     return AMDGPU::NoRegister;
326   }
327 
328   if (ST.hasSGPRInitBug())
329     return ScratchWaveOffsetReg;
330 
331   unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
332 
333   ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF);
334   if (NumPreloaded > AllSGPRs.size())
335     return ScratchWaveOffsetReg;
336 
337   AllSGPRs = AllSGPRs.slice(NumPreloaded);
338 
339   // We need to drop register from the end of the list that we cannot use
340   // for the scratch wave offset.
341   // + 2 s102 and s103 do not exist on VI.
342   // + 2 for vcc
343   // + 2 for xnack_mask
344   // + 2 for flat_scratch
345   // + 4 for registers reserved for scratch resource register
346   // + 1 for register reserved for scratch wave offset.  (By exluding this
347   //     register from the list to consider, it means that when this
348   //     register is being used for the scratch wave offset and there
349   //     are no other free SGPRs, then the value will stay in this register.
350   // + 1 if stack pointer is used.
351   // ----
352   //  13 (+1)
353   unsigned ReservedRegCount = 13;
354 
355   if (AllSGPRs.size() < ReservedRegCount)
356     return ScratchWaveOffsetReg;
357 
358   bool HandledScratchWaveOffsetReg =
359     ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
360 
361   for (MCPhysReg Reg : AllSGPRs.drop_back(ReservedRegCount)) {
362     // Pick the first unallocated SGPR. Be careful not to pick an alias of the
363     // scratch descriptor, since we haven’t added its uses yet.
364     if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
365       if (!HandledScratchWaveOffsetReg) {
366         HandledScratchWaveOffsetReg = true;
367 
368         MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
369         if (MFI->getScratchWaveOffsetReg() == MFI->getStackPtrOffsetReg()) {
370           assert(!hasFP(MF));
371           MFI->setStackPtrOffsetReg(Reg);
372         }
373 
374         MFI->setScratchWaveOffsetReg(Reg);
375         MFI->setFrameOffsetReg(Reg);
376         ScratchWaveOffsetReg = Reg;
377         break;
378       }
379     }
380   }
381 
382   return ScratchWaveOffsetReg;
383 }
384 
385 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
386                                                 MachineBasicBlock &MBB) const {
387   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
388 
389   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
390 
391   // If we only have SGPR spills, we won't actually be using scratch memory
392   // since these spill to VGPRs.
393   //
394   // FIXME: We should be cleaning up these unused SGPR spill frame indices
395   // somewhere.
396 
397   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
398   const SIInstrInfo *TII = ST.getInstrInfo();
399   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
400   MachineRegisterInfo &MRI = MF.getRegInfo();
401   const Function &F = MF.getFunction();
402 
403   // We need to do the replacement of the private segment buffer and wave offset
404   // register even if there are no stack objects. There could be stores to undef
405   // or a constant without an associated object.
406 
407   // FIXME: We still have implicit uses on SGPR spill instructions in case they
408   // need to spill to vector memory. It's likely that will not happen, but at
409   // this point it appears we need the setup. This part of the prolog should be
410   // emitted after frame indices are eliminated.
411 
412   if (MFI->hasFlatScratchInit())
413     emitFlatScratchInit(ST, MF, MBB);
414 
415   unsigned ScratchRsrcReg
416     = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF);
417 
418   unsigned ScratchWaveOffsetReg =
419       getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF);
420 
421   // We need to insert initialization of the scratch resource descriptor.
422   unsigned PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
423     AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
424 
425   unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
426   if (ST.isAmdHsaOrMesa(F)) {
427     PreloadedPrivateBufferReg = MFI->getPreloadedReg(
428       AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
429   }
430 
431   bool OffsetRegUsed = ScratchWaveOffsetReg != AMDGPU::NoRegister &&
432                        MRI.isPhysRegUsed(ScratchWaveOffsetReg);
433   bool ResourceRegUsed = ScratchRsrcReg != AMDGPU::NoRegister &&
434                          MRI.isPhysRegUsed(ScratchRsrcReg);
435 
436   // FIXME: Hack to not crash in situations which emitted an error.
437   if (PreloadedScratchWaveOffsetReg == AMDGPU::NoRegister)
438     return;
439 
440   // We added live-ins during argument lowering, but since they were not used
441   // they were deleted. We're adding the uses now, so add them back.
442   MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
443   MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
444 
445   if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) {
446     assert(ST.isAmdHsaOrMesa(F) || ST.isMesaGfxShader(F));
447     MRI.addLiveIn(PreloadedPrivateBufferReg);
448     MBB.addLiveIn(PreloadedPrivateBufferReg);
449   }
450 
451   // Make the register selected live throughout the function.
452   for (MachineBasicBlock &OtherBB : MF) {
453     if (&OtherBB == &MBB)
454       continue;
455 
456     if (OffsetRegUsed)
457       OtherBB.addLiveIn(ScratchWaveOffsetReg);
458 
459     if (ResourceRegUsed)
460       OtherBB.addLiveIn(ScratchRsrcReg);
461   }
462 
463   DebugLoc DL;
464   MachineBasicBlock::iterator I = MBB.begin();
465 
466   // If we reserved the original input registers, we don't need to copy to the
467   // reserved registers.
468 
469   bool CopyBuffer = ResourceRegUsed &&
470     PreloadedPrivateBufferReg != AMDGPU::NoRegister &&
471     ST.isAmdHsaOrMesa(F) &&
472     ScratchRsrcReg != PreloadedPrivateBufferReg;
473 
474   // This needs to be careful of the copying order to avoid overwriting one of
475   // the input registers before it's been copied to it's final
476   // destination. Usually the offset should be copied first.
477   bool CopyBufferFirst = TRI->isSubRegisterEq(PreloadedPrivateBufferReg,
478                                               ScratchWaveOffsetReg);
479   if (CopyBuffer && CopyBufferFirst) {
480     BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
481       .addReg(PreloadedPrivateBufferReg, RegState::Kill);
482   }
483 
484   unsigned SPReg = MFI->getStackPtrOffsetReg();
485   assert(SPReg != AMDGPU::SP_REG);
486 
487   // FIXME: Remove the isPhysRegUsed checks
488   const bool HasFP = hasFP(MF);
489 
490   if (HasFP || OffsetRegUsed) {
491     assert(ScratchWaveOffsetReg);
492     BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
493       .addReg(PreloadedScratchWaveOffsetReg, HasFP ? RegState::Kill : 0);
494   }
495 
496   if (CopyBuffer && !CopyBufferFirst) {
497     BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
498       .addReg(PreloadedPrivateBufferReg, RegState::Kill);
499   }
500 
501   if (ResourceRegUsed) {
502     emitEntryFunctionScratchSetup(ST, MF, MBB, MFI, I,
503         PreloadedPrivateBufferReg, ScratchRsrcReg);
504   }
505 
506   if (HasFP) {
507     DebugLoc DL;
508     const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
509     int64_t StackSize = FrameInfo.getStackSize();
510 
511     // On kernel entry, the private scratch wave offset is the SP value.
512     if (StackSize == 0) {
513       BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), SPReg)
514         .addReg(MFI->getScratchWaveOffsetReg());
515     } else {
516       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), SPReg)
517         .addReg(MFI->getScratchWaveOffsetReg())
518         .addImm(StackSize * ST.getWavefrontSize());
519     }
520   }
521 }
522 
523 // Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set.
524 void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
525       MachineFunction &MF, MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI,
526       MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg,
527       unsigned ScratchRsrcReg) const {
528 
529   const SIInstrInfo *TII = ST.getInstrInfo();
530   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
531   const Function &Fn = MF.getFunction();
532   DebugLoc DL;
533 
534   if (ST.isAmdPalOS()) {
535     // The pointer to the GIT is formed from the offset passed in and either
536     // the amdgpu-git-ptr-high function attribute or the top part of the PC
537     unsigned RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
538     unsigned RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
539     unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
540 
541     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
542 
543     if (MFI->getGITPtrHigh() != 0xffffffff) {
544       BuildMI(MBB, I, DL, SMovB32, RsrcHi)
545         .addImm(MFI->getGITPtrHigh())
546         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
547     } else {
548       const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
549       BuildMI(MBB, I, DL, GetPC64, Rsrc01);
550     }
551     auto GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in
552     if (ST.hasMergedShaders()) {
553       switch (MF.getFunction().getCallingConv()) {
554         case CallingConv::AMDGPU_HS:
555         case CallingConv::AMDGPU_GS:
556           // Low GIT address is passed in s8 rather than s0 for an LS+HS or
557           // ES+GS merged shader on gfx9+.
558           GitPtrLo = AMDGPU::SGPR8;
559           break;
560         default:
561           break;
562       }
563     }
564     MF.getRegInfo().addLiveIn(GitPtrLo);
565     MBB.addLiveIn(GitPtrLo);
566     BuildMI(MBB, I, DL, SMovB32, RsrcLo)
567       .addReg(GitPtrLo)
568       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
569 
570     // We now have the GIT ptr - now get the scratch descriptor from the entry
571     // at offset 0 (or offset 16 for a compute shader).
572     PointerType *PtrTy =
573       PointerType::get(Type::getInt64Ty(MF.getFunction().getContext()),
574                        AMDGPUAS::CONSTANT_ADDRESS);
575     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
576     const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
577     auto MMO = MF.getMachineMemOperand(PtrInfo,
578                                        MachineMemOperand::MOLoad |
579                                        MachineMemOperand::MOInvariant |
580                                        MachineMemOperand::MODereferenceable,
581                                        16, 4);
582     unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
583     const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
584     unsigned EncodedOffset = AMDGPU::getSMRDEncodedOffset(Subtarget, Offset);
585     BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
586       .addReg(Rsrc01)
587       .addImm(EncodedOffset) // offset
588       .addImm(0) // glc
589       .addImm(0) // dlc
590       .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
591       .addMemOperand(MMO);
592     return;
593   }
594   if (ST.isMesaGfxShader(Fn)
595       || (PreloadedPrivateBufferReg == AMDGPU::NoRegister)) {
596     assert(!ST.isAmdHsaOrMesa(Fn));
597     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
598 
599     unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
600     unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
601 
602     // Use relocations to get the pointer, and setup the other bits manually.
603     uint64_t Rsrc23 = TII->getScratchRsrcWords23();
604 
605     if (MFI->hasImplicitBufferPtr()) {
606       unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
607 
608       if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
609         const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
610 
611         BuildMI(MBB, I, DL, Mov64, Rsrc01)
612           .addReg(MFI->getImplicitBufferPtrUserSGPR())
613           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
614       } else {
615         const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
616 
617         PointerType *PtrTy =
618           PointerType::get(Type::getInt64Ty(MF.getFunction().getContext()),
619                            AMDGPUAS::CONSTANT_ADDRESS);
620         MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
621         auto MMO = MF.getMachineMemOperand(PtrInfo,
622                                            MachineMemOperand::MOLoad |
623                                            MachineMemOperand::MOInvariant |
624                                            MachineMemOperand::MODereferenceable,
625                                            8, 4);
626         BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
627           .addReg(MFI->getImplicitBufferPtrUserSGPR())
628           .addImm(0) // offset
629           .addImm(0) // glc
630           .addImm(0) // dlc
631           .addMemOperand(MMO)
632           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
633 
634         MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
635         MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
636       }
637     } else {
638       unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
639       unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
640 
641       BuildMI(MBB, I, DL, SMovB32, Rsrc0)
642         .addExternalSymbol("SCRATCH_RSRC_DWORD0")
643         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
644 
645       BuildMI(MBB, I, DL, SMovB32, Rsrc1)
646         .addExternalSymbol("SCRATCH_RSRC_DWORD1")
647         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
648 
649     }
650 
651     BuildMI(MBB, I, DL, SMovB32, Rsrc2)
652       .addImm(Rsrc23 & 0xffffffff)
653       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
654 
655     BuildMI(MBB, I, DL, SMovB32, Rsrc3)
656       .addImm(Rsrc23 >> 32)
657       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
658   }
659 }
660 
661 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
662   switch (ID) {
663   case TargetStackID::Default:
664   case TargetStackID::NoAlloc:
665   case TargetStackID::SGPRSpill:
666     return true;
667   }
668   llvm_unreachable("Invalid TargetStackID::Value");
669 }
670 
671 void SIFrameLowering::emitPrologue(MachineFunction &MF,
672                                    MachineBasicBlock &MBB) const {
673   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
674   if (FuncInfo->isEntryFunction()) {
675     emitEntryFunctionPrologue(MF, MBB);
676     return;
677   }
678 
679   const MachineFrameInfo &MFI = MF.getFrameInfo();
680   MachineRegisterInfo &MRI = MF.getRegInfo();
681   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
682   const SIInstrInfo *TII = ST.getInstrInfo();
683   const SIRegisterInfo &TRI = TII->getRegisterInfo();
684 
685   unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
686   unsigned FramePtrReg = FuncInfo->getFrameOffsetReg();
687   LivePhysRegs LiveRegs;
688 
689   MachineBasicBlock::iterator MBBI = MBB.begin();
690   DebugLoc DL;
691 
692   bool HasFP = false;
693   uint32_t NumBytes = MFI.getStackSize();
694   uint32_t RoundedSize = NumBytes;
695   // To avoid clobbering VGPRs in lanes that weren't active on function entry,
696   // turn on all lanes before doing the spill to memory.
697   unsigned ScratchExecCopy = AMDGPU::NoRegister;
698 
699   // Emit the copy if we need an FP, and are using a free SGPR to save it.
700   if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) {
701     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy)
702       .addReg(FramePtrReg)
703       .setMIFlag(MachineInstr::FrameSetup);
704   }
705 
706   for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
707          : FuncInfo->getSGPRSpillVGPRs()) {
708     if (!Reg.FI.hasValue())
709       continue;
710 
711     if (ScratchExecCopy == AMDGPU::NoRegister) {
712       if (LiveRegs.empty()) {
713         LiveRegs.init(TRI);
714         LiveRegs.addLiveIns(MBB);
715         if (FuncInfo->SGPRForFPSaveRestoreCopy)
716           LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy);
717       }
718 
719       ScratchExecCopy
720         = findScratchNonCalleeSaveRegister(MRI, LiveRegs,
721                                            *TRI.getWaveMaskRegClass());
722       assert(FuncInfo->SGPRForFPSaveRestoreCopy != ScratchExecCopy);
723 
724       const unsigned OrSaveExec = ST.isWave32() ?
725         AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
726       BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec),
727               ScratchExecCopy)
728         .addImm(-1);
729     }
730 
731     buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
732                      FuncInfo->getScratchRSrcReg(),
733                      StackPtrReg,
734                      Reg.FI.getValue());
735   }
736 
737   if (ScratchExecCopy != AMDGPU::NoRegister) {
738     // FIXME: Split block and make terminator.
739     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
740     unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
741     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
742       .addReg(ScratchExecCopy, RegState::Kill);
743     LiveRegs.addReg(ScratchExecCopy);
744   }
745 
746 
747   if (FuncInfo->FramePointerSaveIndex) {
748     const int FI = FuncInfo->FramePointerSaveIndex.getValue();
749     assert(!MFI.isDeadObjectIndex(FI) &&
750            MFI.getStackID(FI) == TargetStackID::SGPRSpill);
751     ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill
752       = FuncInfo->getSGPRToVGPRSpills(FI);
753     assert(Spill.size() == 1);
754 
755     // Save FP before setting it up.
756     // FIXME: This should respect spillSGPRToVGPR;
757     BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
758             Spill[0].VGPR)
759       .addReg(FramePtrReg)
760       .addImm(Spill[0].Lane)
761       .addReg(Spill[0].VGPR, RegState::Undef);
762   }
763 
764   if (TRI.needsStackRealignment(MF)) {
765     HasFP = true;
766     const unsigned Alignment = MFI.getMaxAlignment();
767 
768     RoundedSize += Alignment;
769     if (LiveRegs.empty()) {
770       LiveRegs.init(TRI);
771       LiveRegs.addLiveIns(MBB);
772       LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy);
773     }
774 
775     unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(
776         MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass);
777     assert(ScratchSPReg != AMDGPU::NoRegister &&
778            ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy);
779 
780     // s_add_u32 tmp_reg, s32, NumBytes
781     // s_and_b32 s32, tmp_reg, 0b111...0000
782     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg)
783       .addReg(StackPtrReg)
784       .addImm((Alignment - 1) * ST.getWavefrontSize())
785       .setMIFlag(MachineInstr::FrameSetup);
786     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
787       .addReg(ScratchSPReg, RegState::Kill)
788       .addImm(-Alignment * ST.getWavefrontSize())
789       .setMIFlag(MachineInstr::FrameSetup);
790     FuncInfo->setIsStackRealigned(true);
791   } else if ((HasFP = hasFP(MF))) {
792     // If we need a base pointer, set it up here. It's whatever the value of
793     // the stack pointer is at this point. Any variable size objects will be
794     // allocated after this, so we can still use the base pointer to reference
795     // locals.
796     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
797       .addReg(StackPtrReg)
798       .setMIFlag(MachineInstr::FrameSetup);
799   }
800 
801   if (HasFP && RoundedSize != 0) {
802     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
803       .addReg(StackPtrReg)
804       .addImm(RoundedSize * ST.getWavefrontSize())
805       .setMIFlag(MachineInstr::FrameSetup);
806   }
807 
808   assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister ||
809                      FuncInfo->FramePointerSaveIndex)) &&
810          "Needed to save FP but didn't save it anywhere");
811 
812   assert((HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy == AMDGPU::NoRegister &&
813                     !FuncInfo->FramePointerSaveIndex)) &&
814          "Saved FP but didn't need it");
815 }
816 
817 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
818                                    MachineBasicBlock &MBB) const {
819   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
820   if (FuncInfo->isEntryFunction())
821     return;
822 
823   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
824   const SIInstrInfo *TII = ST.getInstrInfo();
825   MachineRegisterInfo &MRI = MF.getRegInfo();
826   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
827   LivePhysRegs LiveRegs;
828   DebugLoc DL;
829 
830   const MachineFrameInfo &MFI = MF.getFrameInfo();
831   uint32_t NumBytes = MFI.getStackSize();
832   uint32_t RoundedSize = FuncInfo->isStackRealigned() ?
833     NumBytes + MFI.getMaxAlignment() : NumBytes;
834 
835   if (RoundedSize != 0 && hasFP(MF)) {
836     const unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
837     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
838       .addReg(StackPtrReg)
839       .addImm(RoundedSize * ST.getWavefrontSize())
840       .setMIFlag(MachineInstr::FrameDestroy);
841   }
842 
843   if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) {
844     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->getFrameOffsetReg())
845       .addReg(FuncInfo->SGPRForFPSaveRestoreCopy)
846       .setMIFlag(MachineInstr::FrameSetup);
847   }
848 
849   if (FuncInfo->FramePointerSaveIndex) {
850     const int FI = FuncInfo->FramePointerSaveIndex.getValue();
851 
852     assert(!MF.getFrameInfo().isDeadObjectIndex(FI) &&
853            MF.getFrameInfo().getStackID(FI) == TargetStackID::SGPRSpill);
854 
855     ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill
856       = FuncInfo->getSGPRToVGPRSpills(FI);
857     assert(Spill.size() == 1);
858     BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
859             FuncInfo->getFrameOffsetReg())
860       .addReg(Spill[0].VGPR)
861       .addImm(Spill[0].Lane);
862   }
863 
864   unsigned ScratchExecCopy = AMDGPU::NoRegister;
865   for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
866          : FuncInfo->getSGPRSpillVGPRs()) {
867     if (!Reg.FI.hasValue())
868       continue;
869 
870     const SIRegisterInfo &TRI = TII->getRegisterInfo();
871     if (ScratchExecCopy == AMDGPU::NoRegister) {
872       // See emitPrologue
873       if (LiveRegs.empty()) {
874         LiveRegs.init(*ST.getRegisterInfo());
875         LiveRegs.addLiveOuts(MBB);
876         LiveRegs.stepBackward(*MBBI);
877       }
878 
879       ScratchExecCopy = findScratchNonCalleeSaveRegister(
880           MRI, LiveRegs, *TRI.getWaveMaskRegClass());
881       LiveRegs.removeReg(ScratchExecCopy);
882 
883       const unsigned OrSaveExec =
884           ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
885 
886       BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy)
887         .addImm(-1);
888     }
889 
890     buildEpilogReload(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
891                       FuncInfo->getScratchRSrcReg(),
892                       FuncInfo->getStackPtrOffsetReg(), Reg.FI.getValue());
893   }
894 
895   if (ScratchExecCopy != AMDGPU::NoRegister) {
896     // FIXME: Split block and make terminator.
897     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
898     unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
899     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
900       .addReg(ScratchExecCopy, RegState::Kill);
901   }
902 }
903 
904 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
905 // memory. They should have been removed by now.
906 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
907   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
908        I != E; ++I) {
909     if (!MFI.isDeadObjectIndex(I))
910       return false;
911   }
912 
913   return true;
914 }
915 
916 #ifndef NDEBUG
917 static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI,
918                                  Optional<int> FramePointerSaveIndex) {
919   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
920        I != E; ++I) {
921     if (!MFI.isDeadObjectIndex(I) &&
922         MFI.getStackID(I) == TargetStackID::SGPRSpill &&
923         FramePointerSaveIndex && I != FramePointerSaveIndex) {
924       return false;
925     }
926   }
927 
928   return true;
929 }
930 #endif
931 
932 int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
933                                             unsigned &FrameReg) const {
934   const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
935 
936   FrameReg = RI->getFrameRegister(MF);
937   return MF.getFrameInfo().getObjectOffset(FI);
938 }
939 
940 void SIFrameLowering::processFunctionBeforeFrameFinalized(
941   MachineFunction &MF,
942   RegScavenger *RS) const {
943   MachineFrameInfo &MFI = MF.getFrameInfo();
944 
945   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
946   const SIRegisterInfo *TRI = ST.getRegisterInfo();
947   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
948 
949   FuncInfo->removeDeadFrameIndices(MFI);
950   assert(allSGPRSpillsAreDead(MFI, None) &&
951          "SGPR spill should have been removed in SILowerSGPRSpills");
952 
953   // FIXME: The other checks should be redundant with allStackObjectsAreDead,
954   // but currently hasNonSpillStackObjects is set only from source
955   // allocas. Stack temps produced from legalization are not counted currently.
956   if (!allStackObjectsAreDead(MFI)) {
957     assert(RS && "RegScavenger required if spilling");
958 
959     if (FuncInfo->isEntryFunction()) {
960       int ScavengeFI = MFI.CreateFixedObject(
961         TRI->getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
962       RS->addScavengingFrameIndex(ScavengeFI);
963     } else {
964       int ScavengeFI = MFI.CreateStackObject(
965         TRI->getSpillSize(AMDGPU::SGPR_32RegClass),
966         TRI->getSpillAlignment(AMDGPU::SGPR_32RegClass),
967         false);
968       RS->addScavengingFrameIndex(ScavengeFI);
969     }
970   }
971 }
972 
973 // Only report VGPRs to generic code.
974 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
975                                            BitVector &SavedVGPRs,
976                                            RegScavenger *RS) const {
977   TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
978   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
979   if (MFI->isEntryFunction())
980     return;
981 
982   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
983   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
984   const SIRegisterInfo *TRI = ST.getRegisterInfo();
985 
986   // Ignore the SGPRs the default implementation found.
987   SavedVGPRs.clearBitsNotInMask(TRI->getAllVGPRRegMask());
988 
989   // hasFP only knows about stack objects that already exist. We're now
990   // determining the stack slots that will be created, so we have to predict
991   // them. Stack objects force FP usage with calls.
992   //
993   // Note a new VGPR CSR may be introduced if one is used for the spill, but we
994   // don't want to report it here.
995   //
996   // FIXME: Is this really hasReservedCallFrame?
997   const bool WillHaveFP =
998       FrameInfo.hasCalls() &&
999       (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo));
1000 
1001   // VGPRs used for SGPR spilling need to be specially inserted in the prolog,
1002   // so don't allow the default insertion to handle them.
1003   for (auto SSpill : MFI->getSGPRSpillVGPRs())
1004     SavedVGPRs.reset(SSpill.VGPR);
1005 
1006   const bool HasFP = WillHaveFP || hasFP(MF);
1007   if (!HasFP)
1008     return;
1009 
1010   if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) {
1011     int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr,
1012                                                     TargetStackID::SGPRSpill);
1013 
1014     // If there is already a VGPR with free lanes, use it. We may already have
1015     // to pay the penalty for spilling a CSR VGPR.
1016     if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
1017       llvm_unreachable("allocate SGPR spill should have worked");
1018 
1019     MFI->FramePointerSaveIndex = NewFI;
1020 
1021     LLVM_DEBUG(
1022       auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
1023       dbgs() << "Spilling FP to  " << printReg(Spill.VGPR, TRI)
1024              << ':' << Spill.Lane << '\n');
1025     return;
1026   }
1027 
1028   MFI->SGPRForFPSaveRestoreCopy = findUnusedSGPRNonCalleeSaved(MF.getRegInfo());
1029 
1030   if (!MFI->SGPRForFPSaveRestoreCopy) {
1031     // There's no free lane to spill, and no free register to save FP, so we're
1032     // forced to spill another VGPR to use for the spill.
1033     int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr,
1034                                                     TargetStackID::SGPRSpill);
1035     if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
1036       llvm_unreachable("allocate SGPR spill should have worked");
1037     MFI->FramePointerSaveIndex = NewFI;
1038 
1039     LLVM_DEBUG(
1040       auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
1041       dbgs() << "FP requires fallback spill to " << printReg(Spill.VGPR, TRI)
1042              << ':' << Spill.Lane << '\n';);
1043   } else {
1044     LLVM_DEBUG(dbgs() << "Saving FP with copy to " <<
1045                printReg(MFI->SGPRForFPSaveRestoreCopy, TRI) << '\n');
1046   }
1047 }
1048 
1049 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
1050                                                BitVector &SavedRegs,
1051                                                RegScavenger *RS) const {
1052   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
1053   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1054   if (MFI->isEntryFunction())
1055     return;
1056 
1057   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1058   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1059 
1060   // The SP is specifically managed and we don't want extra spills of it.
1061   SavedRegs.reset(MFI->getStackPtrOffsetReg());
1062   SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask());
1063 }
1064 
1065 bool SIFrameLowering::assignCalleeSavedSpillSlots(
1066     MachineFunction &MF, const TargetRegisterInfo *TRI,
1067     std::vector<CalleeSavedInfo> &CSI) const {
1068   if (CSI.empty())
1069     return true; // Early exit if no callee saved registers are modified!
1070 
1071   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1072   if (!FuncInfo->SGPRForFPSaveRestoreCopy)
1073     return false;
1074 
1075   for (auto &CS : CSI) {
1076     if (CS.getReg() == FuncInfo->getFrameOffsetReg()) {
1077       if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister)
1078         CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy);
1079       break;
1080     }
1081   }
1082 
1083   return false;
1084 }
1085 
1086 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
1087   MachineFunction &MF,
1088   MachineBasicBlock &MBB,
1089   MachineBasicBlock::iterator I) const {
1090   int64_t Amount = I->getOperand(0).getImm();
1091   if (Amount == 0)
1092     return MBB.erase(I);
1093 
1094   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1095   const SIInstrInfo *TII = ST.getInstrInfo();
1096   const DebugLoc &DL = I->getDebugLoc();
1097   unsigned Opc = I->getOpcode();
1098   bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
1099   uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
1100 
1101   if (!hasReservedCallFrame(MF)) {
1102     unsigned Align = getStackAlignment();
1103 
1104     Amount = alignTo(Amount, Align);
1105     assert(isUInt<32>(Amount) && "exceeded stack address space size");
1106     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1107     unsigned SPReg = MFI->getStackPtrOffsetReg();
1108 
1109     unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
1110     BuildMI(MBB, I, DL, TII->get(Op), SPReg)
1111       .addReg(SPReg)
1112       .addImm(Amount * ST.getWavefrontSize());
1113   } else if (CalleePopAmount != 0) {
1114     llvm_unreachable("is this used?");
1115   }
1116 
1117   return MBB.erase(I);
1118 }
1119 
1120 bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
1121   const MachineFrameInfo &MFI = MF.getFrameInfo();
1122   if (MFI.hasCalls()) {
1123     // All offsets are unsigned, so need to be addressed in the same direction
1124     // as stack growth.
1125 
1126     // FIXME: This function is pretty broken, since it can be called before the
1127     // frame layout is determined or CSR spills are inserted.
1128     if (MFI.getStackSize() != 0)
1129       return true;
1130 
1131     // For the entry point, the input wave scratch offset must be copied to the
1132     // API SP if there are calls.
1133     if (MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction())
1134       return true;
1135   }
1136 
1137   return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
1138     MFI.hasStackMap() || MFI.hasPatchPoint() ||
1139     MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->needsStackRealignment(MF) ||
1140     MF.getTarget().Options.DisableFramePointerElim(MF);
1141 }
1142