1 //===----------------------- SIFrameLowering.cpp --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 
9 #include "SIFrameLowering.h"
10 #include "AMDGPUSubtarget.h"
11 #include "SIInstrInfo.h"
12 #include "SIMachineFunctionInfo.h"
13 #include "SIRegisterInfo.h"
14 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
15 
16 #include "llvm/CodeGen/LivePhysRegs.h"
17 #include "llvm/CodeGen/MachineFrameInfo.h"
18 #include "llvm/CodeGen/MachineFunction.h"
19 #include "llvm/CodeGen/MachineInstrBuilder.h"
20 #include "llvm/CodeGen/RegisterScavenging.h"
21 
22 using namespace llvm;
23 
24 #define DEBUG_TYPE "frame-info"
25 
26 
27 // Find a scratch register that we can use at the start of the prologue to
28 // re-align the stack pointer. We avoid using callee-save registers since they
29 // may appear to be free when this is called from canUseAsPrologue (during
30 // shrink wrapping), but then no longer be free when this is called from
31 // emitPrologue.
32 //
33 // FIXME: This is a bit conservative, since in the above case we could use one
34 // of the callee-save registers as a scratch temp to re-align the stack pointer,
35 // but we would then have to make sure that we were in fact saving at least one
36 // callee-save register in the prologue, which is additional complexity that
37 // doesn't seem worth the benefit.
38 static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
39                                                    LivePhysRegs &LiveRegs,
40                                                    const TargetRegisterClass &RC,
41                                                    bool Unused = false) {
42   // Mark callee saved registers as used so we will not choose them.
43   const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
44   for (unsigned i = 0; CSRegs[i]; ++i)
45     LiveRegs.addReg(CSRegs[i]);
46 
47   if (Unused) {
48     // We are looking for a register that can be used throughout the entire
49     // function, so any use is unacceptable.
50     for (MCRegister Reg : RC) {
51       if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg))
52         return Reg;
53     }
54   } else {
55     for (MCRegister Reg : RC) {
56       if (LiveRegs.available(MRI, Reg))
57         return Reg;
58     }
59   }
60 
61   // If we require an unused register, this is used in contexts where failure is
62   // an option and has an alternative plan. In other contexts, this must
63   // succeed0.
64   if (!Unused)
65     report_fatal_error("failed to find free scratch register");
66 
67   return MCRegister();
68 }
69 
70 static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF,
71                                            LivePhysRegs &LiveRegs,
72                                            Register &TempSGPR,
73                                            Optional<int> &FrameIndex,
74                                            bool IsFP) {
75   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
76   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
77 
78 #ifndef NDEBUG
79   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
80   const SIRegisterInfo *TRI = ST.getRegisterInfo();
81 #endif
82 
83   // We need to save and restore the current FP/BP.
84 
85   // 1: If there is already a VGPR with free lanes, use it. We
86   // may already have to pay the penalty for spilling a CSR VGPR.
87   if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) {
88     int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr,
89                                             TargetStackID::SGPRSpill);
90 
91     if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
92       llvm_unreachable("allocate SGPR spill should have worked");
93 
94     FrameIndex = NewFI;
95 
96     LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
97                dbgs() << "Spilling " << (IsFP ? "FP" : "BP") << " to  "
98                       << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
99                       << '\n');
100     return;
101   }
102 
103   // 2: Next, try to save the FP/BP in an unused SGPR.
104   TempSGPR = findScratchNonCalleeSaveRegister(
105       MF.getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true);
106 
107   if (!TempSGPR) {
108     int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr,
109                                             TargetStackID::SGPRSpill);
110 
111     if (MFI->allocateSGPRSpillToVGPR(MF, NewFI)) {
112       // 3: There's no free lane to spill, and no free register to save FP/BP,
113       // so we're forced to spill another VGPR to use for the spill.
114       FrameIndex = NewFI;
115     } else {
116       // 4: If all else fails, spill the FP/BP to memory.
117       FrameIndex = FrameInfo.CreateSpillStackObject(4, Align(4));
118     }
119 
120     LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
121                dbgs() << (IsFP ? "FP" : "BP") << " requires fallback spill to "
122                       << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
123                       << '\n';);
124   } else {
125     LLVM_DEBUG(dbgs() << "Saving " << (IsFP ? "FP" : "BP") << " with copy to "
126                       << printReg(TempSGPR, TRI) << '\n');
127   }
128 }
129 
130 // We need to specially emit stack operations here because a different frame
131 // register is used than in the rest of the function, as getFrameRegister would
132 // use.
133 static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
134                              MachineBasicBlock::iterator I,
135                              const SIInstrInfo *TII, Register SpillReg,
136                              Register ScratchRsrcReg, Register SPReg, int FI) {
137   MachineFunction *MF = MBB.getParent();
138   MachineFrameInfo &MFI = MF->getFrameInfo();
139 
140   int64_t Offset = MFI.getObjectOffset(FI);
141 
142   MachineMemOperand *MMO = MF->getMachineMemOperand(
143       MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4,
144       MFI.getObjectAlign(FI));
145 
146   if (isUInt<12>(Offset)) {
147     BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET))
148       .addReg(SpillReg, RegState::Kill)
149       .addReg(ScratchRsrcReg)
150       .addReg(SPReg)
151       .addImm(Offset)
152       .addImm(0) // glc
153       .addImm(0) // slc
154       .addImm(0) // tfe
155       .addImm(0) // dlc
156       .addImm(0) // swz
157       .addMemOperand(MMO);
158     return;
159   }
160 
161   // Don't clobber the TmpVGPR if we also need a scratch reg for the stack
162   // offset in the spill.
163   LiveRegs.addReg(SpillReg);
164 
165   MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
166     MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
167 
168   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
169     .addImm(Offset);
170 
171   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN))
172     .addReg(SpillReg, RegState::Kill)
173     .addReg(OffsetReg, RegState::Kill)
174     .addReg(ScratchRsrcReg)
175     .addReg(SPReg)
176     .addImm(0)
177     .addImm(0) // glc
178     .addImm(0) // slc
179     .addImm(0) // tfe
180     .addImm(0) // dlc
181     .addImm(0) // swz
182     .addMemOperand(MMO);
183 
184   LiveRegs.removeReg(SpillReg);
185 }
186 
187 static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
188                               MachineBasicBlock::iterator I,
189                               const SIInstrInfo *TII, Register SpillReg,
190                               Register ScratchRsrcReg, Register SPReg, int FI) {
191   MachineFunction *MF = MBB.getParent();
192   MachineFrameInfo &MFI = MF->getFrameInfo();
193   int64_t Offset = MFI.getObjectOffset(FI);
194 
195   MachineMemOperand *MMO = MF->getMachineMemOperand(
196       MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4,
197       MFI.getObjectAlign(FI));
198 
199   if (isUInt<12>(Offset)) {
200     BuildMI(MBB, I, DebugLoc(),
201             TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFSET), SpillReg)
202       .addReg(ScratchRsrcReg)
203       .addReg(SPReg)
204       .addImm(Offset)
205       .addImm(0) // glc
206       .addImm(0) // slc
207       .addImm(0) // tfe
208       .addImm(0) // dlc
209       .addImm(0) // swz
210       .addMemOperand(MMO);
211     return;
212   }
213 
214   MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
215     MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
216 
217   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
218     .addImm(Offset);
219 
220   BuildMI(MBB, I, DebugLoc(),
221           TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), SpillReg)
222     .addReg(OffsetReg, RegState::Kill)
223     .addReg(ScratchRsrcReg)
224     .addReg(SPReg)
225     .addImm(0)
226     .addImm(0) // glc
227     .addImm(0) // slc
228     .addImm(0) // tfe
229     .addImm(0) // dlc
230     .addImm(0) // swz
231     .addMemOperand(MMO);
232 }
233 
234 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
235 void SIFrameLowering::emitEntryFunctionFlatScratchInit(
236     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
237     const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
238   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
239   const SIInstrInfo *TII = ST.getInstrInfo();
240   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
241   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
242 
243   // We don't need this if we only have spills since there is no user facing
244   // scratch.
245 
246   // TODO: If we know we don't have flat instructions earlier, we can omit
247   // this from the input registers.
248   //
249   // TODO: We only need to know if we access scratch space through a flat
250   // pointer. Because we only detect if flat instructions are used at all,
251   // this will be used more often than necessary on VI.
252 
253   Register FlatScratchInitReg =
254       MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
255 
256   MachineRegisterInfo &MRI = MF.getRegInfo();
257   MRI.addLiveIn(FlatScratchInitReg);
258   MBB.addLiveIn(FlatScratchInitReg);
259 
260   Register FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
261   Register FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
262 
263   // Do a 64-bit pointer add.
264   if (ST.flatScratchIsPointer()) {
265     if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
266       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
267         .addReg(FlatScrInitLo)
268         .addReg(ScratchWaveOffsetReg);
269       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), FlatScrInitHi)
270         .addReg(FlatScrInitHi)
271         .addImm(0);
272       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
273         addReg(FlatScrInitLo).
274         addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO |
275                        (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
276       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
277         addReg(FlatScrInitHi).
278         addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
279                        (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
280       return;
281     }
282 
283     // For GFX9.
284     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
285       .addReg(FlatScrInitLo)
286       .addReg(ScratchWaveOffsetReg);
287     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI)
288       .addReg(FlatScrInitHi)
289       .addImm(0);
290 
291     return;
292   }
293 
294   assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
295 
296   // Copy the size in bytes.
297   BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
298     .addReg(FlatScrInitHi, RegState::Kill);
299 
300   // Add wave offset in bytes to private base offset.
301   // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
302   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
303     .addReg(FlatScrInitLo)
304     .addReg(ScratchWaveOffsetReg);
305 
306   // Convert offset to 256-byte units.
307   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI)
308     .addReg(FlatScrInitLo, RegState::Kill)
309     .addImm(8);
310 }
311 
312 // Shift down registers reserved for the scratch RSRC.
313 Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
314     MachineFunction &MF) const {
315 
316   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
317   const SIInstrInfo *TII = ST.getInstrInfo();
318   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
319   MachineRegisterInfo &MRI = MF.getRegInfo();
320   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
321 
322   assert(MFI->isEntryFunction());
323 
324   Register ScratchRsrcReg = MFI->getScratchRSrcReg();
325 
326   if (!ScratchRsrcReg || !MRI.isPhysRegUsed(ScratchRsrcReg))
327     return Register();
328 
329   if (ST.hasSGPRInitBug() ||
330       ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
331     return ScratchRsrcReg;
332 
333   // We reserved the last registers for this. Shift it down to the end of those
334   // which were actually used.
335   //
336   // FIXME: It might be safer to use a pseudoregister before replacement.
337 
338   // FIXME: We should be able to eliminate unused input registers. We only
339   // cannot do this for the resources required for scratch access. For now we
340   // skip over user SGPRs and may leave unused holes.
341 
342   unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
343   ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF);
344   AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
345 
346   // Skip the last N reserved elements because they should have already been
347   // reserved for VCC etc.
348   Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
349   for (MCPhysReg Reg : AllSGPR128s) {
350     // Pick the first unallocated one. Make sure we don't clobber the other
351     // reserved input we needed. Also for PAL, make sure we don't clobber
352     // the GIT pointer passed in SGPR0 or SGPR8.
353     if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
354         !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
355       MRI.replaceRegWith(ScratchRsrcReg, Reg);
356       MFI->setScratchRSrcReg(Reg);
357       return Reg;
358     }
359   }
360 
361   return ScratchRsrcReg;
362 }
363 
364 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
365                                                 MachineBasicBlock &MBB) const {
366   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
367 
368   // FIXME: If we only have SGPR spills, we won't actually be using scratch
369   // memory since these spill to VGPRs. We should be cleaning up these unused
370   // SGPR spill frame indices somewhere.
371 
372   // FIXME: We still have implicit uses on SGPR spill instructions in case they
373   // need to spill to vector memory. It's likely that will not happen, but at
374   // this point it appears we need the setup. This part of the prolog should be
375   // emitted after frame indices are eliminated.
376 
377   // FIXME: Remove all of the isPhysRegUsed checks
378 
379   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
380   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
381   const SIInstrInfo *TII = ST.getInstrInfo();
382   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
383   MachineRegisterInfo &MRI = MF.getRegInfo();
384   const Function &F = MF.getFunction();
385 
386   assert(MFI->isEntryFunction());
387 
388   Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
389       AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
390   // FIXME: Hack to not crash in situations which emitted an error.
391   if (!PreloadedScratchWaveOffsetReg)
392     return;
393 
394   // We need to do the replacement of the private segment buffer register even
395   // if there are no stack objects. There could be stores to undef or a
396   // constant without an associated object.
397   //
398   // This will return `Register()` in cases where there are no actual
399   // uses of the SRSRC.
400   Register ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
401 
402   // Make the selected register live throughout the function.
403   if (ScratchRsrcReg) {
404     for (MachineBasicBlock &OtherBB : MF) {
405       if (&OtherBB != &MBB) {
406         OtherBB.addLiveIn(ScratchRsrcReg);
407       }
408     }
409   }
410 
411   // Now that we have fixed the reserved SRSRC we need to locate the
412   // (potentially) preloaded SRSRC.
413   Register PreloadedScratchRsrcReg;
414   if (ST.isAmdHsaOrMesa(F)) {
415     PreloadedScratchRsrcReg =
416         MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
417     if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
418       // We added live-ins during argument lowering, but since they were not
419       // used they were deleted. We're adding the uses now, so add them back.
420       MRI.addLiveIn(PreloadedScratchRsrcReg);
421       MBB.addLiveIn(PreloadedScratchRsrcReg);
422     }
423   }
424 
425   // Debug location must be unknown since the first debug location is used to
426   // determine the end of the prologue.
427   DebugLoc DL;
428   MachineBasicBlock::iterator I = MBB.begin();
429 
430   // We found the SRSRC first because it needs four registers and has an
431   // alignment requirement. If the SRSRC that we found is clobbering with
432   // the scratch wave offset, which may be in a fixed SGPR or a free SGPR
433   // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
434   // wave offset to a free SGPR.
435   Register ScratchWaveOffsetReg;
436   if (TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) {
437     ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF);
438     unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
439     AllSGPRs = AllSGPRs.slice(
440         std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded));
441     Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
442     for (MCPhysReg Reg : AllSGPRs) {
443       if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
444           !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) {
445         ScratchWaveOffsetReg = Reg;
446         BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
447             .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
448         break;
449       }
450     }
451   } else {
452     ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
453   }
454   assert(ScratchWaveOffsetReg);
455 
456   if (requiresStackPointerReference(MF)) {
457     Register SPReg = MFI->getStackPtrOffsetReg();
458     assert(SPReg != AMDGPU::SP_REG);
459     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
460         .addImm(MF.getFrameInfo().getStackSize() * ST.getWavefrontSize());
461   }
462 
463   if (hasFP(MF)) {
464     Register FPReg = MFI->getFrameOffsetReg();
465     assert(FPReg != AMDGPU::FP_REG);
466     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
467   }
468 
469   if (MFI->hasFlatScratchInit() || ScratchRsrcReg) {
470     MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
471     MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
472   }
473 
474   if (MFI->hasFlatScratchInit()) {
475     emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
476   }
477 
478   if (ScratchRsrcReg) {
479     emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
480                                          PreloadedScratchRsrcReg,
481                                          ScratchRsrcReg, ScratchWaveOffsetReg);
482   }
483 }
484 
485 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
486 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
487     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
488     const DebugLoc &DL, Register PreloadedScratchRsrcReg,
489     Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
490 
491   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
492   const SIInstrInfo *TII = ST.getInstrInfo();
493   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
494   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
495   const Function &Fn = MF.getFunction();
496 
497   if (ST.isAmdPalOS()) {
498     // The pointer to the GIT is formed from the offset passed in and either
499     // the amdgpu-git-ptr-high function attribute or the top part of the PC
500     Register RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
501     Register RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
502     Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
503 
504     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
505 
506     if (MFI->getGITPtrHigh() != 0xffffffff) {
507       BuildMI(MBB, I, DL, SMovB32, RsrcHi)
508         .addImm(MFI->getGITPtrHigh())
509         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
510     } else {
511       const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
512       BuildMI(MBB, I, DL, GetPC64, Rsrc01);
513     }
514     Register GitPtrLo = MFI->getGITPtrLoReg(MF);
515     MF.getRegInfo().addLiveIn(GitPtrLo);
516     MBB.addLiveIn(GitPtrLo);
517     BuildMI(MBB, I, DL, SMovB32, RsrcLo)
518       .addReg(GitPtrLo)
519       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
520 
521     // We now have the GIT ptr - now get the scratch descriptor from the entry
522     // at offset 0 (or offset 16 for a compute shader).
523     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
524     const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
525     auto MMO = MF.getMachineMemOperand(PtrInfo,
526                                        MachineMemOperand::MOLoad |
527                                            MachineMemOperand::MOInvariant |
528                                            MachineMemOperand::MODereferenceable,
529                                        16, Align(4));
530     unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
531     const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
532     unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
533     BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
534       .addReg(Rsrc01)
535       .addImm(EncodedOffset) // offset
536       .addImm(0) // glc
537       .addImm(0) // dlc
538       .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
539       .addMemOperand(MMO);
540   } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
541     assert(!ST.isAmdHsaOrMesa(Fn));
542     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
543 
544     Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
545     Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
546 
547     // Use relocations to get the pointer, and setup the other bits manually.
548     uint64_t Rsrc23 = TII->getScratchRsrcWords23();
549 
550     if (MFI->hasImplicitBufferPtr()) {
551       Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
552 
553       if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
554         const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
555 
556         BuildMI(MBB, I, DL, Mov64, Rsrc01)
557           .addReg(MFI->getImplicitBufferPtrUserSGPR())
558           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
559       } else {
560         const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
561 
562         MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
563         auto MMO = MF.getMachineMemOperand(
564             PtrInfo,
565             MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
566                 MachineMemOperand::MODereferenceable,
567             8, Align(4));
568         BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
569           .addReg(MFI->getImplicitBufferPtrUserSGPR())
570           .addImm(0) // offset
571           .addImm(0) // glc
572           .addImm(0) // dlc
573           .addMemOperand(MMO)
574           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
575 
576         MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
577         MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
578       }
579     } else {
580       Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
581       Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
582 
583       BuildMI(MBB, I, DL, SMovB32, Rsrc0)
584         .addExternalSymbol("SCRATCH_RSRC_DWORD0")
585         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
586 
587       BuildMI(MBB, I, DL, SMovB32, Rsrc1)
588         .addExternalSymbol("SCRATCH_RSRC_DWORD1")
589         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
590 
591     }
592 
593     BuildMI(MBB, I, DL, SMovB32, Rsrc2)
594       .addImm(Rsrc23 & 0xffffffff)
595       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
596 
597     BuildMI(MBB, I, DL, SMovB32, Rsrc3)
598       .addImm(Rsrc23 >> 32)
599       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
600   } else if (ST.isAmdHsaOrMesa(Fn)) {
601     assert(PreloadedScratchRsrcReg);
602 
603     if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
604       BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
605           .addReg(PreloadedScratchRsrcReg, RegState::Kill);
606     }
607   }
608 
609   // Add the scratch wave offset into the scratch RSRC.
610   //
611   // We only want to update the first 48 bits, which is the base address
612   // pointer, without touching the adjacent 16 bits of flags. We know this add
613   // cannot carry-out from bit 47, otherwise the scratch allocation would be
614   // impossible to fit in the 48-bit global address space.
615   //
616   // TODO: Evaluate if it is better to just construct an SRD using the flat
617   // scratch init and some constants rather than update the one we are passed.
618   Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
619   Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
620 
621   // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
622   // the kernel body via inreg arguments.
623   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0)
624       .addReg(ScratchRsrcSub0)
625       .addReg(ScratchWaveOffsetReg)
626       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
627   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1)
628       .addReg(ScratchRsrcSub1)
629       .addImm(0)
630       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
631 }
632 
633 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
634   switch (ID) {
635   case TargetStackID::Default:
636   case TargetStackID::NoAlloc:
637   case TargetStackID::SGPRSpill:
638     return true;
639   case TargetStackID::SVEVector:
640     return false;
641   }
642   llvm_unreachable("Invalid TargetStackID::Value");
643 }
644 
645 // Activate all lanes, returns saved exec.
646 static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
647                                      MachineFunction &MF,
648                                      MachineBasicBlock &MBB,
649                                      MachineBasicBlock::iterator MBBI,
650                                      bool IsProlog) {
651   Register ScratchExecCopy;
652   MachineRegisterInfo &MRI = MF.getRegInfo();
653   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
654   const SIInstrInfo *TII = ST.getInstrInfo();
655   const SIRegisterInfo &TRI = TII->getRegisterInfo();
656   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
657   DebugLoc DL;
658 
659   if (LiveRegs.empty()) {
660     if (IsProlog) {
661       LiveRegs.init(TRI);
662       LiveRegs.addLiveIns(MBB);
663       if (FuncInfo->SGPRForFPSaveRestoreCopy)
664         LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy);
665 
666       if (FuncInfo->SGPRForBPSaveRestoreCopy)
667         LiveRegs.removeReg(FuncInfo->SGPRForBPSaveRestoreCopy);
668     } else {
669       // In epilog.
670       LiveRegs.init(*ST.getRegisterInfo());
671       LiveRegs.addLiveOuts(MBB);
672       LiveRegs.stepBackward(*MBBI);
673     }
674   }
675 
676   ScratchExecCopy = findScratchNonCalleeSaveRegister(
677       MRI, LiveRegs, *TRI.getWaveMaskRegClass());
678 
679   if (!IsProlog)
680     LiveRegs.removeReg(ScratchExecCopy);
681 
682   const unsigned OrSaveExec =
683       ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
684   BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy).addImm(-1);
685 
686   return ScratchExecCopy;
687 }
688 
689 void SIFrameLowering::emitPrologue(MachineFunction &MF,
690                                    MachineBasicBlock &MBB) const {
691   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
692   if (FuncInfo->isEntryFunction()) {
693     emitEntryFunctionPrologue(MF, MBB);
694     return;
695   }
696 
697   const MachineFrameInfo &MFI = MF.getFrameInfo();
698   MachineRegisterInfo &MRI = MF.getRegInfo();
699   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
700   const SIInstrInfo *TII = ST.getInstrInfo();
701   const SIRegisterInfo &TRI = TII->getRegisterInfo();
702 
703   Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
704   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
705   Register BasePtrReg =
706       TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
707   LivePhysRegs LiveRegs;
708 
709   MachineBasicBlock::iterator MBBI = MBB.begin();
710   DebugLoc DL;
711 
712   bool HasFP = false;
713   bool HasBP = false;
714   uint32_t NumBytes = MFI.getStackSize();
715   uint32_t RoundedSize = NumBytes;
716   // To avoid clobbering VGPRs in lanes that weren't active on function entry,
717   // turn on all lanes before doing the spill to memory.
718   Register ScratchExecCopy;
719 
720   bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue();
721   bool SpillFPToMemory = false;
722   // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR.
723   // Otherwise we are spilling the FP to memory.
724   if (HasFPSaveIndex) {
725     SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) !=
726                       TargetStackID::SGPRSpill;
727   }
728 
729   bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue();
730   bool SpillBPToMemory = false;
731   // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR.
732   // Otherwise we are spilling the BP to memory.
733   if (HasBPSaveIndex) {
734     SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) !=
735                       TargetStackID::SGPRSpill;
736   }
737 
738   // Emit the copy if we need an FP, and are using a free SGPR to save it.
739   if (FuncInfo->SGPRForFPSaveRestoreCopy) {
740     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy)
741       .addReg(FramePtrReg)
742       .setMIFlag(MachineInstr::FrameSetup);
743   }
744 
745   // Emit the copy if we need a BP, and are using a free SGPR to save it.
746   if (FuncInfo->SGPRForBPSaveRestoreCopy) {
747     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY),
748             FuncInfo->SGPRForBPSaveRestoreCopy)
749         .addReg(BasePtrReg)
750         .setMIFlag(MachineInstr::FrameSetup);
751   }
752 
753   // If a copy has been emitted for FP and/or BP, Make the SGPRs
754   // used in the copy instructions live throughout the function.
755   SmallVector<MCPhysReg, 2> TempSGPRs;
756   if (FuncInfo->SGPRForFPSaveRestoreCopy)
757     TempSGPRs.push_back(FuncInfo->SGPRForFPSaveRestoreCopy);
758 
759   if (FuncInfo->SGPRForBPSaveRestoreCopy)
760     TempSGPRs.push_back(FuncInfo->SGPRForBPSaveRestoreCopy);
761 
762   if (!TempSGPRs.empty()) {
763     for (MachineBasicBlock &MBB : MF) {
764       for (MCPhysReg Reg : TempSGPRs)
765         MBB.addLiveIn(Reg);
766 
767       MBB.sortUniqueLiveIns();
768     }
769   }
770 
771   for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
772          : FuncInfo->getSGPRSpillVGPRs()) {
773     if (!Reg.FI.hasValue())
774       continue;
775 
776     if (!ScratchExecCopy)
777       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
778 
779     buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
780                      FuncInfo->getScratchRSrcReg(),
781                      StackPtrReg,
782                      Reg.FI.getValue());
783   }
784 
785   if (HasFPSaveIndex && SpillFPToMemory) {
786     assert(!MFI.isDeadObjectIndex(FuncInfo->FramePointerSaveIndex.getValue()));
787 
788     if (!ScratchExecCopy)
789       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
790 
791     MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
792         MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
793 
794     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
795         .addReg(FramePtrReg);
796 
797     buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR,
798                      FuncInfo->getScratchRSrcReg(), StackPtrReg,
799                      FuncInfo->FramePointerSaveIndex.getValue());
800   }
801 
802   if (HasBPSaveIndex && SpillBPToMemory) {
803     assert(!MFI.isDeadObjectIndex(*FuncInfo->BasePointerSaveIndex));
804 
805     if (!ScratchExecCopy)
806       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
807 
808     MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
809         MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
810 
811     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
812         .addReg(BasePtrReg);
813 
814     buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR,
815                      FuncInfo->getScratchRSrcReg(), StackPtrReg,
816                      *FuncInfo->BasePointerSaveIndex);
817   }
818 
819   if (ScratchExecCopy) {
820     // FIXME: Split block and make terminator.
821     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
822     MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
823     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
824         .addReg(ScratchExecCopy, RegState::Kill);
825     LiveRegs.addReg(ScratchExecCopy);
826   }
827 
828   // In this case, spill the FP to a reserved VGPR.
829   if (HasFPSaveIndex && !SpillFPToMemory) {
830     const int FI = FuncInfo->FramePointerSaveIndex.getValue();
831     assert(!MFI.isDeadObjectIndex(FI));
832 
833     assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
834     ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
835         FuncInfo->getSGPRToVGPRSpills(FI);
836     assert(Spill.size() == 1);
837 
838     // Save FP before setting it up.
839     // FIXME: This should respect spillSGPRToVGPR;
840     BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
841             Spill[0].VGPR)
842         .addReg(FramePtrReg)
843         .addImm(Spill[0].Lane)
844         .addReg(Spill[0].VGPR, RegState::Undef);
845   }
846 
847   // In this case, spill the BP to a reserved VGPR.
848   if (HasBPSaveIndex && !SpillBPToMemory) {
849     const int BasePtrFI = *FuncInfo->BasePointerSaveIndex;
850     assert(!MFI.isDeadObjectIndex(BasePtrFI));
851 
852     assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill);
853     ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
854         FuncInfo->getSGPRToVGPRSpills(BasePtrFI);
855     assert(Spill.size() == 1);
856 
857     // Save BP before setting it up.
858     // FIXME: This should respect spillSGPRToVGPR;
859     BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
860             Spill[0].VGPR)
861         .addReg(BasePtrReg)
862         .addImm(Spill[0].Lane)
863         .addReg(Spill[0].VGPR, RegState::Undef);
864   }
865 
866   if (TRI.needsStackRealignment(MF)) {
867     HasFP = true;
868     const unsigned Alignment = MFI.getMaxAlign().value();
869 
870     RoundedSize += Alignment;
871     if (LiveRegs.empty()) {
872       LiveRegs.init(TRI);
873       LiveRegs.addLiveIns(MBB);
874       LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy);
875       LiveRegs.addReg(FuncInfo->SGPRForBPSaveRestoreCopy);
876     }
877 
878     Register ScratchSPReg = findScratchNonCalleeSaveRegister(
879         MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass);
880     assert(ScratchSPReg && ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy &&
881            ScratchSPReg != FuncInfo->SGPRForBPSaveRestoreCopy);
882 
883     // s_add_u32 tmp_reg, s32, NumBytes
884     // s_and_b32 s32, tmp_reg, 0b111...0000
885     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg)
886         .addReg(StackPtrReg)
887         .addImm((Alignment - 1) * ST.getWavefrontSize())
888         .setMIFlag(MachineInstr::FrameSetup);
889     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
890         .addReg(ScratchSPReg, RegState::Kill)
891         .addImm(-Alignment * ST.getWavefrontSize())
892         .setMIFlag(MachineInstr::FrameSetup);
893     FuncInfo->setIsStackRealigned(true);
894   } else if ((HasFP = hasFP(MF))) {
895     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
896         .addReg(StackPtrReg)
897         .setMIFlag(MachineInstr::FrameSetup);
898   }
899 
900   // If we need a base pointer, set it up here. It's whatever the value of
901   // the stack pointer is at this point. Any variable size objects will be
902   // allocated after this, so we can still use the base pointer to reference
903   // the incoming arguments.
904   if ((HasBP = TRI.hasBasePointer(MF))) {
905     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
906         .addReg(StackPtrReg)
907         .setMIFlag(MachineInstr::FrameSetup);
908   }
909 
910   if (HasFP && RoundedSize != 0) {
911     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
912         .addReg(StackPtrReg)
913         .addImm(RoundedSize * ST.getWavefrontSize())
914         .setMIFlag(MachineInstr::FrameSetup);
915   }
916 
917   assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy ||
918                      FuncInfo->FramePointerSaveIndex)) &&
919          "Needed to save FP but didn't save it anywhere");
920 
921   assert((HasFP || (!FuncInfo->SGPRForFPSaveRestoreCopy &&
922                     !FuncInfo->FramePointerSaveIndex)) &&
923          "Saved FP but didn't need it");
924 
925   assert((!HasBP || (FuncInfo->SGPRForBPSaveRestoreCopy ||
926                      FuncInfo->BasePointerSaveIndex)) &&
927          "Needed to save BP but didn't save it anywhere");
928 
929   assert((HasBP || (!FuncInfo->SGPRForBPSaveRestoreCopy &&
930                     !FuncInfo->BasePointerSaveIndex)) &&
931          "Saved BP but didn't need it");
932 }
933 
934 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
935                                    MachineBasicBlock &MBB) const {
936   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
937   if (FuncInfo->isEntryFunction())
938     return;
939 
940   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
941   const SIInstrInfo *TII = ST.getInstrInfo();
942   MachineRegisterInfo &MRI = MF.getRegInfo();
943   const SIRegisterInfo &TRI = TII->getRegisterInfo();
944   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
945   LivePhysRegs LiveRegs;
946   DebugLoc DL;
947 
948   const MachineFrameInfo &MFI = MF.getFrameInfo();
949   uint32_t NumBytes = MFI.getStackSize();
950   uint32_t RoundedSize = FuncInfo->isStackRealigned()
951                              ? NumBytes + MFI.getMaxAlign().value()
952                              : NumBytes;
953   const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
954   const Register FramePtrReg = FuncInfo->getFrameOffsetReg();
955   const Register BasePtrReg =
956       TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
957 
958   bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue();
959   bool SpillFPToMemory = false;
960   if (HasFPSaveIndex) {
961     SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) !=
962                       TargetStackID::SGPRSpill;
963   }
964 
965   bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue();
966   bool SpillBPToMemory = false;
967   if (HasBPSaveIndex) {
968     SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) !=
969                       TargetStackID::SGPRSpill;
970   }
971 
972   if (RoundedSize != 0 && hasFP(MF)) {
973     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
974       .addReg(StackPtrReg)
975       .addImm(RoundedSize * ST.getWavefrontSize())
976       .setMIFlag(MachineInstr::FrameDestroy);
977   }
978 
979   if (FuncInfo->SGPRForFPSaveRestoreCopy) {
980     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
981         .addReg(FuncInfo->SGPRForFPSaveRestoreCopy)
982         .setMIFlag(MachineInstr::FrameSetup);
983   }
984 
985   if (FuncInfo->SGPRForBPSaveRestoreCopy) {
986     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
987         .addReg(FuncInfo->SGPRForBPSaveRestoreCopy)
988         .setMIFlag(MachineInstr::FrameSetup);
989   }
990 
991   Register ScratchExecCopy;
992   if (HasFPSaveIndex) {
993     const int FI = FuncInfo->FramePointerSaveIndex.getValue();
994     assert(!MFI.isDeadObjectIndex(FI));
995     if (SpillFPToMemory) {
996       if (!ScratchExecCopy)
997         ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
998 
999       MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister(
1000           MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
1001       buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR,
1002                         FuncInfo->getScratchRSrcReg(), StackPtrReg, FI);
1003       BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg)
1004           .addReg(TempVGPR, RegState::Kill);
1005     } else {
1006       // Reload from VGPR spill.
1007       assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
1008       ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
1009           FuncInfo->getSGPRToVGPRSpills(FI);
1010       assert(Spill.size() == 1);
1011       BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
1012               FramePtrReg)
1013           .addReg(Spill[0].VGPR)
1014           .addImm(Spill[0].Lane);
1015     }
1016   }
1017 
1018   if (HasBPSaveIndex) {
1019     const int BasePtrFI = *FuncInfo->BasePointerSaveIndex;
1020     assert(!MFI.isDeadObjectIndex(BasePtrFI));
1021     if (SpillBPToMemory) {
1022       if (!ScratchExecCopy)
1023         ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
1024 
1025       MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister(
1026           MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
1027       buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR,
1028                         FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI);
1029       BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg)
1030           .addReg(TempVGPR, RegState::Kill);
1031     } else {
1032       // Reload from VGPR spill.
1033       assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill);
1034       ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
1035           FuncInfo->getSGPRToVGPRSpills(BasePtrFI);
1036       assert(Spill.size() == 1);
1037       BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
1038               BasePtrReg)
1039           .addReg(Spill[0].VGPR)
1040           .addImm(Spill[0].Lane);
1041     }
1042   }
1043 
1044   for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg :
1045        FuncInfo->getSGPRSpillVGPRs()) {
1046     if (!Reg.FI.hasValue())
1047       continue;
1048 
1049     if (!ScratchExecCopy)
1050       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
1051 
1052     buildEpilogReload(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
1053                       FuncInfo->getScratchRSrcReg(), StackPtrReg,
1054                       Reg.FI.getValue());
1055   }
1056 
1057   if (ScratchExecCopy) {
1058     // FIXME: Split block and make terminator.
1059     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1060     MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
1061     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
1062         .addReg(ScratchExecCopy, RegState::Kill);
1063   }
1064 }
1065 
1066 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
1067 // memory. They should have been removed by now.
1068 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
1069   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
1070        I != E; ++I) {
1071     if (!MFI.isDeadObjectIndex(I))
1072       return false;
1073   }
1074 
1075   return true;
1076 }
1077 
1078 #ifndef NDEBUG
1079 static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI,
1080                                  Optional<int> FramePointerSaveIndex,
1081                                  Optional<int> BasePointerSaveIndex) {
1082   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
1083        I != E; ++I) {
1084     if (!MFI.isDeadObjectIndex(I) &&
1085         MFI.getStackID(I) == TargetStackID::SGPRSpill &&
1086         ((FramePointerSaveIndex && I != FramePointerSaveIndex) ||
1087          (BasePointerSaveIndex && I != BasePointerSaveIndex))) {
1088       return false;
1089     }
1090   }
1091 
1092   return true;
1093 }
1094 #endif
1095 
1096 int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
1097                                             Register &FrameReg) const {
1098   const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1099 
1100   FrameReg = RI->getFrameRegister(MF);
1101   return MF.getFrameInfo().getObjectOffset(FI);
1102 }
1103 
1104 void SIFrameLowering::processFunctionBeforeFrameFinalized(
1105   MachineFunction &MF,
1106   RegScavenger *RS) const {
1107   MachineFrameInfo &MFI = MF.getFrameInfo();
1108 
1109   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1110   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1111   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1112 
1113   FuncInfo->removeDeadFrameIndices(MFI);
1114   assert(allSGPRSpillsAreDead(MFI, None, None) &&
1115          "SGPR spill should have been removed in SILowerSGPRSpills");
1116 
1117   // FIXME: The other checks should be redundant with allStackObjectsAreDead,
1118   // but currently hasNonSpillStackObjects is set only from source
1119   // allocas. Stack temps produced from legalization are not counted currently.
1120   if (!allStackObjectsAreDead(MFI)) {
1121     assert(RS && "RegScavenger required if spilling");
1122 
1123     if (FuncInfo->isEntryFunction()) {
1124       int ScavengeFI = MFI.CreateFixedObject(
1125         TRI->getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
1126       RS->addScavengingFrameIndex(ScavengeFI);
1127     } else {
1128       int ScavengeFI = MFI.CreateStackObject(
1129           TRI->getSpillSize(AMDGPU::SGPR_32RegClass),
1130           TRI->getSpillAlign(AMDGPU::SGPR_32RegClass), false);
1131       RS->addScavengingFrameIndex(ScavengeFI);
1132     }
1133   }
1134 }
1135 
1136 // Only report VGPRs to generic code.
1137 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
1138                                            BitVector &SavedVGPRs,
1139                                            RegScavenger *RS) const {
1140   TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
1141   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1142   if (MFI->isEntryFunction())
1143     return;
1144 
1145   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1146   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1147   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1148 
1149   // Ignore the SGPRs the default implementation found.
1150   SavedVGPRs.clearBitsNotInMask(TRI->getAllVGPRRegMask());
1151 
1152   // hasFP only knows about stack objects that already exist. We're now
1153   // determining the stack slots that will be created, so we have to predict
1154   // them. Stack objects force FP usage with calls.
1155   //
1156   // Note a new VGPR CSR may be introduced if one is used for the spill, but we
1157   // don't want to report it here.
1158   //
1159   // FIXME: Is this really hasReservedCallFrame?
1160   const bool WillHaveFP =
1161       FrameInfo.hasCalls() &&
1162       (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo));
1163 
1164   // VGPRs used for SGPR spilling need to be specially inserted in the prolog,
1165   // so don't allow the default insertion to handle them.
1166   for (auto SSpill : MFI->getSGPRSpillVGPRs())
1167     SavedVGPRs.reset(SSpill.VGPR);
1168 
1169   LivePhysRegs LiveRegs;
1170   LiveRegs.init(*TRI);
1171 
1172   if (WillHaveFP || hasFP(MF)) {
1173     getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForFPSaveRestoreCopy,
1174                                    MFI->FramePointerSaveIndex, true);
1175   }
1176 
1177   if (TRI->hasBasePointer(MF)) {
1178     if (MFI->SGPRForFPSaveRestoreCopy)
1179       LiveRegs.addReg(MFI->SGPRForFPSaveRestoreCopy);
1180     getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForBPSaveRestoreCopy,
1181                                    MFI->BasePointerSaveIndex, false);
1182   }
1183 }
1184 
1185 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
1186                                                BitVector &SavedRegs,
1187                                                RegScavenger *RS) const {
1188   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
1189   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1190   if (MFI->isEntryFunction())
1191     return;
1192 
1193   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1194   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1195 
1196   // The SP is specifically managed and we don't want extra spills of it.
1197   SavedRegs.reset(MFI->getStackPtrOffsetReg());
1198   SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask());
1199 }
1200 
1201 bool SIFrameLowering::assignCalleeSavedSpillSlots(
1202     MachineFunction &MF, const TargetRegisterInfo *TRI,
1203     std::vector<CalleeSavedInfo> &CSI) const {
1204   if (CSI.empty())
1205     return true; // Early exit if no callee saved registers are modified!
1206 
1207   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1208   if (!FuncInfo->SGPRForFPSaveRestoreCopy &&
1209       !FuncInfo->SGPRForBPSaveRestoreCopy)
1210     return false;
1211 
1212   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1213   const SIRegisterInfo *RI = ST.getRegisterInfo();
1214   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1215   Register BasePtrReg = RI->getBaseRegister();
1216   unsigned NumModifiedRegs = 0;
1217 
1218   if (FuncInfo->SGPRForFPSaveRestoreCopy)
1219     NumModifiedRegs++;
1220   if (FuncInfo->SGPRForBPSaveRestoreCopy)
1221     NumModifiedRegs++;
1222 
1223   for (auto &CS : CSI) {
1224     if (CS.getReg() == FramePtrReg && FuncInfo->SGPRForFPSaveRestoreCopy) {
1225       CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy);
1226       if (--NumModifiedRegs)
1227         break;
1228     } else if (CS.getReg() == BasePtrReg &&
1229                FuncInfo->SGPRForBPSaveRestoreCopy) {
1230       CS.setDstReg(FuncInfo->SGPRForBPSaveRestoreCopy);
1231       if (--NumModifiedRegs)
1232         break;
1233     }
1234   }
1235 
1236   return false;
1237 }
1238 
1239 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
1240   MachineFunction &MF,
1241   MachineBasicBlock &MBB,
1242   MachineBasicBlock::iterator I) const {
1243   int64_t Amount = I->getOperand(0).getImm();
1244   if (Amount == 0)
1245     return MBB.erase(I);
1246 
1247   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1248   const SIInstrInfo *TII = ST.getInstrInfo();
1249   const DebugLoc &DL = I->getDebugLoc();
1250   unsigned Opc = I->getOpcode();
1251   bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
1252   uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
1253 
1254   if (!hasReservedCallFrame(MF)) {
1255     Amount = alignTo(Amount, getStackAlign());
1256     assert(isUInt<32>(Amount) && "exceeded stack address space size");
1257     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1258     Register SPReg = MFI->getStackPtrOffsetReg();
1259 
1260     unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
1261     BuildMI(MBB, I, DL, TII->get(Op), SPReg)
1262       .addReg(SPReg)
1263       .addImm(Amount * ST.getWavefrontSize());
1264   } else if (CalleePopAmount != 0) {
1265     llvm_unreachable("is this used?");
1266   }
1267 
1268   return MBB.erase(I);
1269 }
1270 
1271 /// Returns true if the frame will require a reference to the stack pointer.
1272 ///
1273 /// This is the set of conditions common to setting up the stack pointer in a
1274 /// kernel, and for using a frame pointer in a callable function.
1275 ///
1276 /// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm
1277 /// references SP.
1278 static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) {
1279   return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint();
1280 }
1281 
1282 // The FP for kernels is always known 0, so we never really need to setup an
1283 // explicit register for it. However, DisableFramePointerElim will force us to
1284 // use a register for it.
1285 bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
1286   const MachineFrameInfo &MFI = MF.getFrameInfo();
1287 
1288   // For entry functions we can use an immediate offset in most cases, so the
1289   // presence of calls doesn't imply we need a distinct frame pointer.
1290   if (MFI.hasCalls() &&
1291       !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1292     // All offsets are unsigned, so need to be addressed in the same direction
1293     // as stack growth.
1294 
1295     // FIXME: This function is pretty broken, since it can be called before the
1296     // frame layout is determined or CSR spills are inserted.
1297     return MFI.getStackSize() != 0;
1298   }
1299 
1300   return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
1301     MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->needsStackRealignment(MF) ||
1302     MF.getTarget().Options.DisableFramePointerElim(MF);
1303 }
1304 
1305 // This is essentially a reduced version of hasFP for entry functions. Since the
1306 // stack pointer is known 0 on entry to kernels, we never really need an FP
1307 // register. We may need to initialize the stack pointer depending on the frame
1308 // properties, which logically overlaps many of the cases where an ordinary
1309 // function would require an FP.
1310 bool SIFrameLowering::requiresStackPointerReference(
1311     const MachineFunction &MF) const {
1312   // Callable functions always require a stack pointer reference.
1313   assert(MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() &&
1314          "only expected to call this for entry points");
1315 
1316   const MachineFrameInfo &MFI = MF.getFrameInfo();
1317 
1318   // Entry points ordinarily don't need to initialize SP. We have to set it up
1319   // for callees if there are any. Also note tail calls are impossible/don't
1320   // make any sense for kernels.
1321   if (MFI.hasCalls())
1322     return true;
1323 
1324   // We still need to initialize the SP if we're doing anything weird that
1325   // references the SP, like variable sized stack objects.
1326   return frameTriviallyRequiresSP(MFI);
1327 }
1328