1 //===----------------------- SIFrameLowering.cpp --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 
9 #include "SIFrameLowering.h"
10 #include "AMDGPUSubtarget.h"
11 #include "SIInstrInfo.h"
12 #include "SIMachineFunctionInfo.h"
13 #include "SIRegisterInfo.h"
14 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
15 
16 #include "llvm/CodeGen/LivePhysRegs.h"
17 #include "llvm/CodeGen/MachineFrameInfo.h"
18 #include "llvm/CodeGen/MachineFunction.h"
19 #include "llvm/CodeGen/MachineInstrBuilder.h"
20 #include "llvm/CodeGen/RegisterScavenging.h"
21 
22 using namespace llvm;
23 
24 #define DEBUG_TYPE "frame-info"
25 
26 
27 // Find a scratch register that we can use at the start of the prologue to
28 // re-align the stack pointer. We avoid using callee-save registers since they
29 // may appear to be free when this is called from canUseAsPrologue (during
30 // shrink wrapping), but then no longer be free when this is called from
31 // emitPrologue.
32 //
33 // FIXME: This is a bit conservative, since in the above case we could use one
34 // of the callee-save registers as a scratch temp to re-align the stack pointer,
35 // but we would then have to make sure that we were in fact saving at least one
36 // callee-save register in the prologue, which is additional complexity that
37 // doesn't seem worth the benefit.
38 static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
39                                                    LivePhysRegs &LiveRegs,
40                                                    const TargetRegisterClass &RC,
41                                                    bool Unused = false) {
42   // Mark callee saved registers as used so we will not choose them.
43   const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
44   for (unsigned i = 0; CSRegs[i]; ++i)
45     LiveRegs.addReg(CSRegs[i]);
46 
47   if (Unused) {
48     // We are looking for a register that can be used throughout the entire
49     // function, so any use is unacceptable.
50     for (MCRegister Reg : RC) {
51       if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg))
52         return Reg;
53     }
54   } else {
55     for (MCRegister Reg : RC) {
56       if (LiveRegs.available(MRI, Reg))
57         return Reg;
58     }
59   }
60 
61   // If we require an unused register, this is used in contexts where failure is
62   // an option and has an alternative plan. In other contexts, this must
63   // succeed0.
64   if (!Unused)
65     report_fatal_error("failed to find free scratch register");
66 
67   return MCRegister();
68 }
69 
70 static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF,
71                                            LivePhysRegs &LiveRegs,
72                                            Register &TempSGPR,
73                                            Optional<int> &FrameIndex,
74                                            bool IsFP) {
75   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
76   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
77 
78 #ifndef NDEBUG
79   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
80   const SIRegisterInfo *TRI = ST.getRegisterInfo();
81 #endif
82 
83   // We need to save and restore the current FP/BP.
84 
85   // 1: If there is already a VGPR with free lanes, use it. We
86   // may already have to pay the penalty for spilling a CSR VGPR.
87   if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) {
88     int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr,
89                                             TargetStackID::SGPRSpill);
90 
91     if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
92       llvm_unreachable("allocate SGPR spill should have worked");
93 
94     FrameIndex = NewFI;
95 
96     LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
97                dbgs() << "Spilling " << (IsFP ? "FP" : "BP") << " to  "
98                       << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
99                       << '\n');
100     return;
101   }
102 
103   // 2: Next, try to save the FP/BP in an unused SGPR.
104   TempSGPR = findScratchNonCalleeSaveRegister(
105       MF.getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true);
106 
107   if (!TempSGPR) {
108     int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr,
109                                             TargetStackID::SGPRSpill);
110 
111     if (MFI->allocateSGPRSpillToVGPR(MF, NewFI)) {
112       // 3: There's no free lane to spill, and no free register to save FP/BP,
113       // so we're forced to spill another VGPR to use for the spill.
114       FrameIndex = NewFI;
115 
116       LLVM_DEBUG(
117           auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
118           dbgs() << (IsFP ? "FP" : "BP") << " requires fallback spill to "
119                  << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';);
120     } else {
121       // Remove dead <NewFI> index
122       MF.getFrameInfo().RemoveStackObject(NewFI);
123       // 4: If all else fails, spill the FP/BP to memory.
124       FrameIndex = FrameInfo.CreateSpillStackObject(4, Align(4));
125       LLVM_DEBUG(dbgs() << "Reserved FI " << FrameIndex << " for spilling "
126                         << (IsFP ? "FP" : "BP") << '\n');
127     }
128   } else {
129     LLVM_DEBUG(dbgs() << "Saving " << (IsFP ? "FP" : "BP") << " with copy to "
130                       << printReg(TempSGPR, TRI) << '\n');
131   }
132 }
133 
134 // We need to specially emit stack operations here because a different frame
135 // register is used than in the rest of the function, as getFrameRegister would
136 // use.
137 static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
138                              MachineBasicBlock::iterator I,
139                              const SIInstrInfo *TII, Register SpillReg,
140                              Register ScratchRsrcReg, Register SPReg, int FI) {
141   MachineFunction *MF = MBB.getParent();
142   MachineFrameInfo &MFI = MF->getFrameInfo();
143 
144   int64_t Offset = MFI.getObjectOffset(FI);
145 
146   MachineMemOperand *MMO = MF->getMachineMemOperand(
147       MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4,
148       MFI.getObjectAlign(FI));
149 
150   if (isUInt<12>(Offset)) {
151     BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET))
152       .addReg(SpillReg, RegState::Kill)
153       .addReg(ScratchRsrcReg)
154       .addReg(SPReg)
155       .addImm(Offset)
156       .addImm(0) // glc
157       .addImm(0) // slc
158       .addImm(0) // tfe
159       .addImm(0) // dlc
160       .addImm(0) // swz
161       .addMemOperand(MMO);
162     return;
163   }
164 
165   // Don't clobber the TmpVGPR if we also need a scratch reg for the stack
166   // offset in the spill.
167   LiveRegs.addReg(SpillReg);
168 
169   MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
170     MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
171 
172   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
173     .addImm(Offset);
174 
175   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN))
176     .addReg(SpillReg, RegState::Kill)
177     .addReg(OffsetReg, RegState::Kill)
178     .addReg(ScratchRsrcReg)
179     .addReg(SPReg)
180     .addImm(0)
181     .addImm(0) // glc
182     .addImm(0) // slc
183     .addImm(0) // tfe
184     .addImm(0) // dlc
185     .addImm(0) // swz
186     .addMemOperand(MMO);
187 
188   LiveRegs.removeReg(SpillReg);
189 }
190 
191 static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
192                               MachineBasicBlock::iterator I,
193                               const SIInstrInfo *TII, Register SpillReg,
194                               Register ScratchRsrcReg, Register SPReg, int FI) {
195   MachineFunction *MF = MBB.getParent();
196   MachineFrameInfo &MFI = MF->getFrameInfo();
197   int64_t Offset = MFI.getObjectOffset(FI);
198 
199   MachineMemOperand *MMO = MF->getMachineMemOperand(
200       MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4,
201       MFI.getObjectAlign(FI));
202 
203   if (isUInt<12>(Offset)) {
204     BuildMI(MBB, I, DebugLoc(),
205             TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFSET), SpillReg)
206       .addReg(ScratchRsrcReg)
207       .addReg(SPReg)
208       .addImm(Offset)
209       .addImm(0) // glc
210       .addImm(0) // slc
211       .addImm(0) // tfe
212       .addImm(0) // dlc
213       .addImm(0) // swz
214       .addMemOperand(MMO);
215     return;
216   }
217 
218   MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
219     MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
220 
221   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
222     .addImm(Offset);
223 
224   BuildMI(MBB, I, DebugLoc(),
225           TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), SpillReg)
226     .addReg(OffsetReg, RegState::Kill)
227     .addReg(ScratchRsrcReg)
228     .addReg(SPReg)
229     .addImm(0)
230     .addImm(0) // glc
231     .addImm(0) // slc
232     .addImm(0) // tfe
233     .addImm(0) // dlc
234     .addImm(0) // swz
235     .addMemOperand(MMO);
236 }
237 
238 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
239 void SIFrameLowering::emitEntryFunctionFlatScratchInit(
240     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
241     const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
242   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
243   const SIInstrInfo *TII = ST.getInstrInfo();
244   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
245   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
246 
247   // We don't need this if we only have spills since there is no user facing
248   // scratch.
249 
250   // TODO: If we know we don't have flat instructions earlier, we can omit
251   // this from the input registers.
252   //
253   // TODO: We only need to know if we access scratch space through a flat
254   // pointer. Because we only detect if flat instructions are used at all,
255   // this will be used more often than necessary on VI.
256 
257   Register FlatScratchInitReg =
258       MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
259 
260   MachineRegisterInfo &MRI = MF.getRegInfo();
261   MRI.addLiveIn(FlatScratchInitReg);
262   MBB.addLiveIn(FlatScratchInitReg);
263 
264   Register FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
265   Register FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
266 
267   // Do a 64-bit pointer add.
268   if (ST.flatScratchIsPointer()) {
269     if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
270       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
271         .addReg(FlatScrInitLo)
272         .addReg(ScratchWaveOffsetReg);
273       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), FlatScrInitHi)
274         .addReg(FlatScrInitHi)
275         .addImm(0);
276       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
277         addReg(FlatScrInitLo).
278         addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO |
279                        (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
280       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
281         addReg(FlatScrInitHi).
282         addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
283                        (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
284       return;
285     }
286 
287     // For GFX9.
288     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
289       .addReg(FlatScrInitLo)
290       .addReg(ScratchWaveOffsetReg);
291     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI)
292       .addReg(FlatScrInitHi)
293       .addImm(0);
294 
295     return;
296   }
297 
298   assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
299 
300   // Copy the size in bytes.
301   BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
302     .addReg(FlatScrInitHi, RegState::Kill);
303 
304   // Add wave offset in bytes to private base offset.
305   // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
306   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
307     .addReg(FlatScrInitLo)
308     .addReg(ScratchWaveOffsetReg);
309 
310   // Convert offset to 256-byte units.
311   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI)
312     .addReg(FlatScrInitLo, RegState::Kill)
313     .addImm(8);
314 }
315 
316 // Shift down registers reserved for the scratch RSRC.
317 Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
318     MachineFunction &MF) const {
319 
320   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
321   const SIInstrInfo *TII = ST.getInstrInfo();
322   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
323   MachineRegisterInfo &MRI = MF.getRegInfo();
324   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
325 
326   assert(MFI->isEntryFunction());
327 
328   Register ScratchRsrcReg = MFI->getScratchRSrcReg();
329 
330   if (!ScratchRsrcReg || !MRI.isPhysRegUsed(ScratchRsrcReg))
331     return Register();
332 
333   if (ST.hasSGPRInitBug() ||
334       ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
335     return ScratchRsrcReg;
336 
337   // We reserved the last registers for this. Shift it down to the end of those
338   // which were actually used.
339   //
340   // FIXME: It might be safer to use a pseudoregister before replacement.
341 
342   // FIXME: We should be able to eliminate unused input registers. We only
343   // cannot do this for the resources required for scratch access. For now we
344   // skip over user SGPRs and may leave unused holes.
345 
346   unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
347   ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF);
348   AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
349 
350   // Skip the last N reserved elements because they should have already been
351   // reserved for VCC etc.
352   Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
353   for (MCPhysReg Reg : AllSGPR128s) {
354     // Pick the first unallocated one. Make sure we don't clobber the other
355     // reserved input we needed. Also for PAL, make sure we don't clobber
356     // the GIT pointer passed in SGPR0 or SGPR8.
357     if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
358         !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
359       MRI.replaceRegWith(ScratchRsrcReg, Reg);
360       MFI->setScratchRSrcReg(Reg);
361       return Reg;
362     }
363   }
364 
365   return ScratchRsrcReg;
366 }
367 
368 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
369                                                 MachineBasicBlock &MBB) const {
370   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
371 
372   // FIXME: If we only have SGPR spills, we won't actually be using scratch
373   // memory since these spill to VGPRs. We should be cleaning up these unused
374   // SGPR spill frame indices somewhere.
375 
376   // FIXME: We still have implicit uses on SGPR spill instructions in case they
377   // need to spill to vector memory. It's likely that will not happen, but at
378   // this point it appears we need the setup. This part of the prolog should be
379   // emitted after frame indices are eliminated.
380 
381   // FIXME: Remove all of the isPhysRegUsed checks
382 
383   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
384   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
385   const SIInstrInfo *TII = ST.getInstrInfo();
386   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
387   MachineRegisterInfo &MRI = MF.getRegInfo();
388   const Function &F = MF.getFunction();
389 
390   assert(MFI->isEntryFunction());
391 
392   Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
393       AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
394   // FIXME: Hack to not crash in situations which emitted an error.
395   if (!PreloadedScratchWaveOffsetReg)
396     return;
397 
398   // We need to do the replacement of the private segment buffer register even
399   // if there are no stack objects. There could be stores to undef or a
400   // constant without an associated object.
401   //
402   // This will return `Register()` in cases where there are no actual
403   // uses of the SRSRC.
404   Register ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
405 
406   // Make the selected register live throughout the function.
407   if (ScratchRsrcReg) {
408     for (MachineBasicBlock &OtherBB : MF) {
409       if (&OtherBB != &MBB) {
410         OtherBB.addLiveIn(ScratchRsrcReg);
411       }
412     }
413   }
414 
415   // Now that we have fixed the reserved SRSRC we need to locate the
416   // (potentially) preloaded SRSRC.
417   Register PreloadedScratchRsrcReg;
418   if (ST.isAmdHsaOrMesa(F)) {
419     PreloadedScratchRsrcReg =
420         MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
421     if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
422       // We added live-ins during argument lowering, but since they were not
423       // used they were deleted. We're adding the uses now, so add them back.
424       MRI.addLiveIn(PreloadedScratchRsrcReg);
425       MBB.addLiveIn(PreloadedScratchRsrcReg);
426     }
427   }
428 
429   // Debug location must be unknown since the first debug location is used to
430   // determine the end of the prologue.
431   DebugLoc DL;
432   MachineBasicBlock::iterator I = MBB.begin();
433 
434   // We found the SRSRC first because it needs four registers and has an
435   // alignment requirement. If the SRSRC that we found is clobbering with
436   // the scratch wave offset, which may be in a fixed SGPR or a free SGPR
437   // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
438   // wave offset to a free SGPR.
439   Register ScratchWaveOffsetReg;
440   if (TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) {
441     ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF);
442     unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
443     AllSGPRs = AllSGPRs.slice(
444         std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded));
445     Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
446     for (MCPhysReg Reg : AllSGPRs) {
447       if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
448           !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) {
449         ScratchWaveOffsetReg = Reg;
450         BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
451             .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
452         break;
453       }
454     }
455   } else {
456     ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
457   }
458   assert(ScratchWaveOffsetReg);
459 
460   if (requiresStackPointerReference(MF)) {
461     Register SPReg = MFI->getStackPtrOffsetReg();
462     assert(SPReg != AMDGPU::SP_REG);
463     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
464         .addImm(MF.getFrameInfo().getStackSize() * ST.getWavefrontSize());
465   }
466 
467   if (hasFP(MF)) {
468     Register FPReg = MFI->getFrameOffsetReg();
469     assert(FPReg != AMDGPU::FP_REG);
470     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
471   }
472 
473   if (MFI->hasFlatScratchInit() || ScratchRsrcReg) {
474     MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
475     MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
476   }
477 
478   if (MFI->hasFlatScratchInit()) {
479     emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
480   }
481 
482   if (ScratchRsrcReg) {
483     emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
484                                          PreloadedScratchRsrcReg,
485                                          ScratchRsrcReg, ScratchWaveOffsetReg);
486   }
487 }
488 
489 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
490 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
491     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
492     const DebugLoc &DL, Register PreloadedScratchRsrcReg,
493     Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
494 
495   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
496   const SIInstrInfo *TII = ST.getInstrInfo();
497   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
498   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
499   const Function &Fn = MF.getFunction();
500 
501   if (ST.isAmdPalOS()) {
502     // The pointer to the GIT is formed from the offset passed in and either
503     // the amdgpu-git-ptr-high function attribute or the top part of the PC
504     Register RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
505     Register RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
506     Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
507 
508     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
509 
510     if (MFI->getGITPtrHigh() != 0xffffffff) {
511       BuildMI(MBB, I, DL, SMovB32, RsrcHi)
512         .addImm(MFI->getGITPtrHigh())
513         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
514     } else {
515       const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
516       BuildMI(MBB, I, DL, GetPC64, Rsrc01);
517     }
518     Register GitPtrLo = MFI->getGITPtrLoReg(MF);
519     MF.getRegInfo().addLiveIn(GitPtrLo);
520     MBB.addLiveIn(GitPtrLo);
521     BuildMI(MBB, I, DL, SMovB32, RsrcLo)
522       .addReg(GitPtrLo)
523       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
524 
525     // We now have the GIT ptr - now get the scratch descriptor from the entry
526     // at offset 0 (or offset 16 for a compute shader).
527     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
528     const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
529     auto MMO = MF.getMachineMemOperand(PtrInfo,
530                                        MachineMemOperand::MOLoad |
531                                            MachineMemOperand::MOInvariant |
532                                            MachineMemOperand::MODereferenceable,
533                                        16, Align(4));
534     unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
535     const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
536     unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
537     BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
538       .addReg(Rsrc01)
539       .addImm(EncodedOffset) // offset
540       .addImm(0) // glc
541       .addImm(0) // dlc
542       .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
543       .addMemOperand(MMO);
544   } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
545     assert(!ST.isAmdHsaOrMesa(Fn));
546     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
547 
548     Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
549     Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
550 
551     // Use relocations to get the pointer, and setup the other bits manually.
552     uint64_t Rsrc23 = TII->getScratchRsrcWords23();
553 
554     if (MFI->hasImplicitBufferPtr()) {
555       Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
556 
557       if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
558         const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
559 
560         BuildMI(MBB, I, DL, Mov64, Rsrc01)
561           .addReg(MFI->getImplicitBufferPtrUserSGPR())
562           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
563       } else {
564         const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
565 
566         MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
567         auto MMO = MF.getMachineMemOperand(
568             PtrInfo,
569             MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
570                 MachineMemOperand::MODereferenceable,
571             8, Align(4));
572         BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
573           .addReg(MFI->getImplicitBufferPtrUserSGPR())
574           .addImm(0) // offset
575           .addImm(0) // glc
576           .addImm(0) // dlc
577           .addMemOperand(MMO)
578           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
579 
580         MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
581         MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
582       }
583     } else {
584       Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
585       Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
586 
587       BuildMI(MBB, I, DL, SMovB32, Rsrc0)
588         .addExternalSymbol("SCRATCH_RSRC_DWORD0")
589         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
590 
591       BuildMI(MBB, I, DL, SMovB32, Rsrc1)
592         .addExternalSymbol("SCRATCH_RSRC_DWORD1")
593         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
594 
595     }
596 
597     BuildMI(MBB, I, DL, SMovB32, Rsrc2)
598       .addImm(Rsrc23 & 0xffffffff)
599       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
600 
601     BuildMI(MBB, I, DL, SMovB32, Rsrc3)
602       .addImm(Rsrc23 >> 32)
603       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
604   } else if (ST.isAmdHsaOrMesa(Fn)) {
605     assert(PreloadedScratchRsrcReg);
606 
607     if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
608       BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
609           .addReg(PreloadedScratchRsrcReg, RegState::Kill);
610     }
611   }
612 
613   // Add the scratch wave offset into the scratch RSRC.
614   //
615   // We only want to update the first 48 bits, which is the base address
616   // pointer, without touching the adjacent 16 bits of flags. We know this add
617   // cannot carry-out from bit 47, otherwise the scratch allocation would be
618   // impossible to fit in the 48-bit global address space.
619   //
620   // TODO: Evaluate if it is better to just construct an SRD using the flat
621   // scratch init and some constants rather than update the one we are passed.
622   Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
623   Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
624 
625   // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
626   // the kernel body via inreg arguments.
627   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0)
628       .addReg(ScratchRsrcSub0)
629       .addReg(ScratchWaveOffsetReg)
630       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
631   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1)
632       .addReg(ScratchRsrcSub1)
633       .addImm(0)
634       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
635 }
636 
637 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
638   switch (ID) {
639   case TargetStackID::Default:
640   case TargetStackID::NoAlloc:
641   case TargetStackID::SGPRSpill:
642     return true;
643   case TargetStackID::SVEVector:
644     return false;
645   }
646   llvm_unreachable("Invalid TargetStackID::Value");
647 }
648 
649 // Activate all lanes, returns saved exec.
650 static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
651                                      MachineFunction &MF,
652                                      MachineBasicBlock &MBB,
653                                      MachineBasicBlock::iterator MBBI,
654                                      bool IsProlog) {
655   Register ScratchExecCopy;
656   MachineRegisterInfo &MRI = MF.getRegInfo();
657   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
658   const SIInstrInfo *TII = ST.getInstrInfo();
659   const SIRegisterInfo &TRI = TII->getRegisterInfo();
660   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
661   DebugLoc DL;
662 
663   if (LiveRegs.empty()) {
664     if (IsProlog) {
665       LiveRegs.init(TRI);
666       LiveRegs.addLiveIns(MBB);
667       if (FuncInfo->SGPRForFPSaveRestoreCopy)
668         LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy);
669 
670       if (FuncInfo->SGPRForBPSaveRestoreCopy)
671         LiveRegs.removeReg(FuncInfo->SGPRForBPSaveRestoreCopy);
672     } else {
673       // In epilog.
674       LiveRegs.init(*ST.getRegisterInfo());
675       LiveRegs.addLiveOuts(MBB);
676       LiveRegs.stepBackward(*MBBI);
677     }
678   }
679 
680   ScratchExecCopy = findScratchNonCalleeSaveRegister(
681       MRI, LiveRegs, *TRI.getWaveMaskRegClass());
682 
683   if (!IsProlog)
684     LiveRegs.removeReg(ScratchExecCopy);
685 
686   const unsigned OrSaveExec =
687       ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
688   BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy).addImm(-1);
689 
690   return ScratchExecCopy;
691 }
692 
693 void SIFrameLowering::emitPrologue(MachineFunction &MF,
694                                    MachineBasicBlock &MBB) const {
695   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
696   if (FuncInfo->isEntryFunction()) {
697     emitEntryFunctionPrologue(MF, MBB);
698     return;
699   }
700 
701   const MachineFrameInfo &MFI = MF.getFrameInfo();
702   MachineRegisterInfo &MRI = MF.getRegInfo();
703   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
704   const SIInstrInfo *TII = ST.getInstrInfo();
705   const SIRegisterInfo &TRI = TII->getRegisterInfo();
706 
707   Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
708   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
709   Register BasePtrReg =
710       TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
711   LivePhysRegs LiveRegs;
712 
713   MachineBasicBlock::iterator MBBI = MBB.begin();
714   DebugLoc DL;
715 
716   bool HasFP = false;
717   bool HasBP = false;
718   uint32_t NumBytes = MFI.getStackSize();
719   uint32_t RoundedSize = NumBytes;
720   // To avoid clobbering VGPRs in lanes that weren't active on function entry,
721   // turn on all lanes before doing the spill to memory.
722   Register ScratchExecCopy;
723 
724   bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue();
725   bool SpillFPToMemory = false;
726   // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR.
727   // Otherwise we are spilling the FP to memory.
728   if (HasFPSaveIndex) {
729     SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) !=
730                       TargetStackID::SGPRSpill;
731   }
732 
733   bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue();
734   bool SpillBPToMemory = false;
735   // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR.
736   // Otherwise we are spilling the BP to memory.
737   if (HasBPSaveIndex) {
738     SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) !=
739                       TargetStackID::SGPRSpill;
740   }
741 
742   // Emit the copy if we need an FP, and are using a free SGPR to save it.
743   if (FuncInfo->SGPRForFPSaveRestoreCopy) {
744     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy)
745       .addReg(FramePtrReg)
746       .setMIFlag(MachineInstr::FrameSetup);
747   }
748 
749   // Emit the copy if we need a BP, and are using a free SGPR to save it.
750   if (FuncInfo->SGPRForBPSaveRestoreCopy) {
751     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY),
752             FuncInfo->SGPRForBPSaveRestoreCopy)
753         .addReg(BasePtrReg)
754         .setMIFlag(MachineInstr::FrameSetup);
755   }
756 
757   // If a copy has been emitted for FP and/or BP, Make the SGPRs
758   // used in the copy instructions live throughout the function.
759   SmallVector<MCPhysReg, 2> TempSGPRs;
760   if (FuncInfo->SGPRForFPSaveRestoreCopy)
761     TempSGPRs.push_back(FuncInfo->SGPRForFPSaveRestoreCopy);
762 
763   if (FuncInfo->SGPRForBPSaveRestoreCopy)
764     TempSGPRs.push_back(FuncInfo->SGPRForBPSaveRestoreCopy);
765 
766   if (!TempSGPRs.empty()) {
767     for (MachineBasicBlock &MBB : MF) {
768       for (MCPhysReg Reg : TempSGPRs)
769         MBB.addLiveIn(Reg);
770 
771       MBB.sortUniqueLiveIns();
772     }
773   }
774 
775   for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
776          : FuncInfo->getSGPRSpillVGPRs()) {
777     if (!Reg.FI.hasValue())
778       continue;
779 
780     if (!ScratchExecCopy)
781       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
782 
783     buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
784                      FuncInfo->getScratchRSrcReg(),
785                      StackPtrReg,
786                      Reg.FI.getValue());
787   }
788 
789   if (HasFPSaveIndex && SpillFPToMemory) {
790     assert(!MFI.isDeadObjectIndex(FuncInfo->FramePointerSaveIndex.getValue()));
791 
792     if (!ScratchExecCopy)
793       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
794 
795     MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
796         MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
797 
798     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
799         .addReg(FramePtrReg);
800 
801     buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR,
802                      FuncInfo->getScratchRSrcReg(), StackPtrReg,
803                      FuncInfo->FramePointerSaveIndex.getValue());
804   }
805 
806   if (HasBPSaveIndex && SpillBPToMemory) {
807     assert(!MFI.isDeadObjectIndex(*FuncInfo->BasePointerSaveIndex));
808 
809     if (!ScratchExecCopy)
810       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
811 
812     MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
813         MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
814 
815     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
816         .addReg(BasePtrReg);
817 
818     buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR,
819                      FuncInfo->getScratchRSrcReg(), StackPtrReg,
820                      *FuncInfo->BasePointerSaveIndex);
821   }
822 
823   if (ScratchExecCopy) {
824     // FIXME: Split block and make terminator.
825     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
826     MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
827     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
828         .addReg(ScratchExecCopy, RegState::Kill);
829     LiveRegs.addReg(ScratchExecCopy);
830   }
831 
832   // In this case, spill the FP to a reserved VGPR.
833   if (HasFPSaveIndex && !SpillFPToMemory) {
834     const int FI = FuncInfo->FramePointerSaveIndex.getValue();
835     assert(!MFI.isDeadObjectIndex(FI));
836 
837     assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
838     ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
839         FuncInfo->getSGPRToVGPRSpills(FI);
840     assert(Spill.size() == 1);
841 
842     // Save FP before setting it up.
843     // FIXME: This should respect spillSGPRToVGPR;
844     BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
845             Spill[0].VGPR)
846         .addReg(FramePtrReg)
847         .addImm(Spill[0].Lane)
848         .addReg(Spill[0].VGPR, RegState::Undef);
849   }
850 
851   // In this case, spill the BP to a reserved VGPR.
852   if (HasBPSaveIndex && !SpillBPToMemory) {
853     const int BasePtrFI = *FuncInfo->BasePointerSaveIndex;
854     assert(!MFI.isDeadObjectIndex(BasePtrFI));
855 
856     assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill);
857     ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
858         FuncInfo->getSGPRToVGPRSpills(BasePtrFI);
859     assert(Spill.size() == 1);
860 
861     // Save BP before setting it up.
862     // FIXME: This should respect spillSGPRToVGPR;
863     BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
864             Spill[0].VGPR)
865         .addReg(BasePtrReg)
866         .addImm(Spill[0].Lane)
867         .addReg(Spill[0].VGPR, RegState::Undef);
868   }
869 
870   if (TRI.needsStackRealignment(MF)) {
871     HasFP = true;
872     const unsigned Alignment = MFI.getMaxAlign().value();
873 
874     RoundedSize += Alignment;
875     if (LiveRegs.empty()) {
876       LiveRegs.init(TRI);
877       LiveRegs.addLiveIns(MBB);
878       LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy);
879       LiveRegs.addReg(FuncInfo->SGPRForBPSaveRestoreCopy);
880     }
881 
882     Register ScratchSPReg = findScratchNonCalleeSaveRegister(
883         MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass);
884     assert(ScratchSPReg && ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy &&
885            ScratchSPReg != FuncInfo->SGPRForBPSaveRestoreCopy);
886 
887     // s_add_u32 tmp_reg, s32, NumBytes
888     // s_and_b32 s32, tmp_reg, 0b111...0000
889     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg)
890         .addReg(StackPtrReg)
891         .addImm((Alignment - 1) * ST.getWavefrontSize())
892         .setMIFlag(MachineInstr::FrameSetup);
893     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
894         .addReg(ScratchSPReg, RegState::Kill)
895         .addImm(-Alignment * ST.getWavefrontSize())
896         .setMIFlag(MachineInstr::FrameSetup);
897     FuncInfo->setIsStackRealigned(true);
898   } else if ((HasFP = hasFP(MF))) {
899     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
900         .addReg(StackPtrReg)
901         .setMIFlag(MachineInstr::FrameSetup);
902   }
903 
904   // If we need a base pointer, set it up here. It's whatever the value of
905   // the stack pointer is at this point. Any variable size objects will be
906   // allocated after this, so we can still use the base pointer to reference
907   // the incoming arguments.
908   if ((HasBP = TRI.hasBasePointer(MF))) {
909     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
910         .addReg(StackPtrReg)
911         .setMIFlag(MachineInstr::FrameSetup);
912   }
913 
914   if (HasFP && RoundedSize != 0) {
915     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
916         .addReg(StackPtrReg)
917         .addImm(RoundedSize * ST.getWavefrontSize())
918         .setMIFlag(MachineInstr::FrameSetup);
919   }
920 
921   assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy ||
922                      FuncInfo->FramePointerSaveIndex)) &&
923          "Needed to save FP but didn't save it anywhere");
924 
925   assert((HasFP || (!FuncInfo->SGPRForFPSaveRestoreCopy &&
926                     !FuncInfo->FramePointerSaveIndex)) &&
927          "Saved FP but didn't need it");
928 
929   assert((!HasBP || (FuncInfo->SGPRForBPSaveRestoreCopy ||
930                      FuncInfo->BasePointerSaveIndex)) &&
931          "Needed to save BP but didn't save it anywhere");
932 
933   assert((HasBP || (!FuncInfo->SGPRForBPSaveRestoreCopy &&
934                     !FuncInfo->BasePointerSaveIndex)) &&
935          "Saved BP but didn't need it");
936 }
937 
938 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
939                                    MachineBasicBlock &MBB) const {
940   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
941   if (FuncInfo->isEntryFunction())
942     return;
943 
944   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
945   const SIInstrInfo *TII = ST.getInstrInfo();
946   MachineRegisterInfo &MRI = MF.getRegInfo();
947   const SIRegisterInfo &TRI = TII->getRegisterInfo();
948   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
949   LivePhysRegs LiveRegs;
950   DebugLoc DL;
951 
952   const MachineFrameInfo &MFI = MF.getFrameInfo();
953   uint32_t NumBytes = MFI.getStackSize();
954   uint32_t RoundedSize = FuncInfo->isStackRealigned()
955                              ? NumBytes + MFI.getMaxAlign().value()
956                              : NumBytes;
957   const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
958   const Register FramePtrReg = FuncInfo->getFrameOffsetReg();
959   const Register BasePtrReg =
960       TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
961 
962   bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue();
963   bool SpillFPToMemory = false;
964   if (HasFPSaveIndex) {
965     SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) !=
966                       TargetStackID::SGPRSpill;
967   }
968 
969   bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue();
970   bool SpillBPToMemory = false;
971   if (HasBPSaveIndex) {
972     SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) !=
973                       TargetStackID::SGPRSpill;
974   }
975 
976   if (RoundedSize != 0 && hasFP(MF)) {
977     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
978       .addReg(StackPtrReg)
979       .addImm(RoundedSize * ST.getWavefrontSize())
980       .setMIFlag(MachineInstr::FrameDestroy);
981   }
982 
983   if (FuncInfo->SGPRForFPSaveRestoreCopy) {
984     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
985         .addReg(FuncInfo->SGPRForFPSaveRestoreCopy)
986         .setMIFlag(MachineInstr::FrameSetup);
987   }
988 
989   if (FuncInfo->SGPRForBPSaveRestoreCopy) {
990     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
991         .addReg(FuncInfo->SGPRForBPSaveRestoreCopy)
992         .setMIFlag(MachineInstr::FrameSetup);
993   }
994 
995   Register ScratchExecCopy;
996   if (HasFPSaveIndex) {
997     const int FI = FuncInfo->FramePointerSaveIndex.getValue();
998     assert(!MFI.isDeadObjectIndex(FI));
999     if (SpillFPToMemory) {
1000       if (!ScratchExecCopy)
1001         ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
1002 
1003       MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister(
1004           MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
1005       buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR,
1006                         FuncInfo->getScratchRSrcReg(), StackPtrReg, FI);
1007       BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg)
1008           .addReg(TempVGPR, RegState::Kill);
1009     } else {
1010       // Reload from VGPR spill.
1011       assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
1012       ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
1013           FuncInfo->getSGPRToVGPRSpills(FI);
1014       assert(Spill.size() == 1);
1015       BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
1016               FramePtrReg)
1017           .addReg(Spill[0].VGPR)
1018           .addImm(Spill[0].Lane);
1019     }
1020   }
1021 
1022   if (HasBPSaveIndex) {
1023     const int BasePtrFI = *FuncInfo->BasePointerSaveIndex;
1024     assert(!MFI.isDeadObjectIndex(BasePtrFI));
1025     if (SpillBPToMemory) {
1026       if (!ScratchExecCopy)
1027         ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
1028 
1029       MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister(
1030           MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
1031       buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR,
1032                         FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI);
1033       BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg)
1034           .addReg(TempVGPR, RegState::Kill);
1035     } else {
1036       // Reload from VGPR spill.
1037       assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill);
1038       ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
1039           FuncInfo->getSGPRToVGPRSpills(BasePtrFI);
1040       assert(Spill.size() == 1);
1041       BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
1042               BasePtrReg)
1043           .addReg(Spill[0].VGPR)
1044           .addImm(Spill[0].Lane);
1045     }
1046   }
1047 
1048   for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg :
1049        FuncInfo->getSGPRSpillVGPRs()) {
1050     if (!Reg.FI.hasValue())
1051       continue;
1052 
1053     if (!ScratchExecCopy)
1054       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
1055 
1056     buildEpilogReload(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
1057                       FuncInfo->getScratchRSrcReg(), StackPtrReg,
1058                       Reg.FI.getValue());
1059   }
1060 
1061   if (ScratchExecCopy) {
1062     // FIXME: Split block and make terminator.
1063     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1064     MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
1065     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
1066         .addReg(ScratchExecCopy, RegState::Kill);
1067   }
1068 }
1069 
1070 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
1071 // memory. They should have been removed by now.
1072 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
1073   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
1074        I != E; ++I) {
1075     if (!MFI.isDeadObjectIndex(I))
1076       return false;
1077   }
1078 
1079   return true;
1080 }
1081 
1082 #ifndef NDEBUG
1083 static bool allSGPRSpillsAreDead(const MachineFunction &MF) {
1084   const MachineFrameInfo &MFI = MF.getFrameInfo();
1085   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1086   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
1087        I != E; ++I) {
1088     if (!MFI.isDeadObjectIndex(I) &&
1089         MFI.getStackID(I) == TargetStackID::SGPRSpill &&
1090         (I != FuncInfo->FramePointerSaveIndex &&
1091          I != FuncInfo->BasePointerSaveIndex)) {
1092       return false;
1093     }
1094   }
1095 
1096   return true;
1097 }
1098 #endif
1099 
1100 int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
1101                                             Register &FrameReg) const {
1102   const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1103 
1104   FrameReg = RI->getFrameRegister(MF);
1105   return MF.getFrameInfo().getObjectOffset(FI);
1106 }
1107 
1108 void SIFrameLowering::processFunctionBeforeFrameFinalized(
1109   MachineFunction &MF,
1110   RegScavenger *RS) const {
1111   MachineFrameInfo &MFI = MF.getFrameInfo();
1112 
1113   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1114   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1115   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1116 
1117   FuncInfo->removeDeadFrameIndices(MFI);
1118   assert(allSGPRSpillsAreDead(MF) &&
1119          "SGPR spill should have been removed in SILowerSGPRSpills");
1120 
1121   // FIXME: The other checks should be redundant with allStackObjectsAreDead,
1122   // but currently hasNonSpillStackObjects is set only from source
1123   // allocas. Stack temps produced from legalization are not counted currently.
1124   if (!allStackObjectsAreDead(MFI)) {
1125     assert(RS && "RegScavenger required if spilling");
1126 
1127     if (FuncInfo->isEntryFunction()) {
1128       int ScavengeFI = MFI.CreateFixedObject(
1129         TRI->getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
1130       RS->addScavengingFrameIndex(ScavengeFI);
1131     } else {
1132       int ScavengeFI = MFI.CreateStackObject(
1133           TRI->getSpillSize(AMDGPU::SGPR_32RegClass),
1134           TRI->getSpillAlign(AMDGPU::SGPR_32RegClass), false);
1135       RS->addScavengingFrameIndex(ScavengeFI);
1136     }
1137   }
1138 }
1139 
1140 // Only report VGPRs to generic code.
1141 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
1142                                            BitVector &SavedVGPRs,
1143                                            RegScavenger *RS) const {
1144   TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
1145   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1146   if (MFI->isEntryFunction())
1147     return;
1148 
1149   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1150   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1151   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1152 
1153   // Ignore the SGPRs the default implementation found.
1154   SavedVGPRs.clearBitsNotInMask(TRI->getAllVGPRRegMask());
1155 
1156   // hasFP only knows about stack objects that already exist. We're now
1157   // determining the stack slots that will be created, so we have to predict
1158   // them. Stack objects force FP usage with calls.
1159   //
1160   // Note a new VGPR CSR may be introduced if one is used for the spill, but we
1161   // don't want to report it here.
1162   //
1163   // FIXME: Is this really hasReservedCallFrame?
1164   const bool WillHaveFP =
1165       FrameInfo.hasCalls() &&
1166       (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo));
1167 
1168   // VGPRs used for SGPR spilling need to be specially inserted in the prolog,
1169   // so don't allow the default insertion to handle them.
1170   for (auto SSpill : MFI->getSGPRSpillVGPRs())
1171     SavedVGPRs.reset(SSpill.VGPR);
1172 
1173   LivePhysRegs LiveRegs;
1174   LiveRegs.init(*TRI);
1175 
1176   if (WillHaveFP || hasFP(MF)) {
1177     getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForFPSaveRestoreCopy,
1178                                    MFI->FramePointerSaveIndex, true);
1179   }
1180 
1181   if (TRI->hasBasePointer(MF)) {
1182     if (MFI->SGPRForFPSaveRestoreCopy)
1183       LiveRegs.addReg(MFI->SGPRForFPSaveRestoreCopy);
1184     getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForBPSaveRestoreCopy,
1185                                    MFI->BasePointerSaveIndex, false);
1186   }
1187 }
1188 
1189 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
1190                                                BitVector &SavedRegs,
1191                                                RegScavenger *RS) const {
1192   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
1193   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1194   if (MFI->isEntryFunction())
1195     return;
1196 
1197   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1198   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1199 
1200   // The SP is specifically managed and we don't want extra spills of it.
1201   SavedRegs.reset(MFI->getStackPtrOffsetReg());
1202   SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask());
1203 }
1204 
1205 bool SIFrameLowering::assignCalleeSavedSpillSlots(
1206     MachineFunction &MF, const TargetRegisterInfo *TRI,
1207     std::vector<CalleeSavedInfo> &CSI) const {
1208   if (CSI.empty())
1209     return true; // Early exit if no callee saved registers are modified!
1210 
1211   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1212   if (!FuncInfo->SGPRForFPSaveRestoreCopy &&
1213       !FuncInfo->SGPRForBPSaveRestoreCopy)
1214     return false;
1215 
1216   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1217   const SIRegisterInfo *RI = ST.getRegisterInfo();
1218   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1219   Register BasePtrReg = RI->getBaseRegister();
1220   unsigned NumModifiedRegs = 0;
1221 
1222   if (FuncInfo->SGPRForFPSaveRestoreCopy)
1223     NumModifiedRegs++;
1224   if (FuncInfo->SGPRForBPSaveRestoreCopy)
1225     NumModifiedRegs++;
1226 
1227   for (auto &CS : CSI) {
1228     if (CS.getReg() == FramePtrReg && FuncInfo->SGPRForFPSaveRestoreCopy) {
1229       CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy);
1230       if (--NumModifiedRegs)
1231         break;
1232     } else if (CS.getReg() == BasePtrReg &&
1233                FuncInfo->SGPRForBPSaveRestoreCopy) {
1234       CS.setDstReg(FuncInfo->SGPRForBPSaveRestoreCopy);
1235       if (--NumModifiedRegs)
1236         break;
1237     }
1238   }
1239 
1240   return false;
1241 }
1242 
1243 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
1244   MachineFunction &MF,
1245   MachineBasicBlock &MBB,
1246   MachineBasicBlock::iterator I) const {
1247   int64_t Amount = I->getOperand(0).getImm();
1248   if (Amount == 0)
1249     return MBB.erase(I);
1250 
1251   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1252   const SIInstrInfo *TII = ST.getInstrInfo();
1253   const DebugLoc &DL = I->getDebugLoc();
1254   unsigned Opc = I->getOpcode();
1255   bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
1256   uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
1257 
1258   if (!hasReservedCallFrame(MF)) {
1259     Amount = alignTo(Amount, getStackAlign());
1260     assert(isUInt<32>(Amount) && "exceeded stack address space size");
1261     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1262     Register SPReg = MFI->getStackPtrOffsetReg();
1263 
1264     unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
1265     BuildMI(MBB, I, DL, TII->get(Op), SPReg)
1266       .addReg(SPReg)
1267       .addImm(Amount * ST.getWavefrontSize());
1268   } else if (CalleePopAmount != 0) {
1269     llvm_unreachable("is this used?");
1270   }
1271 
1272   return MBB.erase(I);
1273 }
1274 
1275 /// Returns true if the frame will require a reference to the stack pointer.
1276 ///
1277 /// This is the set of conditions common to setting up the stack pointer in a
1278 /// kernel, and for using a frame pointer in a callable function.
1279 ///
1280 /// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm
1281 /// references SP.
1282 static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) {
1283   return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint();
1284 }
1285 
1286 // The FP for kernels is always known 0, so we never really need to setup an
1287 // explicit register for it. However, DisableFramePointerElim will force us to
1288 // use a register for it.
1289 bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
1290   const MachineFrameInfo &MFI = MF.getFrameInfo();
1291 
1292   // For entry functions we can use an immediate offset in most cases, so the
1293   // presence of calls doesn't imply we need a distinct frame pointer.
1294   if (MFI.hasCalls() &&
1295       !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1296     // All offsets are unsigned, so need to be addressed in the same direction
1297     // as stack growth.
1298 
1299     // FIXME: This function is pretty broken, since it can be called before the
1300     // frame layout is determined or CSR spills are inserted.
1301     return MFI.getStackSize() != 0;
1302   }
1303 
1304   return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
1305     MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->needsStackRealignment(MF) ||
1306     MF.getTarget().Options.DisableFramePointerElim(MF);
1307 }
1308 
1309 // This is essentially a reduced version of hasFP for entry functions. Since the
1310 // stack pointer is known 0 on entry to kernels, we never really need an FP
1311 // register. We may need to initialize the stack pointer depending on the frame
1312 // properties, which logically overlaps many of the cases where an ordinary
1313 // function would require an FP.
1314 bool SIFrameLowering::requiresStackPointerReference(
1315     const MachineFunction &MF) const {
1316   // Callable functions always require a stack pointer reference.
1317   assert(MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() &&
1318          "only expected to call this for entry points");
1319 
1320   const MachineFrameInfo &MFI = MF.getFrameInfo();
1321 
1322   // Entry points ordinarily don't need to initialize SP. We have to set it up
1323   // for callees if there are any. Also note tail calls are impossible/don't
1324   // make any sense for kernels.
1325   if (MFI.hasCalls())
1326     return true;
1327 
1328   // We still need to initialize the SP if we're doing anything weird that
1329   // references the SP, like variable sized stack objects.
1330   return frameTriviallyRequiresSP(MFI);
1331 }
1332