1 //===----------------------- SIFrameLowering.cpp --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 
9 #include "SIFrameLowering.h"
10 #include "AMDGPUSubtarget.h"
11 #include "SIInstrInfo.h"
12 #include "SIMachineFunctionInfo.h"
13 #include "SIRegisterInfo.h"
14 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
15 
16 #include "llvm/CodeGen/LivePhysRegs.h"
17 #include "llvm/CodeGen/MachineFrameInfo.h"
18 #include "llvm/CodeGen/MachineFunction.h"
19 #include "llvm/CodeGen/MachineInstrBuilder.h"
20 #include "llvm/CodeGen/RegisterScavenging.h"
21 
22 using namespace llvm;
23 
24 #define DEBUG_TYPE "frame-info"
25 
26 
27 static ArrayRef<MCPhysReg> getAllSGPR128(const GCNSubtarget &ST,
28                                          const MachineFunction &MF) {
29   return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
30                       ST.getMaxNumSGPRs(MF) / 4);
31 }
32 
33 static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST,
34                                        const MachineFunction &MF) {
35   return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
36 }
37 
38 // Find a scratch register that we can use at the start of the prologue to
39 // re-align the stack pointer. We avoid using callee-save registers since they
40 // may appear to be free when this is called from canUseAsPrologue (during
41 // shrink wrapping), but then no longer be free when this is called from
42 // emitPrologue.
43 //
44 // FIXME: This is a bit conservative, since in the above case we could use one
45 // of the callee-save registers as a scratch temp to re-align the stack pointer,
46 // but we would then have to make sure that we were in fact saving at least one
47 // callee-save register in the prologue, which is additional complexity that
48 // doesn't seem worth the benefit.
49 static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
50                                                    LivePhysRegs &LiveRegs,
51                                                    const TargetRegisterClass &RC,
52                                                    bool Unused = false) {
53   // Mark callee saved registers as used so we will not choose them.
54   const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
55   for (unsigned i = 0; CSRegs[i]; ++i)
56     LiveRegs.addReg(CSRegs[i]);
57 
58   if (Unused) {
59     // We are looking for a register that can be used throughout the entire
60     // function, so any use is unacceptable.
61     for (MCRegister Reg : RC) {
62       if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg))
63         return Reg;
64     }
65   } else {
66     for (MCRegister Reg : RC) {
67       if (LiveRegs.available(MRI, Reg))
68         return Reg;
69     }
70   }
71 
72   // If we require an unused register, this is used in contexts where failure is
73   // an option and has an alternative plan. In other contexts, this must
74   // succeed0.
75   if (!Unused)
76     report_fatal_error("failed to find free scratch register");
77 
78   return MCRegister();
79 }
80 
81 static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF,
82                                            LivePhysRegs &LiveRegs,
83                                            Register &TempSGPR,
84                                            Optional<int> &FrameIndex,
85                                            bool IsFP) {
86   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
87   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
88 
89 #ifndef NDEBUG
90   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
91   const SIRegisterInfo *TRI = ST.getRegisterInfo();
92 #endif
93 
94   // We need to save and restore the current FP/BP.
95 
96   // 1: If there is already a VGPR with free lanes, use it. We
97   // may already have to pay the penalty for spilling a CSR VGPR.
98   if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) {
99     int NewFI = FrameInfo.CreateStackObject(4, 4, true, nullptr,
100                                             TargetStackID::SGPRSpill);
101 
102     if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
103       llvm_unreachable("allocate SGPR spill should have worked");
104 
105     FrameIndex = NewFI;
106 
107     LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
108                dbgs() << "Spilling " << (IsFP ? "FP" : "BP") << " to  "
109                       << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
110                       << '\n');
111     return;
112   }
113 
114   // 2: Next, try to save the FP/BP in an unused SGPR.
115   TempSGPR = findScratchNonCalleeSaveRegister(
116       MF.getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true);
117 
118   if (!TempSGPR) {
119     int NewFI = FrameInfo.CreateStackObject(4, 4, true, nullptr,
120                                             TargetStackID::SGPRSpill);
121 
122     if (MFI->allocateSGPRSpillToVGPR(MF, NewFI)) {
123       // 3: There's no free lane to spill, and no free register to save FP/BP,
124       // so we're forced to spill another VGPR to use for the spill.
125       FrameIndex = NewFI;
126     } else {
127       // 4: If all else fails, spill the FP/BP to memory.
128       FrameIndex = FrameInfo.CreateSpillStackObject(4, Align(4));
129     }
130 
131     LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
132                dbgs() << (IsFP ? "FP" : "BP") << " requires fallback spill to "
133                       << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
134                       << '\n';);
135   } else {
136     LLVM_DEBUG(dbgs() << "Saving " << (IsFP ? "FP" : "BP") << " with copy to "
137                       << printReg(TempSGPR, TRI) << '\n');
138   }
139 }
140 
141 // We need to specially emit stack operations here because a different frame
142 // register is used than in the rest of the function, as getFrameRegister would
143 // use.
144 static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
145                              MachineBasicBlock::iterator I,
146                              const SIInstrInfo *TII, Register SpillReg,
147                              Register ScratchRsrcReg, Register SPReg, int FI) {
148   MachineFunction *MF = MBB.getParent();
149   MachineFrameInfo &MFI = MF->getFrameInfo();
150 
151   int64_t Offset = MFI.getObjectOffset(FI);
152 
153   MachineMemOperand *MMO = MF->getMachineMemOperand(
154       MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4,
155       MFI.getObjectAlign(FI));
156 
157   if (isUInt<12>(Offset)) {
158     BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET))
159       .addReg(SpillReg, RegState::Kill)
160       .addReg(ScratchRsrcReg)
161       .addReg(SPReg)
162       .addImm(Offset)
163       .addImm(0) // glc
164       .addImm(0) // slc
165       .addImm(0) // tfe
166       .addImm(0) // dlc
167       .addImm(0) // swz
168       .addMemOperand(MMO);
169     return;
170   }
171 
172   MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
173     MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
174 
175   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
176     .addImm(Offset);
177 
178   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN))
179     .addReg(SpillReg, RegState::Kill)
180     .addReg(OffsetReg, RegState::Kill)
181     .addReg(ScratchRsrcReg)
182     .addReg(SPReg)
183     .addImm(0)
184     .addImm(0) // glc
185     .addImm(0) // slc
186     .addImm(0) // tfe
187     .addImm(0) // dlc
188     .addImm(0) // swz
189     .addMemOperand(MMO);
190 }
191 
192 static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
193                               MachineBasicBlock::iterator I,
194                               const SIInstrInfo *TII, Register SpillReg,
195                               Register ScratchRsrcReg, Register SPReg, int FI) {
196   MachineFunction *MF = MBB.getParent();
197   MachineFrameInfo &MFI = MF->getFrameInfo();
198   int64_t Offset = MFI.getObjectOffset(FI);
199 
200   MachineMemOperand *MMO = MF->getMachineMemOperand(
201       MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4,
202       MFI.getObjectAlign(FI));
203 
204   if (isUInt<12>(Offset)) {
205     BuildMI(MBB, I, DebugLoc(),
206             TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFSET), SpillReg)
207       .addReg(ScratchRsrcReg)
208       .addReg(SPReg)
209       .addImm(Offset)
210       .addImm(0) // glc
211       .addImm(0) // slc
212       .addImm(0) // tfe
213       .addImm(0) // dlc
214       .addImm(0) // swz
215       .addMemOperand(MMO);
216     return;
217   }
218 
219   MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
220     MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
221 
222   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
223     .addImm(Offset);
224 
225   BuildMI(MBB, I, DebugLoc(),
226           TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), SpillReg)
227     .addReg(OffsetReg, RegState::Kill)
228     .addReg(ScratchRsrcReg)
229     .addReg(SPReg)
230     .addImm(0)
231     .addImm(0) // glc
232     .addImm(0) // slc
233     .addImm(0) // tfe
234     .addImm(0) // dlc
235     .addImm(0) // swz
236     .addMemOperand(MMO);
237 }
238 
239 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
240 void SIFrameLowering::emitEntryFunctionFlatScratchInit(
241     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
242     const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
243   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
244   const SIInstrInfo *TII = ST.getInstrInfo();
245   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
246   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
247 
248   // We don't need this if we only have spills since there is no user facing
249   // scratch.
250 
251   // TODO: If we know we don't have flat instructions earlier, we can omit
252   // this from the input registers.
253   //
254   // TODO: We only need to know if we access scratch space through a flat
255   // pointer. Because we only detect if flat instructions are used at all,
256   // this will be used more often than necessary on VI.
257 
258   Register FlatScratchInitReg =
259       MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
260 
261   MachineRegisterInfo &MRI = MF.getRegInfo();
262   MRI.addLiveIn(FlatScratchInitReg);
263   MBB.addLiveIn(FlatScratchInitReg);
264 
265   Register FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
266   Register FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
267 
268   // Do a 64-bit pointer add.
269   if (ST.flatScratchIsPointer()) {
270     if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
271       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
272         .addReg(FlatScrInitLo)
273         .addReg(ScratchWaveOffsetReg);
274       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), FlatScrInitHi)
275         .addReg(FlatScrInitHi)
276         .addImm(0);
277       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
278         addReg(FlatScrInitLo).
279         addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO |
280                        (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
281       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
282         addReg(FlatScrInitHi).
283         addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
284                        (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
285       return;
286     }
287 
288     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
289       .addReg(FlatScrInitLo)
290       .addReg(ScratchWaveOffsetReg);
291     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI)
292       .addReg(FlatScrInitHi)
293       .addImm(0);
294 
295     return;
296   }
297 
298   assert(ST.getGeneration() < AMDGPUSubtarget::GFX10);
299 
300   // Copy the size in bytes.
301   BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
302     .addReg(FlatScrInitHi, RegState::Kill);
303 
304   // Add wave offset in bytes to private base offset.
305   // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
306   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
307     .addReg(FlatScrInitLo)
308     .addReg(ScratchWaveOffsetReg);
309 
310   // Convert offset to 256-byte units.
311   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI)
312     .addReg(FlatScrInitLo, RegState::Kill)
313     .addImm(8);
314 }
315 
316 // Shift down registers reserved for the scratch RSRC.
317 Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
318     MachineFunction &MF) const {
319 
320   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
321   const SIInstrInfo *TII = ST.getInstrInfo();
322   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
323   MachineRegisterInfo &MRI = MF.getRegInfo();
324   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
325 
326   assert(MFI->isEntryFunction());
327 
328   Register ScratchRsrcReg = MFI->getScratchRSrcReg();
329 
330   if (!ScratchRsrcReg || !MRI.isPhysRegUsed(ScratchRsrcReg))
331     return Register();
332 
333   if (ST.hasSGPRInitBug() ||
334       ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
335     return ScratchRsrcReg;
336 
337   // We reserved the last registers for this. Shift it down to the end of those
338   // which were actually used.
339   //
340   // FIXME: It might be safer to use a pseudoregister before replacement.
341 
342   // FIXME: We should be able to eliminate unused input registers. We only
343   // cannot do this for the resources required for scratch access. For now we
344   // skip over user SGPRs and may leave unused holes.
345 
346   unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
347   ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(ST, MF);
348   AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
349 
350   // Skip the last N reserved elements because they should have already been
351   // reserved for VCC etc.
352   Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
353   for (MCPhysReg Reg : AllSGPR128s) {
354     // Pick the first unallocated one. Make sure we don't clobber the other
355     // reserved input we needed. Also for PAL, make sure we don't clobber
356     // the GIT pointer passed in SGPR0 or SGPR8.
357     if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
358         !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
359       MRI.replaceRegWith(ScratchRsrcReg, Reg);
360       MFI->setScratchRSrcReg(Reg);
361       return Reg;
362     }
363   }
364 
365   return ScratchRsrcReg;
366 }
367 
368 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
369                                                 MachineBasicBlock &MBB) const {
370   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
371 
372   // FIXME: If we only have SGPR spills, we won't actually be using scratch
373   // memory since these spill to VGPRs. We should be cleaning up these unused
374   // SGPR spill frame indices somewhere.
375 
376   // FIXME: We still have implicit uses on SGPR spill instructions in case they
377   // need to spill to vector memory. It's likely that will not happen, but at
378   // this point it appears we need the setup. This part of the prolog should be
379   // emitted after frame indices are eliminated.
380 
381   // FIXME: Remove all of the isPhysRegUsed checks
382 
383   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
384   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
385   const SIInstrInfo *TII = ST.getInstrInfo();
386   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
387   MachineRegisterInfo &MRI = MF.getRegInfo();
388   const Function &F = MF.getFunction();
389 
390   assert(MFI->isEntryFunction());
391 
392   Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
393       AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
394   // FIXME: Hack to not crash in situations which emitted an error.
395   if (!PreloadedScratchWaveOffsetReg)
396     return;
397 
398   // We need to do the replacement of the private segment buffer register even
399   // if there are no stack objects. There could be stores to undef or a
400   // constant without an associated object.
401   //
402   // This will return `Register()` in cases where there are no actual
403   // uses of the SRSRC.
404   Register ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
405 
406   // Make the selected register live throughout the function.
407   if (ScratchRsrcReg) {
408     for (MachineBasicBlock &OtherBB : MF) {
409       if (&OtherBB != &MBB) {
410         OtherBB.addLiveIn(ScratchRsrcReg);
411       }
412     }
413   }
414 
415   // Now that we have fixed the reserved SRSRC we need to locate the
416   // (potentially) preloaded SRSRC.
417   Register PreloadedScratchRsrcReg;
418   if (ST.isAmdHsaOrMesa(F)) {
419     PreloadedScratchRsrcReg =
420         MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
421     if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
422       // We added live-ins during argument lowering, but since they were not
423       // used they were deleted. We're adding the uses now, so add them back.
424       MRI.addLiveIn(PreloadedScratchRsrcReg);
425       MBB.addLiveIn(PreloadedScratchRsrcReg);
426     }
427   }
428 
429   // Debug location must be unknown since the first debug location is used to
430   // determine the end of the prologue.
431   DebugLoc DL;
432   MachineBasicBlock::iterator I = MBB.begin();
433 
434   // We found the SRSRC first because it needs four registers and has an
435   // alignment requirement. If the SRSRC that we found is clobbering with
436   // the scratch wave offset, which may be in a fixed SGPR or a free SGPR
437   // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
438   // wave offset to a free SGPR.
439   Register ScratchWaveOffsetReg;
440   if (TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) {
441     ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF);
442     unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
443     AllSGPRs = AllSGPRs.slice(
444         std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded));
445     Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
446     for (MCPhysReg Reg : AllSGPRs) {
447       if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
448           !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) {
449         ScratchWaveOffsetReg = Reg;
450         BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
451             .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
452         break;
453       }
454     }
455   } else {
456     ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
457   }
458   assert(ScratchWaveOffsetReg);
459 
460   if (MF.getFrameInfo().hasCalls()) {
461     Register SPReg = MFI->getStackPtrOffsetReg();
462     assert(SPReg != AMDGPU::SP_REG);
463     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
464         .addImm(MF.getFrameInfo().getStackSize() * ST.getWavefrontSize());
465   }
466 
467   if (hasFP(MF)) {
468     Register FPReg = MFI->getFrameOffsetReg();
469     assert(FPReg != AMDGPU::FP_REG);
470     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
471   }
472 
473   if (MFI->hasFlatScratchInit() || ScratchRsrcReg) {
474     MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
475     MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
476   }
477 
478   if (MFI->hasFlatScratchInit()) {
479     emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
480   }
481 
482   if (ScratchRsrcReg) {
483     emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
484                                          PreloadedScratchRsrcReg,
485                                          ScratchRsrcReg, ScratchWaveOffsetReg);
486   }
487 }
488 
489 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
490 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
491     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
492     const DebugLoc &DL, Register PreloadedScratchRsrcReg,
493     Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
494 
495   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
496   const SIInstrInfo *TII = ST.getInstrInfo();
497   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
498   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
499   const Function &Fn = MF.getFunction();
500 
501   if (ST.isAmdPalOS()) {
502     // The pointer to the GIT is formed from the offset passed in and either
503     // the amdgpu-git-ptr-high function attribute or the top part of the PC
504     Register RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
505     Register RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
506     Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
507 
508     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
509 
510     if (MFI->getGITPtrHigh() != 0xffffffff) {
511       BuildMI(MBB, I, DL, SMovB32, RsrcHi)
512         .addImm(MFI->getGITPtrHigh())
513         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
514     } else {
515       const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
516       BuildMI(MBB, I, DL, GetPC64, Rsrc01);
517     }
518     Register GitPtrLo = MFI->getGITPtrLoReg(MF);
519     MF.getRegInfo().addLiveIn(GitPtrLo);
520     MBB.addLiveIn(GitPtrLo);
521     BuildMI(MBB, I, DL, SMovB32, RsrcLo)
522       .addReg(GitPtrLo)
523       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
524 
525     // We now have the GIT ptr - now get the scratch descriptor from the entry
526     // at offset 0 (or offset 16 for a compute shader).
527     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
528     const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
529     auto MMO = MF.getMachineMemOperand(PtrInfo,
530                                        MachineMemOperand::MOLoad |
531                                            MachineMemOperand::MOInvariant |
532                                            MachineMemOperand::MODereferenceable,
533                                        16, Align(4));
534     unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
535     const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
536     unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
537     BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
538       .addReg(Rsrc01)
539       .addImm(EncodedOffset) // offset
540       .addImm(0) // glc
541       .addImm(0) // dlc
542       .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
543       .addMemOperand(MMO);
544   } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
545     assert(!ST.isAmdHsaOrMesa(Fn));
546     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
547 
548     Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
549     Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
550 
551     // Use relocations to get the pointer, and setup the other bits manually.
552     uint64_t Rsrc23 = TII->getScratchRsrcWords23();
553 
554     if (MFI->hasImplicitBufferPtr()) {
555       Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
556 
557       if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
558         const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
559 
560         BuildMI(MBB, I, DL, Mov64, Rsrc01)
561           .addReg(MFI->getImplicitBufferPtrUserSGPR())
562           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
563       } else {
564         const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
565 
566         MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
567         auto MMO = MF.getMachineMemOperand(
568             PtrInfo,
569             MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
570                 MachineMemOperand::MODereferenceable,
571             8, Align(4));
572         BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
573           .addReg(MFI->getImplicitBufferPtrUserSGPR())
574           .addImm(0) // offset
575           .addImm(0) // glc
576           .addImm(0) // dlc
577           .addMemOperand(MMO)
578           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
579 
580         MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
581         MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
582       }
583     } else {
584       Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
585       Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
586 
587       BuildMI(MBB, I, DL, SMovB32, Rsrc0)
588         .addExternalSymbol("SCRATCH_RSRC_DWORD0")
589         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
590 
591       BuildMI(MBB, I, DL, SMovB32, Rsrc1)
592         .addExternalSymbol("SCRATCH_RSRC_DWORD1")
593         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
594 
595     }
596 
597     BuildMI(MBB, I, DL, SMovB32, Rsrc2)
598       .addImm(Rsrc23 & 0xffffffff)
599       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
600 
601     BuildMI(MBB, I, DL, SMovB32, Rsrc3)
602       .addImm(Rsrc23 >> 32)
603       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
604   } else if (ST.isAmdHsaOrMesa(Fn)) {
605     assert(PreloadedScratchRsrcReg);
606 
607     if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
608       BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
609           .addReg(PreloadedScratchRsrcReg, RegState::Kill);
610     }
611   }
612 
613   // Add the scratch wave offset into the scratch RSRC.
614   //
615   // We only want to update the first 48 bits, which is the base address
616   // pointer, without touching the adjacent 16 bits of flags. We know this add
617   // cannot carry-out from bit 47, otherwise the scratch allocation would be
618   // impossible to fit in the 48-bit global address space.
619   //
620   // TODO: Evaluate if it is better to just construct an SRD using the flat
621   // scratch init and some constants rather than update the one we are passed.
622   Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
623   Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
624 
625   // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
626   // the kernel body via inreg arguments.
627   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0)
628       .addReg(ScratchRsrcSub0)
629       .addReg(ScratchWaveOffsetReg)
630       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
631   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1)
632       .addReg(ScratchRsrcSub1)
633       .addImm(0)
634       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
635 }
636 
637 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
638   switch (ID) {
639   case TargetStackID::Default:
640   case TargetStackID::NoAlloc:
641   case TargetStackID::SGPRSpill:
642     return true;
643   case TargetStackID::SVEVector:
644     return false;
645   }
646   llvm_unreachable("Invalid TargetStackID::Value");
647 }
648 
649 // Activate all lanes, returns saved exec.
650 static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
651                                      MachineFunction &MF,
652                                      MachineBasicBlock &MBB,
653                                      MachineBasicBlock::iterator MBBI,
654                                      bool IsProlog) {
655   Register ScratchExecCopy;
656   MachineRegisterInfo &MRI = MF.getRegInfo();
657   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
658   const SIInstrInfo *TII = ST.getInstrInfo();
659   const SIRegisterInfo &TRI = TII->getRegisterInfo();
660   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
661   DebugLoc DL;
662 
663   if (LiveRegs.empty()) {
664     if (IsProlog) {
665       LiveRegs.init(TRI);
666       LiveRegs.addLiveIns(MBB);
667       if (FuncInfo->SGPRForFPSaveRestoreCopy)
668         LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy);
669 
670       if (FuncInfo->SGPRForBPSaveRestoreCopy)
671         LiveRegs.removeReg(FuncInfo->SGPRForBPSaveRestoreCopy);
672     } else {
673       // In epilog.
674       LiveRegs.init(*ST.getRegisterInfo());
675       LiveRegs.addLiveOuts(MBB);
676       LiveRegs.stepBackward(*MBBI);
677     }
678   }
679 
680   ScratchExecCopy = findScratchNonCalleeSaveRegister(
681       MRI, LiveRegs, *TRI.getWaveMaskRegClass());
682 
683   if (!IsProlog)
684     LiveRegs.removeReg(ScratchExecCopy);
685 
686   const unsigned OrSaveExec =
687       ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
688   BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy).addImm(-1);
689 
690   return ScratchExecCopy;
691 }
692 
693 void SIFrameLowering::emitPrologue(MachineFunction &MF,
694                                    MachineBasicBlock &MBB) const {
695   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
696   if (FuncInfo->isEntryFunction()) {
697     emitEntryFunctionPrologue(MF, MBB);
698     return;
699   }
700 
701   const MachineFrameInfo &MFI = MF.getFrameInfo();
702   MachineRegisterInfo &MRI = MF.getRegInfo();
703   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
704   const SIInstrInfo *TII = ST.getInstrInfo();
705   const SIRegisterInfo &TRI = TII->getRegisterInfo();
706 
707   Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
708   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
709   Register BasePtrReg =
710       TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
711   LivePhysRegs LiveRegs;
712 
713   MachineBasicBlock::iterator MBBI = MBB.begin();
714   DebugLoc DL;
715 
716   bool HasFP = false;
717   bool HasBP = false;
718   uint32_t NumBytes = MFI.getStackSize();
719   uint32_t RoundedSize = NumBytes;
720   // To avoid clobbering VGPRs in lanes that weren't active on function entry,
721   // turn on all lanes before doing the spill to memory.
722   Register ScratchExecCopy;
723 
724   bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue();
725   bool SpillFPToMemory = false;
726   // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR.
727   // Otherwise we are spilling the FP to memory.
728   if (HasFPSaveIndex) {
729     SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) !=
730                       TargetStackID::SGPRSpill;
731   }
732 
733   bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue();
734   bool SpillBPToMemory = false;
735   // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR.
736   // Otherwise we are spilling the BP to memory.
737   if (HasBPSaveIndex) {
738     SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) !=
739                       TargetStackID::SGPRSpill;
740   }
741 
742   // Emit the copy if we need an FP, and are using a free SGPR to save it.
743   if (FuncInfo->SGPRForFPSaveRestoreCopy) {
744     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy)
745       .addReg(FramePtrReg)
746       .setMIFlag(MachineInstr::FrameSetup);
747   }
748 
749   // Emit the copy if we need a BP, and are using a free SGPR to save it.
750   if (FuncInfo->SGPRForBPSaveRestoreCopy) {
751     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY),
752             FuncInfo->SGPRForBPSaveRestoreCopy)
753         .addReg(BasePtrReg)
754         .setMIFlag(MachineInstr::FrameSetup);
755   }
756 
757   // If a copy has been emitted for FP and/or BP, Make the SGPRs
758   // used in the copy instructions live throughout the function.
759   SmallVector<MCPhysReg, 2> TempSGPRs;
760   if (FuncInfo->SGPRForFPSaveRestoreCopy)
761     TempSGPRs.push_back(FuncInfo->SGPRForFPSaveRestoreCopy);
762 
763   if (FuncInfo->SGPRForBPSaveRestoreCopy)
764     TempSGPRs.push_back(FuncInfo->SGPRForBPSaveRestoreCopy);
765 
766   if (!TempSGPRs.empty()) {
767     for (MachineBasicBlock &MBB : MF) {
768       for (MCPhysReg Reg : TempSGPRs)
769         MBB.addLiveIn(Reg);
770 
771       MBB.sortUniqueLiveIns();
772     }
773   }
774 
775   for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
776          : FuncInfo->getSGPRSpillVGPRs()) {
777     if (!Reg.FI.hasValue())
778       continue;
779 
780     if (!ScratchExecCopy)
781       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
782 
783     buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
784                      FuncInfo->getScratchRSrcReg(),
785                      StackPtrReg,
786                      Reg.FI.getValue());
787   }
788 
789   if (HasFPSaveIndex && SpillFPToMemory) {
790     assert(!MFI.isDeadObjectIndex(FuncInfo->FramePointerSaveIndex.getValue()));
791 
792     if (!ScratchExecCopy)
793       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
794 
795     MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
796         MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
797 
798     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
799         .addReg(FramePtrReg);
800 
801     buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR,
802                      FuncInfo->getScratchRSrcReg(), StackPtrReg,
803                      FuncInfo->FramePointerSaveIndex.getValue());
804   }
805 
806   if (HasBPSaveIndex && SpillBPToMemory) {
807     assert(!MFI.isDeadObjectIndex(*FuncInfo->BasePointerSaveIndex));
808 
809     if (!ScratchExecCopy)
810       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
811 
812     MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
813         MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
814 
815     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
816         .addReg(BasePtrReg);
817 
818     buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR,
819                      FuncInfo->getScratchRSrcReg(), StackPtrReg,
820                      *FuncInfo->BasePointerSaveIndex);
821   }
822 
823   if (ScratchExecCopy) {
824     // FIXME: Split block and make terminator.
825     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
826     MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
827     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
828         .addReg(ScratchExecCopy, RegState::Kill);
829     LiveRegs.addReg(ScratchExecCopy);
830   }
831 
832   // In this case, spill the FP to a reserved VGPR.
833   if (HasFPSaveIndex && !SpillFPToMemory) {
834     const int FI = FuncInfo->FramePointerSaveIndex.getValue();
835     assert(!MFI.isDeadObjectIndex(FI));
836 
837     assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
838     ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
839         FuncInfo->getSGPRToVGPRSpills(FI);
840     assert(Spill.size() == 1);
841 
842     // Save FP before setting it up.
843     // FIXME: This should respect spillSGPRToVGPR;
844     BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
845             Spill[0].VGPR)
846         .addReg(FramePtrReg)
847         .addImm(Spill[0].Lane)
848         .addReg(Spill[0].VGPR, RegState::Undef);
849   }
850 
851   // In this case, spill the BP to a reserved VGPR.
852   if (HasBPSaveIndex && !SpillBPToMemory) {
853     const int BasePtrFI = *FuncInfo->BasePointerSaveIndex;
854     assert(!MFI.isDeadObjectIndex(BasePtrFI));
855 
856     assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill);
857     ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
858         FuncInfo->getSGPRToVGPRSpills(BasePtrFI);
859     assert(Spill.size() == 1);
860 
861     // Save BP before setting it up.
862     // FIXME: This should respect spillSGPRToVGPR;
863     BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
864             Spill[0].VGPR)
865         .addReg(BasePtrReg)
866         .addImm(Spill[0].Lane)
867         .addReg(Spill[0].VGPR, RegState::Undef);
868   }
869 
870   if (TRI.needsStackRealignment(MF)) {
871     HasFP = true;
872     const unsigned Alignment = MFI.getMaxAlign().value();
873 
874     RoundedSize += Alignment;
875     if (LiveRegs.empty()) {
876       LiveRegs.init(TRI);
877       LiveRegs.addLiveIns(MBB);
878       LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy);
879       LiveRegs.addReg(FuncInfo->SGPRForBPSaveRestoreCopy);
880     }
881 
882     Register ScratchSPReg = findScratchNonCalleeSaveRegister(
883         MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass);
884     assert(ScratchSPReg && ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy &&
885            ScratchSPReg != FuncInfo->SGPRForBPSaveRestoreCopy);
886 
887     // s_add_u32 tmp_reg, s32, NumBytes
888     // s_and_b32 s32, tmp_reg, 0b111...0000
889     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg)
890         .addReg(StackPtrReg)
891         .addImm((Alignment - 1) * ST.getWavefrontSize())
892         .setMIFlag(MachineInstr::FrameSetup);
893     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
894         .addReg(ScratchSPReg, RegState::Kill)
895         .addImm(-Alignment * ST.getWavefrontSize())
896         .setMIFlag(MachineInstr::FrameSetup);
897     FuncInfo->setIsStackRealigned(true);
898   } else if ((HasFP = hasFP(MF))) {
899     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
900         .addReg(StackPtrReg)
901         .setMIFlag(MachineInstr::FrameSetup);
902   }
903 
904   // If we need a base pointer, set it up here. It's whatever the value of
905   // the stack pointer is at this point. Any variable size objects will be
906   // allocated after this, so we can still use the base pointer to reference
907   // the incoming arguments.
908   if ((HasBP = TRI.hasBasePointer(MF))) {
909     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
910         .addReg(StackPtrReg)
911         .setMIFlag(MachineInstr::FrameSetup);
912   }
913 
914   if (HasFP && RoundedSize != 0) {
915     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
916         .addReg(StackPtrReg)
917         .addImm(RoundedSize * ST.getWavefrontSize())
918         .setMIFlag(MachineInstr::FrameSetup);
919   }
920 
921   assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy ||
922                      FuncInfo->FramePointerSaveIndex)) &&
923          "Needed to save FP but didn't save it anywhere");
924 
925   assert((HasFP || (!FuncInfo->SGPRForFPSaveRestoreCopy &&
926                     !FuncInfo->FramePointerSaveIndex)) &&
927          "Saved FP but didn't need it");
928 
929   assert((!HasBP || (FuncInfo->SGPRForBPSaveRestoreCopy ||
930                      FuncInfo->BasePointerSaveIndex)) &&
931          "Needed to save BP but didn't save it anywhere");
932 
933   assert((HasBP || (!FuncInfo->SGPRForBPSaveRestoreCopy &&
934                     !FuncInfo->BasePointerSaveIndex)) &&
935          "Saved BP but didn't need it");
936 }
937 
938 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
939                                    MachineBasicBlock &MBB) const {
940   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
941   if (FuncInfo->isEntryFunction())
942     return;
943 
944   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
945   const SIInstrInfo *TII = ST.getInstrInfo();
946   MachineRegisterInfo &MRI = MF.getRegInfo();
947   const SIRegisterInfo &TRI = TII->getRegisterInfo();
948   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
949   LivePhysRegs LiveRegs;
950   DebugLoc DL;
951 
952   const MachineFrameInfo &MFI = MF.getFrameInfo();
953   uint32_t NumBytes = MFI.getStackSize();
954   uint32_t RoundedSize = FuncInfo->isStackRealigned()
955                              ? NumBytes + MFI.getMaxAlign().value()
956                              : NumBytes;
957   const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
958   const Register FramePtrReg = FuncInfo->getFrameOffsetReg();
959   const Register BasePtrReg =
960       TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
961 
962   bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue();
963   bool SpillFPToMemory = false;
964   if (HasFPSaveIndex) {
965     SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) !=
966                       TargetStackID::SGPRSpill;
967   }
968 
969   bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue();
970   bool SpillBPToMemory = false;
971   if (HasBPSaveIndex) {
972     SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) !=
973                       TargetStackID::SGPRSpill;
974   }
975 
976   if (RoundedSize != 0 && hasFP(MF)) {
977     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
978       .addReg(StackPtrReg)
979       .addImm(RoundedSize * ST.getWavefrontSize())
980       .setMIFlag(MachineInstr::FrameDestroy);
981   }
982 
983   if (FuncInfo->SGPRForFPSaveRestoreCopy) {
984     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
985         .addReg(FuncInfo->SGPRForFPSaveRestoreCopy)
986         .setMIFlag(MachineInstr::FrameSetup);
987   }
988 
989   if (FuncInfo->SGPRForBPSaveRestoreCopy) {
990     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
991         .addReg(FuncInfo->SGPRForBPSaveRestoreCopy)
992         .setMIFlag(MachineInstr::FrameSetup);
993   }
994 
995   Register ScratchExecCopy;
996   if (HasFPSaveIndex) {
997     const int FI = FuncInfo->FramePointerSaveIndex.getValue();
998     assert(!MFI.isDeadObjectIndex(FI));
999     if (SpillFPToMemory) {
1000       if (!ScratchExecCopy)
1001         ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
1002 
1003       MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister(
1004           MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
1005       buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR,
1006                         FuncInfo->getScratchRSrcReg(), StackPtrReg, FI);
1007       BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg)
1008           .addReg(TempVGPR, RegState::Kill);
1009     } else {
1010       // Reload from VGPR spill.
1011       assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
1012       ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
1013           FuncInfo->getSGPRToVGPRSpills(FI);
1014       assert(Spill.size() == 1);
1015       BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
1016               FramePtrReg)
1017           .addReg(Spill[0].VGPR)
1018           .addImm(Spill[0].Lane);
1019     }
1020   }
1021 
1022   if (HasBPSaveIndex) {
1023     const int BasePtrFI = *FuncInfo->BasePointerSaveIndex;
1024     assert(!MFI.isDeadObjectIndex(BasePtrFI));
1025     if (SpillBPToMemory) {
1026       if (!ScratchExecCopy)
1027         ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
1028 
1029       MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister(
1030           MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
1031       buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR,
1032                         FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI);
1033       BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg)
1034           .addReg(TempVGPR, RegState::Kill);
1035     } else {
1036       // Reload from VGPR spill.
1037       assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill);
1038       ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
1039           FuncInfo->getSGPRToVGPRSpills(BasePtrFI);
1040       assert(Spill.size() == 1);
1041       BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
1042               BasePtrReg)
1043           .addReg(Spill[0].VGPR)
1044           .addImm(Spill[0].Lane);
1045     }
1046   }
1047 
1048   for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg :
1049        FuncInfo->getSGPRSpillVGPRs()) {
1050     if (!Reg.FI.hasValue())
1051       continue;
1052 
1053     if (!ScratchExecCopy)
1054       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
1055 
1056     buildEpilogReload(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
1057                       FuncInfo->getScratchRSrcReg(), StackPtrReg,
1058                       Reg.FI.getValue());
1059   }
1060 
1061   if (ScratchExecCopy) {
1062     // FIXME: Split block and make terminator.
1063     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1064     MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
1065     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
1066         .addReg(ScratchExecCopy, RegState::Kill);
1067   }
1068 }
1069 
1070 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
1071 // memory. They should have been removed by now.
1072 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
1073   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
1074        I != E; ++I) {
1075     if (!MFI.isDeadObjectIndex(I))
1076       return false;
1077   }
1078 
1079   return true;
1080 }
1081 
1082 #ifndef NDEBUG
1083 static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI,
1084                                  Optional<int> FramePointerSaveIndex,
1085                                  Optional<int> BasePointerSaveIndex) {
1086   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
1087        I != E; ++I) {
1088     if (!MFI.isDeadObjectIndex(I) &&
1089         MFI.getStackID(I) == TargetStackID::SGPRSpill &&
1090         ((FramePointerSaveIndex && I != FramePointerSaveIndex) ||
1091          (BasePointerSaveIndex && I != BasePointerSaveIndex))) {
1092       return false;
1093     }
1094   }
1095 
1096   return true;
1097 }
1098 #endif
1099 
1100 int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
1101                                             Register &FrameReg) const {
1102   const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1103 
1104   FrameReg = RI->getFrameRegister(MF);
1105   return MF.getFrameInfo().getObjectOffset(FI);
1106 }
1107 
1108 void SIFrameLowering::processFunctionBeforeFrameFinalized(
1109   MachineFunction &MF,
1110   RegScavenger *RS) const {
1111   MachineFrameInfo &MFI = MF.getFrameInfo();
1112 
1113   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1114   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1115   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1116 
1117   FuncInfo->removeDeadFrameIndices(MFI);
1118   assert(allSGPRSpillsAreDead(MFI, None, None) &&
1119          "SGPR spill should have been removed in SILowerSGPRSpills");
1120 
1121   // FIXME: The other checks should be redundant with allStackObjectsAreDead,
1122   // but currently hasNonSpillStackObjects is set only from source
1123   // allocas. Stack temps produced from legalization are not counted currently.
1124   if (!allStackObjectsAreDead(MFI)) {
1125     assert(RS && "RegScavenger required if spilling");
1126 
1127     if (FuncInfo->isEntryFunction()) {
1128       int ScavengeFI = MFI.CreateFixedObject(
1129         TRI->getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
1130       RS->addScavengingFrameIndex(ScavengeFI);
1131     } else {
1132       int ScavengeFI = MFI.CreateStackObject(
1133         TRI->getSpillSize(AMDGPU::SGPR_32RegClass),
1134         TRI->getSpillAlignment(AMDGPU::SGPR_32RegClass),
1135         false);
1136       RS->addScavengingFrameIndex(ScavengeFI);
1137     }
1138   }
1139 }
1140 
1141 // Only report VGPRs to generic code.
1142 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
1143                                            BitVector &SavedVGPRs,
1144                                            RegScavenger *RS) const {
1145   TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
1146   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1147   if (MFI->isEntryFunction())
1148     return;
1149 
1150   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1151   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1152   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1153 
1154   // Ignore the SGPRs the default implementation found.
1155   SavedVGPRs.clearBitsNotInMask(TRI->getAllVGPRRegMask());
1156 
1157   // hasFP only knows about stack objects that already exist. We're now
1158   // determining the stack slots that will be created, so we have to predict
1159   // them. Stack objects force FP usage with calls.
1160   //
1161   // Note a new VGPR CSR may be introduced if one is used for the spill, but we
1162   // don't want to report it here.
1163   //
1164   // FIXME: Is this really hasReservedCallFrame?
1165   const bool WillHaveFP =
1166       FrameInfo.hasCalls() &&
1167       (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo));
1168 
1169   // VGPRs used for SGPR spilling need to be specially inserted in the prolog,
1170   // so don't allow the default insertion to handle them.
1171   for (auto SSpill : MFI->getSGPRSpillVGPRs())
1172     SavedVGPRs.reset(SSpill.VGPR);
1173 
1174   LivePhysRegs LiveRegs;
1175   LiveRegs.init(*TRI);
1176 
1177   if (WillHaveFP || hasFP(MF)) {
1178     getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForFPSaveRestoreCopy,
1179                                    MFI->FramePointerSaveIndex, true);
1180   }
1181 
1182   if (TRI->hasBasePointer(MF)) {
1183     if (MFI->SGPRForFPSaveRestoreCopy)
1184       LiveRegs.addReg(MFI->SGPRForFPSaveRestoreCopy);
1185     getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForBPSaveRestoreCopy,
1186                                    MFI->BasePointerSaveIndex, false);
1187   }
1188 }
1189 
1190 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
1191                                                BitVector &SavedRegs,
1192                                                RegScavenger *RS) const {
1193   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
1194   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1195   if (MFI->isEntryFunction())
1196     return;
1197 
1198   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1199   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1200 
1201   // The SP is specifically managed and we don't want extra spills of it.
1202   SavedRegs.reset(MFI->getStackPtrOffsetReg());
1203   SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask());
1204 }
1205 
1206 bool SIFrameLowering::assignCalleeSavedSpillSlots(
1207     MachineFunction &MF, const TargetRegisterInfo *TRI,
1208     std::vector<CalleeSavedInfo> &CSI) const {
1209   if (CSI.empty())
1210     return true; // Early exit if no callee saved registers are modified!
1211 
1212   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1213   if (!FuncInfo->SGPRForFPSaveRestoreCopy &&
1214       !FuncInfo->SGPRForBPSaveRestoreCopy)
1215     return false;
1216 
1217   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1218   const SIRegisterInfo *RI = ST.getRegisterInfo();
1219   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1220   Register BasePtrReg = RI->getBaseRegister();
1221   unsigned NumModifiedRegs = 0;
1222 
1223   if (FuncInfo->SGPRForFPSaveRestoreCopy)
1224     NumModifiedRegs++;
1225   if (FuncInfo->SGPRForBPSaveRestoreCopy)
1226     NumModifiedRegs++;
1227 
1228   for (auto &CS : CSI) {
1229     if (CS.getReg() == FramePtrReg && FuncInfo->SGPRForFPSaveRestoreCopy) {
1230       CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy);
1231       if (--NumModifiedRegs)
1232         break;
1233     } else if (CS.getReg() == BasePtrReg &&
1234                FuncInfo->SGPRForBPSaveRestoreCopy) {
1235       CS.setDstReg(FuncInfo->SGPRForBPSaveRestoreCopy);
1236       if (--NumModifiedRegs)
1237         break;
1238     }
1239   }
1240 
1241   return false;
1242 }
1243 
1244 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
1245   MachineFunction &MF,
1246   MachineBasicBlock &MBB,
1247   MachineBasicBlock::iterator I) const {
1248   int64_t Amount = I->getOperand(0).getImm();
1249   if (Amount == 0)
1250     return MBB.erase(I);
1251 
1252   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1253   const SIInstrInfo *TII = ST.getInstrInfo();
1254   const DebugLoc &DL = I->getDebugLoc();
1255   unsigned Opc = I->getOpcode();
1256   bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
1257   uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
1258 
1259   if (!hasReservedCallFrame(MF)) {
1260     Amount = alignTo(Amount, getStackAlign());
1261     assert(isUInt<32>(Amount) && "exceeded stack address space size");
1262     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1263     Register SPReg = MFI->getStackPtrOffsetReg();
1264 
1265     unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
1266     BuildMI(MBB, I, DL, TII->get(Op), SPReg)
1267       .addReg(SPReg)
1268       .addImm(Amount * ST.getWavefrontSize());
1269   } else if (CalleePopAmount != 0) {
1270     llvm_unreachable("is this used?");
1271   }
1272 
1273   return MBB.erase(I);
1274 }
1275 
1276 bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
1277   const MachineFrameInfo &MFI = MF.getFrameInfo();
1278 
1279   // For entry functions we can use an immediate offset in most cases, so the
1280   // presence of calls doesn't imply we need a distinct frame pointer.
1281   if (MFI.hasCalls() &&
1282       !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1283     // All offsets are unsigned, so need to be addressed in the same direction
1284     // as stack growth.
1285 
1286     // FIXME: This function is pretty broken, since it can be called before the
1287     // frame layout is determined or CSR spills are inserted.
1288     return MFI.getStackSize() != 0;
1289   }
1290 
1291   return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
1292     MFI.hasStackMap() || MFI.hasPatchPoint() ||
1293     MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->needsStackRealignment(MF) ||
1294     MF.getTarget().Options.DisableFramePointerElim(MF);
1295 }
1296