1 //===----------------------- SIFrameLowering.cpp --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 
9 #include "SIFrameLowering.h"
10 #include "AMDGPUSubtarget.h"
11 #include "SIInstrInfo.h"
12 #include "SIMachineFunctionInfo.h"
13 #include "SIRegisterInfo.h"
14 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
15 
16 #include "llvm/CodeGen/LivePhysRegs.h"
17 #include "llvm/CodeGen/MachineFrameInfo.h"
18 #include "llvm/CodeGen/MachineFunction.h"
19 #include "llvm/CodeGen/MachineInstrBuilder.h"
20 #include "llvm/CodeGen/RegisterScavenging.h"
21 
22 using namespace llvm;
23 
24 #define DEBUG_TYPE "frame-info"
25 
26 
27 // Find a scratch register that we can use at the start of the prologue to
28 // re-align the stack pointer. We avoid using callee-save registers since they
29 // may appear to be free when this is called from canUseAsPrologue (during
30 // shrink wrapping), but then no longer be free when this is called from
31 // emitPrologue.
32 //
33 // FIXME: This is a bit conservative, since in the above case we could use one
34 // of the callee-save registers as a scratch temp to re-align the stack pointer,
35 // but we would then have to make sure that we were in fact saving at least one
36 // callee-save register in the prologue, which is additional complexity that
37 // doesn't seem worth the benefit.
38 static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
39                                                    LivePhysRegs &LiveRegs,
40                                                    const TargetRegisterClass &RC,
41                                                    bool Unused = false) {
42   // Mark callee saved registers as used so we will not choose them.
43   const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
44   for (unsigned i = 0; CSRegs[i]; ++i)
45     LiveRegs.addReg(CSRegs[i]);
46 
47   if (Unused) {
48     // We are looking for a register that can be used throughout the entire
49     // function, so any use is unacceptable.
50     for (MCRegister Reg : RC) {
51       if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg))
52         return Reg;
53     }
54   } else {
55     for (MCRegister Reg : RC) {
56       if (LiveRegs.available(MRI, Reg))
57         return Reg;
58     }
59   }
60 
61   // If we require an unused register, this is used in contexts where failure is
62   // an option and has an alternative plan. In other contexts, this must
63   // succeed0.
64   if (!Unused)
65     report_fatal_error("failed to find free scratch register");
66 
67   return MCRegister();
68 }
69 
70 static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF,
71                                            LivePhysRegs &LiveRegs,
72                                            Register &TempSGPR,
73                                            Optional<int> &FrameIndex,
74                                            bool IsFP) {
75   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
76   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
77 
78 #ifndef NDEBUG
79   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
80   const SIRegisterInfo *TRI = ST.getRegisterInfo();
81 #endif
82 
83   // We need to save and restore the current FP/BP.
84 
85   // 1: If there is already a VGPR with free lanes, use it. We
86   // may already have to pay the penalty for spilling a CSR VGPR.
87   if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) {
88     int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr,
89                                             TargetStackID::SGPRSpill);
90 
91     if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
92       llvm_unreachable("allocate SGPR spill should have worked");
93 
94     FrameIndex = NewFI;
95 
96     LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
97                dbgs() << "Spilling " << (IsFP ? "FP" : "BP") << " to  "
98                       << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
99                       << '\n');
100     return;
101   }
102 
103   // 2: Next, try to save the FP/BP in an unused SGPR.
104   TempSGPR = findScratchNonCalleeSaveRegister(
105       MF.getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true);
106 
107   if (!TempSGPR) {
108     int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr,
109                                             TargetStackID::SGPRSpill);
110 
111     if (MFI->allocateSGPRSpillToVGPR(MF, NewFI)) {
112       // 3: There's no free lane to spill, and no free register to save FP/BP,
113       // so we're forced to spill another VGPR to use for the spill.
114       FrameIndex = NewFI;
115     } else {
116       // 4: If all else fails, spill the FP/BP to memory.
117       FrameIndex = FrameInfo.CreateSpillStackObject(4, Align(4));
118     }
119 
120     LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
121                dbgs() << (IsFP ? "FP" : "BP") << " requires fallback spill to "
122                       << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
123                       << '\n';);
124   } else {
125     LLVM_DEBUG(dbgs() << "Saving " << (IsFP ? "FP" : "BP") << " with copy to "
126                       << printReg(TempSGPR, TRI) << '\n');
127   }
128 }
129 
130 // We need to specially emit stack operations here because a different frame
131 // register is used than in the rest of the function, as getFrameRegister would
132 // use.
133 static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
134                              MachineBasicBlock::iterator I,
135                              const SIInstrInfo *TII, Register SpillReg,
136                              Register ScratchRsrcReg, Register SPReg, int FI) {
137   MachineFunction *MF = MBB.getParent();
138   MachineFrameInfo &MFI = MF->getFrameInfo();
139 
140   int64_t Offset = MFI.getObjectOffset(FI);
141 
142   MachineMemOperand *MMO = MF->getMachineMemOperand(
143       MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4,
144       MFI.getObjectAlign(FI));
145 
146   if (isUInt<12>(Offset)) {
147     BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET))
148       .addReg(SpillReg, RegState::Kill)
149       .addReg(ScratchRsrcReg)
150       .addReg(SPReg)
151       .addImm(Offset)
152       .addImm(0) // glc
153       .addImm(0) // slc
154       .addImm(0) // tfe
155       .addImm(0) // dlc
156       .addImm(0) // swz
157       .addMemOperand(MMO);
158     return;
159   }
160 
161   MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
162     MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
163 
164   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
165     .addImm(Offset);
166 
167   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN))
168     .addReg(SpillReg, RegState::Kill)
169     .addReg(OffsetReg, RegState::Kill)
170     .addReg(ScratchRsrcReg)
171     .addReg(SPReg)
172     .addImm(0)
173     .addImm(0) // glc
174     .addImm(0) // slc
175     .addImm(0) // tfe
176     .addImm(0) // dlc
177     .addImm(0) // swz
178     .addMemOperand(MMO);
179 }
180 
181 static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
182                               MachineBasicBlock::iterator I,
183                               const SIInstrInfo *TII, Register SpillReg,
184                               Register ScratchRsrcReg, Register SPReg, int FI) {
185   MachineFunction *MF = MBB.getParent();
186   MachineFrameInfo &MFI = MF->getFrameInfo();
187   int64_t Offset = MFI.getObjectOffset(FI);
188 
189   MachineMemOperand *MMO = MF->getMachineMemOperand(
190       MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4,
191       MFI.getObjectAlign(FI));
192 
193   if (isUInt<12>(Offset)) {
194     BuildMI(MBB, I, DebugLoc(),
195             TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFSET), SpillReg)
196       .addReg(ScratchRsrcReg)
197       .addReg(SPReg)
198       .addImm(Offset)
199       .addImm(0) // glc
200       .addImm(0) // slc
201       .addImm(0) // tfe
202       .addImm(0) // dlc
203       .addImm(0) // swz
204       .addMemOperand(MMO);
205     return;
206   }
207 
208   MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
209     MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
210 
211   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
212     .addImm(Offset);
213 
214   BuildMI(MBB, I, DebugLoc(),
215           TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), SpillReg)
216     .addReg(OffsetReg, RegState::Kill)
217     .addReg(ScratchRsrcReg)
218     .addReg(SPReg)
219     .addImm(0)
220     .addImm(0) // glc
221     .addImm(0) // slc
222     .addImm(0) // tfe
223     .addImm(0) // dlc
224     .addImm(0) // swz
225     .addMemOperand(MMO);
226 }
227 
228 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
229 void SIFrameLowering::emitEntryFunctionFlatScratchInit(
230     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
231     const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
232   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
233   const SIInstrInfo *TII = ST.getInstrInfo();
234   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
235   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
236 
237   // We don't need this if we only have spills since there is no user facing
238   // scratch.
239 
240   // TODO: If we know we don't have flat instructions earlier, we can omit
241   // this from the input registers.
242   //
243   // TODO: We only need to know if we access scratch space through a flat
244   // pointer. Because we only detect if flat instructions are used at all,
245   // this will be used more often than necessary on VI.
246 
247   Register FlatScratchInitReg =
248       MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
249 
250   MachineRegisterInfo &MRI = MF.getRegInfo();
251   MRI.addLiveIn(FlatScratchInitReg);
252   MBB.addLiveIn(FlatScratchInitReg);
253 
254   Register FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
255   Register FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
256 
257   // Do a 64-bit pointer add.
258   if (ST.flatScratchIsPointer()) {
259     if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
260       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
261         .addReg(FlatScrInitLo)
262         .addReg(ScratchWaveOffsetReg);
263       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), FlatScrInitHi)
264         .addReg(FlatScrInitHi)
265         .addImm(0);
266       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
267         addReg(FlatScrInitLo).
268         addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO |
269                        (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
270       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
271         addReg(FlatScrInitHi).
272         addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
273                        (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
274       return;
275     }
276 
277     // For GFX9.
278     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
279       .addReg(FlatScrInitLo)
280       .addReg(ScratchWaveOffsetReg);
281     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI)
282       .addReg(FlatScrInitHi)
283       .addImm(0);
284 
285     return;
286   }
287 
288   assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
289 
290   // Copy the size in bytes.
291   BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
292     .addReg(FlatScrInitHi, RegState::Kill);
293 
294   // Add wave offset in bytes to private base offset.
295   // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
296   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
297     .addReg(FlatScrInitLo)
298     .addReg(ScratchWaveOffsetReg);
299 
300   // Convert offset to 256-byte units.
301   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI)
302     .addReg(FlatScrInitLo, RegState::Kill)
303     .addImm(8);
304 }
305 
306 // Shift down registers reserved for the scratch RSRC.
307 Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
308     MachineFunction &MF) const {
309 
310   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
311   const SIInstrInfo *TII = ST.getInstrInfo();
312   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
313   MachineRegisterInfo &MRI = MF.getRegInfo();
314   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
315 
316   assert(MFI->isEntryFunction());
317 
318   Register ScratchRsrcReg = MFI->getScratchRSrcReg();
319 
320   if (!ScratchRsrcReg || !MRI.isPhysRegUsed(ScratchRsrcReg))
321     return Register();
322 
323   if (ST.hasSGPRInitBug() ||
324       ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
325     return ScratchRsrcReg;
326 
327   // We reserved the last registers for this. Shift it down to the end of those
328   // which were actually used.
329   //
330   // FIXME: It might be safer to use a pseudoregister before replacement.
331 
332   // FIXME: We should be able to eliminate unused input registers. We only
333   // cannot do this for the resources required for scratch access. For now we
334   // skip over user SGPRs and may leave unused holes.
335 
336   unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
337   ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF);
338   AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
339 
340   // Skip the last N reserved elements because they should have already been
341   // reserved for VCC etc.
342   Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
343   for (MCPhysReg Reg : AllSGPR128s) {
344     // Pick the first unallocated one. Make sure we don't clobber the other
345     // reserved input we needed. Also for PAL, make sure we don't clobber
346     // the GIT pointer passed in SGPR0 or SGPR8.
347     if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
348         !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
349       MRI.replaceRegWith(ScratchRsrcReg, Reg);
350       MFI->setScratchRSrcReg(Reg);
351       return Reg;
352     }
353   }
354 
355   return ScratchRsrcReg;
356 }
357 
358 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
359                                                 MachineBasicBlock &MBB) const {
360   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
361 
362   // FIXME: If we only have SGPR spills, we won't actually be using scratch
363   // memory since these spill to VGPRs. We should be cleaning up these unused
364   // SGPR spill frame indices somewhere.
365 
366   // FIXME: We still have implicit uses on SGPR spill instructions in case they
367   // need to spill to vector memory. It's likely that will not happen, but at
368   // this point it appears we need the setup. This part of the prolog should be
369   // emitted after frame indices are eliminated.
370 
371   // FIXME: Remove all of the isPhysRegUsed checks
372 
373   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
374   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
375   const SIInstrInfo *TII = ST.getInstrInfo();
376   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
377   MachineRegisterInfo &MRI = MF.getRegInfo();
378   const Function &F = MF.getFunction();
379 
380   assert(MFI->isEntryFunction());
381 
382   Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
383       AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
384   // FIXME: Hack to not crash in situations which emitted an error.
385   if (!PreloadedScratchWaveOffsetReg)
386     return;
387 
388   // We need to do the replacement of the private segment buffer register even
389   // if there are no stack objects. There could be stores to undef or a
390   // constant without an associated object.
391   //
392   // This will return `Register()` in cases where there are no actual
393   // uses of the SRSRC.
394   Register ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
395 
396   // Make the selected register live throughout the function.
397   if (ScratchRsrcReg) {
398     for (MachineBasicBlock &OtherBB : MF) {
399       if (&OtherBB != &MBB) {
400         OtherBB.addLiveIn(ScratchRsrcReg);
401       }
402     }
403   }
404 
405   // Now that we have fixed the reserved SRSRC we need to locate the
406   // (potentially) preloaded SRSRC.
407   Register PreloadedScratchRsrcReg;
408   if (ST.isAmdHsaOrMesa(F)) {
409     PreloadedScratchRsrcReg =
410         MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
411     if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
412       // We added live-ins during argument lowering, but since they were not
413       // used they were deleted. We're adding the uses now, so add them back.
414       MRI.addLiveIn(PreloadedScratchRsrcReg);
415       MBB.addLiveIn(PreloadedScratchRsrcReg);
416     }
417   }
418 
419   // Debug location must be unknown since the first debug location is used to
420   // determine the end of the prologue.
421   DebugLoc DL;
422   MachineBasicBlock::iterator I = MBB.begin();
423 
424   // We found the SRSRC first because it needs four registers and has an
425   // alignment requirement. If the SRSRC that we found is clobbering with
426   // the scratch wave offset, which may be in a fixed SGPR or a free SGPR
427   // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
428   // wave offset to a free SGPR.
429   Register ScratchWaveOffsetReg;
430   if (TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) {
431     ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF);
432     unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
433     AllSGPRs = AllSGPRs.slice(
434         std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded));
435     Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
436     for (MCPhysReg Reg : AllSGPRs) {
437       if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
438           !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) {
439         ScratchWaveOffsetReg = Reg;
440         BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
441             .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
442         break;
443       }
444     }
445   } else {
446     ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
447   }
448   assert(ScratchWaveOffsetReg);
449 
450   if (requiresStackPointerReference(MF)) {
451     Register SPReg = MFI->getStackPtrOffsetReg();
452     assert(SPReg != AMDGPU::SP_REG);
453     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
454         .addImm(MF.getFrameInfo().getStackSize() * ST.getWavefrontSize());
455   }
456 
457   if (hasFP(MF)) {
458     Register FPReg = MFI->getFrameOffsetReg();
459     assert(FPReg != AMDGPU::FP_REG);
460     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
461   }
462 
463   if (MFI->hasFlatScratchInit() || ScratchRsrcReg) {
464     MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
465     MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
466   }
467 
468   if (MFI->hasFlatScratchInit()) {
469     emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
470   }
471 
472   if (ScratchRsrcReg) {
473     emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
474                                          PreloadedScratchRsrcReg,
475                                          ScratchRsrcReg, ScratchWaveOffsetReg);
476   }
477 }
478 
479 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
480 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
481     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
482     const DebugLoc &DL, Register PreloadedScratchRsrcReg,
483     Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
484 
485   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
486   const SIInstrInfo *TII = ST.getInstrInfo();
487   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
488   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
489   const Function &Fn = MF.getFunction();
490 
491   if (ST.isAmdPalOS()) {
492     // The pointer to the GIT is formed from the offset passed in and either
493     // the amdgpu-git-ptr-high function attribute or the top part of the PC
494     Register RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
495     Register RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
496     Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
497 
498     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
499 
500     if (MFI->getGITPtrHigh() != 0xffffffff) {
501       BuildMI(MBB, I, DL, SMovB32, RsrcHi)
502         .addImm(MFI->getGITPtrHigh())
503         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
504     } else {
505       const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
506       BuildMI(MBB, I, DL, GetPC64, Rsrc01);
507     }
508     Register GitPtrLo = MFI->getGITPtrLoReg(MF);
509     MF.getRegInfo().addLiveIn(GitPtrLo);
510     MBB.addLiveIn(GitPtrLo);
511     BuildMI(MBB, I, DL, SMovB32, RsrcLo)
512       .addReg(GitPtrLo)
513       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
514 
515     // We now have the GIT ptr - now get the scratch descriptor from the entry
516     // at offset 0 (or offset 16 for a compute shader).
517     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
518     const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
519     auto MMO = MF.getMachineMemOperand(PtrInfo,
520                                        MachineMemOperand::MOLoad |
521                                            MachineMemOperand::MOInvariant |
522                                            MachineMemOperand::MODereferenceable,
523                                        16, Align(4));
524     unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
525     const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
526     unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
527     BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
528       .addReg(Rsrc01)
529       .addImm(EncodedOffset) // offset
530       .addImm(0) // glc
531       .addImm(0) // dlc
532       .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
533       .addMemOperand(MMO);
534   } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
535     assert(!ST.isAmdHsaOrMesa(Fn));
536     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
537 
538     Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
539     Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
540 
541     // Use relocations to get the pointer, and setup the other bits manually.
542     uint64_t Rsrc23 = TII->getScratchRsrcWords23();
543 
544     if (MFI->hasImplicitBufferPtr()) {
545       Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
546 
547       if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
548         const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
549 
550         BuildMI(MBB, I, DL, Mov64, Rsrc01)
551           .addReg(MFI->getImplicitBufferPtrUserSGPR())
552           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
553       } else {
554         const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
555 
556         MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
557         auto MMO = MF.getMachineMemOperand(
558             PtrInfo,
559             MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
560                 MachineMemOperand::MODereferenceable,
561             8, Align(4));
562         BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
563           .addReg(MFI->getImplicitBufferPtrUserSGPR())
564           .addImm(0) // offset
565           .addImm(0) // glc
566           .addImm(0) // dlc
567           .addMemOperand(MMO)
568           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
569 
570         MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
571         MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
572       }
573     } else {
574       Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
575       Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
576 
577       BuildMI(MBB, I, DL, SMovB32, Rsrc0)
578         .addExternalSymbol("SCRATCH_RSRC_DWORD0")
579         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
580 
581       BuildMI(MBB, I, DL, SMovB32, Rsrc1)
582         .addExternalSymbol("SCRATCH_RSRC_DWORD1")
583         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
584 
585     }
586 
587     BuildMI(MBB, I, DL, SMovB32, Rsrc2)
588       .addImm(Rsrc23 & 0xffffffff)
589       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
590 
591     BuildMI(MBB, I, DL, SMovB32, Rsrc3)
592       .addImm(Rsrc23 >> 32)
593       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
594   } else if (ST.isAmdHsaOrMesa(Fn)) {
595     assert(PreloadedScratchRsrcReg);
596 
597     if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
598       BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
599           .addReg(PreloadedScratchRsrcReg, RegState::Kill);
600     }
601   }
602 
603   // Add the scratch wave offset into the scratch RSRC.
604   //
605   // We only want to update the first 48 bits, which is the base address
606   // pointer, without touching the adjacent 16 bits of flags. We know this add
607   // cannot carry-out from bit 47, otherwise the scratch allocation would be
608   // impossible to fit in the 48-bit global address space.
609   //
610   // TODO: Evaluate if it is better to just construct an SRD using the flat
611   // scratch init and some constants rather than update the one we are passed.
612   Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
613   Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
614 
615   // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
616   // the kernel body via inreg arguments.
617   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0)
618       .addReg(ScratchRsrcSub0)
619       .addReg(ScratchWaveOffsetReg)
620       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
621   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1)
622       .addReg(ScratchRsrcSub1)
623       .addImm(0)
624       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
625 }
626 
627 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
628   switch (ID) {
629   case TargetStackID::Default:
630   case TargetStackID::NoAlloc:
631   case TargetStackID::SGPRSpill:
632     return true;
633   case TargetStackID::SVEVector:
634     return false;
635   }
636   llvm_unreachable("Invalid TargetStackID::Value");
637 }
638 
639 // Activate all lanes, returns saved exec.
640 static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
641                                      MachineFunction &MF,
642                                      MachineBasicBlock &MBB,
643                                      MachineBasicBlock::iterator MBBI,
644                                      bool IsProlog) {
645   Register ScratchExecCopy;
646   MachineRegisterInfo &MRI = MF.getRegInfo();
647   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
648   const SIInstrInfo *TII = ST.getInstrInfo();
649   const SIRegisterInfo &TRI = TII->getRegisterInfo();
650   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
651   DebugLoc DL;
652 
653   if (LiveRegs.empty()) {
654     if (IsProlog) {
655       LiveRegs.init(TRI);
656       LiveRegs.addLiveIns(MBB);
657       if (FuncInfo->SGPRForFPSaveRestoreCopy)
658         LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy);
659 
660       if (FuncInfo->SGPRForBPSaveRestoreCopy)
661         LiveRegs.removeReg(FuncInfo->SGPRForBPSaveRestoreCopy);
662     } else {
663       // In epilog.
664       LiveRegs.init(*ST.getRegisterInfo());
665       LiveRegs.addLiveOuts(MBB);
666       LiveRegs.stepBackward(*MBBI);
667     }
668   }
669 
670   ScratchExecCopy = findScratchNonCalleeSaveRegister(
671       MRI, LiveRegs, *TRI.getWaveMaskRegClass());
672 
673   if (!IsProlog)
674     LiveRegs.removeReg(ScratchExecCopy);
675 
676   const unsigned OrSaveExec =
677       ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
678   BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy).addImm(-1);
679 
680   return ScratchExecCopy;
681 }
682 
683 void SIFrameLowering::emitPrologue(MachineFunction &MF,
684                                    MachineBasicBlock &MBB) const {
685   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
686   if (FuncInfo->isEntryFunction()) {
687     emitEntryFunctionPrologue(MF, MBB);
688     return;
689   }
690 
691   const MachineFrameInfo &MFI = MF.getFrameInfo();
692   MachineRegisterInfo &MRI = MF.getRegInfo();
693   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
694   const SIInstrInfo *TII = ST.getInstrInfo();
695   const SIRegisterInfo &TRI = TII->getRegisterInfo();
696 
697   Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
698   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
699   Register BasePtrReg =
700       TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
701   LivePhysRegs LiveRegs;
702 
703   MachineBasicBlock::iterator MBBI = MBB.begin();
704   DebugLoc DL;
705 
706   bool HasFP = false;
707   bool HasBP = false;
708   uint32_t NumBytes = MFI.getStackSize();
709   uint32_t RoundedSize = NumBytes;
710   // To avoid clobbering VGPRs in lanes that weren't active on function entry,
711   // turn on all lanes before doing the spill to memory.
712   Register ScratchExecCopy;
713 
714   bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue();
715   bool SpillFPToMemory = false;
716   // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR.
717   // Otherwise we are spilling the FP to memory.
718   if (HasFPSaveIndex) {
719     SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) !=
720                       TargetStackID::SGPRSpill;
721   }
722 
723   bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue();
724   bool SpillBPToMemory = false;
725   // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR.
726   // Otherwise we are spilling the BP to memory.
727   if (HasBPSaveIndex) {
728     SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) !=
729                       TargetStackID::SGPRSpill;
730   }
731 
732   // Emit the copy if we need an FP, and are using a free SGPR to save it.
733   if (FuncInfo->SGPRForFPSaveRestoreCopy) {
734     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy)
735       .addReg(FramePtrReg)
736       .setMIFlag(MachineInstr::FrameSetup);
737   }
738 
739   // Emit the copy if we need a BP, and are using a free SGPR to save it.
740   if (FuncInfo->SGPRForBPSaveRestoreCopy) {
741     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY),
742             FuncInfo->SGPRForBPSaveRestoreCopy)
743         .addReg(BasePtrReg)
744         .setMIFlag(MachineInstr::FrameSetup);
745   }
746 
747   // If a copy has been emitted for FP and/or BP, Make the SGPRs
748   // used in the copy instructions live throughout the function.
749   SmallVector<MCPhysReg, 2> TempSGPRs;
750   if (FuncInfo->SGPRForFPSaveRestoreCopy)
751     TempSGPRs.push_back(FuncInfo->SGPRForFPSaveRestoreCopy);
752 
753   if (FuncInfo->SGPRForBPSaveRestoreCopy)
754     TempSGPRs.push_back(FuncInfo->SGPRForBPSaveRestoreCopy);
755 
756   if (!TempSGPRs.empty()) {
757     for (MachineBasicBlock &MBB : MF) {
758       for (MCPhysReg Reg : TempSGPRs)
759         MBB.addLiveIn(Reg);
760 
761       MBB.sortUniqueLiveIns();
762     }
763   }
764 
765   for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
766          : FuncInfo->getSGPRSpillVGPRs()) {
767     if (!Reg.FI.hasValue())
768       continue;
769 
770     if (!ScratchExecCopy)
771       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
772 
773     buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
774                      FuncInfo->getScratchRSrcReg(),
775                      StackPtrReg,
776                      Reg.FI.getValue());
777   }
778 
779   if (HasFPSaveIndex && SpillFPToMemory) {
780     assert(!MFI.isDeadObjectIndex(FuncInfo->FramePointerSaveIndex.getValue()));
781 
782     if (!ScratchExecCopy)
783       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
784 
785     MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
786         MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
787 
788     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
789         .addReg(FramePtrReg);
790 
791     buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR,
792                      FuncInfo->getScratchRSrcReg(), StackPtrReg,
793                      FuncInfo->FramePointerSaveIndex.getValue());
794   }
795 
796   if (HasBPSaveIndex && SpillBPToMemory) {
797     assert(!MFI.isDeadObjectIndex(*FuncInfo->BasePointerSaveIndex));
798 
799     if (!ScratchExecCopy)
800       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
801 
802     MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
803         MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
804 
805     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
806         .addReg(BasePtrReg);
807 
808     buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR,
809                      FuncInfo->getScratchRSrcReg(), StackPtrReg,
810                      *FuncInfo->BasePointerSaveIndex);
811   }
812 
813   if (ScratchExecCopy) {
814     // FIXME: Split block and make terminator.
815     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
816     MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
817     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
818         .addReg(ScratchExecCopy, RegState::Kill);
819     LiveRegs.addReg(ScratchExecCopy);
820   }
821 
822   // In this case, spill the FP to a reserved VGPR.
823   if (HasFPSaveIndex && !SpillFPToMemory) {
824     const int FI = FuncInfo->FramePointerSaveIndex.getValue();
825     assert(!MFI.isDeadObjectIndex(FI));
826 
827     assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
828     ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
829         FuncInfo->getSGPRToVGPRSpills(FI);
830     assert(Spill.size() == 1);
831 
832     // Save FP before setting it up.
833     // FIXME: This should respect spillSGPRToVGPR;
834     BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
835             Spill[0].VGPR)
836         .addReg(FramePtrReg)
837         .addImm(Spill[0].Lane)
838         .addReg(Spill[0].VGPR, RegState::Undef);
839   }
840 
841   // In this case, spill the BP to a reserved VGPR.
842   if (HasBPSaveIndex && !SpillBPToMemory) {
843     const int BasePtrFI = *FuncInfo->BasePointerSaveIndex;
844     assert(!MFI.isDeadObjectIndex(BasePtrFI));
845 
846     assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill);
847     ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
848         FuncInfo->getSGPRToVGPRSpills(BasePtrFI);
849     assert(Spill.size() == 1);
850 
851     // Save BP before setting it up.
852     // FIXME: This should respect spillSGPRToVGPR;
853     BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
854             Spill[0].VGPR)
855         .addReg(BasePtrReg)
856         .addImm(Spill[0].Lane)
857         .addReg(Spill[0].VGPR, RegState::Undef);
858   }
859 
860   if (TRI.needsStackRealignment(MF)) {
861     HasFP = true;
862     const unsigned Alignment = MFI.getMaxAlign().value();
863 
864     RoundedSize += Alignment;
865     if (LiveRegs.empty()) {
866       LiveRegs.init(TRI);
867       LiveRegs.addLiveIns(MBB);
868       LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy);
869       LiveRegs.addReg(FuncInfo->SGPRForBPSaveRestoreCopy);
870     }
871 
872     Register ScratchSPReg = findScratchNonCalleeSaveRegister(
873         MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass);
874     assert(ScratchSPReg && ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy &&
875            ScratchSPReg != FuncInfo->SGPRForBPSaveRestoreCopy);
876 
877     // s_add_u32 tmp_reg, s32, NumBytes
878     // s_and_b32 s32, tmp_reg, 0b111...0000
879     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg)
880         .addReg(StackPtrReg)
881         .addImm((Alignment - 1) * ST.getWavefrontSize())
882         .setMIFlag(MachineInstr::FrameSetup);
883     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
884         .addReg(ScratchSPReg, RegState::Kill)
885         .addImm(-Alignment * ST.getWavefrontSize())
886         .setMIFlag(MachineInstr::FrameSetup);
887     FuncInfo->setIsStackRealigned(true);
888   } else if ((HasFP = hasFP(MF))) {
889     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
890         .addReg(StackPtrReg)
891         .setMIFlag(MachineInstr::FrameSetup);
892   }
893 
894   // If we need a base pointer, set it up here. It's whatever the value of
895   // the stack pointer is at this point. Any variable size objects will be
896   // allocated after this, so we can still use the base pointer to reference
897   // the incoming arguments.
898   if ((HasBP = TRI.hasBasePointer(MF))) {
899     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
900         .addReg(StackPtrReg)
901         .setMIFlag(MachineInstr::FrameSetup);
902   }
903 
904   if (HasFP && RoundedSize != 0) {
905     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
906         .addReg(StackPtrReg)
907         .addImm(RoundedSize * ST.getWavefrontSize())
908         .setMIFlag(MachineInstr::FrameSetup);
909   }
910 
911   assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy ||
912                      FuncInfo->FramePointerSaveIndex)) &&
913          "Needed to save FP but didn't save it anywhere");
914 
915   assert((HasFP || (!FuncInfo->SGPRForFPSaveRestoreCopy &&
916                     !FuncInfo->FramePointerSaveIndex)) &&
917          "Saved FP but didn't need it");
918 
919   assert((!HasBP || (FuncInfo->SGPRForBPSaveRestoreCopy ||
920                      FuncInfo->BasePointerSaveIndex)) &&
921          "Needed to save BP but didn't save it anywhere");
922 
923   assert((HasBP || (!FuncInfo->SGPRForBPSaveRestoreCopy &&
924                     !FuncInfo->BasePointerSaveIndex)) &&
925          "Saved BP but didn't need it");
926 }
927 
928 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
929                                    MachineBasicBlock &MBB) const {
930   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
931   if (FuncInfo->isEntryFunction())
932     return;
933 
934   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
935   const SIInstrInfo *TII = ST.getInstrInfo();
936   MachineRegisterInfo &MRI = MF.getRegInfo();
937   const SIRegisterInfo &TRI = TII->getRegisterInfo();
938   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
939   LivePhysRegs LiveRegs;
940   DebugLoc DL;
941 
942   const MachineFrameInfo &MFI = MF.getFrameInfo();
943   uint32_t NumBytes = MFI.getStackSize();
944   uint32_t RoundedSize = FuncInfo->isStackRealigned()
945                              ? NumBytes + MFI.getMaxAlign().value()
946                              : NumBytes;
947   const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
948   const Register FramePtrReg = FuncInfo->getFrameOffsetReg();
949   const Register BasePtrReg =
950       TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
951 
952   bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue();
953   bool SpillFPToMemory = false;
954   if (HasFPSaveIndex) {
955     SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) !=
956                       TargetStackID::SGPRSpill;
957   }
958 
959   bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue();
960   bool SpillBPToMemory = false;
961   if (HasBPSaveIndex) {
962     SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) !=
963                       TargetStackID::SGPRSpill;
964   }
965 
966   if (RoundedSize != 0 && hasFP(MF)) {
967     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
968       .addReg(StackPtrReg)
969       .addImm(RoundedSize * ST.getWavefrontSize())
970       .setMIFlag(MachineInstr::FrameDestroy);
971   }
972 
973   if (FuncInfo->SGPRForFPSaveRestoreCopy) {
974     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
975         .addReg(FuncInfo->SGPRForFPSaveRestoreCopy)
976         .setMIFlag(MachineInstr::FrameSetup);
977   }
978 
979   if (FuncInfo->SGPRForBPSaveRestoreCopy) {
980     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
981         .addReg(FuncInfo->SGPRForBPSaveRestoreCopy)
982         .setMIFlag(MachineInstr::FrameSetup);
983   }
984 
985   Register ScratchExecCopy;
986   if (HasFPSaveIndex) {
987     const int FI = FuncInfo->FramePointerSaveIndex.getValue();
988     assert(!MFI.isDeadObjectIndex(FI));
989     if (SpillFPToMemory) {
990       if (!ScratchExecCopy)
991         ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
992 
993       MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister(
994           MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
995       buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR,
996                         FuncInfo->getScratchRSrcReg(), StackPtrReg, FI);
997       BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg)
998           .addReg(TempVGPR, RegState::Kill);
999     } else {
1000       // Reload from VGPR spill.
1001       assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
1002       ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
1003           FuncInfo->getSGPRToVGPRSpills(FI);
1004       assert(Spill.size() == 1);
1005       BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
1006               FramePtrReg)
1007           .addReg(Spill[0].VGPR)
1008           .addImm(Spill[0].Lane);
1009     }
1010   }
1011 
1012   if (HasBPSaveIndex) {
1013     const int BasePtrFI = *FuncInfo->BasePointerSaveIndex;
1014     assert(!MFI.isDeadObjectIndex(BasePtrFI));
1015     if (SpillBPToMemory) {
1016       if (!ScratchExecCopy)
1017         ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
1018 
1019       MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister(
1020           MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
1021       buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR,
1022                         FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI);
1023       BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg)
1024           .addReg(TempVGPR, RegState::Kill);
1025     } else {
1026       // Reload from VGPR spill.
1027       assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill);
1028       ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
1029           FuncInfo->getSGPRToVGPRSpills(BasePtrFI);
1030       assert(Spill.size() == 1);
1031       BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
1032               BasePtrReg)
1033           .addReg(Spill[0].VGPR)
1034           .addImm(Spill[0].Lane);
1035     }
1036   }
1037 
1038   for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg :
1039        FuncInfo->getSGPRSpillVGPRs()) {
1040     if (!Reg.FI.hasValue())
1041       continue;
1042 
1043     if (!ScratchExecCopy)
1044       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
1045 
1046     buildEpilogReload(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
1047                       FuncInfo->getScratchRSrcReg(), StackPtrReg,
1048                       Reg.FI.getValue());
1049   }
1050 
1051   if (ScratchExecCopy) {
1052     // FIXME: Split block and make terminator.
1053     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1054     MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
1055     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
1056         .addReg(ScratchExecCopy, RegState::Kill);
1057   }
1058 }
1059 
1060 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
1061 // memory. They should have been removed by now.
1062 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
1063   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
1064        I != E; ++I) {
1065     if (!MFI.isDeadObjectIndex(I))
1066       return false;
1067   }
1068 
1069   return true;
1070 }
1071 
1072 #ifndef NDEBUG
1073 static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI,
1074                                  Optional<int> FramePointerSaveIndex,
1075                                  Optional<int> BasePointerSaveIndex) {
1076   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
1077        I != E; ++I) {
1078     if (!MFI.isDeadObjectIndex(I) &&
1079         MFI.getStackID(I) == TargetStackID::SGPRSpill &&
1080         ((FramePointerSaveIndex && I != FramePointerSaveIndex) ||
1081          (BasePointerSaveIndex && I != BasePointerSaveIndex))) {
1082       return false;
1083     }
1084   }
1085 
1086   return true;
1087 }
1088 #endif
1089 
1090 int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
1091                                             Register &FrameReg) const {
1092   const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1093 
1094   FrameReg = RI->getFrameRegister(MF);
1095   return MF.getFrameInfo().getObjectOffset(FI);
1096 }
1097 
1098 void SIFrameLowering::processFunctionBeforeFrameFinalized(
1099   MachineFunction &MF,
1100   RegScavenger *RS) const {
1101   MachineFrameInfo &MFI = MF.getFrameInfo();
1102 
1103   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1104   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1105   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1106 
1107   FuncInfo->removeDeadFrameIndices(MFI);
1108   assert(allSGPRSpillsAreDead(MFI, None, None) &&
1109          "SGPR spill should have been removed in SILowerSGPRSpills");
1110 
1111   // FIXME: The other checks should be redundant with allStackObjectsAreDead,
1112   // but currently hasNonSpillStackObjects is set only from source
1113   // allocas. Stack temps produced from legalization are not counted currently.
1114   if (!allStackObjectsAreDead(MFI)) {
1115     assert(RS && "RegScavenger required if spilling");
1116 
1117     if (FuncInfo->isEntryFunction()) {
1118       int ScavengeFI = MFI.CreateFixedObject(
1119         TRI->getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
1120       RS->addScavengingFrameIndex(ScavengeFI);
1121     } else {
1122       int ScavengeFI = MFI.CreateStackObject(
1123           TRI->getSpillSize(AMDGPU::SGPR_32RegClass),
1124           TRI->getSpillAlign(AMDGPU::SGPR_32RegClass), false);
1125       RS->addScavengingFrameIndex(ScavengeFI);
1126     }
1127   }
1128 }
1129 
1130 // Only report VGPRs to generic code.
1131 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
1132                                            BitVector &SavedVGPRs,
1133                                            RegScavenger *RS) const {
1134   TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
1135   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1136   if (MFI->isEntryFunction())
1137     return;
1138 
1139   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1140   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1141   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1142 
1143   // Ignore the SGPRs the default implementation found.
1144   SavedVGPRs.clearBitsNotInMask(TRI->getAllVGPRRegMask());
1145 
1146   // hasFP only knows about stack objects that already exist. We're now
1147   // determining the stack slots that will be created, so we have to predict
1148   // them. Stack objects force FP usage with calls.
1149   //
1150   // Note a new VGPR CSR may be introduced if one is used for the spill, but we
1151   // don't want to report it here.
1152   //
1153   // FIXME: Is this really hasReservedCallFrame?
1154   const bool WillHaveFP =
1155       FrameInfo.hasCalls() &&
1156       (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo));
1157 
1158   // VGPRs used for SGPR spilling need to be specially inserted in the prolog,
1159   // so don't allow the default insertion to handle them.
1160   for (auto SSpill : MFI->getSGPRSpillVGPRs())
1161     SavedVGPRs.reset(SSpill.VGPR);
1162 
1163   LivePhysRegs LiveRegs;
1164   LiveRegs.init(*TRI);
1165 
1166   if (WillHaveFP || hasFP(MF)) {
1167     getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForFPSaveRestoreCopy,
1168                                    MFI->FramePointerSaveIndex, true);
1169   }
1170 
1171   if (TRI->hasBasePointer(MF)) {
1172     if (MFI->SGPRForFPSaveRestoreCopy)
1173       LiveRegs.addReg(MFI->SGPRForFPSaveRestoreCopy);
1174     getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForBPSaveRestoreCopy,
1175                                    MFI->BasePointerSaveIndex, false);
1176   }
1177 }
1178 
1179 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
1180                                                BitVector &SavedRegs,
1181                                                RegScavenger *RS) const {
1182   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
1183   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1184   if (MFI->isEntryFunction())
1185     return;
1186 
1187   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1188   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1189 
1190   // The SP is specifically managed and we don't want extra spills of it.
1191   SavedRegs.reset(MFI->getStackPtrOffsetReg());
1192   SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask());
1193 }
1194 
1195 bool SIFrameLowering::assignCalleeSavedSpillSlots(
1196     MachineFunction &MF, const TargetRegisterInfo *TRI,
1197     std::vector<CalleeSavedInfo> &CSI) const {
1198   if (CSI.empty())
1199     return true; // Early exit if no callee saved registers are modified!
1200 
1201   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1202   if (!FuncInfo->SGPRForFPSaveRestoreCopy &&
1203       !FuncInfo->SGPRForBPSaveRestoreCopy)
1204     return false;
1205 
1206   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1207   const SIRegisterInfo *RI = ST.getRegisterInfo();
1208   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1209   Register BasePtrReg = RI->getBaseRegister();
1210   unsigned NumModifiedRegs = 0;
1211 
1212   if (FuncInfo->SGPRForFPSaveRestoreCopy)
1213     NumModifiedRegs++;
1214   if (FuncInfo->SGPRForBPSaveRestoreCopy)
1215     NumModifiedRegs++;
1216 
1217   for (auto &CS : CSI) {
1218     if (CS.getReg() == FramePtrReg && FuncInfo->SGPRForFPSaveRestoreCopy) {
1219       CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy);
1220       if (--NumModifiedRegs)
1221         break;
1222     } else if (CS.getReg() == BasePtrReg &&
1223                FuncInfo->SGPRForBPSaveRestoreCopy) {
1224       CS.setDstReg(FuncInfo->SGPRForBPSaveRestoreCopy);
1225       if (--NumModifiedRegs)
1226         break;
1227     }
1228   }
1229 
1230   return false;
1231 }
1232 
1233 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
1234   MachineFunction &MF,
1235   MachineBasicBlock &MBB,
1236   MachineBasicBlock::iterator I) const {
1237   int64_t Amount = I->getOperand(0).getImm();
1238   if (Amount == 0)
1239     return MBB.erase(I);
1240 
1241   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1242   const SIInstrInfo *TII = ST.getInstrInfo();
1243   const DebugLoc &DL = I->getDebugLoc();
1244   unsigned Opc = I->getOpcode();
1245   bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
1246   uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
1247 
1248   if (!hasReservedCallFrame(MF)) {
1249     Amount = alignTo(Amount, getStackAlign());
1250     assert(isUInt<32>(Amount) && "exceeded stack address space size");
1251     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1252     Register SPReg = MFI->getStackPtrOffsetReg();
1253 
1254     unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
1255     BuildMI(MBB, I, DL, TII->get(Op), SPReg)
1256       .addReg(SPReg)
1257       .addImm(Amount * ST.getWavefrontSize());
1258   } else if (CalleePopAmount != 0) {
1259     llvm_unreachable("is this used?");
1260   }
1261 
1262   return MBB.erase(I);
1263 }
1264 
1265 /// Returns true if the frame will require a reference to the stack pointer.
1266 ///
1267 /// This is the set of conditions common to setting up the stack pointer in a
1268 /// kernel, and for using a frame pointer in a callable function.
1269 ///
1270 /// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm
1271 /// references SP.
1272 static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) {
1273   return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint();
1274 }
1275 
1276 // The FP for kernels is always known 0, so we never really need to setup an
1277 // explicit register for it. However, DisableFramePointerElim will force us to
1278 // use a register for it.
1279 bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
1280   const MachineFrameInfo &MFI = MF.getFrameInfo();
1281 
1282   // For entry functions we can use an immediate offset in most cases, so the
1283   // presence of calls doesn't imply we need a distinct frame pointer.
1284   if (MFI.hasCalls() &&
1285       !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1286     // All offsets are unsigned, so need to be addressed in the same direction
1287     // as stack growth.
1288 
1289     // FIXME: This function is pretty broken, since it can be called before the
1290     // frame layout is determined or CSR spills are inserted.
1291     return MFI.getStackSize() != 0;
1292   }
1293 
1294   return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
1295     MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->needsStackRealignment(MF) ||
1296     MF.getTarget().Options.DisableFramePointerElim(MF);
1297 }
1298 
1299 // This is essentially a reduced version of hasFP for entry functions. Since the
1300 // stack pointer is known 0 on entry to kernels, we never really need an FP
1301 // register. We may need to initialize the stack pointer depending on the frame
1302 // properties, which logically overlaps many of the cases where an ordinary
1303 // function would require an FP.
1304 bool SIFrameLowering::requiresStackPointerReference(
1305     const MachineFunction &MF) const {
1306   // Callable functions always require a stack pointer reference.
1307   assert(MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() &&
1308          "only expected to call this for entry points");
1309 
1310   const MachineFrameInfo &MFI = MF.getFrameInfo();
1311 
1312   // Entry points ordinarily don't need to initialize SP. We have to set it up
1313   // for callees if there are any. Also note tail calls are impossible/don't
1314   // make any sense for kernels.
1315   if (MFI.hasCalls())
1316     return true;
1317 
1318   // We still need to initialize the SP if we're doing anything weird that
1319   // references the SP, like variable sized stack objects.
1320   return frameTriviallyRequiresSP(MFI);
1321 }
1322