1 //===----------------------- SIFrameLowering.cpp --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8
9 #include "SIFrameLowering.h"
10 #include "AMDGPU.h"
11 #include "GCNSubtarget.h"
12 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
13 #include "SIMachineFunctionInfo.h"
14 #include "llvm/CodeGen/LivePhysRegs.h"
15 #include "llvm/CodeGen/MachineFrameInfo.h"
16 #include "llvm/CodeGen/RegisterScavenging.h"
17 #include "llvm/Target/TargetMachine.h"
18
19 using namespace llvm;
20
21 #define DEBUG_TYPE "frame-info"
22
23 static cl::opt<bool> EnableSpillVGPRToAGPR(
24 "amdgpu-spill-vgpr-to-agpr",
25 cl::desc("Enable spilling VGPRs to AGPRs"),
26 cl::ReallyHidden,
27 cl::init(true));
28
29 // Find a scratch register that we can use in the prologue. We avoid using
30 // callee-save registers since they may appear to be free when this is called
31 // from canUseAsPrologue (during shrink wrapping), but then no longer be free
32 // when this is called from emitPrologue.
findScratchNonCalleeSaveRegister(MachineRegisterInfo & MRI,LivePhysRegs & LiveRegs,const TargetRegisterClass & RC,bool Unused=false)33 static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
34 LivePhysRegs &LiveRegs,
35 const TargetRegisterClass &RC,
36 bool Unused = false) {
37 // Mark callee saved registers as used so we will not choose them.
38 const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
39 for (unsigned i = 0; CSRegs[i]; ++i)
40 LiveRegs.addReg(CSRegs[i]);
41
42 if (Unused) {
43 // We are looking for a register that can be used throughout the entire
44 // function, so any use is unacceptable.
45 for (MCRegister Reg : RC) {
46 if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg))
47 return Reg;
48 }
49 } else {
50 for (MCRegister Reg : RC) {
51 if (LiveRegs.available(MRI, Reg))
52 return Reg;
53 }
54 }
55
56 return MCRegister();
57 }
58
getVGPRSpillLaneOrTempRegister(MachineFunction & MF,LivePhysRegs & LiveRegs,Register & TempSGPR,Optional<int> & FrameIndex,bool IsFP)59 static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF,
60 LivePhysRegs &LiveRegs,
61 Register &TempSGPR,
62 Optional<int> &FrameIndex,
63 bool IsFP) {
64 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
65 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
66
67 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
68 const SIRegisterInfo *TRI = ST.getRegisterInfo();
69
70 // We need to save and restore the current FP/BP.
71
72 // 1: If there is already a VGPR with free lanes, use it. We
73 // may already have to pay the penalty for spilling a CSR VGPR.
74 if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) {
75 int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr,
76 TargetStackID::SGPRSpill);
77
78 if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
79 llvm_unreachable("allocate SGPR spill should have worked");
80
81 FrameIndex = NewFI;
82
83 LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
84 dbgs() << "Spilling " << (IsFP ? "FP" : "BP") << " to "
85 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
86 << '\n');
87 return;
88 }
89
90 // 2: Next, try to save the FP/BP in an unused SGPR.
91 TempSGPR = findScratchNonCalleeSaveRegister(
92 MF.getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true);
93
94 if (!TempSGPR) {
95 int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr,
96 TargetStackID::SGPRSpill);
97
98 if (TRI->spillSGPRToVGPR() && MFI->allocateSGPRSpillToVGPR(MF, NewFI)) {
99 // 3: There's no free lane to spill, and no free register to save FP/BP,
100 // so we're forced to spill another VGPR to use for the spill.
101 FrameIndex = NewFI;
102
103 LLVM_DEBUG(
104 auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
105 dbgs() << (IsFP ? "FP" : "BP") << " requires fallback spill to "
106 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';);
107 } else {
108 // Remove dead <NewFI> index
109 MF.getFrameInfo().RemoveStackObject(NewFI);
110 // 4: If all else fails, spill the FP/BP to memory.
111 FrameIndex = FrameInfo.CreateSpillStackObject(4, Align(4));
112 LLVM_DEBUG(dbgs() << "Reserved FI " << FrameIndex << " for spilling "
113 << (IsFP ? "FP" : "BP") << '\n');
114 }
115 } else {
116 LLVM_DEBUG(dbgs() << "Saving " << (IsFP ? "FP" : "BP") << " with copy to "
117 << printReg(TempSGPR, TRI) << '\n');
118 }
119 }
120
121 // We need to specially emit stack operations here because a different frame
122 // register is used than in the rest of the function, as getFrameRegister would
123 // use.
buildPrologSpill(const GCNSubtarget & ST,const SIRegisterInfo & TRI,const SIMachineFunctionInfo & FuncInfo,LivePhysRegs & LiveRegs,MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register SpillReg,int FI)124 static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
125 const SIMachineFunctionInfo &FuncInfo,
126 LivePhysRegs &LiveRegs, MachineFunction &MF,
127 MachineBasicBlock &MBB,
128 MachineBasicBlock::iterator I, const DebugLoc &DL,
129 Register SpillReg, int FI) {
130 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
131 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
132
133 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
134 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
135 MachineMemOperand *MMO = MF.getMachineMemOperand(
136 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI),
137 FrameInfo.getObjectAlign(FI));
138 LiveRegs.addReg(SpillReg);
139 TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, true,
140 FuncInfo.getStackPtrOffsetReg(), 0, MMO, nullptr,
141 &LiveRegs);
142 LiveRegs.removeReg(SpillReg);
143 }
144
buildEpilogRestore(const GCNSubtarget & ST,const SIRegisterInfo & TRI,const SIMachineFunctionInfo & FuncInfo,LivePhysRegs & LiveRegs,MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register SpillReg,int FI)145 static void buildEpilogRestore(const GCNSubtarget &ST,
146 const SIRegisterInfo &TRI,
147 const SIMachineFunctionInfo &FuncInfo,
148 LivePhysRegs &LiveRegs, MachineFunction &MF,
149 MachineBasicBlock &MBB,
150 MachineBasicBlock::iterator I,
151 const DebugLoc &DL, Register SpillReg, int FI) {
152 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
153 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
154
155 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
156 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
157 MachineMemOperand *MMO = MF.getMachineMemOperand(
158 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI),
159 FrameInfo.getObjectAlign(FI));
160 TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, false,
161 FuncInfo.getStackPtrOffsetReg(), 0, MMO, nullptr,
162 &LiveRegs);
163 }
164
buildGitPtr(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,const SIInstrInfo * TII,Register TargetReg)165 static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
166 const DebugLoc &DL, const SIInstrInfo *TII,
167 Register TargetReg) {
168 MachineFunction *MF = MBB.getParent();
169 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
170 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
171 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
172 Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0);
173 Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1);
174
175 if (MFI->getGITPtrHigh() != 0xffffffff) {
176 BuildMI(MBB, I, DL, SMovB32, TargetHi)
177 .addImm(MFI->getGITPtrHigh())
178 .addReg(TargetReg, RegState::ImplicitDefine);
179 } else {
180 const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
181 BuildMI(MBB, I, DL, GetPC64, TargetReg);
182 }
183 Register GitPtrLo = MFI->getGITPtrLoReg(*MF);
184 MF->getRegInfo().addLiveIn(GitPtrLo);
185 MBB.addLiveIn(GitPtrLo);
186 BuildMI(MBB, I, DL, SMovB32, TargetLo)
187 .addReg(GitPtrLo);
188 }
189
190 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
emitEntryFunctionFlatScratchInit(MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register ScratchWaveOffsetReg) const191 void SIFrameLowering::emitEntryFunctionFlatScratchInit(
192 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
193 const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
194 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
195 const SIInstrInfo *TII = ST.getInstrInfo();
196 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
197 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
198
199 // We don't need this if we only have spills since there is no user facing
200 // scratch.
201
202 // TODO: If we know we don't have flat instructions earlier, we can omit
203 // this from the input registers.
204 //
205 // TODO: We only need to know if we access scratch space through a flat
206 // pointer. Because we only detect if flat instructions are used at all,
207 // this will be used more often than necessary on VI.
208
209 Register FlatScrInitLo;
210 Register FlatScrInitHi;
211
212 if (ST.isAmdPalOS()) {
213 // Extract the scratch offset from the descriptor in the GIT
214 LivePhysRegs LiveRegs;
215 LiveRegs.init(*TRI);
216 LiveRegs.addLiveIns(MBB);
217
218 // Find unused reg to load flat scratch init into
219 MachineRegisterInfo &MRI = MF.getRegInfo();
220 Register FlatScrInit = AMDGPU::NoRegister;
221 ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
222 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
223 AllSGPR64s = AllSGPR64s.slice(
224 std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded));
225 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
226 for (MCPhysReg Reg : AllSGPR64s) {
227 if (LiveRegs.available(MRI, Reg) && MRI.isAllocatable(Reg) &&
228 !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
229 FlatScrInit = Reg;
230 break;
231 }
232 }
233 assert(FlatScrInit && "Failed to find free register for scratch init");
234
235 FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
236 FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
237
238 buildGitPtr(MBB, I, DL, TII, FlatScrInit);
239
240 // We now have the GIT ptr - now get the scratch descriptor from the entry
241 // at offset 0 (or offset 16 for a compute shader).
242 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
243 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
244 auto *MMO = MF.getMachineMemOperand(
245 PtrInfo,
246 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
247 MachineMemOperand::MODereferenceable,
248 8, Align(4));
249 unsigned Offset =
250 MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
251 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
252 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
253 BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit)
254 .addReg(FlatScrInit)
255 .addImm(EncodedOffset) // offset
256 .addImm(0) // cpol
257 .addMemOperand(MMO);
258
259 // Mask the offset in [47:0] of the descriptor
260 const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32);
261 auto And = BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi)
262 .addReg(FlatScrInitHi)
263 .addImm(0xffff);
264 And->getOperand(3).setIsDead(); // Mark SCC as dead.
265 } else {
266 Register FlatScratchInitReg =
267 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
268 assert(FlatScratchInitReg);
269
270 MachineRegisterInfo &MRI = MF.getRegInfo();
271 MRI.addLiveIn(FlatScratchInitReg);
272 MBB.addLiveIn(FlatScratchInitReg);
273
274 FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
275 FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
276 }
277
278 // Do a 64-bit pointer add.
279 if (ST.flatScratchIsPointer()) {
280 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
281 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
282 .addReg(FlatScrInitLo)
283 .addReg(ScratchWaveOffsetReg);
284 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
285 FlatScrInitHi)
286 .addReg(FlatScrInitHi)
287 .addImm(0);
288 Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
289
290 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
291 addReg(FlatScrInitLo).
292 addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO |
293 (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
294 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
295 addReg(FlatScrInitHi).
296 addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
297 (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
298 return;
299 }
300
301 // For GFX9.
302 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
303 .addReg(FlatScrInitLo)
304 .addReg(ScratchWaveOffsetReg);
305 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
306 AMDGPU::FLAT_SCR_HI)
307 .addReg(FlatScrInitHi)
308 .addImm(0);
309 Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
310
311 return;
312 }
313
314 assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
315
316 // Copy the size in bytes.
317 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
318 .addReg(FlatScrInitHi, RegState::Kill);
319
320 // Add wave offset in bytes to private base offset.
321 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
322 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), FlatScrInitLo)
323 .addReg(FlatScrInitLo)
324 .addReg(ScratchWaveOffsetReg);
325
326 // Convert offset to 256-byte units.
327 auto LShr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32),
328 AMDGPU::FLAT_SCR_HI)
329 .addReg(FlatScrInitLo, RegState::Kill)
330 .addImm(8);
331 LShr->getOperand(3).setIsDead(true); // Mark SCC as dead.
332 }
333
334 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
335 // memory. They should have been removed by now.
allStackObjectsAreDead(const MachineFrameInfo & MFI)336 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
337 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
338 I != E; ++I) {
339 if (!MFI.isDeadObjectIndex(I))
340 return false;
341 }
342
343 return true;
344 }
345
346 // Shift down registers reserved for the scratch RSRC.
getEntryFunctionReservedScratchRsrcReg(MachineFunction & MF) const347 Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
348 MachineFunction &MF) const {
349
350 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
351 const SIInstrInfo *TII = ST.getInstrInfo();
352 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
353 MachineRegisterInfo &MRI = MF.getRegInfo();
354 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
355
356 assert(MFI->isEntryFunction());
357
358 Register ScratchRsrcReg = MFI->getScratchRSrcReg();
359
360 if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(ScratchRsrcReg) &&
361 allStackObjectsAreDead(MF.getFrameInfo())))
362 return Register();
363
364 if (ST.hasSGPRInitBug() ||
365 ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
366 return ScratchRsrcReg;
367
368 // We reserved the last registers for this. Shift it down to the end of those
369 // which were actually used.
370 //
371 // FIXME: It might be safer to use a pseudoregister before replacement.
372
373 // FIXME: We should be able to eliminate unused input registers. We only
374 // cannot do this for the resources required for scratch access. For now we
375 // skip over user SGPRs and may leave unused holes.
376
377 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
378 ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF);
379 AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
380
381 // Skip the last N reserved elements because they should have already been
382 // reserved for VCC etc.
383 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
384 for (MCPhysReg Reg : AllSGPR128s) {
385 // Pick the first unallocated one. Make sure we don't clobber the other
386 // reserved input we needed. Also for PAL, make sure we don't clobber
387 // the GIT pointer passed in SGPR0 or SGPR8.
388 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
389 !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
390 MRI.replaceRegWith(ScratchRsrcReg, Reg);
391 MFI->setScratchRSrcReg(Reg);
392 return Reg;
393 }
394 }
395
396 return ScratchRsrcReg;
397 }
398
getScratchScaleFactor(const GCNSubtarget & ST)399 static unsigned getScratchScaleFactor(const GCNSubtarget &ST) {
400 return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize();
401 }
402
emitEntryFunctionPrologue(MachineFunction & MF,MachineBasicBlock & MBB) const403 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
404 MachineBasicBlock &MBB) const {
405 assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
406
407 // FIXME: If we only have SGPR spills, we won't actually be using scratch
408 // memory since these spill to VGPRs. We should be cleaning up these unused
409 // SGPR spill frame indices somewhere.
410
411 // FIXME: We still have implicit uses on SGPR spill instructions in case they
412 // need to spill to vector memory. It's likely that will not happen, but at
413 // this point it appears we need the setup. This part of the prolog should be
414 // emitted after frame indices are eliminated.
415
416 // FIXME: Remove all of the isPhysRegUsed checks
417
418 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
419 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
420 const SIInstrInfo *TII = ST.getInstrInfo();
421 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
422 MachineRegisterInfo &MRI = MF.getRegInfo();
423 const Function &F = MF.getFunction();
424 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
425
426 assert(MFI->isEntryFunction());
427
428 Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
429 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
430
431 // We need to do the replacement of the private segment buffer register even
432 // if there are no stack objects. There could be stores to undef or a
433 // constant without an associated object.
434 //
435 // This will return `Register()` in cases where there are no actual
436 // uses of the SRSRC.
437 Register ScratchRsrcReg;
438 if (!ST.enableFlatScratch())
439 ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
440
441 // Make the selected register live throughout the function.
442 if (ScratchRsrcReg) {
443 for (MachineBasicBlock &OtherBB : MF) {
444 if (&OtherBB != &MBB) {
445 OtherBB.addLiveIn(ScratchRsrcReg);
446 }
447 }
448 }
449
450 // Now that we have fixed the reserved SRSRC we need to locate the
451 // (potentially) preloaded SRSRC.
452 Register PreloadedScratchRsrcReg;
453 if (ST.isAmdHsaOrMesa(F)) {
454 PreloadedScratchRsrcReg =
455 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
456 if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
457 // We added live-ins during argument lowering, but since they were not
458 // used they were deleted. We're adding the uses now, so add them back.
459 MRI.addLiveIn(PreloadedScratchRsrcReg);
460 MBB.addLiveIn(PreloadedScratchRsrcReg);
461 }
462 }
463
464 // Debug location must be unknown since the first debug location is used to
465 // determine the end of the prologue.
466 DebugLoc DL;
467 MachineBasicBlock::iterator I = MBB.begin();
468
469 // We found the SRSRC first because it needs four registers and has an
470 // alignment requirement. If the SRSRC that we found is clobbering with
471 // the scratch wave offset, which may be in a fixed SGPR or a free SGPR
472 // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
473 // wave offset to a free SGPR.
474 Register ScratchWaveOffsetReg;
475 if (PreloadedScratchWaveOffsetReg &&
476 TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) {
477 ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF);
478 unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
479 AllSGPRs = AllSGPRs.slice(
480 std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded));
481 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
482 for (MCPhysReg Reg : AllSGPRs) {
483 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
484 !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) {
485 ScratchWaveOffsetReg = Reg;
486 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
487 .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
488 break;
489 }
490 }
491 } else {
492 ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
493 }
494 assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg);
495
496 if (requiresStackPointerReference(MF)) {
497 Register SPReg = MFI->getStackPtrOffsetReg();
498 assert(SPReg != AMDGPU::SP_REG);
499 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
500 .addImm(FrameInfo.getStackSize() * getScratchScaleFactor(ST));
501 }
502
503 if (hasFP(MF)) {
504 Register FPReg = MFI->getFrameOffsetReg();
505 assert(FPReg != AMDGPU::FP_REG);
506 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
507 }
508
509 bool NeedsFlatScratchInit =
510 MFI->hasFlatScratchInit() &&
511 (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
512 (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
513
514 if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
515 PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
516 MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
517 MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
518 }
519
520 if (NeedsFlatScratchInit) {
521 emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
522 }
523
524 if (ScratchRsrcReg) {
525 emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
526 PreloadedScratchRsrcReg,
527 ScratchRsrcReg, ScratchWaveOffsetReg);
528 }
529 }
530
531 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
emitEntryFunctionScratchRsrcRegSetup(MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register PreloadedScratchRsrcReg,Register ScratchRsrcReg,Register ScratchWaveOffsetReg) const532 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
533 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
534 const DebugLoc &DL, Register PreloadedScratchRsrcReg,
535 Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
536
537 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
538 const SIInstrInfo *TII = ST.getInstrInfo();
539 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
540 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
541 const Function &Fn = MF.getFunction();
542
543 if (ST.isAmdPalOS()) {
544 // The pointer to the GIT is formed from the offset passed in and either
545 // the amdgpu-git-ptr-high function attribute or the top part of the PC
546 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
547 Register Rsrc03 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
548
549 buildGitPtr(MBB, I, DL, TII, Rsrc01);
550
551 // We now have the GIT ptr - now get the scratch descriptor from the entry
552 // at offset 0 (or offset 16 for a compute shader).
553 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
554 const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
555 auto MMO = MF.getMachineMemOperand(PtrInfo,
556 MachineMemOperand::MOLoad |
557 MachineMemOperand::MOInvariant |
558 MachineMemOperand::MODereferenceable,
559 16, Align(4));
560 unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
561 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
562 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
563 BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
564 .addReg(Rsrc01)
565 .addImm(EncodedOffset) // offset
566 .addImm(0) // cpol
567 .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
568 .addMemOperand(MMO);
569
570 // The driver will always set the SRD for wave 64 (bits 118:117 of
571 // descriptor / bits 22:21 of third sub-reg will be 0b11)
572 // If the shader is actually wave32 we have to modify the const_index_stride
573 // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The
574 // reason the driver does this is that there can be cases where it presents
575 // 2 shaders with different wave size (e.g. VsFs).
576 // TODO: convert to using SCRATCH instructions or multiple SRD buffers
577 if (ST.isWave32()) {
578 const MCInstrDesc &SBitsetB32 = TII->get(AMDGPU::S_BITSET0_B32);
579 BuildMI(MBB, I, DL, SBitsetB32, Rsrc03)
580 .addImm(21)
581 .addReg(Rsrc03);
582 }
583 } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
584 assert(!ST.isAmdHsaOrMesa(Fn));
585 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
586
587 Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
588 Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
589
590 // Use relocations to get the pointer, and setup the other bits manually.
591 uint64_t Rsrc23 = TII->getScratchRsrcWords23();
592
593 if (MFI->hasImplicitBufferPtr()) {
594 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
595
596 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
597 const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
598
599 BuildMI(MBB, I, DL, Mov64, Rsrc01)
600 .addReg(MFI->getImplicitBufferPtrUserSGPR())
601 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
602 } else {
603 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
604
605 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
606 auto MMO = MF.getMachineMemOperand(
607 PtrInfo,
608 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
609 MachineMemOperand::MODereferenceable,
610 8, Align(4));
611 BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
612 .addReg(MFI->getImplicitBufferPtrUserSGPR())
613 .addImm(0) // offset
614 .addImm(0) // cpol
615 .addMemOperand(MMO)
616 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
617
618 MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
619 MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
620 }
621 } else {
622 Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
623 Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
624
625 BuildMI(MBB, I, DL, SMovB32, Rsrc0)
626 .addExternalSymbol("SCRATCH_RSRC_DWORD0")
627 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
628
629 BuildMI(MBB, I, DL, SMovB32, Rsrc1)
630 .addExternalSymbol("SCRATCH_RSRC_DWORD1")
631 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
632
633 }
634
635 BuildMI(MBB, I, DL, SMovB32, Rsrc2)
636 .addImm(Rsrc23 & 0xffffffff)
637 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
638
639 BuildMI(MBB, I, DL, SMovB32, Rsrc3)
640 .addImm(Rsrc23 >> 32)
641 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
642 } else if (ST.isAmdHsaOrMesa(Fn)) {
643 assert(PreloadedScratchRsrcReg);
644
645 if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
646 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
647 .addReg(PreloadedScratchRsrcReg, RegState::Kill);
648 }
649 }
650
651 // Add the scratch wave offset into the scratch RSRC.
652 //
653 // We only want to update the first 48 bits, which is the base address
654 // pointer, without touching the adjacent 16 bits of flags. We know this add
655 // cannot carry-out from bit 47, otherwise the scratch allocation would be
656 // impossible to fit in the 48-bit global address space.
657 //
658 // TODO: Evaluate if it is better to just construct an SRD using the flat
659 // scratch init and some constants rather than update the one we are passed.
660 Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
661 Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
662
663 // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
664 // the kernel body via inreg arguments.
665 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0)
666 .addReg(ScratchRsrcSub0)
667 .addReg(ScratchWaveOffsetReg)
668 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
669 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1)
670 .addReg(ScratchRsrcSub1)
671 .addImm(0)
672 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
673 Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
674 }
675
isSupportedStackID(TargetStackID::Value ID) const676 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
677 switch (ID) {
678 case TargetStackID::Default:
679 case TargetStackID::NoAlloc:
680 case TargetStackID::SGPRSpill:
681 return true;
682 case TargetStackID::ScalableVector:
683 case TargetStackID::WasmLocal:
684 return false;
685 }
686 llvm_unreachable("Invalid TargetStackID::Value");
687 }
688
initLiveRegs(LivePhysRegs & LiveRegs,const SIRegisterInfo & TRI,const SIMachineFunctionInfo * FuncInfo,MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,bool IsProlog)689 static void initLiveRegs(LivePhysRegs &LiveRegs, const SIRegisterInfo &TRI,
690 const SIMachineFunctionInfo *FuncInfo,
691 MachineFunction &MF, MachineBasicBlock &MBB,
692 MachineBasicBlock::iterator MBBI, bool IsProlog) {
693 if (LiveRegs.empty()) {
694 LiveRegs.init(TRI);
695 if (IsProlog) {
696 LiveRegs.addLiveIns(MBB);
697 } else {
698 // In epilog.
699 LiveRegs.addLiveOuts(MBB);
700 LiveRegs.stepBackward(*MBBI);
701 }
702 }
703 }
704
705 // Activate all lanes, returns saved exec.
buildScratchExecCopy(LivePhysRegs & LiveRegs,MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,bool IsProlog)706 static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
707 MachineFunction &MF,
708 MachineBasicBlock &MBB,
709 MachineBasicBlock::iterator MBBI,
710 bool IsProlog) {
711 Register ScratchExecCopy;
712 MachineRegisterInfo &MRI = MF.getRegInfo();
713 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
714 const SIInstrInfo *TII = ST.getInstrInfo();
715 const SIRegisterInfo &TRI = TII->getRegisterInfo();
716 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
717 DebugLoc DL;
718
719 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
720
721 ScratchExecCopy = findScratchNonCalleeSaveRegister(
722 MRI, LiveRegs, *TRI.getWaveMaskRegClass());
723 if (!ScratchExecCopy)
724 report_fatal_error("failed to find free scratch register");
725
726 LiveRegs.addReg(ScratchExecCopy);
727
728 const unsigned OrSaveExec =
729 ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
730 auto SaveExec = BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy)
731 .addImm(-1);
732 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
733
734 return ScratchExecCopy;
735 }
736
737 // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR.
738 // Otherwise we are spilling to memory.
spilledToMemory(const MachineFunction & MF,int SaveIndex)739 static bool spilledToMemory(const MachineFunction &MF, int SaveIndex) {
740 const MachineFrameInfo &MFI = MF.getFrameInfo();
741 return MFI.getStackID(SaveIndex) != TargetStackID::SGPRSpill;
742 }
743
emitPrologue(MachineFunction & MF,MachineBasicBlock & MBB) const744 void SIFrameLowering::emitPrologue(MachineFunction &MF,
745 MachineBasicBlock &MBB) const {
746 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
747 if (FuncInfo->isEntryFunction()) {
748 emitEntryFunctionPrologue(MF, MBB);
749 return;
750 }
751
752 MachineFrameInfo &MFI = MF.getFrameInfo();
753 MachineRegisterInfo &MRI = MF.getRegInfo();
754 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
755 const SIInstrInfo *TII = ST.getInstrInfo();
756 const SIRegisterInfo &TRI = TII->getRegisterInfo();
757
758 Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
759 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
760 Register BasePtrReg =
761 TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
762 LivePhysRegs LiveRegs;
763
764 MachineBasicBlock::iterator MBBI = MBB.begin();
765 DebugLoc DL;
766
767 bool HasFP = false;
768 bool HasBP = false;
769 uint32_t NumBytes = MFI.getStackSize();
770 uint32_t RoundedSize = NumBytes;
771 // To avoid clobbering VGPRs in lanes that weren't active on function entry,
772 // turn on all lanes before doing the spill to memory.
773 Register ScratchExecCopy;
774
775 Optional<int> FPSaveIndex = FuncInfo->FramePointerSaveIndex;
776 Optional<int> BPSaveIndex = FuncInfo->BasePointerSaveIndex;
777
778 // VGPRs used for SGPR->VGPR spills
779 for (const SIMachineFunctionInfo::SGPRSpillVGPR &Reg :
780 FuncInfo->getSGPRSpillVGPRs()) {
781 if (!Reg.FI)
782 continue;
783
784 if (!ScratchExecCopy)
785 ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI,
786 /*IsProlog*/ true);
787
788 buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, Reg.VGPR,
789 *Reg.FI);
790 }
791
792 for (auto ReservedWWM : FuncInfo->wwmAllocation()) {
793 if (!ScratchExecCopy)
794 ScratchExecCopy =
795 buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ true);
796
797 buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL,
798 std::get<0>(ReservedWWM), std::get<1>(ReservedWWM));
799 }
800
801 if (ScratchExecCopy) {
802 // FIXME: Split block and make terminator.
803 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
804 MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
805 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
806 .addReg(ScratchExecCopy, RegState::Kill);
807 LiveRegs.addReg(ScratchExecCopy);
808 }
809
810 auto SaveSGPRToMemory = [&](Register Reg, const int FI) {
811 assert(!MFI.isDeadObjectIndex(FI));
812
813 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
814
815 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
816 MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
817 if (!TmpVGPR)
818 report_fatal_error("failed to find free scratch register");
819
820 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
821 .addReg(Reg);
822
823 buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, TmpVGPR,
824 FI);
825 };
826
827 auto SaveSGPRToVGPRLane = [&](Register Reg, const int FI) {
828 assert(!MFI.isDeadObjectIndex(FI));
829
830 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
831 ArrayRef<SIRegisterInfo::SpilledReg> Spill =
832 FuncInfo->getSGPRToVGPRSpills(FI);
833 assert(Spill.size() == 1);
834
835 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR)
836 .addReg(Reg)
837 .addImm(Spill[0].Lane)
838 .addReg(Spill[0].VGPR, RegState::Undef);
839 };
840
841 if (FPSaveIndex) {
842 if (spilledToMemory(MF, *FPSaveIndex))
843 SaveSGPRToMemory(FramePtrReg, *FPSaveIndex);
844 else
845 SaveSGPRToVGPRLane(FramePtrReg, *FPSaveIndex);
846 }
847
848 // Emit the copy if we need an FP, and are using a free SGPR to save it.
849 if (FuncInfo->SGPRForFPSaveRestoreCopy) {
850 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY),
851 FuncInfo->SGPRForFPSaveRestoreCopy)
852 .addReg(FramePtrReg)
853 .setMIFlag(MachineInstr::FrameSetup);
854 }
855
856 if (BPSaveIndex) {
857 if (spilledToMemory(MF, *BPSaveIndex))
858 SaveSGPRToMemory(BasePtrReg, *BPSaveIndex);
859 else
860 SaveSGPRToVGPRLane(BasePtrReg, *BPSaveIndex);
861 }
862
863 // Emit the copy if we need a BP, and are using a free SGPR to save it.
864 if (FuncInfo->SGPRForBPSaveRestoreCopy) {
865 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY),
866 FuncInfo->SGPRForBPSaveRestoreCopy)
867 .addReg(BasePtrReg)
868 .setMIFlag(MachineInstr::FrameSetup);
869 }
870
871 // If a copy has been emitted for FP and/or BP, Make the SGPRs
872 // used in the copy instructions live throughout the function.
873 SmallVector<MCPhysReg, 2> TempSGPRs;
874 if (FuncInfo->SGPRForFPSaveRestoreCopy)
875 TempSGPRs.push_back(FuncInfo->SGPRForFPSaveRestoreCopy);
876
877 if (FuncInfo->SGPRForBPSaveRestoreCopy)
878 TempSGPRs.push_back(FuncInfo->SGPRForBPSaveRestoreCopy);
879
880 if (!TempSGPRs.empty()) {
881 for (MachineBasicBlock &MBB : MF) {
882 for (MCPhysReg Reg : TempSGPRs)
883 MBB.addLiveIn(Reg);
884
885 MBB.sortUniqueLiveIns();
886 }
887 if (!LiveRegs.empty()) {
888 LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy);
889 LiveRegs.addReg(FuncInfo->SGPRForBPSaveRestoreCopy);
890 }
891 }
892
893 if (TRI.hasStackRealignment(MF)) {
894 HasFP = true;
895 const unsigned Alignment = MFI.getMaxAlign().value();
896
897 RoundedSize += Alignment;
898 if (LiveRegs.empty()) {
899 LiveRegs.init(TRI);
900 LiveRegs.addLiveIns(MBB);
901 }
902
903 // s_add_i32 s33, s32, NumBytes
904 // s_and_b32 s33, s33, 0b111...0000
905 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), FramePtrReg)
906 .addReg(StackPtrReg)
907 .addImm((Alignment - 1) * getScratchScaleFactor(ST))
908 .setMIFlag(MachineInstr::FrameSetup);
909 auto And = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
910 .addReg(FramePtrReg, RegState::Kill)
911 .addImm(-Alignment * getScratchScaleFactor(ST))
912 .setMIFlag(MachineInstr::FrameSetup);
913 And->getOperand(3).setIsDead(); // Mark SCC as dead.
914 FuncInfo->setIsStackRealigned(true);
915 } else if ((HasFP = hasFP(MF))) {
916 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
917 .addReg(StackPtrReg)
918 .setMIFlag(MachineInstr::FrameSetup);
919 }
920
921 // If we need a base pointer, set it up here. It's whatever the value of
922 // the stack pointer is at this point. Any variable size objects will be
923 // allocated after this, so we can still use the base pointer to reference
924 // the incoming arguments.
925 if ((HasBP = TRI.hasBasePointer(MF))) {
926 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
927 .addReg(StackPtrReg)
928 .setMIFlag(MachineInstr::FrameSetup);
929 }
930
931 if (HasFP && RoundedSize != 0) {
932 auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
933 .addReg(StackPtrReg)
934 .addImm(RoundedSize * getScratchScaleFactor(ST))
935 .setMIFlag(MachineInstr::FrameSetup);
936 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
937 }
938
939 assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy ||
940 FuncInfo->FramePointerSaveIndex)) &&
941 "Needed to save FP but didn't save it anywhere");
942
943 // If we allow spilling to AGPRs we may have saved FP but then spill
944 // everything into AGPRs instead of the stack.
945 assert((HasFP || (!FuncInfo->SGPRForFPSaveRestoreCopy &&
946 !FuncInfo->FramePointerSaveIndex) ||
947 EnableSpillVGPRToAGPR) &&
948 "Saved FP but didn't need it");
949
950 assert((!HasBP || (FuncInfo->SGPRForBPSaveRestoreCopy ||
951 FuncInfo->BasePointerSaveIndex)) &&
952 "Needed to save BP but didn't save it anywhere");
953
954 assert((HasBP || (!FuncInfo->SGPRForBPSaveRestoreCopy &&
955 !FuncInfo->BasePointerSaveIndex)) &&
956 "Saved BP but didn't need it");
957 }
958
emitEpilogue(MachineFunction & MF,MachineBasicBlock & MBB) const959 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
960 MachineBasicBlock &MBB) const {
961 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
962 if (FuncInfo->isEntryFunction())
963 return;
964
965 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
966 const SIInstrInfo *TII = ST.getInstrInfo();
967 MachineRegisterInfo &MRI = MF.getRegInfo();
968 const SIRegisterInfo &TRI = TII->getRegisterInfo();
969 MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
970 LivePhysRegs LiveRegs;
971 DebugLoc DL;
972
973 const MachineFrameInfo &MFI = MF.getFrameInfo();
974 uint32_t NumBytes = MFI.getStackSize();
975 uint32_t RoundedSize = FuncInfo->isStackRealigned()
976 ? NumBytes + MFI.getMaxAlign().value()
977 : NumBytes;
978 const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
979 const Register FramePtrReg = FuncInfo->getFrameOffsetReg();
980 const Register BasePtrReg =
981 TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
982
983 Optional<int> FPSaveIndex = FuncInfo->FramePointerSaveIndex;
984 Optional<int> BPSaveIndex = FuncInfo->BasePointerSaveIndex;
985
986 if (RoundedSize != 0 && hasFP(MF)) {
987 auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
988 .addReg(StackPtrReg)
989 .addImm(-static_cast<int64_t>(RoundedSize * getScratchScaleFactor(ST)))
990 .setMIFlag(MachineInstr::FrameDestroy);
991 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
992 }
993
994 if (FuncInfo->SGPRForFPSaveRestoreCopy) {
995 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
996 .addReg(FuncInfo->SGPRForFPSaveRestoreCopy)
997 .setMIFlag(MachineInstr::FrameDestroy);
998 }
999
1000 if (FuncInfo->SGPRForBPSaveRestoreCopy) {
1001 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
1002 .addReg(FuncInfo->SGPRForBPSaveRestoreCopy)
1003 .setMIFlag(MachineInstr::FrameDestroy);
1004 }
1005
1006 auto RestoreSGPRFromMemory = [&](Register Reg, const int FI) {
1007 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
1008 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
1009 MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
1010 if (!TmpVGPR)
1011 report_fatal_error("failed to find free scratch register");
1012 buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, TmpVGPR,
1013 FI);
1014 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), Reg)
1015 .addReg(TmpVGPR, RegState::Kill);
1016 };
1017
1018 auto RestoreSGPRFromVGPRLane = [&](Register Reg, const int FI) {
1019 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
1020 ArrayRef<SIRegisterInfo::SpilledReg> Spill =
1021 FuncInfo->getSGPRToVGPRSpills(FI);
1022 assert(Spill.size() == 1);
1023 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READLANE_B32), Reg)
1024 .addReg(Spill[0].VGPR)
1025 .addImm(Spill[0].Lane);
1026 };
1027
1028 if (FPSaveIndex) {
1029 const int FramePtrFI = *FPSaveIndex;
1030 assert(!MFI.isDeadObjectIndex(FramePtrFI));
1031 if (spilledToMemory(MF, FramePtrFI))
1032 RestoreSGPRFromMemory(FramePtrReg, FramePtrFI);
1033 else
1034 RestoreSGPRFromVGPRLane(FramePtrReg, FramePtrFI);
1035 }
1036
1037 if (BPSaveIndex) {
1038 const int BasePtrFI = *BPSaveIndex;
1039 assert(!MFI.isDeadObjectIndex(BasePtrFI));
1040 if (spilledToMemory(MF, BasePtrFI))
1041 RestoreSGPRFromMemory(BasePtrReg, BasePtrFI);
1042 else
1043 RestoreSGPRFromVGPRLane(BasePtrReg, BasePtrFI);
1044 }
1045
1046 Register ScratchExecCopy;
1047 for (const SIMachineFunctionInfo::SGPRSpillVGPR &Reg :
1048 FuncInfo->getSGPRSpillVGPRs()) {
1049 if (!Reg.FI)
1050 continue;
1051
1052 if (!ScratchExecCopy)
1053 ScratchExecCopy =
1054 buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ false);
1055
1056 buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL,
1057 Reg.VGPR, *Reg.FI);
1058 }
1059
1060 for (auto ReservedWWM : FuncInfo->wwmAllocation()) {
1061 if (!ScratchExecCopy)
1062 ScratchExecCopy =
1063 buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ false);
1064
1065 buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL,
1066 std::get<0>(ReservedWWM), std::get<1>(ReservedWWM));
1067 }
1068
1069 if (ScratchExecCopy) {
1070 // FIXME: Split block and make terminator.
1071 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1072 MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
1073 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
1074 .addReg(ScratchExecCopy, RegState::Kill);
1075 }
1076 }
1077
1078 #ifndef NDEBUG
allSGPRSpillsAreDead(const MachineFunction & MF)1079 static bool allSGPRSpillsAreDead(const MachineFunction &MF) {
1080 const MachineFrameInfo &MFI = MF.getFrameInfo();
1081 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1082 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
1083 I != E; ++I) {
1084 if (!MFI.isDeadObjectIndex(I) &&
1085 MFI.getStackID(I) == TargetStackID::SGPRSpill &&
1086 (I != FuncInfo->FramePointerSaveIndex &&
1087 I != FuncInfo->BasePointerSaveIndex)) {
1088 return false;
1089 }
1090 }
1091
1092 return true;
1093 }
1094 #endif
1095
getFrameIndexReference(const MachineFunction & MF,int FI,Register & FrameReg) const1096 StackOffset SIFrameLowering::getFrameIndexReference(const MachineFunction &MF,
1097 int FI,
1098 Register &FrameReg) const {
1099 const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1100
1101 FrameReg = RI->getFrameRegister(MF);
1102 return StackOffset::getFixed(MF.getFrameInfo().getObjectOffset(FI));
1103 }
1104
processFunctionBeforeFrameFinalized(MachineFunction & MF,RegScavenger * RS) const1105 void SIFrameLowering::processFunctionBeforeFrameFinalized(
1106 MachineFunction &MF,
1107 RegScavenger *RS) const {
1108 MachineFrameInfo &MFI = MF.getFrameInfo();
1109
1110 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1111 const SIInstrInfo *TII = ST.getInstrInfo();
1112 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1113 MachineRegisterInfo &MRI = MF.getRegInfo();
1114 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1115
1116 if (!FuncInfo->isEntryFunction()) {
1117 // Spill VGPRs used for Whole Wave Mode
1118 FuncInfo->allocateWWMReservedSpillSlots(MFI, *TRI);
1119 }
1120
1121 const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
1122 && EnableSpillVGPRToAGPR;
1123
1124 if (SpillVGPRToAGPR) {
1125 // To track the spill frame indices handled in this pass.
1126 BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
1127 BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false);
1128
1129 bool SeenDbgInstr = false;
1130
1131 for (MachineBasicBlock &MBB : MF) {
1132 for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
1133 int FrameIndex;
1134 if (MI.isDebugInstr())
1135 SeenDbgInstr = true;
1136
1137 if (TII->isVGPRSpill(MI)) {
1138 // Try to eliminate stack used by VGPR spills before frame
1139 // finalization.
1140 unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1141 AMDGPU::OpName::vaddr);
1142 int FI = MI.getOperand(FIOp).getIndex();
1143 Register VReg =
1144 TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
1145 if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
1146 TRI->isAGPR(MRI, VReg))) {
1147 // FIXME: change to enterBasicBlockEnd()
1148 RS->enterBasicBlock(MBB);
1149 TRI->eliminateFrameIndex(MI, 0, FIOp, RS);
1150 SpillFIs.set(FI);
1151 continue;
1152 }
1153 } else if (TII->isStoreToStackSlot(MI, FrameIndex) ||
1154 TII->isLoadFromStackSlot(MI, FrameIndex))
1155 if (!MFI.isFixedObjectIndex(FrameIndex))
1156 NonVGPRSpillFIs.set(FrameIndex);
1157 }
1158 }
1159
1160 // Stack slot coloring may assign different objects to the same stack slot.
1161 // If not, then the VGPR to AGPR spill slot is dead.
1162 for (unsigned FI : SpillFIs.set_bits())
1163 if (!NonVGPRSpillFIs.test(FI))
1164 FuncInfo->setVGPRToAGPRSpillDead(FI);
1165
1166 for (MachineBasicBlock &MBB : MF) {
1167 for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
1168 MBB.addLiveIn(Reg);
1169
1170 for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs())
1171 MBB.addLiveIn(Reg);
1172
1173 MBB.sortUniqueLiveIns();
1174
1175 if (!SpillFIs.empty() && SeenDbgInstr) {
1176 // FIXME: The dead frame indices are replaced with a null register from
1177 // the debug value instructions. We should instead, update it with the
1178 // correct register value. But not sure the register value alone is
1179 for (MachineInstr &MI : MBB) {
1180 if (MI.isDebugValue() && MI.getOperand(0).isFI() &&
1181 SpillFIs[MI.getOperand(0).getIndex()]) {
1182 MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/);
1183 }
1184 }
1185 }
1186 }
1187 }
1188
1189 // At this point we've already allocated all spilled SGPRs to VGPRs if we
1190 // can. Any remaining SGPR spills will go to memory, so move them back to the
1191 // default stack.
1192 bool HaveSGPRToVMemSpill =
1193 FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true);
1194 assert(allSGPRSpillsAreDead(MF) &&
1195 "SGPR spill should have been removed in SILowerSGPRSpills");
1196
1197 // FIXME: The other checks should be redundant with allStackObjectsAreDead,
1198 // but currently hasNonSpillStackObjects is set only from source
1199 // allocas. Stack temps produced from legalization are not counted currently.
1200 if (!allStackObjectsAreDead(MFI)) {
1201 assert(RS && "RegScavenger required if spilling");
1202
1203 // Add an emergency spill slot
1204 RS->addScavengingFrameIndex(FuncInfo->getScavengeFI(MFI, *TRI));
1205
1206 // If we are spilling SGPRs to memory with a large frame, we may need a
1207 // second VGPR emergency frame index.
1208 if (HaveSGPRToVMemSpill &&
1209 allocateScavengingFrameIndexesNearIncomingSP(MF)) {
1210 RS->addScavengingFrameIndex(MFI.CreateStackObject(4, Align(4), false));
1211 }
1212 }
1213 }
1214
processFunctionBeforeFrameIndicesReplaced(MachineFunction & MF,RegScavenger * RS) const1215 void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced(
1216 MachineFunction &MF, RegScavenger *RS) const {
1217 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1218 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1219 MachineRegisterInfo &MRI = MF.getRegInfo();
1220 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1221
1222 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
1223 // On gfx908, we had initially reserved highest available VGPR for AGPR
1224 // copy. Now since we are done with RA, check if there exist an unused VGPR
1225 // which is lower than the eariler reserved VGPR before RA. If one exist,
1226 // use it for AGPR copy instead of one reserved before RA.
1227 Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy();
1228 Register UnusedLowVGPR =
1229 TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
1230 if (UnusedLowVGPR && (TRI->getHWRegIndex(UnusedLowVGPR) <
1231 TRI->getHWRegIndex(VGPRForAGPRCopy))) {
1232 // Call to setVGPRForAGPRCopy() should happen first before calling
1233 // freezeReservedRegs() so that getReservedRegs() can reserve this newly
1234 // identified VGPR (for AGPR copy).
1235 FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR);
1236 MRI.freezeReservedRegs(MF);
1237 }
1238 }
1239 }
1240
1241 // Only report VGPRs to generic code.
determineCalleeSaves(MachineFunction & MF,BitVector & SavedVGPRs,RegScavenger * RS) const1242 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
1243 BitVector &SavedVGPRs,
1244 RegScavenger *RS) const {
1245 TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
1246 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1247 if (MFI->isEntryFunction())
1248 return;
1249
1250 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1251 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1252 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1253
1254 // Ignore the SGPRs the default implementation found.
1255 SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask());
1256
1257 // Do not save AGPRs prior to GFX90A because there was no easy way to do so.
1258 // In gfx908 there was do AGPR loads and stores and thus spilling also
1259 // require a temporary VGPR.
1260 if (!ST.hasGFX90AInsts())
1261 SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask());
1262
1263 // hasFP only knows about stack objects that already exist. We're now
1264 // determining the stack slots that will be created, so we have to predict
1265 // them. Stack objects force FP usage with calls.
1266 //
1267 // Note a new VGPR CSR may be introduced if one is used for the spill, but we
1268 // don't want to report it here.
1269 //
1270 // FIXME: Is this really hasReservedCallFrame?
1271 const bool WillHaveFP =
1272 FrameInfo.hasCalls() &&
1273 (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo));
1274
1275 // VGPRs used for SGPR spilling need to be specially inserted in the prolog,
1276 // so don't allow the default insertion to handle them.
1277 for (auto SSpill : MFI->getSGPRSpillVGPRs())
1278 SavedVGPRs.reset(SSpill.VGPR);
1279
1280 LivePhysRegs LiveRegs;
1281 LiveRegs.init(*TRI);
1282
1283 if (WillHaveFP || hasFP(MF)) {
1284 assert(!MFI->SGPRForFPSaveRestoreCopy && !MFI->FramePointerSaveIndex &&
1285 "Re-reserving spill slot for FP");
1286 getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForFPSaveRestoreCopy,
1287 MFI->FramePointerSaveIndex, true);
1288 }
1289
1290 if (TRI->hasBasePointer(MF)) {
1291 if (MFI->SGPRForFPSaveRestoreCopy)
1292 LiveRegs.addReg(MFI->SGPRForFPSaveRestoreCopy);
1293
1294 assert(!MFI->SGPRForBPSaveRestoreCopy &&
1295 !MFI->BasePointerSaveIndex && "Re-reserving spill slot for BP");
1296 getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForBPSaveRestoreCopy,
1297 MFI->BasePointerSaveIndex, false);
1298 }
1299 }
1300
determineCalleeSavesSGPR(MachineFunction & MF,BitVector & SavedRegs,RegScavenger * RS) const1301 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
1302 BitVector &SavedRegs,
1303 RegScavenger *RS) const {
1304 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
1305 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1306 if (MFI->isEntryFunction())
1307 return;
1308
1309 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1310 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1311
1312 // The SP is specifically managed and we don't want extra spills of it.
1313 SavedRegs.reset(MFI->getStackPtrOffsetReg());
1314
1315 const BitVector AllSavedRegs = SavedRegs;
1316 SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask());
1317
1318 // We have to anticipate introducing CSR VGPR spills or spill of caller
1319 // save VGPR reserved for SGPR spills as we now always create stack entry
1320 // for it, if we don't have any stack objects already, since we require a FP
1321 // if there is a call and stack. We will allocate a VGPR for SGPR spills if
1322 // there are any SGPR spills. Whether they are CSR spills or otherwise.
1323 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1324 const bool WillHaveFP =
1325 FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs());
1326
1327 // FP will be specially managed like SP.
1328 if (WillHaveFP || hasFP(MF))
1329 SavedRegs.reset(MFI->getFrameOffsetReg());
1330
1331 // Return address use with return instruction is hidden through the SI_RETURN
1332 // pseudo. Given that and since the IPRA computes actual register usage and
1333 // does not use CSR list, the clobbering of return address by function calls
1334 // (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register
1335 // usage collection. This will ensure save/restore of return address happens
1336 // in those scenarios.
1337 const MachineRegisterInfo &MRI = MF.getRegInfo();
1338 Register RetAddrReg = TRI->getReturnAddressReg(MF);
1339 if (!MFI->isEntryFunction() &&
1340 (FrameInfo.hasCalls() || MRI.isPhysRegModified(RetAddrReg))) {
1341 SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub0));
1342 SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub1));
1343 }
1344 }
1345
assignCalleeSavedSpillSlots(MachineFunction & MF,const TargetRegisterInfo * TRI,std::vector<CalleeSavedInfo> & CSI) const1346 bool SIFrameLowering::assignCalleeSavedSpillSlots(
1347 MachineFunction &MF, const TargetRegisterInfo *TRI,
1348 std::vector<CalleeSavedInfo> &CSI) const {
1349 if (CSI.empty())
1350 return true; // Early exit if no callee saved registers are modified!
1351
1352 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1353 if (!FuncInfo->SGPRForFPSaveRestoreCopy &&
1354 !FuncInfo->SGPRForBPSaveRestoreCopy)
1355 return false;
1356
1357 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1358 const SIRegisterInfo *RI = ST.getRegisterInfo();
1359 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1360 Register BasePtrReg = RI->getBaseRegister();
1361 unsigned NumModifiedRegs = 0;
1362
1363 if (FuncInfo->SGPRForFPSaveRestoreCopy)
1364 NumModifiedRegs++;
1365 if (FuncInfo->SGPRForBPSaveRestoreCopy)
1366 NumModifiedRegs++;
1367
1368 for (auto &CS : CSI) {
1369 if (CS.getReg() == FramePtrReg && FuncInfo->SGPRForFPSaveRestoreCopy) {
1370 CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy);
1371 if (--NumModifiedRegs)
1372 break;
1373 } else if (CS.getReg() == BasePtrReg &&
1374 FuncInfo->SGPRForBPSaveRestoreCopy) {
1375 CS.setDstReg(FuncInfo->SGPRForBPSaveRestoreCopy);
1376 if (--NumModifiedRegs)
1377 break;
1378 }
1379 }
1380
1381 return false;
1382 }
1383
allocateScavengingFrameIndexesNearIncomingSP(const MachineFunction & MF) const1384 bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
1385 const MachineFunction &MF) const {
1386
1387 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1388 const MachineFrameInfo &MFI = MF.getFrameInfo();
1389 uint64_t EstStackSize = MFI.estimateStackSize(MF);
1390 uint64_t MaxOffset = EstStackSize - 1;
1391
1392 // We need the emergency stack slots to be allocated in range of the
1393 // MUBUF/flat scratch immediate offset from the base register, so assign these
1394 // first at the incoming SP position.
1395 //
1396 // TODO: We could try sorting the objects to find a hole in the first bytes
1397 // rather than allocating as close to possible. This could save a lot of space
1398 // on frames with alignment requirements.
1399 if (ST.enableFlatScratch()) {
1400 const SIInstrInfo *TII = ST.getInstrInfo();
1401 if (TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
1402 SIInstrFlags::FlatScratch))
1403 return false;
1404 } else {
1405 if (SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset))
1406 return false;
1407 }
1408
1409 return true;
1410 }
1411
eliminateCallFramePseudoInstr(MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator I) const1412 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
1413 MachineFunction &MF,
1414 MachineBasicBlock &MBB,
1415 MachineBasicBlock::iterator I) const {
1416 int64_t Amount = I->getOperand(0).getImm();
1417 if (Amount == 0)
1418 return MBB.erase(I);
1419
1420 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1421 const SIInstrInfo *TII = ST.getInstrInfo();
1422 const DebugLoc &DL = I->getDebugLoc();
1423 unsigned Opc = I->getOpcode();
1424 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
1425 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
1426
1427 if (!hasReservedCallFrame(MF)) {
1428 Amount = alignTo(Amount, getStackAlign());
1429 assert(isUInt<32>(Amount) && "exceeded stack address space size");
1430 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1431 Register SPReg = MFI->getStackPtrOffsetReg();
1432
1433 Amount *= getScratchScaleFactor(ST);
1434 if (IsDestroy)
1435 Amount = -Amount;
1436 auto Add = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg)
1437 .addReg(SPReg)
1438 .addImm(Amount);
1439 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1440 } else if (CalleePopAmount != 0) {
1441 llvm_unreachable("is this used?");
1442 }
1443
1444 return MBB.erase(I);
1445 }
1446
1447 /// Returns true if the frame will require a reference to the stack pointer.
1448 ///
1449 /// This is the set of conditions common to setting up the stack pointer in a
1450 /// kernel, and for using a frame pointer in a callable function.
1451 ///
1452 /// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm
1453 /// references SP.
frameTriviallyRequiresSP(const MachineFrameInfo & MFI)1454 static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) {
1455 return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint();
1456 }
1457
1458 // The FP for kernels is always known 0, so we never really need to setup an
1459 // explicit register for it. However, DisableFramePointerElim will force us to
1460 // use a register for it.
hasFP(const MachineFunction & MF) const1461 bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
1462 const MachineFrameInfo &MFI = MF.getFrameInfo();
1463
1464 // For entry functions we can use an immediate offset in most cases, so the
1465 // presence of calls doesn't imply we need a distinct frame pointer.
1466 if (MFI.hasCalls() &&
1467 !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1468 // All offsets are unsigned, so need to be addressed in the same direction
1469 // as stack growth.
1470
1471 // FIXME: This function is pretty broken, since it can be called before the
1472 // frame layout is determined or CSR spills are inserted.
1473 return MFI.getStackSize() != 0;
1474 }
1475
1476 return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
1477 MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
1478 MF) ||
1479 MF.getTarget().Options.DisableFramePointerElim(MF);
1480 }
1481
1482 // This is essentially a reduced version of hasFP for entry functions. Since the
1483 // stack pointer is known 0 on entry to kernels, we never really need an FP
1484 // register. We may need to initialize the stack pointer depending on the frame
1485 // properties, which logically overlaps many of the cases where an ordinary
1486 // function would require an FP.
requiresStackPointerReference(const MachineFunction & MF) const1487 bool SIFrameLowering::requiresStackPointerReference(
1488 const MachineFunction &MF) const {
1489 // Callable functions always require a stack pointer reference.
1490 assert(MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() &&
1491 "only expected to call this for entry points");
1492
1493 const MachineFrameInfo &MFI = MF.getFrameInfo();
1494
1495 // Entry points ordinarily don't need to initialize SP. We have to set it up
1496 // for callees if there are any. Also note tail calls are impossible/don't
1497 // make any sense for kernels.
1498 if (MFI.hasCalls())
1499 return true;
1500
1501 // We still need to initialize the SP if we're doing anything weird that
1502 // references the SP, like variable sized stack objects.
1503 return frameTriviallyRequiresSP(MFI);
1504 }
1505