1 //===----------------------- SIFrameLowering.cpp --------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 9 #include "SIFrameLowering.h" 10 #include "AMDGPUSubtarget.h" 11 #include "SIInstrInfo.h" 12 #include "SIMachineFunctionInfo.h" 13 #include "SIRegisterInfo.h" 14 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 15 16 #include "llvm/CodeGen/LivePhysRegs.h" 17 #include "llvm/CodeGen/MachineFrameInfo.h" 18 #include "llvm/CodeGen/MachineFunction.h" 19 #include "llvm/CodeGen/MachineInstrBuilder.h" 20 #include "llvm/CodeGen/RegisterScavenging.h" 21 22 using namespace llvm; 23 24 #define DEBUG_TYPE "frame-info" 25 26 27 static ArrayRef<MCPhysReg> getAllSGPR128(const GCNSubtarget &ST, 28 const MachineFunction &MF) { 29 return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(), 30 ST.getMaxNumSGPRs(MF) / 4); 31 } 32 33 static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST, 34 const MachineFunction &MF) { 35 return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF)); 36 } 37 38 // Find a scratch register that we can use at the start of the prologue to 39 // re-align the stack pointer. We avoid using callee-save registers since they 40 // may appear to be free when this is called from canUseAsPrologue (during 41 // shrink wrapping), but then no longer be free when this is called from 42 // emitPrologue. 43 // 44 // FIXME: This is a bit conservative, since in the above case we could use one 45 // of the callee-save registers as a scratch temp to re-align the stack pointer, 46 // but we would then have to make sure that we were in fact saving at least one 47 // callee-save register in the prologue, which is additional complexity that 48 // doesn't seem worth the benefit. 49 static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, 50 LivePhysRegs &LiveRegs, 51 const TargetRegisterClass &RC, 52 bool Unused = false) { 53 // Mark callee saved registers as used so we will not choose them. 54 const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); 55 for (unsigned i = 0; CSRegs[i]; ++i) 56 LiveRegs.addReg(CSRegs[i]); 57 58 if (Unused) { 59 // We are looking for a register that can be used throughout the entire 60 // function, so any use is unacceptable. 61 for (MCRegister Reg : RC) { 62 if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg)) 63 return Reg; 64 } 65 } else { 66 for (MCRegister Reg : RC) { 67 if (LiveRegs.available(MRI, Reg)) 68 return Reg; 69 } 70 } 71 72 // If we require an unused register, this is used in contexts where failure is 73 // an option and has an alternative plan. In other contexts, this must 74 // succeed0. 75 if (!Unused) 76 report_fatal_error("failed to find free scratch register"); 77 78 return MCRegister(); 79 } 80 81 static MCPhysReg findUnusedSGPRNonCalleeSaved(MachineRegisterInfo &MRI) { 82 LivePhysRegs LiveRegs; 83 LiveRegs.init(*MRI.getTargetRegisterInfo()); 84 return findScratchNonCalleeSaveRegister( 85 MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true); 86 } 87 88 // We need to specially emit stack operations here because a different frame 89 // register is used than in the rest of the function, as getFrameRegister would 90 // use. 91 static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, 92 MachineBasicBlock::iterator I, 93 const SIInstrInfo *TII, Register SpillReg, 94 Register ScratchRsrcReg, Register SPReg, int FI) { 95 MachineFunction *MF = MBB.getParent(); 96 MachineFrameInfo &MFI = MF->getFrameInfo(); 97 98 int64_t Offset = MFI.getObjectOffset(FI); 99 100 MachineMemOperand *MMO = MF->getMachineMemOperand( 101 MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4, 102 MFI.getObjectAlign(FI)); 103 104 if (isUInt<12>(Offset)) { 105 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET)) 106 .addReg(SpillReg, RegState::Kill) 107 .addReg(ScratchRsrcReg) 108 .addReg(SPReg) 109 .addImm(Offset) 110 .addImm(0) // glc 111 .addImm(0) // slc 112 .addImm(0) // tfe 113 .addImm(0) // dlc 114 .addImm(0) // swz 115 .addMemOperand(MMO); 116 return; 117 } 118 119 MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( 120 MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass); 121 122 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg) 123 .addImm(Offset); 124 125 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN)) 126 .addReg(SpillReg, RegState::Kill) 127 .addReg(OffsetReg, RegState::Kill) 128 .addReg(ScratchRsrcReg) 129 .addReg(SPReg) 130 .addImm(0) 131 .addImm(0) // glc 132 .addImm(0) // slc 133 .addImm(0) // tfe 134 .addImm(0) // dlc 135 .addImm(0) // swz 136 .addMemOperand(MMO); 137 } 138 139 static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, 140 MachineBasicBlock::iterator I, 141 const SIInstrInfo *TII, Register SpillReg, 142 Register ScratchRsrcReg, Register SPReg, int FI) { 143 MachineFunction *MF = MBB.getParent(); 144 MachineFrameInfo &MFI = MF->getFrameInfo(); 145 int64_t Offset = MFI.getObjectOffset(FI); 146 147 MachineMemOperand *MMO = MF->getMachineMemOperand( 148 MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4, 149 MFI.getObjectAlign(FI)); 150 151 if (isUInt<12>(Offset)) { 152 BuildMI(MBB, I, DebugLoc(), 153 TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFSET), SpillReg) 154 .addReg(ScratchRsrcReg) 155 .addReg(SPReg) 156 .addImm(Offset) 157 .addImm(0) // glc 158 .addImm(0) // slc 159 .addImm(0) // tfe 160 .addImm(0) // dlc 161 .addImm(0) // swz 162 .addMemOperand(MMO); 163 return; 164 } 165 166 MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( 167 MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass); 168 169 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg) 170 .addImm(Offset); 171 172 BuildMI(MBB, I, DebugLoc(), 173 TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), SpillReg) 174 .addReg(OffsetReg, RegState::Kill) 175 .addReg(ScratchRsrcReg) 176 .addReg(SPReg) 177 .addImm(0) 178 .addImm(0) // glc 179 .addImm(0) // slc 180 .addImm(0) // tfe 181 .addImm(0) // dlc 182 .addImm(0) // swz 183 .addMemOperand(MMO); 184 } 185 186 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()` 187 void SIFrameLowering::emitEntryFunctionFlatScratchInit( 188 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 189 const DebugLoc &DL, Register ScratchWaveOffsetReg) const { 190 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 191 const SIInstrInfo *TII = ST.getInstrInfo(); 192 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 193 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 194 195 // We don't need this if we only have spills since there is no user facing 196 // scratch. 197 198 // TODO: If we know we don't have flat instructions earlier, we can omit 199 // this from the input registers. 200 // 201 // TODO: We only need to know if we access scratch space through a flat 202 // pointer. Because we only detect if flat instructions are used at all, 203 // this will be used more often than necessary on VI. 204 205 Register FlatScratchInitReg = 206 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); 207 208 MachineRegisterInfo &MRI = MF.getRegInfo(); 209 MRI.addLiveIn(FlatScratchInitReg); 210 MBB.addLiveIn(FlatScratchInitReg); 211 212 Register FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); 213 Register FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); 214 215 // Do a 64-bit pointer add. 216 if (ST.flatScratchIsPointer()) { 217 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 218 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) 219 .addReg(FlatScrInitLo) 220 .addReg(ScratchWaveOffsetReg); 221 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), FlatScrInitHi) 222 .addReg(FlatScrInitHi) 223 .addImm(0); 224 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)). 225 addReg(FlatScrInitLo). 226 addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO | 227 (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); 228 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)). 229 addReg(FlatScrInitHi). 230 addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI | 231 (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); 232 return; 233 } 234 235 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO) 236 .addReg(FlatScrInitLo) 237 .addReg(ScratchWaveOffsetReg); 238 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI) 239 .addReg(FlatScrInitHi) 240 .addImm(0); 241 242 return; 243 } 244 245 assert(ST.getGeneration() < AMDGPUSubtarget::GFX10); 246 247 // Copy the size in bytes. 248 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO) 249 .addReg(FlatScrInitHi, RegState::Kill); 250 251 // Add wave offset in bytes to private base offset. 252 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init. 253 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) 254 .addReg(FlatScrInitLo) 255 .addReg(ScratchWaveOffsetReg); 256 257 // Convert offset to 256-byte units. 258 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI) 259 .addReg(FlatScrInitLo, RegState::Kill) 260 .addImm(8); 261 } 262 263 // Shift down registers reserved for the scratch RSRC. 264 Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg( 265 MachineFunction &MF) const { 266 267 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 268 const SIInstrInfo *TII = ST.getInstrInfo(); 269 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 270 MachineRegisterInfo &MRI = MF.getRegInfo(); 271 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 272 273 assert(MFI->isEntryFunction()); 274 275 Register ScratchRsrcReg = MFI->getScratchRSrcReg(); 276 277 if (!ScratchRsrcReg || !MRI.isPhysRegUsed(ScratchRsrcReg)) 278 return Register(); 279 280 if (ST.hasSGPRInitBug() || 281 ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF)) 282 return ScratchRsrcReg; 283 284 // We reserved the last registers for this. Shift it down to the end of those 285 // which were actually used. 286 // 287 // FIXME: It might be safer to use a pseudoregister before replacement. 288 289 // FIXME: We should be able to eliminate unused input registers. We only 290 // cannot do this for the resources required for scratch access. For now we 291 // skip over user SGPRs and may leave unused holes. 292 293 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4; 294 ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(ST, MF); 295 AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded)); 296 297 // Skip the last N reserved elements because they should have already been 298 // reserved for VCC etc. 299 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 300 for (MCPhysReg Reg : AllSGPR128s) { 301 // Pick the first unallocated one. Make sure we don't clobber the other 302 // reserved input we needed. Also for PAL, make sure we don't clobber 303 // the GIT pointer passed in SGPR0 or SGPR8. 304 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && 305 !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) { 306 MRI.replaceRegWith(ScratchRsrcReg, Reg); 307 MFI->setScratchRSrcReg(Reg); 308 return Reg; 309 } 310 } 311 312 return ScratchRsrcReg; 313 } 314 315 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, 316 MachineBasicBlock &MBB) const { 317 assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); 318 319 // FIXME: If we only have SGPR spills, we won't actually be using scratch 320 // memory since these spill to VGPRs. We should be cleaning up these unused 321 // SGPR spill frame indices somewhere. 322 323 // FIXME: We still have implicit uses on SGPR spill instructions in case they 324 // need to spill to vector memory. It's likely that will not happen, but at 325 // this point it appears we need the setup. This part of the prolog should be 326 // emitted after frame indices are eliminated. 327 328 // FIXME: Remove all of the isPhysRegUsed checks 329 330 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 331 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 332 const SIInstrInfo *TII = ST.getInstrInfo(); 333 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 334 MachineRegisterInfo &MRI = MF.getRegInfo(); 335 const Function &F = MF.getFunction(); 336 337 assert(MFI->isEntryFunction()); 338 339 Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( 340 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); 341 // FIXME: Hack to not crash in situations which emitted an error. 342 if (!PreloadedScratchWaveOffsetReg) 343 return; 344 345 // We need to do the replacement of the private segment buffer register even 346 // if there are no stack objects. There could be stores to undef or a 347 // constant without an associated object. 348 // 349 // This will return `Register()` in cases where there are no actual 350 // uses of the SRSRC. 351 Register ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF); 352 353 // Make the selected register live throughout the function. 354 if (ScratchRsrcReg) { 355 for (MachineBasicBlock &OtherBB : MF) { 356 if (&OtherBB != &MBB) { 357 OtherBB.addLiveIn(ScratchRsrcReg); 358 } 359 } 360 } 361 362 // Now that we have fixed the reserved SRSRC we need to locate the 363 // (potentially) preloaded SRSRC. 364 Register PreloadedScratchRsrcReg; 365 if (ST.isAmdHsaOrMesa(F)) { 366 PreloadedScratchRsrcReg = 367 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); 368 if (ScratchRsrcReg && PreloadedScratchRsrcReg) { 369 // We added live-ins during argument lowering, but since they were not 370 // used they were deleted. We're adding the uses now, so add them back. 371 MRI.addLiveIn(PreloadedScratchRsrcReg); 372 MBB.addLiveIn(PreloadedScratchRsrcReg); 373 } 374 } 375 376 // Debug location must be unknown since the first debug location is used to 377 // determine the end of the prologue. 378 DebugLoc DL; 379 MachineBasicBlock::iterator I = MBB.begin(); 380 381 // We found the SRSRC first because it needs four registers and has an 382 // alignment requirement. If the SRSRC that we found is clobbering with 383 // the scratch wave offset, which may be in a fixed SGPR or a free SGPR 384 // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch 385 // wave offset to a free SGPR. 386 Register ScratchWaveOffsetReg; 387 if (TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) { 388 ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF); 389 unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); 390 AllSGPRs = AllSGPRs.slice( 391 std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded)); 392 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 393 for (MCPhysReg Reg : AllSGPRs) { 394 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && 395 !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) { 396 ScratchWaveOffsetReg = Reg; 397 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) 398 .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); 399 break; 400 } 401 } 402 } else { 403 ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg; 404 } 405 assert(ScratchWaveOffsetReg); 406 407 if (MF.getFrameInfo().hasCalls()) { 408 Register SPReg = MFI->getStackPtrOffsetReg(); 409 assert(SPReg != AMDGPU::SP_REG); 410 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg) 411 .addImm(MF.getFrameInfo().getStackSize() * ST.getWavefrontSize()); 412 } 413 414 if (hasFP(MF)) { 415 Register FPReg = MFI->getFrameOffsetReg(); 416 assert(FPReg != AMDGPU::FP_REG); 417 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0); 418 } 419 420 if (MFI->hasFlatScratchInit() || ScratchRsrcReg) { 421 MRI.addLiveIn(PreloadedScratchWaveOffsetReg); 422 MBB.addLiveIn(PreloadedScratchWaveOffsetReg); 423 } 424 425 if (MFI->hasFlatScratchInit()) { 426 emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg); 427 } 428 429 if (ScratchRsrcReg) { 430 emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL, 431 PreloadedScratchRsrcReg, 432 ScratchRsrcReg, ScratchWaveOffsetReg); 433 } 434 } 435 436 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg` 437 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( 438 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 439 const DebugLoc &DL, Register PreloadedScratchRsrcReg, 440 Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const { 441 442 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 443 const SIInstrInfo *TII = ST.getInstrInfo(); 444 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 445 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 446 const Function &Fn = MF.getFunction(); 447 448 if (ST.isAmdPalOS()) { 449 // The pointer to the GIT is formed from the offset passed in and either 450 // the amdgpu-git-ptr-high function attribute or the top part of the PC 451 Register RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 452 Register RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 453 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); 454 455 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); 456 457 if (MFI->getGITPtrHigh() != 0xffffffff) { 458 BuildMI(MBB, I, DL, SMovB32, RsrcHi) 459 .addImm(MFI->getGITPtrHigh()) 460 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 461 } else { 462 const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64); 463 BuildMI(MBB, I, DL, GetPC64, Rsrc01); 464 } 465 Register GitPtrLo = MFI->getGITPtrLoReg(MF); 466 MF.getRegInfo().addLiveIn(GitPtrLo); 467 MBB.addLiveIn(GitPtrLo); 468 BuildMI(MBB, I, DL, SMovB32, RsrcLo) 469 .addReg(GitPtrLo) 470 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 471 472 // We now have the GIT ptr - now get the scratch descriptor from the entry 473 // at offset 0 (or offset 16 for a compute shader). 474 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 475 const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM); 476 auto MMO = MF.getMachineMemOperand(PtrInfo, 477 MachineMemOperand::MOLoad | 478 MachineMemOperand::MOInvariant | 479 MachineMemOperand::MODereferenceable, 480 16, Align(4)); 481 unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; 482 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); 483 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset); 484 BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg) 485 .addReg(Rsrc01) 486 .addImm(EncodedOffset) // offset 487 .addImm(0) // glc 488 .addImm(0) // dlc 489 .addReg(ScratchRsrcReg, RegState::ImplicitDefine) 490 .addMemOperand(MMO); 491 } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) { 492 assert(!ST.isAmdHsaOrMesa(Fn)); 493 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); 494 495 Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); 496 Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); 497 498 // Use relocations to get the pointer, and setup the other bits manually. 499 uint64_t Rsrc23 = TII->getScratchRsrcWords23(); 500 501 if (MFI->hasImplicitBufferPtr()) { 502 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); 503 504 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { 505 const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64); 506 507 BuildMI(MBB, I, DL, Mov64, Rsrc01) 508 .addReg(MFI->getImplicitBufferPtrUserSGPR()) 509 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 510 } else { 511 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); 512 513 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 514 auto MMO = MF.getMachineMemOperand( 515 PtrInfo, 516 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | 517 MachineMemOperand::MODereferenceable, 518 8, Align(4)); 519 BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01) 520 .addReg(MFI->getImplicitBufferPtrUserSGPR()) 521 .addImm(0) // offset 522 .addImm(0) // glc 523 .addImm(0) // dlc 524 .addMemOperand(MMO) 525 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 526 527 MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR()); 528 MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR()); 529 } 530 } else { 531 Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 532 Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 533 534 BuildMI(MBB, I, DL, SMovB32, Rsrc0) 535 .addExternalSymbol("SCRATCH_RSRC_DWORD0") 536 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 537 538 BuildMI(MBB, I, DL, SMovB32, Rsrc1) 539 .addExternalSymbol("SCRATCH_RSRC_DWORD1") 540 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 541 542 } 543 544 BuildMI(MBB, I, DL, SMovB32, Rsrc2) 545 .addImm(Rsrc23 & 0xffffffff) 546 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 547 548 BuildMI(MBB, I, DL, SMovB32, Rsrc3) 549 .addImm(Rsrc23 >> 32) 550 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 551 } else if (ST.isAmdHsaOrMesa(Fn)) { 552 assert(PreloadedScratchRsrcReg); 553 554 if (ScratchRsrcReg != PreloadedScratchRsrcReg) { 555 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) 556 .addReg(PreloadedScratchRsrcReg, RegState::Kill); 557 } 558 } 559 560 // Add the scratch wave offset into the scratch RSRC. 561 // 562 // We only want to update the first 48 bits, which is the base address 563 // pointer, without touching the adjacent 16 bits of flags. We know this add 564 // cannot carry-out from bit 47, otherwise the scratch allocation would be 565 // impossible to fit in the 48-bit global address space. 566 // 567 // TODO: Evaluate if it is better to just construct an SRD using the flat 568 // scratch init and some constants rather than update the one we are passed. 569 Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 570 Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 571 572 // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in 573 // the kernel body via inreg arguments. 574 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0) 575 .addReg(ScratchRsrcSub0) 576 .addReg(ScratchWaveOffsetReg) 577 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 578 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1) 579 .addReg(ScratchRsrcSub1) 580 .addImm(0) 581 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 582 } 583 584 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { 585 switch (ID) { 586 case TargetStackID::Default: 587 case TargetStackID::NoAlloc: 588 case TargetStackID::SGPRSpill: 589 return true; 590 case TargetStackID::SVEVector: 591 return false; 592 } 593 llvm_unreachable("Invalid TargetStackID::Value"); 594 } 595 596 // Activate all lanes, returns saved exec. 597 static Register buildScratchExecCopy(LivePhysRegs &LiveRegs, 598 MachineFunction &MF, 599 MachineBasicBlock &MBB, 600 MachineBasicBlock::iterator MBBI, 601 bool IsProlog) { 602 Register ScratchExecCopy; 603 MachineRegisterInfo &MRI = MF.getRegInfo(); 604 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 605 const SIInstrInfo *TII = ST.getInstrInfo(); 606 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 607 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 608 DebugLoc DL; 609 610 if (LiveRegs.empty()) { 611 if (IsProlog) { 612 LiveRegs.init(TRI); 613 LiveRegs.addLiveIns(MBB); 614 if (FuncInfo->SGPRForFPSaveRestoreCopy) 615 LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy); 616 } else { 617 // In epilog. 618 LiveRegs.init(*ST.getRegisterInfo()); 619 LiveRegs.addLiveOuts(MBB); 620 LiveRegs.stepBackward(*MBBI); 621 } 622 } 623 624 ScratchExecCopy = findScratchNonCalleeSaveRegister( 625 MRI, LiveRegs, *TRI.getWaveMaskRegClass()); 626 627 if (!IsProlog) 628 LiveRegs.removeReg(ScratchExecCopy); 629 630 const unsigned OrSaveExec = 631 ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; 632 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy).addImm(-1); 633 634 return ScratchExecCopy; 635 } 636 637 void SIFrameLowering::emitPrologue(MachineFunction &MF, 638 MachineBasicBlock &MBB) const { 639 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 640 if (FuncInfo->isEntryFunction()) { 641 emitEntryFunctionPrologue(MF, MBB); 642 return; 643 } 644 645 const MachineFrameInfo &MFI = MF.getFrameInfo(); 646 MachineRegisterInfo &MRI = MF.getRegInfo(); 647 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 648 const SIInstrInfo *TII = ST.getInstrInfo(); 649 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 650 651 Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); 652 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 653 LivePhysRegs LiveRegs; 654 655 MachineBasicBlock::iterator MBBI = MBB.begin(); 656 DebugLoc DL; 657 658 bool HasFP = false; 659 uint32_t NumBytes = MFI.getStackSize(); 660 uint32_t RoundedSize = NumBytes; 661 // To avoid clobbering VGPRs in lanes that weren't active on function entry, 662 // turn on all lanes before doing the spill to memory. 663 Register ScratchExecCopy; 664 665 bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue(); 666 bool SpillFPToMemory = false; 667 // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR. 668 // Otherwise we are spilling the FP to memory. 669 if (HasFPSaveIndex) { 670 SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) != 671 TargetStackID::SGPRSpill; 672 } 673 674 // Emit the copy if we need an FP, and are using a free SGPR to save it. 675 if (FuncInfo->SGPRForFPSaveRestoreCopy) { 676 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy) 677 .addReg(FramePtrReg) 678 .setMIFlag(MachineInstr::FrameSetup); 679 // Make the register live throughout the function. 680 for (MachineBasicBlock &MBB : MF) 681 MBB.addLiveIn(FuncInfo->SGPRForFPSaveRestoreCopy); 682 } 683 684 for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg 685 : FuncInfo->getSGPRSpillVGPRs()) { 686 if (!Reg.FI.hasValue()) 687 continue; 688 689 if (!ScratchExecCopy) 690 ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); 691 692 buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR, 693 FuncInfo->getScratchRSrcReg(), 694 StackPtrReg, 695 Reg.FI.getValue()); 696 } 697 698 if (HasFPSaveIndex && SpillFPToMemory) { 699 assert(!MFI.isDeadObjectIndex(FuncInfo->FramePointerSaveIndex.getValue())); 700 701 if (!ScratchExecCopy) 702 ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); 703 704 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( 705 MRI, LiveRegs, AMDGPU::VGPR_32RegClass); 706 707 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 708 .addReg(FramePtrReg); 709 710 buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR, 711 FuncInfo->getScratchRSrcReg(), StackPtrReg, 712 FuncInfo->FramePointerSaveIndex.getValue()); 713 } 714 715 if (ScratchExecCopy) { 716 // FIXME: Split block and make terminator. 717 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 718 MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 719 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) 720 .addReg(ScratchExecCopy, RegState::Kill); 721 LiveRegs.addReg(ScratchExecCopy); 722 } 723 724 // In this case, spill the FP to a reserved VGPR. 725 if (HasFPSaveIndex && !SpillFPToMemory) { 726 const int FI = FuncInfo->FramePointerSaveIndex.getValue(); 727 assert(!MFI.isDeadObjectIndex(FI)); 728 729 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); 730 ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = 731 FuncInfo->getSGPRToVGPRSpills(FI); 732 assert(Spill.size() == 1); 733 734 // Save FP before setting it up. 735 // FIXME: This should respect spillSGPRToVGPR; 736 BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), 737 Spill[0].VGPR) 738 .addReg(FramePtrReg) 739 .addImm(Spill[0].Lane) 740 .addReg(Spill[0].VGPR, RegState::Undef); 741 } 742 743 if (TRI.needsStackRealignment(MF)) { 744 HasFP = true; 745 const unsigned Alignment = MFI.getMaxAlign().value(); 746 747 RoundedSize += Alignment; 748 if (LiveRegs.empty()) { 749 LiveRegs.init(TRI); 750 LiveRegs.addLiveIns(MBB); 751 LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy); 752 } 753 754 Register ScratchSPReg = findScratchNonCalleeSaveRegister( 755 MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass); 756 assert(ScratchSPReg && ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy); 757 758 // s_add_u32 tmp_reg, s32, NumBytes 759 // s_and_b32 s32, tmp_reg, 0b111...0000 760 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg) 761 .addReg(StackPtrReg) 762 .addImm((Alignment - 1) * ST.getWavefrontSize()) 763 .setMIFlag(MachineInstr::FrameSetup); 764 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg) 765 .addReg(ScratchSPReg, RegState::Kill) 766 .addImm(-Alignment * ST.getWavefrontSize()) 767 .setMIFlag(MachineInstr::FrameSetup); 768 FuncInfo->setIsStackRealigned(true); 769 } else if ((HasFP = hasFP(MF))) { 770 // If we need a base pointer, set it up here. It's whatever the value of 771 // the stack pointer is at this point. Any variable size objects will be 772 // allocated after this, so we can still use the base pointer to reference 773 // locals. 774 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) 775 .addReg(StackPtrReg) 776 .setMIFlag(MachineInstr::FrameSetup); 777 } 778 779 if (HasFP && RoundedSize != 0) { 780 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg) 781 .addReg(StackPtrReg) 782 .addImm(RoundedSize * ST.getWavefrontSize()) 783 .setMIFlag(MachineInstr::FrameSetup); 784 } 785 786 assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy || 787 FuncInfo->FramePointerSaveIndex)) && 788 "Needed to save FP but didn't save it anywhere"); 789 790 assert((HasFP || (!FuncInfo->SGPRForFPSaveRestoreCopy && 791 !FuncInfo->FramePointerSaveIndex)) && 792 "Saved FP but didn't need it"); 793 } 794 795 void SIFrameLowering::emitEpilogue(MachineFunction &MF, 796 MachineBasicBlock &MBB) const { 797 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 798 if (FuncInfo->isEntryFunction()) 799 return; 800 801 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 802 const SIInstrInfo *TII = ST.getInstrInfo(); 803 MachineRegisterInfo &MRI = MF.getRegInfo(); 804 MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); 805 LivePhysRegs LiveRegs; 806 DebugLoc DL; 807 808 const MachineFrameInfo &MFI = MF.getFrameInfo(); 809 uint32_t NumBytes = MFI.getStackSize(); 810 uint32_t RoundedSize = FuncInfo->isStackRealigned() 811 ? NumBytes + MFI.getMaxAlign().value() 812 : NumBytes; 813 const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); 814 const Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 815 816 bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue(); 817 bool SpillFPToMemory = false; 818 if (HasFPSaveIndex) { 819 SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) != 820 TargetStackID::SGPRSpill; 821 } 822 823 if (RoundedSize != 0 && hasFP(MF)) { 824 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg) 825 .addReg(StackPtrReg) 826 .addImm(RoundedSize * ST.getWavefrontSize()) 827 .setMIFlag(MachineInstr::FrameDestroy); 828 } 829 830 if (FuncInfo->SGPRForFPSaveRestoreCopy) { 831 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) 832 .addReg(FuncInfo->SGPRForFPSaveRestoreCopy) 833 .setMIFlag(MachineInstr::FrameSetup); 834 } 835 836 Register ScratchExecCopy; 837 if (HasFPSaveIndex) { 838 const int FI = FuncInfo->FramePointerSaveIndex.getValue(); 839 assert(!MFI.isDeadObjectIndex(FI)); 840 if (SpillFPToMemory) { 841 if (!ScratchExecCopy) 842 ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false); 843 844 MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister( 845 MRI, LiveRegs, AMDGPU::VGPR_32RegClass); 846 buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR, 847 FuncInfo->getScratchRSrcReg(), StackPtrReg, FI); 848 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg) 849 .addReg(TempVGPR, RegState::Kill); 850 } else { 851 // Reload from VGPR spill. 852 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); 853 ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = 854 FuncInfo->getSGPRToVGPRSpills(FI); 855 assert(Spill.size() == 1); 856 BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), 857 FramePtrReg) 858 .addReg(Spill[0].VGPR) 859 .addImm(Spill[0].Lane); 860 } 861 } 862 863 for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg : 864 FuncInfo->getSGPRSpillVGPRs()) { 865 if (!Reg.FI.hasValue()) 866 continue; 867 868 if (!ScratchExecCopy) 869 ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false); 870 871 buildEpilogReload(LiveRegs, MBB, MBBI, TII, Reg.VGPR, 872 FuncInfo->getScratchRSrcReg(), StackPtrReg, 873 Reg.FI.getValue()); 874 } 875 876 if (ScratchExecCopy) { 877 // FIXME: Split block and make terminator. 878 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 879 MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 880 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) 881 .addReg(ScratchExecCopy, RegState::Kill); 882 } 883 } 884 885 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not 886 // memory. They should have been removed by now. 887 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { 888 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); 889 I != E; ++I) { 890 if (!MFI.isDeadObjectIndex(I)) 891 return false; 892 } 893 894 return true; 895 } 896 897 #ifndef NDEBUG 898 static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI, 899 Optional<int> FramePointerSaveIndex) { 900 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); 901 I != E; ++I) { 902 if (!MFI.isDeadObjectIndex(I) && 903 MFI.getStackID(I) == TargetStackID::SGPRSpill && 904 FramePointerSaveIndex && I != FramePointerSaveIndex) { 905 return false; 906 } 907 } 908 909 return true; 910 } 911 #endif 912 913 int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, 914 Register &FrameReg) const { 915 const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); 916 917 FrameReg = RI->getFrameRegister(MF); 918 return MF.getFrameInfo().getObjectOffset(FI); 919 } 920 921 void SIFrameLowering::processFunctionBeforeFrameFinalized( 922 MachineFunction &MF, 923 RegScavenger *RS) const { 924 MachineFrameInfo &MFI = MF.getFrameInfo(); 925 926 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 927 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 928 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 929 930 FuncInfo->removeDeadFrameIndices(MFI); 931 assert(allSGPRSpillsAreDead(MFI, None) && 932 "SGPR spill should have been removed in SILowerSGPRSpills"); 933 934 // FIXME: The other checks should be redundant with allStackObjectsAreDead, 935 // but currently hasNonSpillStackObjects is set only from source 936 // allocas. Stack temps produced from legalization are not counted currently. 937 if (!allStackObjectsAreDead(MFI)) { 938 assert(RS && "RegScavenger required if spilling"); 939 940 if (FuncInfo->isEntryFunction()) { 941 int ScavengeFI = MFI.CreateFixedObject( 942 TRI->getSpillSize(AMDGPU::SGPR_32RegClass), 0, false); 943 RS->addScavengingFrameIndex(ScavengeFI); 944 } else { 945 int ScavengeFI = MFI.CreateStackObject( 946 TRI->getSpillSize(AMDGPU::SGPR_32RegClass), 947 TRI->getSpillAlignment(AMDGPU::SGPR_32RegClass), 948 false); 949 RS->addScavengingFrameIndex(ScavengeFI); 950 } 951 } 952 } 953 954 // Only report VGPRs to generic code. 955 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, 956 BitVector &SavedVGPRs, 957 RegScavenger *RS) const { 958 TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS); 959 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 960 if (MFI->isEntryFunction()) 961 return; 962 963 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 964 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 965 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 966 967 // Ignore the SGPRs the default implementation found. 968 SavedVGPRs.clearBitsNotInMask(TRI->getAllVGPRRegMask()); 969 970 // hasFP only knows about stack objects that already exist. We're now 971 // determining the stack slots that will be created, so we have to predict 972 // them. Stack objects force FP usage with calls. 973 // 974 // Note a new VGPR CSR may be introduced if one is used for the spill, but we 975 // don't want to report it here. 976 // 977 // FIXME: Is this really hasReservedCallFrame? 978 const bool WillHaveFP = 979 FrameInfo.hasCalls() && 980 (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo)); 981 982 // VGPRs used for SGPR spilling need to be specially inserted in the prolog, 983 // so don't allow the default insertion to handle them. 984 for (auto SSpill : MFI->getSGPRSpillVGPRs()) 985 SavedVGPRs.reset(SSpill.VGPR); 986 987 const bool HasFP = WillHaveFP || hasFP(MF); 988 if (!HasFP) 989 return; 990 991 // We need to save and restore the current FP. 992 993 // 1: If there is already a VGPR with free lanes, use it. We 994 // may already have to pay the penalty for spilling a CSR VGPR. 995 if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) { 996 int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr, 997 TargetStackID::SGPRSpill); 998 999 if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI)) 1000 llvm_unreachable("allocate SGPR spill should have worked"); 1001 1002 MFI->FramePointerSaveIndex = NewFI; 1003 1004 LLVM_DEBUG( 1005 auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); 1006 dbgs() << "Spilling FP to " << printReg(Spill.VGPR, TRI) 1007 << ':' << Spill.Lane << '\n'); 1008 return; 1009 } 1010 1011 // 2: Next, try to save the FP in an unused SGPR. 1012 MFI->SGPRForFPSaveRestoreCopy = findUnusedSGPRNonCalleeSaved(MF.getRegInfo()); 1013 1014 if (!MFI->SGPRForFPSaveRestoreCopy) { 1015 int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr, 1016 TargetStackID::SGPRSpill); 1017 1018 if (MFI->allocateSGPRSpillToVGPR(MF, NewFI)) { 1019 // 3: There's no free lane to spill, and no free register to save FP, so 1020 // we're forced to spill another VGPR to use for the spill. 1021 MFI->FramePointerSaveIndex = NewFI; 1022 } else { 1023 // 4: If all else fails, spill the FP to memory. 1024 MFI->FramePointerSaveIndex = 1025 FrameInfo.CreateSpillStackObject(4, Align(4)); 1026 } 1027 1028 LLVM_DEBUG( 1029 auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); 1030 dbgs() << "FP requires fallback spill to " << printReg(Spill.VGPR, TRI) 1031 << ':' << Spill.Lane << '\n';); 1032 } else { 1033 LLVM_DEBUG(dbgs() << "Saving FP with copy to " << 1034 printReg(MFI->SGPRForFPSaveRestoreCopy, TRI) << '\n'); 1035 } 1036 } 1037 1038 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF, 1039 BitVector &SavedRegs, 1040 RegScavenger *RS) const { 1041 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); 1042 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1043 if (MFI->isEntryFunction()) 1044 return; 1045 1046 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1047 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1048 1049 // The SP is specifically managed and we don't want extra spills of it. 1050 SavedRegs.reset(MFI->getStackPtrOffsetReg()); 1051 SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask()); 1052 } 1053 1054 bool SIFrameLowering::assignCalleeSavedSpillSlots( 1055 MachineFunction &MF, const TargetRegisterInfo *TRI, 1056 std::vector<CalleeSavedInfo> &CSI) const { 1057 if (CSI.empty()) 1058 return true; // Early exit if no callee saved registers are modified! 1059 1060 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1061 if (!FuncInfo->SGPRForFPSaveRestoreCopy) 1062 return false; 1063 1064 for (auto &CS : CSI) { 1065 if (CS.getReg() == FuncInfo->getFrameOffsetReg()) { 1066 if (FuncInfo->SGPRForFPSaveRestoreCopy) 1067 CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy); 1068 break; 1069 } 1070 } 1071 1072 return false; 1073 } 1074 1075 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( 1076 MachineFunction &MF, 1077 MachineBasicBlock &MBB, 1078 MachineBasicBlock::iterator I) const { 1079 int64_t Amount = I->getOperand(0).getImm(); 1080 if (Amount == 0) 1081 return MBB.erase(I); 1082 1083 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1084 const SIInstrInfo *TII = ST.getInstrInfo(); 1085 const DebugLoc &DL = I->getDebugLoc(); 1086 unsigned Opc = I->getOpcode(); 1087 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); 1088 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0; 1089 1090 if (!hasReservedCallFrame(MF)) { 1091 Amount = alignTo(Amount, getStackAlign()); 1092 assert(isUInt<32>(Amount) && "exceeded stack address space size"); 1093 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1094 Register SPReg = MFI->getStackPtrOffsetReg(); 1095 1096 unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; 1097 BuildMI(MBB, I, DL, TII->get(Op), SPReg) 1098 .addReg(SPReg) 1099 .addImm(Amount * ST.getWavefrontSize()); 1100 } else if (CalleePopAmount != 0) { 1101 llvm_unreachable("is this used?"); 1102 } 1103 1104 return MBB.erase(I); 1105 } 1106 1107 bool SIFrameLowering::hasFP(const MachineFunction &MF) const { 1108 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1109 1110 // For entry functions we can use an immediate offset in most cases, so the 1111 // presence of calls doesn't imply we need a distinct frame pointer. 1112 if (MFI.hasCalls() && 1113 !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) { 1114 // All offsets are unsigned, so need to be addressed in the same direction 1115 // as stack growth. 1116 1117 // FIXME: This function is pretty broken, since it can be called before the 1118 // frame layout is determined or CSR spills are inserted. 1119 return MFI.getStackSize() != 0; 1120 } 1121 1122 return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() || 1123 MFI.hasStackMap() || MFI.hasPatchPoint() || 1124 MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->needsStackRealignment(MF) || 1125 MF.getTarget().Options.DisableFramePointerElim(MF); 1126 } 1127