1 //===----------------------- SIFrameLowering.cpp --------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 9 #include "SIFrameLowering.h" 10 #include "AMDGPUSubtarget.h" 11 #include "SIInstrInfo.h" 12 #include "SIMachineFunctionInfo.h" 13 #include "SIRegisterInfo.h" 14 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 15 16 #include "llvm/CodeGen/LivePhysRegs.h" 17 #include "llvm/CodeGen/MachineFrameInfo.h" 18 #include "llvm/CodeGen/MachineFunction.h" 19 #include "llvm/CodeGen/MachineInstrBuilder.h" 20 #include "llvm/CodeGen/RegisterScavenging.h" 21 22 using namespace llvm; 23 24 #define DEBUG_TYPE "frame-info" 25 26 27 static ArrayRef<MCPhysReg> getAllSGPR128(const GCNSubtarget &ST, 28 const MachineFunction &MF) { 29 return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(), 30 ST.getMaxNumSGPRs(MF) / 4); 31 } 32 33 static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST, 34 const MachineFunction &MF) { 35 return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF)); 36 } 37 38 // Find a scratch register that we can use at the start of the prologue to 39 // re-align the stack pointer. We avoid using callee-save registers since they 40 // may appear to be free when this is called from canUseAsPrologue (during 41 // shrink wrapping), but then no longer be free when this is called from 42 // emitPrologue. 43 // 44 // FIXME: This is a bit conservative, since in the above case we could use one 45 // of the callee-save registers as a scratch temp to re-align the stack pointer, 46 // but we would then have to make sure that we were in fact saving at least one 47 // callee-save register in the prologue, which is additional complexity that 48 // doesn't seem worth the benefit. 49 static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, 50 LivePhysRegs &LiveRegs, 51 const TargetRegisterClass &RC, 52 bool Unused = false) { 53 // Mark callee saved registers as used so we will not choose them. 54 const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); 55 for (unsigned i = 0; CSRegs[i]; ++i) 56 LiveRegs.addReg(CSRegs[i]); 57 58 if (Unused) { 59 // We are looking for a register that can be used throughout the entire 60 // function, so any use is unacceptable. 61 for (MCRegister Reg : RC) { 62 if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg)) 63 return Reg; 64 } 65 } else { 66 for (MCRegister Reg : RC) { 67 if (LiveRegs.available(MRI, Reg)) 68 return Reg; 69 } 70 } 71 72 // If we require an unused register, this is used in contexts where failure is 73 // an option and has an alternative plan. In other contexts, this must 74 // succeed0. 75 if (!Unused) 76 report_fatal_error("failed to find free scratch register"); 77 78 return MCRegister(); 79 } 80 81 static MCPhysReg findUnusedSGPRNonCalleeSaved(MachineRegisterInfo &MRI) { 82 LivePhysRegs LiveRegs; 83 LiveRegs.init(*MRI.getTargetRegisterInfo()); 84 return findScratchNonCalleeSaveRegister( 85 MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true); 86 } 87 88 // We need to specially emit stack operations here because a different frame 89 // register is used than in the rest of the function, as getFrameRegister would 90 // use. 91 static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, 92 MachineBasicBlock::iterator I, 93 const SIInstrInfo *TII, Register SpillReg, 94 Register ScratchRsrcReg, Register SPReg, int FI) { 95 MachineFunction *MF = MBB.getParent(); 96 MachineFrameInfo &MFI = MF->getFrameInfo(); 97 98 int64_t Offset = MFI.getObjectOffset(FI); 99 100 MachineMemOperand *MMO = MF->getMachineMemOperand( 101 MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4, 102 MFI.getObjectAlign(FI)); 103 104 if (isUInt<12>(Offset)) { 105 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET)) 106 .addReg(SpillReg, RegState::Kill) 107 .addReg(ScratchRsrcReg) 108 .addReg(SPReg) 109 .addImm(Offset) 110 .addImm(0) // glc 111 .addImm(0) // slc 112 .addImm(0) // tfe 113 .addImm(0) // dlc 114 .addImm(0) // swz 115 .addMemOperand(MMO); 116 return; 117 } 118 119 MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( 120 MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass); 121 122 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg) 123 .addImm(Offset); 124 125 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN)) 126 .addReg(SpillReg, RegState::Kill) 127 .addReg(OffsetReg, RegState::Kill) 128 .addReg(ScratchRsrcReg) 129 .addReg(SPReg) 130 .addImm(0) 131 .addImm(0) // glc 132 .addImm(0) // slc 133 .addImm(0) // tfe 134 .addImm(0) // dlc 135 .addImm(0) // swz 136 .addMemOperand(MMO); 137 } 138 139 static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, 140 MachineBasicBlock::iterator I, 141 const SIInstrInfo *TII, Register SpillReg, 142 Register ScratchRsrcReg, Register SPReg, int FI) { 143 MachineFunction *MF = MBB.getParent(); 144 MachineFrameInfo &MFI = MF->getFrameInfo(); 145 int64_t Offset = MFI.getObjectOffset(FI); 146 147 MachineMemOperand *MMO = MF->getMachineMemOperand( 148 MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4, 149 MFI.getObjectAlign(FI)); 150 151 if (isUInt<12>(Offset)) { 152 BuildMI(MBB, I, DebugLoc(), 153 TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFSET), SpillReg) 154 .addReg(ScratchRsrcReg) 155 .addReg(SPReg) 156 .addImm(Offset) 157 .addImm(0) // glc 158 .addImm(0) // slc 159 .addImm(0) // tfe 160 .addImm(0) // dlc 161 .addImm(0) // swz 162 .addMemOperand(MMO); 163 return; 164 } 165 166 MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( 167 MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass); 168 169 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg) 170 .addImm(Offset); 171 172 BuildMI(MBB, I, DebugLoc(), 173 TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), SpillReg) 174 .addReg(OffsetReg, RegState::Kill) 175 .addReg(ScratchRsrcReg) 176 .addReg(SPReg) 177 .addImm(0) 178 .addImm(0) // glc 179 .addImm(0) // slc 180 .addImm(0) // tfe 181 .addImm(0) // dlc 182 .addImm(0) // swz 183 .addMemOperand(MMO); 184 } 185 186 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()` 187 void SIFrameLowering::emitEntryFunctionFlatScratchInit( 188 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 189 const DebugLoc &DL, Register ScratchWaveOffsetReg) const { 190 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 191 const SIInstrInfo *TII = ST.getInstrInfo(); 192 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 193 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 194 195 // We don't need this if we only have spills since there is no user facing 196 // scratch. 197 198 // TODO: If we know we don't have flat instructions earlier, we can omit 199 // this from the input registers. 200 // 201 // TODO: We only need to know if we access scratch space through a flat 202 // pointer. Because we only detect if flat instructions are used at all, 203 // this will be used more often than necessary on VI. 204 205 Register FlatScratchInitReg = 206 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); 207 208 MachineRegisterInfo &MRI = MF.getRegInfo(); 209 MRI.addLiveIn(FlatScratchInitReg); 210 MBB.addLiveIn(FlatScratchInitReg); 211 212 Register FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); 213 Register FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); 214 215 // Do a 64-bit pointer add. 216 if (ST.flatScratchIsPointer()) { 217 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 218 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) 219 .addReg(FlatScrInitLo) 220 .addReg(ScratchWaveOffsetReg); 221 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), FlatScrInitHi) 222 .addReg(FlatScrInitHi) 223 .addImm(0); 224 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)). 225 addReg(FlatScrInitLo). 226 addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO | 227 (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); 228 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)). 229 addReg(FlatScrInitHi). 230 addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI | 231 (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); 232 return; 233 } 234 235 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO) 236 .addReg(FlatScrInitLo) 237 .addReg(ScratchWaveOffsetReg); 238 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI) 239 .addReg(FlatScrInitHi) 240 .addImm(0); 241 242 return; 243 } 244 245 assert(ST.getGeneration() < AMDGPUSubtarget::GFX10); 246 247 // Copy the size in bytes. 248 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO) 249 .addReg(FlatScrInitHi, RegState::Kill); 250 251 // Add wave offset in bytes to private base offset. 252 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init. 253 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) 254 .addReg(FlatScrInitLo) 255 .addReg(ScratchWaveOffsetReg); 256 257 // Convert offset to 256-byte units. 258 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI) 259 .addReg(FlatScrInitLo, RegState::Kill) 260 .addImm(8); 261 } 262 263 // Shift down registers reserved for the scratch RSRC. 264 Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg( 265 MachineFunction &MF) const { 266 267 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 268 const SIInstrInfo *TII = ST.getInstrInfo(); 269 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 270 MachineRegisterInfo &MRI = MF.getRegInfo(); 271 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 272 273 assert(MFI->isEntryFunction()); 274 275 Register ScratchRsrcReg = MFI->getScratchRSrcReg(); 276 277 if (!ScratchRsrcReg || !MRI.isPhysRegUsed(ScratchRsrcReg)) 278 return Register(); 279 280 if (ST.hasSGPRInitBug() || 281 ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF)) 282 return ScratchRsrcReg; 283 284 // We reserved the last registers for this. Shift it down to the end of those 285 // which were actually used. 286 // 287 // FIXME: It might be safer to use a pseudoregister before replacement. 288 289 // FIXME: We should be able to eliminate unused input registers. We only 290 // cannot do this for the resources required for scratch access. For now we 291 // skip over user SGPRs and may leave unused holes. 292 293 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4; 294 ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(ST, MF); 295 AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded)); 296 297 // Skip the last N reserved elements because they should have already been 298 // reserved for VCC etc. 299 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 300 for (MCPhysReg Reg : AllSGPR128s) { 301 // Pick the first unallocated one. Make sure we don't clobber the other 302 // reserved input we needed. Also for PAL, make sure we don't clobber 303 // the GIT pointer passed in SGPR0 or SGPR8. 304 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && 305 !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) { 306 MRI.replaceRegWith(ScratchRsrcReg, Reg); 307 MFI->setScratchRSrcReg(Reg); 308 return Reg; 309 } 310 } 311 312 return ScratchRsrcReg; 313 } 314 315 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, 316 MachineBasicBlock &MBB) const { 317 assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); 318 319 // FIXME: If we only have SGPR spills, we won't actually be using scratch 320 // memory since these spill to VGPRs. We should be cleaning up these unused 321 // SGPR spill frame indices somewhere. 322 323 // FIXME: We still have implicit uses on SGPR spill instructions in case they 324 // need to spill to vector memory. It's likely that will not happen, but at 325 // this point it appears we need the setup. This part of the prolog should be 326 // emitted after frame indices are eliminated. 327 328 // FIXME: Remove all of the isPhysRegUsed checks 329 330 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 331 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 332 const SIInstrInfo *TII = ST.getInstrInfo(); 333 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 334 MachineRegisterInfo &MRI = MF.getRegInfo(); 335 const Function &F = MF.getFunction(); 336 337 assert(MFI->isEntryFunction()); 338 339 Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( 340 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); 341 // FIXME: Hack to not crash in situations which emitted an error. 342 if (!PreloadedScratchWaveOffsetReg) 343 return; 344 345 // We need to do the replacement of the private segment buffer register even 346 // if there are no stack objects. There could be stores to undef or a 347 // constant without an associated object. 348 // 349 // This will return `Register()` in cases where there are no actual 350 // uses of the SRSRC. 351 Register ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF); 352 353 // Make the selected register live throughout the function. 354 if (ScratchRsrcReg) { 355 for (MachineBasicBlock &OtherBB : MF) { 356 if (&OtherBB != &MBB) { 357 OtherBB.addLiveIn(ScratchRsrcReg); 358 } 359 } 360 } 361 362 // Now that we have fixed the reserved SRSRC we need to locate the 363 // (potentially) preloaded SRSRC. 364 Register PreloadedScratchRsrcReg; 365 if (ST.isAmdHsaOrMesa(F)) { 366 PreloadedScratchRsrcReg = 367 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); 368 if (ScratchRsrcReg && PreloadedScratchRsrcReg) { 369 // We added live-ins during argument lowering, but since they were not 370 // used they were deleted. We're adding the uses now, so add them back. 371 MRI.addLiveIn(PreloadedScratchRsrcReg); 372 MBB.addLiveIn(PreloadedScratchRsrcReg); 373 } 374 } 375 376 // Debug location must be unknown since the first debug location is used to 377 // determine the end of the prologue. 378 DebugLoc DL; 379 MachineBasicBlock::iterator I = MBB.begin(); 380 381 // We found the SRSRC first because it needs four registers and has an 382 // alignment requirement. If the SRSRC that we found is clobbering with 383 // the scratch wave offset, which may be in a fixed SGPR or a free SGPR 384 // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch 385 // wave offset to a free SGPR. 386 Register ScratchWaveOffsetReg; 387 if (TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) { 388 ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF); 389 unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); 390 AllSGPRs = AllSGPRs.slice( 391 std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded)); 392 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 393 for (MCPhysReg Reg : AllSGPRs) { 394 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && 395 !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) { 396 ScratchWaveOffsetReg = Reg; 397 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) 398 .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); 399 break; 400 } 401 } 402 } else { 403 ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg; 404 } 405 assert(ScratchWaveOffsetReg); 406 407 if (MF.getFrameInfo().hasCalls()) { 408 Register SPReg = MFI->getStackPtrOffsetReg(); 409 assert(SPReg != AMDGPU::SP_REG); 410 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg) 411 .addImm(MF.getFrameInfo().getStackSize() * ST.getWavefrontSize()); 412 } 413 414 if (hasFP(MF)) { 415 Register FPReg = MFI->getFrameOffsetReg(); 416 assert(FPReg != AMDGPU::FP_REG); 417 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0); 418 } 419 420 if (MFI->hasFlatScratchInit() || ScratchRsrcReg) { 421 MRI.addLiveIn(PreloadedScratchWaveOffsetReg); 422 MBB.addLiveIn(PreloadedScratchWaveOffsetReg); 423 } 424 425 if (MFI->hasFlatScratchInit()) { 426 emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg); 427 } 428 429 if (ScratchRsrcReg) { 430 emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL, 431 PreloadedScratchRsrcReg, 432 ScratchRsrcReg, ScratchWaveOffsetReg); 433 } 434 } 435 436 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg` 437 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( 438 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 439 const DebugLoc &DL, Register PreloadedScratchRsrcReg, 440 Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const { 441 442 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 443 const SIInstrInfo *TII = ST.getInstrInfo(); 444 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 445 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 446 const Function &Fn = MF.getFunction(); 447 448 if (ST.isAmdPalOS()) { 449 // The pointer to the GIT is formed from the offset passed in and either 450 // the amdgpu-git-ptr-high function attribute or the top part of the PC 451 Register RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 452 Register RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 453 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); 454 455 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); 456 457 if (MFI->getGITPtrHigh() != 0xffffffff) { 458 BuildMI(MBB, I, DL, SMovB32, RsrcHi) 459 .addImm(MFI->getGITPtrHigh()) 460 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 461 } else { 462 const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64); 463 BuildMI(MBB, I, DL, GetPC64, Rsrc01); 464 } 465 Register GitPtrLo = MFI->getGITPtrLoReg(MF); 466 MF.getRegInfo().addLiveIn(GitPtrLo); 467 MBB.addLiveIn(GitPtrLo); 468 BuildMI(MBB, I, DL, SMovB32, RsrcLo) 469 .addReg(GitPtrLo) 470 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 471 472 // We now have the GIT ptr - now get the scratch descriptor from the entry 473 // at offset 0 (or offset 16 for a compute shader). 474 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 475 const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM); 476 auto MMO = MF.getMachineMemOperand(PtrInfo, 477 MachineMemOperand::MOLoad | 478 MachineMemOperand::MOInvariant | 479 MachineMemOperand::MODereferenceable, 480 16, Align(4)); 481 unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; 482 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); 483 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset); 484 BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg) 485 .addReg(Rsrc01) 486 .addImm(EncodedOffset) // offset 487 .addImm(0) // glc 488 .addImm(0) // dlc 489 .addReg(ScratchRsrcReg, RegState::ImplicitDefine) 490 .addMemOperand(MMO); 491 } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) { 492 assert(!ST.isAmdHsaOrMesa(Fn)); 493 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); 494 495 Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); 496 Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); 497 498 // Use relocations to get the pointer, and setup the other bits manually. 499 uint64_t Rsrc23 = TII->getScratchRsrcWords23(); 500 501 if (MFI->hasImplicitBufferPtr()) { 502 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); 503 504 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { 505 const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64); 506 507 BuildMI(MBB, I, DL, Mov64, Rsrc01) 508 .addReg(MFI->getImplicitBufferPtrUserSGPR()) 509 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 510 } else { 511 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); 512 513 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 514 auto MMO = MF.getMachineMemOperand( 515 PtrInfo, 516 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | 517 MachineMemOperand::MODereferenceable, 518 8, Align(4)); 519 BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01) 520 .addReg(MFI->getImplicitBufferPtrUserSGPR()) 521 .addImm(0) // offset 522 .addImm(0) // glc 523 .addImm(0) // dlc 524 .addMemOperand(MMO) 525 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 526 527 MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR()); 528 MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR()); 529 } 530 } else { 531 Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 532 Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 533 534 BuildMI(MBB, I, DL, SMovB32, Rsrc0) 535 .addExternalSymbol("SCRATCH_RSRC_DWORD0") 536 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 537 538 BuildMI(MBB, I, DL, SMovB32, Rsrc1) 539 .addExternalSymbol("SCRATCH_RSRC_DWORD1") 540 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 541 542 } 543 544 BuildMI(MBB, I, DL, SMovB32, Rsrc2) 545 .addImm(Rsrc23 & 0xffffffff) 546 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 547 548 BuildMI(MBB, I, DL, SMovB32, Rsrc3) 549 .addImm(Rsrc23 >> 32) 550 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 551 } else if (ST.isAmdHsaOrMesa(Fn)) { 552 assert(PreloadedScratchRsrcReg); 553 554 if (ScratchRsrcReg != PreloadedScratchRsrcReg) { 555 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) 556 .addReg(PreloadedScratchRsrcReg, RegState::Kill); 557 } 558 } 559 560 // Add the scratch wave offset into the scratch RSRC. 561 // 562 // We only want to update the first 48 bits, which is the base address 563 // pointer, without touching the adjacent 16 bits of flags. We know this add 564 // cannot carry-out from bit 47, otherwise the scratch allocation would be 565 // impossible to fit in the 48-bit global address space. 566 // 567 // TODO: Evaluate if it is better to just construct an SRD using the flat 568 // scratch init and some constants rather than update the one we are passed. 569 Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 570 Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 571 572 // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in 573 // the kernel body via inreg arguments. 574 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0) 575 .addReg(ScratchRsrcSub0) 576 .addReg(ScratchWaveOffsetReg) 577 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 578 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1) 579 .addReg(ScratchRsrcSub1) 580 .addImm(0) 581 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 582 } 583 584 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { 585 switch (ID) { 586 case TargetStackID::Default: 587 case TargetStackID::NoAlloc: 588 case TargetStackID::SGPRSpill: 589 return true; 590 case TargetStackID::SVEVector: 591 return false; 592 } 593 llvm_unreachable("Invalid TargetStackID::Value"); 594 } 595 596 // Activate all lanes, returns saved exec. 597 static Register buildScratchExecCopy(LivePhysRegs &LiveRegs, 598 MachineFunction &MF, 599 MachineBasicBlock &MBB, 600 MachineBasicBlock::iterator MBBI, 601 bool IsProlog) { 602 Register ScratchExecCopy; 603 MachineRegisterInfo &MRI = MF.getRegInfo(); 604 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 605 const SIInstrInfo *TII = ST.getInstrInfo(); 606 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 607 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 608 DebugLoc DL; 609 610 if (LiveRegs.empty()) { 611 if (IsProlog) { 612 LiveRegs.init(TRI); 613 LiveRegs.addLiveIns(MBB); 614 if (FuncInfo->SGPRForFPSaveRestoreCopy) 615 LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy); 616 } else { 617 // In epilog. 618 LiveRegs.init(*ST.getRegisterInfo()); 619 LiveRegs.addLiveOuts(MBB); 620 LiveRegs.stepBackward(*MBBI); 621 } 622 } 623 624 ScratchExecCopy = findScratchNonCalleeSaveRegister( 625 MRI, LiveRegs, *TRI.getWaveMaskRegClass()); 626 627 if (!IsProlog) 628 LiveRegs.removeReg(ScratchExecCopy); 629 630 const unsigned OrSaveExec = 631 ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; 632 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy).addImm(-1); 633 634 return ScratchExecCopy; 635 } 636 637 void SIFrameLowering::emitPrologue(MachineFunction &MF, 638 MachineBasicBlock &MBB) const { 639 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 640 if (FuncInfo->isEntryFunction()) { 641 emitEntryFunctionPrologue(MF, MBB); 642 return; 643 } 644 645 const MachineFrameInfo &MFI = MF.getFrameInfo(); 646 MachineRegisterInfo &MRI = MF.getRegInfo(); 647 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 648 const SIInstrInfo *TII = ST.getInstrInfo(); 649 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 650 651 Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); 652 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 653 LivePhysRegs LiveRegs; 654 655 MachineBasicBlock::iterator MBBI = MBB.begin(); 656 DebugLoc DL; 657 658 bool HasFP = false; 659 uint32_t NumBytes = MFI.getStackSize(); 660 uint32_t RoundedSize = NumBytes; 661 // To avoid clobbering VGPRs in lanes that weren't active on function entry, 662 // turn on all lanes before doing the spill to memory. 663 Register ScratchExecCopy; 664 665 bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue(); 666 bool SpillFPToMemory = false; 667 // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR. 668 // Otherwise we are spilling the FP to memory. 669 if (HasFPSaveIndex) { 670 SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) != 671 TargetStackID::SGPRSpill; 672 } 673 674 // Emit the copy if we need an FP, and are using a free SGPR to save it. 675 if (FuncInfo->SGPRForFPSaveRestoreCopy) { 676 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy) 677 .addReg(FramePtrReg) 678 .setMIFlag(MachineInstr::FrameSetup); 679 // Make the register live throughout the function. 680 for (MachineBasicBlock &MBB : MF) 681 MBB.addLiveIn(FuncInfo->SGPRForFPSaveRestoreCopy); 682 } 683 684 for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg 685 : FuncInfo->getSGPRSpillVGPRs()) { 686 if (!Reg.FI.hasValue()) 687 continue; 688 689 if (!ScratchExecCopy) 690 ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); 691 692 buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR, 693 FuncInfo->getScratchRSrcReg(), 694 StackPtrReg, 695 Reg.FI.getValue()); 696 } 697 698 if (HasFPSaveIndex && SpillFPToMemory) { 699 const int FI = FuncInfo->FramePointerSaveIndex.getValue(); 700 assert(!MFI.isDeadObjectIndex(FI)); 701 702 if (!ScratchExecCopy) 703 ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); 704 705 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( 706 MRI, LiveRegs, AMDGPU::VGPR_32RegClass); 707 708 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 709 .addReg(FramePtrReg); 710 711 buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR, 712 FuncInfo->getScratchRSrcReg(), StackPtrReg, 713 FuncInfo->FramePointerSaveIndex.getValue()); 714 } 715 716 if (ScratchExecCopy) { 717 // FIXME: Split block and make terminator. 718 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 719 MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 720 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) 721 .addReg(ScratchExecCopy, RegState::Kill); 722 LiveRegs.addReg(ScratchExecCopy); 723 } 724 725 // In this case, spill the FP to a reserved VGPR. 726 if (HasFPSaveIndex && !SpillFPToMemory) { 727 const int FI = FuncInfo->FramePointerSaveIndex.getValue(); 728 assert(!MFI.isDeadObjectIndex(FI)); 729 730 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); 731 ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = 732 FuncInfo->getSGPRToVGPRSpills(FI); 733 assert(Spill.size() == 1); 734 735 // Save FP before setting it up. 736 // FIXME: This should respect spillSGPRToVGPR; 737 BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), 738 Spill[0].VGPR) 739 .addReg(FramePtrReg) 740 .addImm(Spill[0].Lane) 741 .addReg(Spill[0].VGPR, RegState::Undef); 742 } 743 744 if (TRI.needsStackRealignment(MF)) { 745 HasFP = true; 746 const unsigned Alignment = MFI.getMaxAlign().value(); 747 748 RoundedSize += Alignment; 749 if (LiveRegs.empty()) { 750 LiveRegs.init(TRI); 751 LiveRegs.addLiveIns(MBB); 752 LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy); 753 } 754 755 Register ScratchSPReg = findScratchNonCalleeSaveRegister( 756 MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass); 757 assert(ScratchSPReg && ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy); 758 759 // s_add_u32 tmp_reg, s32, NumBytes 760 // s_and_b32 s32, tmp_reg, 0b111...0000 761 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg) 762 .addReg(StackPtrReg) 763 .addImm((Alignment - 1) * ST.getWavefrontSize()) 764 .setMIFlag(MachineInstr::FrameSetup); 765 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg) 766 .addReg(ScratchSPReg, RegState::Kill) 767 .addImm(-Alignment * ST.getWavefrontSize()) 768 .setMIFlag(MachineInstr::FrameSetup); 769 FuncInfo->setIsStackRealigned(true); 770 } else if ((HasFP = hasFP(MF))) { 771 // If we need a base pointer, set it up here. It's whatever the value of 772 // the stack pointer is at this point. Any variable size objects will be 773 // allocated after this, so we can still use the base pointer to reference 774 // locals. 775 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) 776 .addReg(StackPtrReg) 777 .setMIFlag(MachineInstr::FrameSetup); 778 } 779 780 if (HasFP && RoundedSize != 0) { 781 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg) 782 .addReg(StackPtrReg) 783 .addImm(RoundedSize * ST.getWavefrontSize()) 784 .setMIFlag(MachineInstr::FrameSetup); 785 } 786 787 assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy || 788 FuncInfo->FramePointerSaveIndex)) && 789 "Needed to save FP but didn't save it anywhere"); 790 791 assert((HasFP || (!FuncInfo->SGPRForFPSaveRestoreCopy && 792 !FuncInfo->FramePointerSaveIndex)) && 793 "Saved FP but didn't need it"); 794 } 795 796 void SIFrameLowering::emitEpilogue(MachineFunction &MF, 797 MachineBasicBlock &MBB) const { 798 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 799 if (FuncInfo->isEntryFunction()) 800 return; 801 802 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 803 const SIInstrInfo *TII = ST.getInstrInfo(); 804 MachineRegisterInfo &MRI = MF.getRegInfo(); 805 MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); 806 LivePhysRegs LiveRegs; 807 DebugLoc DL; 808 809 const MachineFrameInfo &MFI = MF.getFrameInfo(); 810 uint32_t NumBytes = MFI.getStackSize(); 811 uint32_t RoundedSize = FuncInfo->isStackRealigned() 812 ? NumBytes + MFI.getMaxAlign().value() 813 : NumBytes; 814 const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); 815 const Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 816 817 bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue(); 818 bool SpillFPToMemory = false; 819 if (HasFPSaveIndex) { 820 SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) != 821 TargetStackID::SGPRSpill; 822 } 823 824 if (RoundedSize != 0 && hasFP(MF)) { 825 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg) 826 .addReg(StackPtrReg) 827 .addImm(RoundedSize * ST.getWavefrontSize()) 828 .setMIFlag(MachineInstr::FrameDestroy); 829 } 830 831 if (FuncInfo->SGPRForFPSaveRestoreCopy) { 832 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) 833 .addReg(FuncInfo->SGPRForFPSaveRestoreCopy) 834 .setMIFlag(MachineInstr::FrameSetup); 835 } 836 837 Register ScratchExecCopy; 838 if (HasFPSaveIndex) { 839 const int FI = FuncInfo->FramePointerSaveIndex.getValue(); 840 assert(!MFI.isDeadObjectIndex(FI)); 841 if (SpillFPToMemory) { 842 if (!ScratchExecCopy) 843 ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false); 844 845 MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister( 846 MRI, LiveRegs, AMDGPU::VGPR_32RegClass); 847 buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR, 848 FuncInfo->getScratchRSrcReg(), StackPtrReg, FI); 849 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg) 850 .addReg(TempVGPR, RegState::Kill); 851 } else { 852 // Reload from VGPR spill. 853 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); 854 ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = 855 FuncInfo->getSGPRToVGPRSpills(FI); 856 assert(Spill.size() == 1); 857 BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), 858 FramePtrReg) 859 .addReg(Spill[0].VGPR) 860 .addImm(Spill[0].Lane); 861 } 862 } 863 864 for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg : 865 FuncInfo->getSGPRSpillVGPRs()) { 866 if (!Reg.FI.hasValue()) 867 continue; 868 869 if (!ScratchExecCopy) 870 ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false); 871 872 buildEpilogReload(LiveRegs, MBB, MBBI, TII, Reg.VGPR, 873 FuncInfo->getScratchRSrcReg(), StackPtrReg, 874 Reg.FI.getValue()); 875 } 876 877 if (ScratchExecCopy) { 878 // FIXME: Split block and make terminator. 879 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 880 MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 881 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) 882 .addReg(ScratchExecCopy, RegState::Kill); 883 } 884 } 885 886 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not 887 // memory. They should have been removed by now. 888 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { 889 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); 890 I != E; ++I) { 891 if (!MFI.isDeadObjectIndex(I)) 892 return false; 893 } 894 895 return true; 896 } 897 898 #ifndef NDEBUG 899 static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI, 900 Optional<int> FramePointerSaveIndex) { 901 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); 902 I != E; ++I) { 903 if (!MFI.isDeadObjectIndex(I) && 904 MFI.getStackID(I) == TargetStackID::SGPRSpill && 905 FramePointerSaveIndex && I != FramePointerSaveIndex) { 906 return false; 907 } 908 } 909 910 return true; 911 } 912 #endif 913 914 int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, 915 Register &FrameReg) const { 916 const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); 917 918 FrameReg = RI->getFrameRegister(MF); 919 return MF.getFrameInfo().getObjectOffset(FI); 920 } 921 922 void SIFrameLowering::processFunctionBeforeFrameFinalized( 923 MachineFunction &MF, 924 RegScavenger *RS) const { 925 MachineFrameInfo &MFI = MF.getFrameInfo(); 926 927 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 928 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 929 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 930 931 FuncInfo->removeDeadFrameIndices(MFI); 932 assert(allSGPRSpillsAreDead(MFI, None) && 933 "SGPR spill should have been removed in SILowerSGPRSpills"); 934 935 // FIXME: The other checks should be redundant with allStackObjectsAreDead, 936 // but currently hasNonSpillStackObjects is set only from source 937 // allocas. Stack temps produced from legalization are not counted currently. 938 if (!allStackObjectsAreDead(MFI)) { 939 assert(RS && "RegScavenger required if spilling"); 940 941 if (FuncInfo->isEntryFunction()) { 942 int ScavengeFI = MFI.CreateFixedObject( 943 TRI->getSpillSize(AMDGPU::SGPR_32RegClass), 0, false); 944 RS->addScavengingFrameIndex(ScavengeFI); 945 } else { 946 int ScavengeFI = MFI.CreateStackObject( 947 TRI->getSpillSize(AMDGPU::SGPR_32RegClass), 948 TRI->getSpillAlignment(AMDGPU::SGPR_32RegClass), 949 false); 950 RS->addScavengingFrameIndex(ScavengeFI); 951 } 952 } 953 } 954 955 // Only report VGPRs to generic code. 956 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, 957 BitVector &SavedVGPRs, 958 RegScavenger *RS) const { 959 TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS); 960 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 961 if (MFI->isEntryFunction()) 962 return; 963 964 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 965 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 966 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 967 968 // Ignore the SGPRs the default implementation found. 969 SavedVGPRs.clearBitsNotInMask(TRI->getAllVGPRRegMask()); 970 971 // hasFP only knows about stack objects that already exist. We're now 972 // determining the stack slots that will be created, so we have to predict 973 // them. Stack objects force FP usage with calls. 974 // 975 // Note a new VGPR CSR may be introduced if one is used for the spill, but we 976 // don't want to report it here. 977 // 978 // FIXME: Is this really hasReservedCallFrame? 979 const bool WillHaveFP = 980 FrameInfo.hasCalls() && 981 (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo)); 982 983 // VGPRs used for SGPR spilling need to be specially inserted in the prolog, 984 // so don't allow the default insertion to handle them. 985 for (auto SSpill : MFI->getSGPRSpillVGPRs()) 986 SavedVGPRs.reset(SSpill.VGPR); 987 988 const bool HasFP = WillHaveFP || hasFP(MF); 989 if (!HasFP) 990 return; 991 992 // We need to save and restore the current FP. 993 994 // 1: If there is already a VGPR with free lanes, use it. We 995 // may already have to pay the penalty for spilling a CSR VGPR. 996 if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) { 997 int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr, 998 TargetStackID::SGPRSpill); 999 1000 if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI)) 1001 llvm_unreachable("allocate SGPR spill should have worked"); 1002 1003 MFI->FramePointerSaveIndex = NewFI; 1004 1005 LLVM_DEBUG( 1006 auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); 1007 dbgs() << "Spilling FP to " << printReg(Spill.VGPR, TRI) 1008 << ':' << Spill.Lane << '\n'); 1009 return; 1010 } 1011 1012 // 2: Next, try to save the FP in an unused SGPR. 1013 MFI->SGPRForFPSaveRestoreCopy = findUnusedSGPRNonCalleeSaved(MF.getRegInfo()); 1014 1015 if (!MFI->SGPRForFPSaveRestoreCopy) { 1016 int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr, 1017 TargetStackID::SGPRSpill); 1018 1019 if (MFI->allocateSGPRSpillToVGPR(MF, NewFI)) { 1020 // 3: There's no free lane to spill, and no free register to save FP, so 1021 // we're forced to spill another VGPR to use for the spill. 1022 MFI->FramePointerSaveIndex = NewFI; 1023 } else { 1024 // 4: If all else fails, spill the FP to memory. 1025 MFI->FramePointerSaveIndex = 1026 FrameInfo.CreateSpillStackObject(4, Align(4)); 1027 } 1028 1029 LLVM_DEBUG( 1030 auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); 1031 dbgs() << "FP requires fallback spill to " << printReg(Spill.VGPR, TRI) 1032 << ':' << Spill.Lane << '\n';); 1033 } else { 1034 LLVM_DEBUG(dbgs() << "Saving FP with copy to " << 1035 printReg(MFI->SGPRForFPSaveRestoreCopy, TRI) << '\n'); 1036 } 1037 } 1038 1039 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF, 1040 BitVector &SavedRegs, 1041 RegScavenger *RS) const { 1042 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); 1043 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1044 if (MFI->isEntryFunction()) 1045 return; 1046 1047 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1048 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1049 1050 // The SP is specifically managed and we don't want extra spills of it. 1051 SavedRegs.reset(MFI->getStackPtrOffsetReg()); 1052 SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask()); 1053 } 1054 1055 bool SIFrameLowering::assignCalleeSavedSpillSlots( 1056 MachineFunction &MF, const TargetRegisterInfo *TRI, 1057 std::vector<CalleeSavedInfo> &CSI) const { 1058 if (CSI.empty()) 1059 return true; // Early exit if no callee saved registers are modified! 1060 1061 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1062 if (!FuncInfo->SGPRForFPSaveRestoreCopy) 1063 return false; 1064 1065 for (auto &CS : CSI) { 1066 if (CS.getReg() == FuncInfo->getFrameOffsetReg()) { 1067 if (FuncInfo->SGPRForFPSaveRestoreCopy) 1068 CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy); 1069 break; 1070 } 1071 } 1072 1073 return false; 1074 } 1075 1076 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( 1077 MachineFunction &MF, 1078 MachineBasicBlock &MBB, 1079 MachineBasicBlock::iterator I) const { 1080 int64_t Amount = I->getOperand(0).getImm(); 1081 if (Amount == 0) 1082 return MBB.erase(I); 1083 1084 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1085 const SIInstrInfo *TII = ST.getInstrInfo(); 1086 const DebugLoc &DL = I->getDebugLoc(); 1087 unsigned Opc = I->getOpcode(); 1088 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); 1089 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0; 1090 1091 if (!hasReservedCallFrame(MF)) { 1092 Amount = alignTo(Amount, getStackAlign()); 1093 assert(isUInt<32>(Amount) && "exceeded stack address space size"); 1094 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1095 Register SPReg = MFI->getStackPtrOffsetReg(); 1096 1097 unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; 1098 BuildMI(MBB, I, DL, TII->get(Op), SPReg) 1099 .addReg(SPReg) 1100 .addImm(Amount * ST.getWavefrontSize()); 1101 } else if (CalleePopAmount != 0) { 1102 llvm_unreachable("is this used?"); 1103 } 1104 1105 return MBB.erase(I); 1106 } 1107 1108 bool SIFrameLowering::hasFP(const MachineFunction &MF) const { 1109 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1110 1111 // For entry functions we can use an immediate offset in most cases, so the 1112 // presence of calls doesn't imply we need a distinct frame pointer. 1113 if (MFI.hasCalls() && 1114 !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) { 1115 // All offsets are unsigned, so need to be addressed in the same direction 1116 // as stack growth. 1117 1118 // FIXME: This function is pretty broken, since it can be called before the 1119 // frame layout is determined or CSR spills are inserted. 1120 return MFI.getStackSize() != 0; 1121 } 1122 1123 return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() || 1124 MFI.hasStackMap() || MFI.hasPatchPoint() || 1125 MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->needsStackRealignment(MF) || 1126 MF.getTarget().Options.DisableFramePointerElim(MF); 1127 } 1128