1 //===----------------------- SIFrameLowering.cpp --------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 9 #include "SIFrameLowering.h" 10 #include "AMDGPU.h" 11 #include "GCNSubtarget.h" 12 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 13 #include "SIMachineFunctionInfo.h" 14 #include "llvm/CodeGen/LivePhysRegs.h" 15 #include "llvm/CodeGen/MachineFrameInfo.h" 16 #include "llvm/CodeGen/RegisterScavenging.h" 17 #include "llvm/Target/TargetMachine.h" 18 19 using namespace llvm; 20 21 #define DEBUG_TYPE "frame-info" 22 23 // Find a scratch register that we can use in the prologue. We avoid using 24 // callee-save registers since they may appear to be free when this is called 25 // from canUseAsPrologue (during shrink wrapping), but then no longer be free 26 // when this is called from emitPrologue. 27 static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, 28 LivePhysRegs &LiveRegs, 29 const TargetRegisterClass &RC, 30 bool Unused = false) { 31 // Mark callee saved registers as used so we will not choose them. 32 const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); 33 for (unsigned i = 0; CSRegs[i]; ++i) 34 LiveRegs.addReg(CSRegs[i]); 35 36 if (Unused) { 37 // We are looking for a register that can be used throughout the entire 38 // function, so any use is unacceptable. 39 for (MCRegister Reg : RC) { 40 if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg)) 41 return Reg; 42 } 43 } else { 44 for (MCRegister Reg : RC) { 45 if (LiveRegs.available(MRI, Reg)) 46 return Reg; 47 } 48 } 49 50 return MCRegister(); 51 } 52 53 static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF, 54 LivePhysRegs &LiveRegs, 55 Register &TempSGPR, 56 Optional<int> &FrameIndex, 57 bool IsFP) { 58 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 59 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 60 61 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 62 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 63 64 // We need to save and restore the current FP/BP. 65 66 // 1: If there is already a VGPR with free lanes, use it. We 67 // may already have to pay the penalty for spilling a CSR VGPR. 68 if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) { 69 int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr, 70 TargetStackID::SGPRSpill); 71 72 if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI)) 73 llvm_unreachable("allocate SGPR spill should have worked"); 74 75 FrameIndex = NewFI; 76 77 LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); 78 dbgs() << "Spilling " << (IsFP ? "FP" : "BP") << " to " 79 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane 80 << '\n'); 81 return; 82 } 83 84 // 2: Next, try to save the FP/BP in an unused SGPR. 85 TempSGPR = findScratchNonCalleeSaveRegister( 86 MF.getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true); 87 88 if (!TempSGPR) { 89 int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr, 90 TargetStackID::SGPRSpill); 91 92 if (TRI->spillSGPRToVGPR() && MFI->allocateSGPRSpillToVGPR(MF, NewFI)) { 93 // 3: There's no free lane to spill, and no free register to save FP/BP, 94 // so we're forced to spill another VGPR to use for the spill. 95 FrameIndex = NewFI; 96 97 LLVM_DEBUG( 98 auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); 99 dbgs() << (IsFP ? "FP" : "BP") << " requires fallback spill to " 100 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';); 101 } else { 102 // Remove dead <NewFI> index 103 MF.getFrameInfo().RemoveStackObject(NewFI); 104 // 4: If all else fails, spill the FP/BP to memory. 105 FrameIndex = FrameInfo.CreateSpillStackObject(4, Align(4)); 106 LLVM_DEBUG(dbgs() << "Reserved FI " << FrameIndex << " for spilling " 107 << (IsFP ? "FP" : "BP") << '\n'); 108 } 109 } else { 110 LLVM_DEBUG(dbgs() << "Saving " << (IsFP ? "FP" : "BP") << " with copy to " 111 << printReg(TempSGPR, TRI) << '\n'); 112 } 113 } 114 115 // We need to specially emit stack operations here because a different frame 116 // register is used than in the rest of the function, as getFrameRegister would 117 // use. 118 static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI, 119 const SIMachineFunctionInfo &FuncInfo, 120 LivePhysRegs &LiveRegs, MachineFunction &MF, 121 MachineBasicBlock &MBB, 122 MachineBasicBlock::iterator I, Register SpillReg, 123 int FI) { 124 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 125 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 126 127 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 128 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 129 MachineMemOperand *MMO = MF.getMachineMemOperand( 130 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI), 131 FrameInfo.getObjectAlign(FI)); 132 LiveRegs.addReg(SpillReg); 133 TRI.buildSpillLoadStore(MBB, I, Opc, FI, SpillReg, true, 134 FuncInfo.getStackPtrOffsetReg(), 0, MMO, nullptr, 135 &LiveRegs); 136 LiveRegs.removeReg(SpillReg); 137 } 138 139 static void buildEpilogRestore(const GCNSubtarget &ST, 140 const SIRegisterInfo &TRI, 141 const SIMachineFunctionInfo &FuncInfo, 142 LivePhysRegs &LiveRegs, MachineFunction &MF, 143 MachineBasicBlock &MBB, 144 MachineBasicBlock::iterator I, Register SpillReg, 145 int FI) { 146 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 147 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 148 149 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 150 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 151 MachineMemOperand *MMO = MF.getMachineMemOperand( 152 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI), 153 FrameInfo.getObjectAlign(FI)); 154 TRI.buildSpillLoadStore(MBB, I, Opc, FI, SpillReg, false, 155 FuncInfo.getStackPtrOffsetReg(), 0, MMO, nullptr, 156 &LiveRegs); 157 } 158 159 static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 160 const DebugLoc &DL, const SIInstrInfo *TII, 161 Register TargetReg) { 162 MachineFunction *MF = MBB.getParent(); 163 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 164 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 165 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); 166 Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0); 167 Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1); 168 169 if (MFI->getGITPtrHigh() != 0xffffffff) { 170 BuildMI(MBB, I, DL, SMovB32, TargetHi) 171 .addImm(MFI->getGITPtrHigh()) 172 .addReg(TargetReg, RegState::ImplicitDefine); 173 } else { 174 const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64); 175 BuildMI(MBB, I, DL, GetPC64, TargetReg); 176 } 177 Register GitPtrLo = MFI->getGITPtrLoReg(*MF); 178 MF->getRegInfo().addLiveIn(GitPtrLo); 179 MBB.addLiveIn(GitPtrLo); 180 BuildMI(MBB, I, DL, SMovB32, TargetLo) 181 .addReg(GitPtrLo); 182 } 183 184 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()` 185 void SIFrameLowering::emitEntryFunctionFlatScratchInit( 186 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 187 const DebugLoc &DL, Register ScratchWaveOffsetReg) const { 188 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 189 const SIInstrInfo *TII = ST.getInstrInfo(); 190 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 191 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 192 193 // We don't need this if we only have spills since there is no user facing 194 // scratch. 195 196 // TODO: If we know we don't have flat instructions earlier, we can omit 197 // this from the input registers. 198 // 199 // TODO: We only need to know if we access scratch space through a flat 200 // pointer. Because we only detect if flat instructions are used at all, 201 // this will be used more often than necessary on VI. 202 203 Register FlatScrInitLo; 204 Register FlatScrInitHi; 205 206 if (ST.isAmdPalOS()) { 207 // Extract the scratch offset from the descriptor in the GIT 208 LivePhysRegs LiveRegs; 209 LiveRegs.init(*TRI); 210 LiveRegs.addLiveIns(MBB); 211 212 // Find unused reg to load flat scratch init into 213 MachineRegisterInfo &MRI = MF.getRegInfo(); 214 Register FlatScrInit = AMDGPU::NoRegister; 215 ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF); 216 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2; 217 AllSGPR64s = AllSGPR64s.slice( 218 std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded)); 219 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 220 for (MCPhysReg Reg : AllSGPR64s) { 221 if (LiveRegs.available(MRI, Reg) && MRI.isAllocatable(Reg) && 222 !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) { 223 FlatScrInit = Reg; 224 break; 225 } 226 } 227 assert(FlatScrInit && "Failed to find free register for scratch init"); 228 229 FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0); 230 FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1); 231 232 buildGitPtr(MBB, I, DL, TII, FlatScrInit); 233 234 // We now have the GIT ptr - now get the scratch descriptor from the entry 235 // at offset 0 (or offset 16 for a compute shader). 236 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 237 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); 238 auto *MMO = MF.getMachineMemOperand( 239 PtrInfo, 240 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | 241 MachineMemOperand::MODereferenceable, 242 8, Align(4)); 243 unsigned Offset = 244 MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; 245 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); 246 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset); 247 BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit) 248 .addReg(FlatScrInit) 249 .addImm(EncodedOffset) // offset 250 .addImm(0) // cpol 251 .addMemOperand(MMO); 252 253 // Mask the offset in [47:0] of the descriptor 254 const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32); 255 BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi) 256 .addReg(FlatScrInitHi) 257 .addImm(0xffff); 258 } else { 259 Register FlatScratchInitReg = 260 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); 261 assert(FlatScratchInitReg); 262 263 MachineRegisterInfo &MRI = MF.getRegInfo(); 264 MRI.addLiveIn(FlatScratchInitReg); 265 MBB.addLiveIn(FlatScratchInitReg); 266 267 FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); 268 FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); 269 } 270 271 // Do a 64-bit pointer add. 272 if (ST.flatScratchIsPointer()) { 273 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 274 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) 275 .addReg(FlatScrInitLo) 276 .addReg(ScratchWaveOffsetReg); 277 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), FlatScrInitHi) 278 .addReg(FlatScrInitHi) 279 .addImm(0); 280 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)). 281 addReg(FlatScrInitLo). 282 addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO | 283 (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); 284 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)). 285 addReg(FlatScrInitHi). 286 addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI | 287 (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); 288 return; 289 } 290 291 // For GFX9. 292 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO) 293 .addReg(FlatScrInitLo) 294 .addReg(ScratchWaveOffsetReg); 295 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI) 296 .addReg(FlatScrInitHi) 297 .addImm(0); 298 299 return; 300 } 301 302 assert(ST.getGeneration() < AMDGPUSubtarget::GFX9); 303 304 // Copy the size in bytes. 305 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO) 306 .addReg(FlatScrInitHi, RegState::Kill); 307 308 // Add wave offset in bytes to private base offset. 309 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init. 310 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) 311 .addReg(FlatScrInitLo) 312 .addReg(ScratchWaveOffsetReg); 313 314 // Convert offset to 256-byte units. 315 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI) 316 .addReg(FlatScrInitLo, RegState::Kill) 317 .addImm(8); 318 } 319 320 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not 321 // memory. They should have been removed by now. 322 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { 323 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); 324 I != E; ++I) { 325 if (!MFI.isDeadObjectIndex(I)) 326 return false; 327 } 328 329 return true; 330 } 331 332 // Shift down registers reserved for the scratch RSRC. 333 Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg( 334 MachineFunction &MF) const { 335 336 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 337 const SIInstrInfo *TII = ST.getInstrInfo(); 338 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 339 MachineRegisterInfo &MRI = MF.getRegInfo(); 340 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 341 342 assert(MFI->isEntryFunction()); 343 344 Register ScratchRsrcReg = MFI->getScratchRSrcReg(); 345 346 if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(ScratchRsrcReg) && 347 allStackObjectsAreDead(MF.getFrameInfo()))) 348 return Register(); 349 350 if (ST.hasSGPRInitBug() || 351 ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF)) 352 return ScratchRsrcReg; 353 354 // We reserved the last registers for this. Shift it down to the end of those 355 // which were actually used. 356 // 357 // FIXME: It might be safer to use a pseudoregister before replacement. 358 359 // FIXME: We should be able to eliminate unused input registers. We only 360 // cannot do this for the resources required for scratch access. For now we 361 // skip over user SGPRs and may leave unused holes. 362 363 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4; 364 ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF); 365 AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded)); 366 367 // Skip the last N reserved elements because they should have already been 368 // reserved for VCC etc. 369 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 370 for (MCPhysReg Reg : AllSGPR128s) { 371 // Pick the first unallocated one. Make sure we don't clobber the other 372 // reserved input we needed. Also for PAL, make sure we don't clobber 373 // the GIT pointer passed in SGPR0 or SGPR8. 374 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && 375 !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) { 376 MRI.replaceRegWith(ScratchRsrcReg, Reg); 377 MFI->setScratchRSrcReg(Reg); 378 return Reg; 379 } 380 } 381 382 return ScratchRsrcReg; 383 } 384 385 static unsigned getScratchScaleFactor(const GCNSubtarget &ST) { 386 return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize(); 387 } 388 389 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, 390 MachineBasicBlock &MBB) const { 391 assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); 392 393 // FIXME: If we only have SGPR spills, we won't actually be using scratch 394 // memory since these spill to VGPRs. We should be cleaning up these unused 395 // SGPR spill frame indices somewhere. 396 397 // FIXME: We still have implicit uses on SGPR spill instructions in case they 398 // need to spill to vector memory. It's likely that will not happen, but at 399 // this point it appears we need the setup. This part of the prolog should be 400 // emitted after frame indices are eliminated. 401 402 // FIXME: Remove all of the isPhysRegUsed checks 403 404 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 405 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 406 const SIInstrInfo *TII = ST.getInstrInfo(); 407 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 408 MachineRegisterInfo &MRI = MF.getRegInfo(); 409 const Function &F = MF.getFunction(); 410 411 assert(MFI->isEntryFunction()); 412 413 Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( 414 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); 415 // FIXME: Hack to not crash in situations which emitted an error. 416 if (!PreloadedScratchWaveOffsetReg) 417 return; 418 419 // We need to do the replacement of the private segment buffer register even 420 // if there are no stack objects. There could be stores to undef or a 421 // constant without an associated object. 422 // 423 // This will return `Register()` in cases where there are no actual 424 // uses of the SRSRC. 425 Register ScratchRsrcReg; 426 if (!ST.enableFlatScratch()) 427 ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF); 428 429 // Make the selected register live throughout the function. 430 if (ScratchRsrcReg) { 431 for (MachineBasicBlock &OtherBB : MF) { 432 if (&OtherBB != &MBB) { 433 OtherBB.addLiveIn(ScratchRsrcReg); 434 } 435 } 436 } 437 438 // Now that we have fixed the reserved SRSRC we need to locate the 439 // (potentially) preloaded SRSRC. 440 Register PreloadedScratchRsrcReg; 441 if (ST.isAmdHsaOrMesa(F)) { 442 PreloadedScratchRsrcReg = 443 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); 444 if (ScratchRsrcReg && PreloadedScratchRsrcReg) { 445 // We added live-ins during argument lowering, but since they were not 446 // used they were deleted. We're adding the uses now, so add them back. 447 MRI.addLiveIn(PreloadedScratchRsrcReg); 448 MBB.addLiveIn(PreloadedScratchRsrcReg); 449 } 450 } 451 452 // Debug location must be unknown since the first debug location is used to 453 // determine the end of the prologue. 454 DebugLoc DL; 455 MachineBasicBlock::iterator I = MBB.begin(); 456 457 // We found the SRSRC first because it needs four registers and has an 458 // alignment requirement. If the SRSRC that we found is clobbering with 459 // the scratch wave offset, which may be in a fixed SGPR or a free SGPR 460 // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch 461 // wave offset to a free SGPR. 462 Register ScratchWaveOffsetReg; 463 if (TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) { 464 ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF); 465 unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); 466 AllSGPRs = AllSGPRs.slice( 467 std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded)); 468 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 469 for (MCPhysReg Reg : AllSGPRs) { 470 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && 471 !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) { 472 ScratchWaveOffsetReg = Reg; 473 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) 474 .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); 475 break; 476 } 477 } 478 } else { 479 ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg; 480 } 481 assert(ScratchWaveOffsetReg); 482 483 if (requiresStackPointerReference(MF)) { 484 Register SPReg = MFI->getStackPtrOffsetReg(); 485 assert(SPReg != AMDGPU::SP_REG); 486 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg) 487 .addImm(MF.getFrameInfo().getStackSize() * getScratchScaleFactor(ST)); 488 } 489 490 if (hasFP(MF)) { 491 Register FPReg = MFI->getFrameOffsetReg(); 492 assert(FPReg != AMDGPU::FP_REG); 493 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0); 494 } 495 496 if (MFI->hasFlatScratchInit() || ScratchRsrcReg) { 497 MRI.addLiveIn(PreloadedScratchWaveOffsetReg); 498 MBB.addLiveIn(PreloadedScratchWaveOffsetReg); 499 } 500 501 if (MFI->hasFlatScratchInit()) { 502 emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg); 503 } 504 505 if (ScratchRsrcReg) { 506 emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL, 507 PreloadedScratchRsrcReg, 508 ScratchRsrcReg, ScratchWaveOffsetReg); 509 } 510 } 511 512 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg` 513 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( 514 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 515 const DebugLoc &DL, Register PreloadedScratchRsrcReg, 516 Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const { 517 518 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 519 const SIInstrInfo *TII = ST.getInstrInfo(); 520 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 521 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 522 const Function &Fn = MF.getFunction(); 523 524 if (ST.isAmdPalOS()) { 525 // The pointer to the GIT is formed from the offset passed in and either 526 // the amdgpu-git-ptr-high function attribute or the top part of the PC 527 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); 528 529 buildGitPtr(MBB, I, DL, TII, Rsrc01); 530 531 // We now have the GIT ptr - now get the scratch descriptor from the entry 532 // at offset 0 (or offset 16 for a compute shader). 533 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 534 const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM); 535 auto MMO = MF.getMachineMemOperand(PtrInfo, 536 MachineMemOperand::MOLoad | 537 MachineMemOperand::MOInvariant | 538 MachineMemOperand::MODereferenceable, 539 16, Align(4)); 540 unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; 541 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); 542 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset); 543 BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg) 544 .addReg(Rsrc01) 545 .addImm(EncodedOffset) // offset 546 .addImm(0) // cpol 547 .addReg(ScratchRsrcReg, RegState::ImplicitDefine) 548 .addMemOperand(MMO); 549 } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) { 550 assert(!ST.isAmdHsaOrMesa(Fn)); 551 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); 552 553 Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); 554 Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); 555 556 // Use relocations to get the pointer, and setup the other bits manually. 557 uint64_t Rsrc23 = TII->getScratchRsrcWords23(); 558 559 if (MFI->hasImplicitBufferPtr()) { 560 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); 561 562 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { 563 const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64); 564 565 BuildMI(MBB, I, DL, Mov64, Rsrc01) 566 .addReg(MFI->getImplicitBufferPtrUserSGPR()) 567 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 568 } else { 569 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); 570 571 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 572 auto MMO = MF.getMachineMemOperand( 573 PtrInfo, 574 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | 575 MachineMemOperand::MODereferenceable, 576 8, Align(4)); 577 BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01) 578 .addReg(MFI->getImplicitBufferPtrUserSGPR()) 579 .addImm(0) // offset 580 .addImm(0) // cpol 581 .addMemOperand(MMO) 582 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 583 584 MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR()); 585 MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR()); 586 } 587 } else { 588 Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 589 Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 590 591 BuildMI(MBB, I, DL, SMovB32, Rsrc0) 592 .addExternalSymbol("SCRATCH_RSRC_DWORD0") 593 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 594 595 BuildMI(MBB, I, DL, SMovB32, Rsrc1) 596 .addExternalSymbol("SCRATCH_RSRC_DWORD1") 597 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 598 599 } 600 601 BuildMI(MBB, I, DL, SMovB32, Rsrc2) 602 .addImm(Rsrc23 & 0xffffffff) 603 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 604 605 BuildMI(MBB, I, DL, SMovB32, Rsrc3) 606 .addImm(Rsrc23 >> 32) 607 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 608 } else if (ST.isAmdHsaOrMesa(Fn)) { 609 assert(PreloadedScratchRsrcReg); 610 611 if (ScratchRsrcReg != PreloadedScratchRsrcReg) { 612 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) 613 .addReg(PreloadedScratchRsrcReg, RegState::Kill); 614 } 615 } 616 617 // Add the scratch wave offset into the scratch RSRC. 618 // 619 // We only want to update the first 48 bits, which is the base address 620 // pointer, without touching the adjacent 16 bits of flags. We know this add 621 // cannot carry-out from bit 47, otherwise the scratch allocation would be 622 // impossible to fit in the 48-bit global address space. 623 // 624 // TODO: Evaluate if it is better to just construct an SRD using the flat 625 // scratch init and some constants rather than update the one we are passed. 626 Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 627 Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 628 629 // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in 630 // the kernel body via inreg arguments. 631 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0) 632 .addReg(ScratchRsrcSub0) 633 .addReg(ScratchWaveOffsetReg) 634 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 635 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1) 636 .addReg(ScratchRsrcSub1) 637 .addImm(0) 638 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 639 } 640 641 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { 642 switch (ID) { 643 case TargetStackID::Default: 644 case TargetStackID::NoAlloc: 645 case TargetStackID::SGPRSpill: 646 return true; 647 case TargetStackID::ScalableVector: 648 return false; 649 } 650 llvm_unreachable("Invalid TargetStackID::Value"); 651 } 652 653 static void initLiveRegs(LivePhysRegs &LiveRegs, const SIRegisterInfo &TRI, 654 const SIMachineFunctionInfo *FuncInfo, 655 MachineFunction &MF, MachineBasicBlock &MBB, 656 MachineBasicBlock::iterator MBBI, bool IsProlog) { 657 if (LiveRegs.empty()) { 658 LiveRegs.init(TRI); 659 if (IsProlog) { 660 LiveRegs.addLiveIns(MBB); 661 } else { 662 // In epilog. 663 LiveRegs.addLiveOuts(MBB); 664 LiveRegs.stepBackward(*MBBI); 665 } 666 } 667 } 668 669 // Activate all lanes, returns saved exec. 670 static Register buildScratchExecCopy(LivePhysRegs &LiveRegs, 671 MachineFunction &MF, 672 MachineBasicBlock &MBB, 673 MachineBasicBlock::iterator MBBI, 674 bool IsProlog) { 675 Register ScratchExecCopy; 676 MachineRegisterInfo &MRI = MF.getRegInfo(); 677 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 678 const SIInstrInfo *TII = ST.getInstrInfo(); 679 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 680 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 681 DebugLoc DL; 682 683 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, IsProlog); 684 685 ScratchExecCopy = findScratchNonCalleeSaveRegister( 686 MRI, LiveRegs, *TRI.getWaveMaskRegClass()); 687 if (!ScratchExecCopy) 688 report_fatal_error("failed to find free scratch register"); 689 690 LiveRegs.addReg(ScratchExecCopy); 691 692 const unsigned OrSaveExec = 693 ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; 694 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy).addImm(-1); 695 696 return ScratchExecCopy; 697 } 698 699 // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR. 700 // Otherwise we are spilling to memory. 701 static bool spilledToMemory(const MachineFunction &MF, int SaveIndex) { 702 const MachineFrameInfo &MFI = MF.getFrameInfo(); 703 return MFI.getStackID(SaveIndex) != TargetStackID::SGPRSpill; 704 } 705 706 void SIFrameLowering::emitPrologue(MachineFunction &MF, 707 MachineBasicBlock &MBB) const { 708 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 709 if (FuncInfo->isEntryFunction()) { 710 emitEntryFunctionPrologue(MF, MBB); 711 return; 712 } 713 714 const MachineFrameInfo &MFI = MF.getFrameInfo(); 715 MachineRegisterInfo &MRI = MF.getRegInfo(); 716 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 717 const SIInstrInfo *TII = ST.getInstrInfo(); 718 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 719 720 Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); 721 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 722 Register BasePtrReg = 723 TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register(); 724 LivePhysRegs LiveRegs; 725 726 MachineBasicBlock::iterator MBBI = MBB.begin(); 727 DebugLoc DL; 728 729 bool HasFP = false; 730 bool HasBP = false; 731 uint32_t NumBytes = MFI.getStackSize(); 732 uint32_t RoundedSize = NumBytes; 733 // To avoid clobbering VGPRs in lanes that weren't active on function entry, 734 // turn on all lanes before doing the spill to memory. 735 Register ScratchExecCopy; 736 737 Optional<int> FPSaveIndex = FuncInfo->FramePointerSaveIndex; 738 Optional<int> BPSaveIndex = FuncInfo->BasePointerSaveIndex; 739 740 // VGPRs used for SGPR->VGPR spills 741 for (const SIMachineFunctionInfo::SGPRSpillVGPR &Reg : 742 FuncInfo->getSGPRSpillVGPRs()) { 743 if (!Reg.FI) 744 continue; 745 746 if (!ScratchExecCopy) 747 ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, 748 /*IsProlog*/ true); 749 750 buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, Reg.VGPR, 751 *Reg.FI); 752 } 753 754 // VGPRs used for Whole Wave Mode 755 for (const auto &Reg : FuncInfo->WWMReservedRegs) { 756 auto VGPR = Reg.first; 757 auto FI = Reg.second; 758 if (!FI) 759 continue; 760 761 if (!ScratchExecCopy) 762 ScratchExecCopy = 763 buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ true); 764 765 buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, VGPR, *FI); 766 } 767 768 if (ScratchExecCopy) { 769 // FIXME: Split block and make terminator. 770 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 771 MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 772 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) 773 .addReg(ScratchExecCopy, RegState::Kill); 774 LiveRegs.addReg(ScratchExecCopy); 775 } 776 777 if (FPSaveIndex && spilledToMemory(MF, *FPSaveIndex)) { 778 const int FramePtrFI = *FPSaveIndex; 779 assert(!MFI.isDeadObjectIndex(FramePtrFI)); 780 781 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true); 782 783 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( 784 MRI, LiveRegs, AMDGPU::VGPR_32RegClass); 785 if (!TmpVGPR) 786 report_fatal_error("failed to find free scratch register"); 787 788 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 789 .addReg(FramePtrReg); 790 791 buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, TmpVGPR, 792 FramePtrFI); 793 } 794 795 if (BPSaveIndex && spilledToMemory(MF, *BPSaveIndex)) { 796 const int BasePtrFI = *BPSaveIndex; 797 assert(!MFI.isDeadObjectIndex(BasePtrFI)); 798 799 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true); 800 801 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( 802 MRI, LiveRegs, AMDGPU::VGPR_32RegClass); 803 if (!TmpVGPR) 804 report_fatal_error("failed to find free scratch register"); 805 806 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 807 .addReg(BasePtrReg); 808 809 buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, TmpVGPR, 810 BasePtrFI); 811 } 812 813 // In this case, spill the FP to a reserved VGPR. 814 if (FPSaveIndex && !spilledToMemory(MF, *FPSaveIndex)) { 815 const int FramePtrFI = *FPSaveIndex; 816 assert(!MFI.isDeadObjectIndex(FramePtrFI)); 817 818 assert(MFI.getStackID(FramePtrFI) == TargetStackID::SGPRSpill); 819 ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = 820 FuncInfo->getSGPRToVGPRSpills(FramePtrFI); 821 assert(Spill.size() == 1); 822 823 // Save FP before setting it up. 824 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR) 825 .addReg(FramePtrReg) 826 .addImm(Spill[0].Lane) 827 .addReg(Spill[0].VGPR, RegState::Undef); 828 } 829 830 // In this case, spill the BP to a reserved VGPR. 831 if (BPSaveIndex && !spilledToMemory(MF, *BPSaveIndex)) { 832 const int BasePtrFI = *BPSaveIndex; 833 assert(!MFI.isDeadObjectIndex(BasePtrFI)); 834 835 assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill); 836 ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = 837 FuncInfo->getSGPRToVGPRSpills(BasePtrFI); 838 assert(Spill.size() == 1); 839 840 // Save BP before setting it up. 841 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR) 842 .addReg(BasePtrReg) 843 .addImm(Spill[0].Lane) 844 .addReg(Spill[0].VGPR, RegState::Undef); 845 } 846 847 // Emit the copy if we need an FP, and are using a free SGPR to save it. 848 if (FuncInfo->SGPRForFPSaveRestoreCopy) { 849 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), 850 FuncInfo->SGPRForFPSaveRestoreCopy) 851 .addReg(FramePtrReg) 852 .setMIFlag(MachineInstr::FrameSetup); 853 } 854 855 // Emit the copy if we need a BP, and are using a free SGPR to save it. 856 if (FuncInfo->SGPRForBPSaveRestoreCopy) { 857 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), 858 FuncInfo->SGPRForBPSaveRestoreCopy) 859 .addReg(BasePtrReg) 860 .setMIFlag(MachineInstr::FrameSetup); 861 } 862 863 // If a copy has been emitted for FP and/or BP, Make the SGPRs 864 // used in the copy instructions live throughout the function. 865 SmallVector<MCPhysReg, 2> TempSGPRs; 866 if (FuncInfo->SGPRForFPSaveRestoreCopy) 867 TempSGPRs.push_back(FuncInfo->SGPRForFPSaveRestoreCopy); 868 869 if (FuncInfo->SGPRForBPSaveRestoreCopy) 870 TempSGPRs.push_back(FuncInfo->SGPRForBPSaveRestoreCopy); 871 872 if (!TempSGPRs.empty()) { 873 for (MachineBasicBlock &MBB : MF) { 874 for (MCPhysReg Reg : TempSGPRs) 875 MBB.addLiveIn(Reg); 876 877 MBB.sortUniqueLiveIns(); 878 } 879 if (!LiveRegs.empty()) { 880 LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy); 881 LiveRegs.addReg(FuncInfo->SGPRForBPSaveRestoreCopy); 882 } 883 } 884 885 if (TRI.hasStackRealignment(MF)) { 886 HasFP = true; 887 const unsigned Alignment = MFI.getMaxAlign().value(); 888 889 RoundedSize += Alignment; 890 if (LiveRegs.empty()) { 891 LiveRegs.init(TRI); 892 LiveRegs.addLiveIns(MBB); 893 } 894 895 // s_add_u32 s33, s32, NumBytes 896 // s_and_b32 s33, s33, 0b111...0000 897 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), FramePtrReg) 898 .addReg(StackPtrReg) 899 .addImm((Alignment - 1) * getScratchScaleFactor(ST)) 900 .setMIFlag(MachineInstr::FrameSetup); 901 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg) 902 .addReg(FramePtrReg, RegState::Kill) 903 .addImm(-Alignment * getScratchScaleFactor(ST)) 904 .setMIFlag(MachineInstr::FrameSetup); 905 FuncInfo->setIsStackRealigned(true); 906 } else if ((HasFP = hasFP(MF))) { 907 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) 908 .addReg(StackPtrReg) 909 .setMIFlag(MachineInstr::FrameSetup); 910 } 911 912 // If we need a base pointer, set it up here. It's whatever the value of 913 // the stack pointer is at this point. Any variable size objects will be 914 // allocated after this, so we can still use the base pointer to reference 915 // the incoming arguments. 916 if ((HasBP = TRI.hasBasePointer(MF))) { 917 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg) 918 .addReg(StackPtrReg) 919 .setMIFlag(MachineInstr::FrameSetup); 920 } 921 922 if (HasFP && RoundedSize != 0) { 923 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg) 924 .addReg(StackPtrReg) 925 .addImm(RoundedSize * getScratchScaleFactor(ST)) 926 .setMIFlag(MachineInstr::FrameSetup); 927 } 928 929 assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy || 930 FuncInfo->FramePointerSaveIndex)) && 931 "Needed to save FP but didn't save it anywhere"); 932 933 assert((HasFP || (!FuncInfo->SGPRForFPSaveRestoreCopy && 934 !FuncInfo->FramePointerSaveIndex)) && 935 "Saved FP but didn't need it"); 936 937 assert((!HasBP || (FuncInfo->SGPRForBPSaveRestoreCopy || 938 FuncInfo->BasePointerSaveIndex)) && 939 "Needed to save BP but didn't save it anywhere"); 940 941 assert((HasBP || (!FuncInfo->SGPRForBPSaveRestoreCopy && 942 !FuncInfo->BasePointerSaveIndex)) && 943 "Saved BP but didn't need it"); 944 } 945 946 void SIFrameLowering::emitEpilogue(MachineFunction &MF, 947 MachineBasicBlock &MBB) const { 948 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 949 if (FuncInfo->isEntryFunction()) 950 return; 951 952 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 953 const SIInstrInfo *TII = ST.getInstrInfo(); 954 MachineRegisterInfo &MRI = MF.getRegInfo(); 955 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 956 MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); 957 LivePhysRegs LiveRegs; 958 DebugLoc DL; 959 960 const MachineFrameInfo &MFI = MF.getFrameInfo(); 961 uint32_t NumBytes = MFI.getStackSize(); 962 uint32_t RoundedSize = FuncInfo->isStackRealigned() 963 ? NumBytes + MFI.getMaxAlign().value() 964 : NumBytes; 965 const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); 966 const Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 967 const Register BasePtrReg = 968 TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register(); 969 970 Optional<int> FPSaveIndex = FuncInfo->FramePointerSaveIndex; 971 Optional<int> BPSaveIndex = FuncInfo->BasePointerSaveIndex; 972 973 if (RoundedSize != 0 && hasFP(MF)) { 974 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg) 975 .addReg(StackPtrReg) 976 .addImm(RoundedSize * getScratchScaleFactor(ST)) 977 .setMIFlag(MachineInstr::FrameDestroy); 978 } 979 980 if (FuncInfo->SGPRForFPSaveRestoreCopy) { 981 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) 982 .addReg(FuncInfo->SGPRForFPSaveRestoreCopy) 983 .setMIFlag(MachineInstr::FrameDestroy); 984 } 985 986 if (FuncInfo->SGPRForBPSaveRestoreCopy) { 987 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg) 988 .addReg(FuncInfo->SGPRForBPSaveRestoreCopy) 989 .setMIFlag(MachineInstr::FrameDestroy); 990 } 991 992 if (FPSaveIndex) { 993 const int FramePtrFI = *FPSaveIndex; 994 assert(!MFI.isDeadObjectIndex(FramePtrFI)); 995 if (spilledToMemory(MF, FramePtrFI)) { 996 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false); 997 998 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( 999 MRI, LiveRegs, AMDGPU::VGPR_32RegClass); 1000 if (!TmpVGPR) 1001 report_fatal_error("failed to find free scratch register"); 1002 buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, TmpVGPR, 1003 FramePtrFI); 1004 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg) 1005 .addReg(TmpVGPR, RegState::Kill); 1006 } else { 1007 // Reload from VGPR spill. 1008 assert(MFI.getStackID(FramePtrFI) == TargetStackID::SGPRSpill); 1009 ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = 1010 FuncInfo->getSGPRToVGPRSpills(FramePtrFI); 1011 assert(Spill.size() == 1); 1012 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READLANE_B32), FramePtrReg) 1013 .addReg(Spill[0].VGPR) 1014 .addImm(Spill[0].Lane); 1015 } 1016 } 1017 1018 if (BPSaveIndex) { 1019 const int BasePtrFI = *BPSaveIndex; 1020 assert(!MFI.isDeadObjectIndex(BasePtrFI)); 1021 if (spilledToMemory(MF, BasePtrFI)) { 1022 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false); 1023 1024 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( 1025 MRI, LiveRegs, AMDGPU::VGPR_32RegClass); 1026 if (!TmpVGPR) 1027 report_fatal_error("failed to find free scratch register"); 1028 buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, TmpVGPR, 1029 BasePtrFI); 1030 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg) 1031 .addReg(TmpVGPR, RegState::Kill); 1032 } else { 1033 // Reload from VGPR spill. 1034 assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill); 1035 ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = 1036 FuncInfo->getSGPRToVGPRSpills(BasePtrFI); 1037 assert(Spill.size() == 1); 1038 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READLANE_B32), BasePtrReg) 1039 .addReg(Spill[0].VGPR) 1040 .addImm(Spill[0].Lane); 1041 } 1042 } 1043 1044 Register ScratchExecCopy; 1045 for (const SIMachineFunctionInfo::SGPRSpillVGPR &Reg : 1046 FuncInfo->getSGPRSpillVGPRs()) { 1047 if (!Reg.FI) 1048 continue; 1049 1050 if (!ScratchExecCopy) 1051 ScratchExecCopy = 1052 buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ false); 1053 1054 buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, Reg.VGPR, 1055 *Reg.FI); 1056 } 1057 1058 for (const auto &Reg : FuncInfo->WWMReservedRegs) { 1059 auto VGPR = Reg.first; 1060 auto FI = Reg.second; 1061 if (!FI) 1062 continue; 1063 1064 if (!ScratchExecCopy) 1065 ScratchExecCopy = 1066 buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ false); 1067 1068 buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, VGPR, *FI); 1069 } 1070 1071 if (ScratchExecCopy) { 1072 // FIXME: Split block and make terminator. 1073 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 1074 MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 1075 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) 1076 .addReg(ScratchExecCopy, RegState::Kill); 1077 } 1078 } 1079 1080 #ifndef NDEBUG 1081 static bool allSGPRSpillsAreDead(const MachineFunction &MF) { 1082 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1083 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1084 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); 1085 I != E; ++I) { 1086 if (!MFI.isDeadObjectIndex(I) && 1087 MFI.getStackID(I) == TargetStackID::SGPRSpill && 1088 (I != FuncInfo->FramePointerSaveIndex && 1089 I != FuncInfo->BasePointerSaveIndex)) { 1090 return false; 1091 } 1092 } 1093 1094 return true; 1095 } 1096 #endif 1097 1098 StackOffset SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, 1099 int FI, 1100 Register &FrameReg) const { 1101 const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); 1102 1103 FrameReg = RI->getFrameRegister(MF); 1104 return StackOffset::getFixed(MF.getFrameInfo().getObjectOffset(FI)); 1105 } 1106 1107 void SIFrameLowering::processFunctionBeforeFrameFinalized( 1108 MachineFunction &MF, 1109 RegScavenger *RS) const { 1110 MachineFrameInfo &MFI = MF.getFrameInfo(); 1111 1112 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1113 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1114 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1115 1116 FuncInfo->removeDeadFrameIndices(MFI); 1117 assert(allSGPRSpillsAreDead(MF) && 1118 "SGPR spill should have been removed in SILowerSGPRSpills"); 1119 1120 // FIXME: The other checks should be redundant with allStackObjectsAreDead, 1121 // but currently hasNonSpillStackObjects is set only from source 1122 // allocas. Stack temps produced from legalization are not counted currently. 1123 if (!allStackObjectsAreDead(MFI)) { 1124 assert(RS && "RegScavenger required if spilling"); 1125 1126 // Add an emergency spill slot 1127 RS->addScavengingFrameIndex(FuncInfo->getScavengeFI(MFI, *TRI)); 1128 } 1129 } 1130 1131 // Only report VGPRs to generic code. 1132 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, 1133 BitVector &SavedVGPRs, 1134 RegScavenger *RS) const { 1135 TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS); 1136 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1137 if (MFI->isEntryFunction()) 1138 return; 1139 1140 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 1141 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1142 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1143 1144 // Ignore the SGPRs the default implementation found. 1145 SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask()); 1146 1147 // Do not save AGPRs prior to GFX90A because there was no easy way to do so. 1148 // In gfx908 there was do AGPR loads and stores and thus spilling also 1149 // require a temporary VGPR. 1150 if (!ST.hasGFX90AInsts()) 1151 SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask()); 1152 1153 // hasFP only knows about stack objects that already exist. We're now 1154 // determining the stack slots that will be created, so we have to predict 1155 // them. Stack objects force FP usage with calls. 1156 // 1157 // Note a new VGPR CSR may be introduced if one is used for the spill, but we 1158 // don't want to report it here. 1159 // 1160 // FIXME: Is this really hasReservedCallFrame? 1161 const bool WillHaveFP = 1162 FrameInfo.hasCalls() && 1163 (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo)); 1164 1165 // VGPRs used for SGPR spilling need to be specially inserted in the prolog, 1166 // so don't allow the default insertion to handle them. 1167 for (auto SSpill : MFI->getSGPRSpillVGPRs()) 1168 SavedVGPRs.reset(SSpill.VGPR); 1169 1170 LivePhysRegs LiveRegs; 1171 LiveRegs.init(*TRI); 1172 1173 if (WillHaveFP || hasFP(MF)) { 1174 assert(!MFI->SGPRForFPSaveRestoreCopy && !MFI->FramePointerSaveIndex && 1175 "Re-reserving spill slot for FP"); 1176 getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForFPSaveRestoreCopy, 1177 MFI->FramePointerSaveIndex, true); 1178 } 1179 1180 if (TRI->hasBasePointer(MF)) { 1181 if (MFI->SGPRForFPSaveRestoreCopy) 1182 LiveRegs.addReg(MFI->SGPRForFPSaveRestoreCopy); 1183 1184 assert(!MFI->SGPRForBPSaveRestoreCopy && 1185 !MFI->BasePointerSaveIndex && "Re-reserving spill slot for BP"); 1186 getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForBPSaveRestoreCopy, 1187 MFI->BasePointerSaveIndex, false); 1188 } 1189 } 1190 1191 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF, 1192 BitVector &SavedRegs, 1193 RegScavenger *RS) const { 1194 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); 1195 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1196 if (MFI->isEntryFunction()) 1197 return; 1198 1199 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1200 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1201 1202 // The SP is specifically managed and we don't want extra spills of it. 1203 SavedRegs.reset(MFI->getStackPtrOffsetReg()); 1204 1205 const BitVector AllSavedRegs = SavedRegs; 1206 SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask()); 1207 1208 // If clearing VGPRs changed the mask, we will have some CSR VGPR spills. 1209 const bool HaveAnyCSRVGPR = SavedRegs != AllSavedRegs; 1210 1211 // We have to anticipate introducing CSR VGPR spills if we don't have any 1212 // stack objects already, since we require an FP if there is a call and stack. 1213 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 1214 const bool WillHaveFP = FrameInfo.hasCalls() && HaveAnyCSRVGPR; 1215 1216 // FP will be specially managed like SP. 1217 if (WillHaveFP || hasFP(MF)) 1218 SavedRegs.reset(MFI->getFrameOffsetReg()); 1219 } 1220 1221 bool SIFrameLowering::assignCalleeSavedSpillSlots( 1222 MachineFunction &MF, const TargetRegisterInfo *TRI, 1223 std::vector<CalleeSavedInfo> &CSI) const { 1224 if (CSI.empty()) 1225 return true; // Early exit if no callee saved registers are modified! 1226 1227 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1228 if (!FuncInfo->SGPRForFPSaveRestoreCopy && 1229 !FuncInfo->SGPRForBPSaveRestoreCopy) 1230 return false; 1231 1232 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1233 const SIRegisterInfo *RI = ST.getRegisterInfo(); 1234 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 1235 Register BasePtrReg = RI->getBaseRegister(); 1236 unsigned NumModifiedRegs = 0; 1237 1238 if (FuncInfo->SGPRForFPSaveRestoreCopy) 1239 NumModifiedRegs++; 1240 if (FuncInfo->SGPRForBPSaveRestoreCopy) 1241 NumModifiedRegs++; 1242 1243 for (auto &CS : CSI) { 1244 if (CS.getReg() == FramePtrReg && FuncInfo->SGPRForFPSaveRestoreCopy) { 1245 CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy); 1246 if (--NumModifiedRegs) 1247 break; 1248 } else if (CS.getReg() == BasePtrReg && 1249 FuncInfo->SGPRForBPSaveRestoreCopy) { 1250 CS.setDstReg(FuncInfo->SGPRForBPSaveRestoreCopy); 1251 if (--NumModifiedRegs) 1252 break; 1253 } 1254 } 1255 1256 return false; 1257 } 1258 1259 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( 1260 MachineFunction &MF, 1261 MachineBasicBlock &MBB, 1262 MachineBasicBlock::iterator I) const { 1263 int64_t Amount = I->getOperand(0).getImm(); 1264 if (Amount == 0) 1265 return MBB.erase(I); 1266 1267 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1268 const SIInstrInfo *TII = ST.getInstrInfo(); 1269 const DebugLoc &DL = I->getDebugLoc(); 1270 unsigned Opc = I->getOpcode(); 1271 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); 1272 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0; 1273 1274 if (!hasReservedCallFrame(MF)) { 1275 Amount = alignTo(Amount, getStackAlign()); 1276 assert(isUInt<32>(Amount) && "exceeded stack address space size"); 1277 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1278 Register SPReg = MFI->getStackPtrOffsetReg(); 1279 1280 unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; 1281 BuildMI(MBB, I, DL, TII->get(Op), SPReg) 1282 .addReg(SPReg) 1283 .addImm(Amount * getScratchScaleFactor(ST)); 1284 } else if (CalleePopAmount != 0) { 1285 llvm_unreachable("is this used?"); 1286 } 1287 1288 return MBB.erase(I); 1289 } 1290 1291 /// Returns true if the frame will require a reference to the stack pointer. 1292 /// 1293 /// This is the set of conditions common to setting up the stack pointer in a 1294 /// kernel, and for using a frame pointer in a callable function. 1295 /// 1296 /// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm 1297 /// references SP. 1298 static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) { 1299 return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint(); 1300 } 1301 1302 // The FP for kernels is always known 0, so we never really need to setup an 1303 // explicit register for it. However, DisableFramePointerElim will force us to 1304 // use a register for it. 1305 bool SIFrameLowering::hasFP(const MachineFunction &MF) const { 1306 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1307 1308 // For entry functions we can use an immediate offset in most cases, so the 1309 // presence of calls doesn't imply we need a distinct frame pointer. 1310 if (MFI.hasCalls() && 1311 !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) { 1312 // All offsets are unsigned, so need to be addressed in the same direction 1313 // as stack growth. 1314 1315 // FIXME: This function is pretty broken, since it can be called before the 1316 // frame layout is determined or CSR spills are inserted. 1317 return MFI.getStackSize() != 0; 1318 } 1319 1320 return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() || 1321 MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment( 1322 MF) || 1323 MF.getTarget().Options.DisableFramePointerElim(MF); 1324 } 1325 1326 // This is essentially a reduced version of hasFP for entry functions. Since the 1327 // stack pointer is known 0 on entry to kernels, we never really need an FP 1328 // register. We may need to initialize the stack pointer depending on the frame 1329 // properties, which logically overlaps many of the cases where an ordinary 1330 // function would require an FP. 1331 bool SIFrameLowering::requiresStackPointerReference( 1332 const MachineFunction &MF) const { 1333 // Callable functions always require a stack pointer reference. 1334 assert(MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() && 1335 "only expected to call this for entry points"); 1336 1337 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1338 1339 // Entry points ordinarily don't need to initialize SP. We have to set it up 1340 // for callees if there are any. Also note tail calls are impossible/don't 1341 // make any sense for kernels. 1342 if (MFI.hasCalls()) 1343 return true; 1344 1345 // We still need to initialize the SP if we're doing anything weird that 1346 // references the SP, like variable sized stack objects. 1347 return frameTriviallyRequiresSP(MFI); 1348 } 1349