1 //===----------------------- SIFrameLowering.cpp --------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 9 #include "SIFrameLowering.h" 10 #include "AMDGPU.h" 11 #include "GCNSubtarget.h" 12 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 13 #include "SIMachineFunctionInfo.h" 14 #include "llvm/CodeGen/LivePhysRegs.h" 15 #include "llvm/CodeGen/MachineFrameInfo.h" 16 #include "llvm/CodeGen/RegisterScavenging.h" 17 #include "llvm/Target/TargetMachine.h" 18 19 using namespace llvm; 20 21 #define DEBUG_TYPE "frame-info" 22 23 // Find a scratch register that we can use in the prologue. We avoid using 24 // callee-save registers since they may appear to be free when this is called 25 // from canUseAsPrologue (during shrink wrapping), but then no longer be free 26 // when this is called from emitPrologue. 27 static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, 28 LivePhysRegs &LiveRegs, 29 const TargetRegisterClass &RC, 30 bool Unused = false) { 31 // Mark callee saved registers as used so we will not choose them. 32 const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); 33 for (unsigned i = 0; CSRegs[i]; ++i) 34 LiveRegs.addReg(CSRegs[i]); 35 36 if (Unused) { 37 // We are looking for a register that can be used throughout the entire 38 // function, so any use is unacceptable. 39 for (MCRegister Reg : RC) { 40 if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg)) 41 return Reg; 42 } 43 } else { 44 for (MCRegister Reg : RC) { 45 if (LiveRegs.available(MRI, Reg)) 46 return Reg; 47 } 48 } 49 50 return MCRegister(); 51 } 52 53 static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF, 54 LivePhysRegs &LiveRegs, 55 Register &TempSGPR, 56 Optional<int> &FrameIndex, 57 bool IsFP) { 58 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 59 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 60 61 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 62 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 63 64 // We need to save and restore the current FP/BP. 65 66 // 1: If there is already a VGPR with free lanes, use it. We 67 // may already have to pay the penalty for spilling a CSR VGPR. 68 if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) { 69 int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr, 70 TargetStackID::SGPRSpill); 71 72 if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI)) 73 llvm_unreachable("allocate SGPR spill should have worked"); 74 75 FrameIndex = NewFI; 76 77 LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); 78 dbgs() << "Spilling " << (IsFP ? "FP" : "BP") << " to " 79 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane 80 << '\n'); 81 return; 82 } 83 84 // 2: Next, try to save the FP/BP in an unused SGPR. 85 TempSGPR = findScratchNonCalleeSaveRegister( 86 MF.getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true); 87 88 if (!TempSGPR) { 89 int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr, 90 TargetStackID::SGPRSpill); 91 92 if (TRI->spillSGPRToVGPR() && MFI->allocateSGPRSpillToVGPR(MF, NewFI)) { 93 // 3: There's no free lane to spill, and no free register to save FP/BP, 94 // so we're forced to spill another VGPR to use for the spill. 95 FrameIndex = NewFI; 96 97 LLVM_DEBUG( 98 auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); 99 dbgs() << (IsFP ? "FP" : "BP") << " requires fallback spill to " 100 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';); 101 } else { 102 // Remove dead <NewFI> index 103 MF.getFrameInfo().RemoveStackObject(NewFI); 104 // 4: If all else fails, spill the FP/BP to memory. 105 FrameIndex = FrameInfo.CreateSpillStackObject(4, Align(4)); 106 LLVM_DEBUG(dbgs() << "Reserved FI " << FrameIndex << " for spilling " 107 << (IsFP ? "FP" : "BP") << '\n'); 108 } 109 } else { 110 LLVM_DEBUG(dbgs() << "Saving " << (IsFP ? "FP" : "BP") << " with copy to " 111 << printReg(TempSGPR, TRI) << '\n'); 112 } 113 } 114 115 // We need to specially emit stack operations here because a different frame 116 // register is used than in the rest of the function, as getFrameRegister would 117 // use. 118 static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI, 119 const SIMachineFunctionInfo &FuncInfo, 120 LivePhysRegs &LiveRegs, MachineFunction &MF, 121 MachineBasicBlock &MBB, 122 MachineBasicBlock::iterator I, Register SpillReg, 123 int FI) { 124 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 125 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 126 127 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 128 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 129 MachineMemOperand *MMO = MF.getMachineMemOperand( 130 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI), 131 FrameInfo.getObjectAlign(FI)); 132 LiveRegs.addReg(SpillReg); 133 TRI.buildSpillLoadStore(MBB, I, Opc, FI, SpillReg, true, 134 FuncInfo.getStackPtrOffsetReg(), 0, MMO, nullptr, 135 &LiveRegs); 136 LiveRegs.removeReg(SpillReg); 137 } 138 139 static void buildEpilogRestore(const GCNSubtarget &ST, 140 const SIRegisterInfo &TRI, 141 const SIMachineFunctionInfo &FuncInfo, 142 LivePhysRegs &LiveRegs, MachineFunction &MF, 143 MachineBasicBlock &MBB, 144 MachineBasicBlock::iterator I, Register SpillReg, 145 int FI) { 146 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 147 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 148 149 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 150 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 151 MachineMemOperand *MMO = MF.getMachineMemOperand( 152 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI), 153 FrameInfo.getObjectAlign(FI)); 154 TRI.buildSpillLoadStore(MBB, I, Opc, FI, SpillReg, false, 155 FuncInfo.getStackPtrOffsetReg(), 0, MMO, nullptr, 156 &LiveRegs); 157 } 158 159 static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 160 const DebugLoc &DL, const SIInstrInfo *TII, 161 Register TargetReg) { 162 MachineFunction *MF = MBB.getParent(); 163 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 164 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 165 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); 166 Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0); 167 Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1); 168 169 if (MFI->getGITPtrHigh() != 0xffffffff) { 170 BuildMI(MBB, I, DL, SMovB32, TargetHi) 171 .addImm(MFI->getGITPtrHigh()) 172 .addReg(TargetReg, RegState::ImplicitDefine); 173 } else { 174 const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64); 175 BuildMI(MBB, I, DL, GetPC64, TargetReg); 176 } 177 Register GitPtrLo = MFI->getGITPtrLoReg(*MF); 178 MF->getRegInfo().addLiveIn(GitPtrLo); 179 MBB.addLiveIn(GitPtrLo); 180 BuildMI(MBB, I, DL, SMovB32, TargetLo) 181 .addReg(GitPtrLo); 182 } 183 184 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()` 185 void SIFrameLowering::emitEntryFunctionFlatScratchInit( 186 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 187 const DebugLoc &DL, Register ScratchWaveOffsetReg) const { 188 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 189 const SIInstrInfo *TII = ST.getInstrInfo(); 190 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 191 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 192 193 // We don't need this if we only have spills since there is no user facing 194 // scratch. 195 196 // TODO: If we know we don't have flat instructions earlier, we can omit 197 // this from the input registers. 198 // 199 // TODO: We only need to know if we access scratch space through a flat 200 // pointer. Because we only detect if flat instructions are used at all, 201 // this will be used more often than necessary on VI. 202 203 Register FlatScrInitLo; 204 Register FlatScrInitHi; 205 206 if (ST.isAmdPalOS()) { 207 // Extract the scratch offset from the descriptor in the GIT 208 LivePhysRegs LiveRegs; 209 LiveRegs.init(*TRI); 210 LiveRegs.addLiveIns(MBB); 211 212 // Find unused reg to load flat scratch init into 213 MachineRegisterInfo &MRI = MF.getRegInfo(); 214 Register FlatScrInit = AMDGPU::NoRegister; 215 ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF); 216 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2; 217 AllSGPR64s = AllSGPR64s.slice( 218 std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded)); 219 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 220 for (MCPhysReg Reg : AllSGPR64s) { 221 if (LiveRegs.available(MRI, Reg) && MRI.isAllocatable(Reg) && 222 !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) { 223 FlatScrInit = Reg; 224 break; 225 } 226 } 227 assert(FlatScrInit && "Failed to find free register for scratch init"); 228 229 FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0); 230 FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1); 231 232 buildGitPtr(MBB, I, DL, TII, FlatScrInit); 233 234 // We now have the GIT ptr - now get the scratch descriptor from the entry 235 // at offset 0 (or offset 16 for a compute shader). 236 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 237 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); 238 auto *MMO = MF.getMachineMemOperand( 239 PtrInfo, 240 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | 241 MachineMemOperand::MODereferenceable, 242 8, Align(4)); 243 unsigned Offset = 244 MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; 245 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); 246 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset); 247 BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit) 248 .addReg(FlatScrInit) 249 .addImm(EncodedOffset) // offset 250 .addImm(0) // cpol 251 .addMemOperand(MMO); 252 253 // Mask the offset in [47:0] of the descriptor 254 const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32); 255 BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi) 256 .addReg(FlatScrInitHi) 257 .addImm(0xffff); 258 } else { 259 Register FlatScratchInitReg = 260 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); 261 assert(FlatScratchInitReg); 262 263 MachineRegisterInfo &MRI = MF.getRegInfo(); 264 MRI.addLiveIn(FlatScratchInitReg); 265 MBB.addLiveIn(FlatScratchInitReg); 266 267 FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); 268 FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); 269 } 270 271 // Do a 64-bit pointer add. 272 if (ST.flatScratchIsPointer()) { 273 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 274 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) 275 .addReg(FlatScrInitLo) 276 .addReg(ScratchWaveOffsetReg); 277 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), FlatScrInitHi) 278 .addReg(FlatScrInitHi) 279 .addImm(0); 280 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)). 281 addReg(FlatScrInitLo). 282 addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO | 283 (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); 284 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)). 285 addReg(FlatScrInitHi). 286 addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI | 287 (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); 288 return; 289 } 290 291 // For GFX9. 292 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO) 293 .addReg(FlatScrInitLo) 294 .addReg(ScratchWaveOffsetReg); 295 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI) 296 .addReg(FlatScrInitHi) 297 .addImm(0); 298 299 return; 300 } 301 302 assert(ST.getGeneration() < AMDGPUSubtarget::GFX9); 303 304 // Copy the size in bytes. 305 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO) 306 .addReg(FlatScrInitHi, RegState::Kill); 307 308 // Add wave offset in bytes to private base offset. 309 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init. 310 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), FlatScrInitLo) 311 .addReg(FlatScrInitLo) 312 .addReg(ScratchWaveOffsetReg); 313 314 // Convert offset to 256-byte units. 315 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI) 316 .addReg(FlatScrInitLo, RegState::Kill) 317 .addImm(8); 318 } 319 320 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not 321 // memory. They should have been removed by now. 322 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { 323 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); 324 I != E; ++I) { 325 if (!MFI.isDeadObjectIndex(I)) 326 return false; 327 } 328 329 return true; 330 } 331 332 // Shift down registers reserved for the scratch RSRC. 333 Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg( 334 MachineFunction &MF) const { 335 336 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 337 const SIInstrInfo *TII = ST.getInstrInfo(); 338 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 339 MachineRegisterInfo &MRI = MF.getRegInfo(); 340 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 341 342 assert(MFI->isEntryFunction()); 343 344 Register ScratchRsrcReg = MFI->getScratchRSrcReg(); 345 346 if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(ScratchRsrcReg) && 347 allStackObjectsAreDead(MF.getFrameInfo()))) 348 return Register(); 349 350 if (ST.hasSGPRInitBug() || 351 ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF)) 352 return ScratchRsrcReg; 353 354 // We reserved the last registers for this. Shift it down to the end of those 355 // which were actually used. 356 // 357 // FIXME: It might be safer to use a pseudoregister before replacement. 358 359 // FIXME: We should be able to eliminate unused input registers. We only 360 // cannot do this for the resources required for scratch access. For now we 361 // skip over user SGPRs and may leave unused holes. 362 363 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4; 364 ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF); 365 AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded)); 366 367 // Skip the last N reserved elements because they should have already been 368 // reserved for VCC etc. 369 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 370 for (MCPhysReg Reg : AllSGPR128s) { 371 // Pick the first unallocated one. Make sure we don't clobber the other 372 // reserved input we needed. Also for PAL, make sure we don't clobber 373 // the GIT pointer passed in SGPR0 or SGPR8. 374 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && 375 !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) { 376 MRI.replaceRegWith(ScratchRsrcReg, Reg); 377 MFI->setScratchRSrcReg(Reg); 378 return Reg; 379 } 380 } 381 382 return ScratchRsrcReg; 383 } 384 385 static unsigned getScratchScaleFactor(const GCNSubtarget &ST) { 386 return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize(); 387 } 388 389 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, 390 MachineBasicBlock &MBB) const { 391 assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); 392 393 // FIXME: If we only have SGPR spills, we won't actually be using scratch 394 // memory since these spill to VGPRs. We should be cleaning up these unused 395 // SGPR spill frame indices somewhere. 396 397 // FIXME: We still have implicit uses on SGPR spill instructions in case they 398 // need to spill to vector memory. It's likely that will not happen, but at 399 // this point it appears we need the setup. This part of the prolog should be 400 // emitted after frame indices are eliminated. 401 402 // FIXME: Remove all of the isPhysRegUsed checks 403 404 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 405 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 406 const SIInstrInfo *TII = ST.getInstrInfo(); 407 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 408 MachineRegisterInfo &MRI = MF.getRegInfo(); 409 const Function &F = MF.getFunction(); 410 411 assert(MFI->isEntryFunction()); 412 413 Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( 414 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); 415 // FIXME: Hack to not crash in situations which emitted an error. 416 if (!PreloadedScratchWaveOffsetReg) 417 return; 418 419 // We need to do the replacement of the private segment buffer register even 420 // if there are no stack objects. There could be stores to undef or a 421 // constant without an associated object. 422 // 423 // This will return `Register()` in cases where there are no actual 424 // uses of the SRSRC. 425 Register ScratchRsrcReg; 426 if (!ST.enableFlatScratch()) 427 ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF); 428 429 // Make the selected register live throughout the function. 430 if (ScratchRsrcReg) { 431 for (MachineBasicBlock &OtherBB : MF) { 432 if (&OtherBB != &MBB) { 433 OtherBB.addLiveIn(ScratchRsrcReg); 434 } 435 } 436 } 437 438 // Now that we have fixed the reserved SRSRC we need to locate the 439 // (potentially) preloaded SRSRC. 440 Register PreloadedScratchRsrcReg; 441 if (ST.isAmdHsaOrMesa(F)) { 442 PreloadedScratchRsrcReg = 443 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); 444 if (ScratchRsrcReg && PreloadedScratchRsrcReg) { 445 // We added live-ins during argument lowering, but since they were not 446 // used they were deleted. We're adding the uses now, so add them back. 447 MRI.addLiveIn(PreloadedScratchRsrcReg); 448 MBB.addLiveIn(PreloadedScratchRsrcReg); 449 } 450 } 451 452 // Debug location must be unknown since the first debug location is used to 453 // determine the end of the prologue. 454 DebugLoc DL; 455 MachineBasicBlock::iterator I = MBB.begin(); 456 457 // We found the SRSRC first because it needs four registers and has an 458 // alignment requirement. If the SRSRC that we found is clobbering with 459 // the scratch wave offset, which may be in a fixed SGPR or a free SGPR 460 // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch 461 // wave offset to a free SGPR. 462 Register ScratchWaveOffsetReg; 463 if (TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) { 464 ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF); 465 unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); 466 AllSGPRs = AllSGPRs.slice( 467 std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded)); 468 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 469 for (MCPhysReg Reg : AllSGPRs) { 470 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && 471 !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) { 472 ScratchWaveOffsetReg = Reg; 473 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) 474 .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); 475 break; 476 } 477 } 478 } else { 479 ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg; 480 } 481 assert(ScratchWaveOffsetReg); 482 483 if (requiresStackPointerReference(MF)) { 484 Register SPReg = MFI->getStackPtrOffsetReg(); 485 assert(SPReg != AMDGPU::SP_REG); 486 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg) 487 .addImm(MF.getFrameInfo().getStackSize() * getScratchScaleFactor(ST)); 488 } 489 490 if (hasFP(MF)) { 491 Register FPReg = MFI->getFrameOffsetReg(); 492 assert(FPReg != AMDGPU::FP_REG); 493 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0); 494 } 495 496 if ((MFI->hasFlatScratchInit() || ScratchRsrcReg) && 497 !ST.flatScratchIsArchitected()) { 498 MRI.addLiveIn(PreloadedScratchWaveOffsetReg); 499 MBB.addLiveIn(PreloadedScratchWaveOffsetReg); 500 } 501 502 if (MFI->hasFlatScratchInit()) { 503 emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg); 504 } 505 506 if (ScratchRsrcReg) { 507 emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL, 508 PreloadedScratchRsrcReg, 509 ScratchRsrcReg, ScratchWaveOffsetReg); 510 } 511 } 512 513 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg` 514 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( 515 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 516 const DebugLoc &DL, Register PreloadedScratchRsrcReg, 517 Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const { 518 519 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 520 const SIInstrInfo *TII = ST.getInstrInfo(); 521 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 522 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 523 const Function &Fn = MF.getFunction(); 524 525 if (ST.isAmdPalOS()) { 526 // The pointer to the GIT is formed from the offset passed in and either 527 // the amdgpu-git-ptr-high function attribute or the top part of the PC 528 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); 529 Register Rsrc03 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); 530 531 buildGitPtr(MBB, I, DL, TII, Rsrc01); 532 533 // We now have the GIT ptr - now get the scratch descriptor from the entry 534 // at offset 0 (or offset 16 for a compute shader). 535 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 536 const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM); 537 auto MMO = MF.getMachineMemOperand(PtrInfo, 538 MachineMemOperand::MOLoad | 539 MachineMemOperand::MOInvariant | 540 MachineMemOperand::MODereferenceable, 541 16, Align(4)); 542 unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; 543 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); 544 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset); 545 BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg) 546 .addReg(Rsrc01) 547 .addImm(EncodedOffset) // offset 548 .addImm(0) // cpol 549 .addReg(ScratchRsrcReg, RegState::ImplicitDefine) 550 .addMemOperand(MMO); 551 552 // The driver will always set the SRD for wave 64 (bits 118:117 of 553 // descriptor / bits 22:21 of third sub-reg will be 0b11) 554 // If the shader is actually wave32 we have to modify the const_index_stride 555 // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The 556 // reason the driver does this is that there can be cases where it presents 557 // 2 shaders with different wave size (e.g. VsFs). 558 // TODO: convert to using SCRATCH instructions or multiple SRD buffers 559 if (ST.isWave32()) { 560 const MCInstrDesc &SBitsetB32 = TII->get(AMDGPU::S_BITSET0_B32); 561 BuildMI(MBB, I, DL, SBitsetB32, Rsrc03) 562 .addImm(21) 563 .addReg(Rsrc03); 564 } 565 } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) { 566 assert(!ST.isAmdHsaOrMesa(Fn)); 567 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); 568 569 Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); 570 Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); 571 572 // Use relocations to get the pointer, and setup the other bits manually. 573 uint64_t Rsrc23 = TII->getScratchRsrcWords23(); 574 575 if (MFI->hasImplicitBufferPtr()) { 576 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); 577 578 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { 579 const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64); 580 581 BuildMI(MBB, I, DL, Mov64, Rsrc01) 582 .addReg(MFI->getImplicitBufferPtrUserSGPR()) 583 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 584 } else { 585 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); 586 587 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 588 auto MMO = MF.getMachineMemOperand( 589 PtrInfo, 590 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | 591 MachineMemOperand::MODereferenceable, 592 8, Align(4)); 593 BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01) 594 .addReg(MFI->getImplicitBufferPtrUserSGPR()) 595 .addImm(0) // offset 596 .addImm(0) // cpol 597 .addMemOperand(MMO) 598 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 599 600 MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR()); 601 MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR()); 602 } 603 } else { 604 Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 605 Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 606 607 BuildMI(MBB, I, DL, SMovB32, Rsrc0) 608 .addExternalSymbol("SCRATCH_RSRC_DWORD0") 609 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 610 611 BuildMI(MBB, I, DL, SMovB32, Rsrc1) 612 .addExternalSymbol("SCRATCH_RSRC_DWORD1") 613 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 614 615 } 616 617 BuildMI(MBB, I, DL, SMovB32, Rsrc2) 618 .addImm(Rsrc23 & 0xffffffff) 619 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 620 621 BuildMI(MBB, I, DL, SMovB32, Rsrc3) 622 .addImm(Rsrc23 >> 32) 623 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 624 } else if (ST.isAmdHsaOrMesa(Fn)) { 625 assert(PreloadedScratchRsrcReg); 626 627 if (ScratchRsrcReg != PreloadedScratchRsrcReg) { 628 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) 629 .addReg(PreloadedScratchRsrcReg, RegState::Kill); 630 } 631 } 632 633 // Add the scratch wave offset into the scratch RSRC. 634 // 635 // We only want to update the first 48 bits, which is the base address 636 // pointer, without touching the adjacent 16 bits of flags. We know this add 637 // cannot carry-out from bit 47, otherwise the scratch allocation would be 638 // impossible to fit in the 48-bit global address space. 639 // 640 // TODO: Evaluate if it is better to just construct an SRD using the flat 641 // scratch init and some constants rather than update the one we are passed. 642 Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 643 Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 644 645 // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in 646 // the kernel body via inreg arguments. 647 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0) 648 .addReg(ScratchRsrcSub0) 649 .addReg(ScratchWaveOffsetReg) 650 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 651 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1) 652 .addReg(ScratchRsrcSub1) 653 .addImm(0) 654 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 655 } 656 657 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { 658 switch (ID) { 659 case TargetStackID::Default: 660 case TargetStackID::NoAlloc: 661 case TargetStackID::SGPRSpill: 662 return true; 663 case TargetStackID::ScalableVector: 664 case TargetStackID::WasmLocal: 665 return false; 666 } 667 llvm_unreachable("Invalid TargetStackID::Value"); 668 } 669 670 static void initLiveRegs(LivePhysRegs &LiveRegs, const SIRegisterInfo &TRI, 671 const SIMachineFunctionInfo *FuncInfo, 672 MachineFunction &MF, MachineBasicBlock &MBB, 673 MachineBasicBlock::iterator MBBI, bool IsProlog) { 674 if (LiveRegs.empty()) { 675 LiveRegs.init(TRI); 676 if (IsProlog) { 677 LiveRegs.addLiveIns(MBB); 678 } else { 679 // In epilog. 680 LiveRegs.addLiveOuts(MBB); 681 LiveRegs.stepBackward(*MBBI); 682 } 683 } 684 } 685 686 // Activate all lanes, returns saved exec. 687 static Register buildScratchExecCopy(LivePhysRegs &LiveRegs, 688 MachineFunction &MF, 689 MachineBasicBlock &MBB, 690 MachineBasicBlock::iterator MBBI, 691 bool IsProlog) { 692 Register ScratchExecCopy; 693 MachineRegisterInfo &MRI = MF.getRegInfo(); 694 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 695 const SIInstrInfo *TII = ST.getInstrInfo(); 696 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 697 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 698 DebugLoc DL; 699 700 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, IsProlog); 701 702 ScratchExecCopy = findScratchNonCalleeSaveRegister( 703 MRI, LiveRegs, *TRI.getWaveMaskRegClass()); 704 if (!ScratchExecCopy) 705 report_fatal_error("failed to find free scratch register"); 706 707 LiveRegs.addReg(ScratchExecCopy); 708 709 const unsigned OrSaveExec = 710 ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; 711 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy).addImm(-1); 712 713 return ScratchExecCopy; 714 } 715 716 // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR. 717 // Otherwise we are spilling to memory. 718 static bool spilledToMemory(const MachineFunction &MF, int SaveIndex) { 719 const MachineFrameInfo &MFI = MF.getFrameInfo(); 720 return MFI.getStackID(SaveIndex) != TargetStackID::SGPRSpill; 721 } 722 723 void SIFrameLowering::emitPrologue(MachineFunction &MF, 724 MachineBasicBlock &MBB) const { 725 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 726 if (FuncInfo->isEntryFunction()) { 727 emitEntryFunctionPrologue(MF, MBB); 728 return; 729 } 730 731 const MachineFrameInfo &MFI = MF.getFrameInfo(); 732 MachineRegisterInfo &MRI = MF.getRegInfo(); 733 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 734 const SIInstrInfo *TII = ST.getInstrInfo(); 735 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 736 737 Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); 738 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 739 Register BasePtrReg = 740 TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register(); 741 LivePhysRegs LiveRegs; 742 743 MachineBasicBlock::iterator MBBI = MBB.begin(); 744 DebugLoc DL; 745 746 bool HasFP = false; 747 bool HasBP = false; 748 uint32_t NumBytes = MFI.getStackSize(); 749 uint32_t RoundedSize = NumBytes; 750 // To avoid clobbering VGPRs in lanes that weren't active on function entry, 751 // turn on all lanes before doing the spill to memory. 752 Register ScratchExecCopy; 753 754 Optional<int> FPSaveIndex = FuncInfo->FramePointerSaveIndex; 755 Optional<int> BPSaveIndex = FuncInfo->BasePointerSaveIndex; 756 757 // VGPRs used for SGPR->VGPR spills 758 for (const SIMachineFunctionInfo::SGPRSpillVGPR &Reg : 759 FuncInfo->getSGPRSpillVGPRs()) { 760 if (!Reg.FI) 761 continue; 762 763 if (!ScratchExecCopy) 764 ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, 765 /*IsProlog*/ true); 766 767 buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, Reg.VGPR, 768 *Reg.FI); 769 } 770 771 // VGPRs used for Whole Wave Mode 772 for (const auto &Reg : FuncInfo->WWMReservedRegs) { 773 auto VGPR = Reg.first; 774 auto FI = Reg.second; 775 if (!FI) 776 continue; 777 778 if (!ScratchExecCopy) 779 ScratchExecCopy = 780 buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ true); 781 782 buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, VGPR, *FI); 783 } 784 785 if (ScratchExecCopy) { 786 // FIXME: Split block and make terminator. 787 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 788 MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 789 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) 790 .addReg(ScratchExecCopy, RegState::Kill); 791 LiveRegs.addReg(ScratchExecCopy); 792 } 793 794 if (FPSaveIndex && spilledToMemory(MF, *FPSaveIndex)) { 795 const int FramePtrFI = *FPSaveIndex; 796 assert(!MFI.isDeadObjectIndex(FramePtrFI)); 797 798 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true); 799 800 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( 801 MRI, LiveRegs, AMDGPU::VGPR_32RegClass); 802 if (!TmpVGPR) 803 report_fatal_error("failed to find free scratch register"); 804 805 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 806 .addReg(FramePtrReg); 807 808 buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, TmpVGPR, 809 FramePtrFI); 810 } 811 812 if (BPSaveIndex && spilledToMemory(MF, *BPSaveIndex)) { 813 const int BasePtrFI = *BPSaveIndex; 814 assert(!MFI.isDeadObjectIndex(BasePtrFI)); 815 816 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true); 817 818 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( 819 MRI, LiveRegs, AMDGPU::VGPR_32RegClass); 820 if (!TmpVGPR) 821 report_fatal_error("failed to find free scratch register"); 822 823 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 824 .addReg(BasePtrReg); 825 826 buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, TmpVGPR, 827 BasePtrFI); 828 } 829 830 // In this case, spill the FP to a reserved VGPR. 831 if (FPSaveIndex && !spilledToMemory(MF, *FPSaveIndex)) { 832 const int FramePtrFI = *FPSaveIndex; 833 assert(!MFI.isDeadObjectIndex(FramePtrFI)); 834 835 assert(MFI.getStackID(FramePtrFI) == TargetStackID::SGPRSpill); 836 ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = 837 FuncInfo->getSGPRToVGPRSpills(FramePtrFI); 838 assert(Spill.size() == 1); 839 840 // Save FP before setting it up. 841 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR) 842 .addReg(FramePtrReg) 843 .addImm(Spill[0].Lane) 844 .addReg(Spill[0].VGPR, RegState::Undef); 845 } 846 847 // In this case, spill the BP to a reserved VGPR. 848 if (BPSaveIndex && !spilledToMemory(MF, *BPSaveIndex)) { 849 const int BasePtrFI = *BPSaveIndex; 850 assert(!MFI.isDeadObjectIndex(BasePtrFI)); 851 852 assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill); 853 ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = 854 FuncInfo->getSGPRToVGPRSpills(BasePtrFI); 855 assert(Spill.size() == 1); 856 857 // Save BP before setting it up. 858 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR) 859 .addReg(BasePtrReg) 860 .addImm(Spill[0].Lane) 861 .addReg(Spill[0].VGPR, RegState::Undef); 862 } 863 864 // Emit the copy if we need an FP, and are using a free SGPR to save it. 865 if (FuncInfo->SGPRForFPSaveRestoreCopy) { 866 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), 867 FuncInfo->SGPRForFPSaveRestoreCopy) 868 .addReg(FramePtrReg) 869 .setMIFlag(MachineInstr::FrameSetup); 870 } 871 872 // Emit the copy if we need a BP, and are using a free SGPR to save it. 873 if (FuncInfo->SGPRForBPSaveRestoreCopy) { 874 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), 875 FuncInfo->SGPRForBPSaveRestoreCopy) 876 .addReg(BasePtrReg) 877 .setMIFlag(MachineInstr::FrameSetup); 878 } 879 880 // If a copy has been emitted for FP and/or BP, Make the SGPRs 881 // used in the copy instructions live throughout the function. 882 SmallVector<MCPhysReg, 2> TempSGPRs; 883 if (FuncInfo->SGPRForFPSaveRestoreCopy) 884 TempSGPRs.push_back(FuncInfo->SGPRForFPSaveRestoreCopy); 885 886 if (FuncInfo->SGPRForBPSaveRestoreCopy) 887 TempSGPRs.push_back(FuncInfo->SGPRForBPSaveRestoreCopy); 888 889 if (!TempSGPRs.empty()) { 890 for (MachineBasicBlock &MBB : MF) { 891 for (MCPhysReg Reg : TempSGPRs) 892 MBB.addLiveIn(Reg); 893 894 MBB.sortUniqueLiveIns(); 895 } 896 if (!LiveRegs.empty()) { 897 LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy); 898 LiveRegs.addReg(FuncInfo->SGPRForBPSaveRestoreCopy); 899 } 900 } 901 902 if (TRI.hasStackRealignment(MF)) { 903 HasFP = true; 904 const unsigned Alignment = MFI.getMaxAlign().value(); 905 906 RoundedSize += Alignment; 907 if (LiveRegs.empty()) { 908 LiveRegs.init(TRI); 909 LiveRegs.addLiveIns(MBB); 910 } 911 912 // s_add_i32 s33, s32, NumBytes 913 // s_and_b32 s33, s33, 0b111...0000 914 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), FramePtrReg) 915 .addReg(StackPtrReg) 916 .addImm((Alignment - 1) * getScratchScaleFactor(ST)) 917 .setMIFlag(MachineInstr::FrameSetup); 918 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg) 919 .addReg(FramePtrReg, RegState::Kill) 920 .addImm(-Alignment * getScratchScaleFactor(ST)) 921 .setMIFlag(MachineInstr::FrameSetup); 922 FuncInfo->setIsStackRealigned(true); 923 } else if ((HasFP = hasFP(MF))) { 924 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) 925 .addReg(StackPtrReg) 926 .setMIFlag(MachineInstr::FrameSetup); 927 } 928 929 // If we need a base pointer, set it up here. It's whatever the value of 930 // the stack pointer is at this point. Any variable size objects will be 931 // allocated after this, so we can still use the base pointer to reference 932 // the incoming arguments. 933 if ((HasBP = TRI.hasBasePointer(MF))) { 934 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg) 935 .addReg(StackPtrReg) 936 .setMIFlag(MachineInstr::FrameSetup); 937 } 938 939 if (HasFP && RoundedSize != 0) { 940 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg) 941 .addReg(StackPtrReg) 942 .addImm(RoundedSize * getScratchScaleFactor(ST)) 943 .setMIFlag(MachineInstr::FrameSetup); 944 } 945 946 assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy || 947 FuncInfo->FramePointerSaveIndex)) && 948 "Needed to save FP but didn't save it anywhere"); 949 950 assert((HasFP || (!FuncInfo->SGPRForFPSaveRestoreCopy && 951 !FuncInfo->FramePointerSaveIndex)) && 952 "Saved FP but didn't need it"); 953 954 assert((!HasBP || (FuncInfo->SGPRForBPSaveRestoreCopy || 955 FuncInfo->BasePointerSaveIndex)) && 956 "Needed to save BP but didn't save it anywhere"); 957 958 assert((HasBP || (!FuncInfo->SGPRForBPSaveRestoreCopy && 959 !FuncInfo->BasePointerSaveIndex)) && 960 "Saved BP but didn't need it"); 961 } 962 963 void SIFrameLowering::emitEpilogue(MachineFunction &MF, 964 MachineBasicBlock &MBB) const { 965 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 966 if (FuncInfo->isEntryFunction()) 967 return; 968 969 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 970 const SIInstrInfo *TII = ST.getInstrInfo(); 971 MachineRegisterInfo &MRI = MF.getRegInfo(); 972 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 973 MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); 974 LivePhysRegs LiveRegs; 975 DebugLoc DL; 976 977 const MachineFrameInfo &MFI = MF.getFrameInfo(); 978 uint32_t NumBytes = MFI.getStackSize(); 979 uint32_t RoundedSize = FuncInfo->isStackRealigned() 980 ? NumBytes + MFI.getMaxAlign().value() 981 : NumBytes; 982 const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); 983 const Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 984 const Register BasePtrReg = 985 TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register(); 986 987 Optional<int> FPSaveIndex = FuncInfo->FramePointerSaveIndex; 988 Optional<int> BPSaveIndex = FuncInfo->BasePointerSaveIndex; 989 990 if (RoundedSize != 0 && hasFP(MF)) { 991 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg) 992 .addReg(StackPtrReg) 993 .addImm(-static_cast<int64_t>(RoundedSize * getScratchScaleFactor(ST))) 994 .setMIFlag(MachineInstr::FrameDestroy); 995 } 996 997 if (FuncInfo->SGPRForFPSaveRestoreCopy) { 998 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) 999 .addReg(FuncInfo->SGPRForFPSaveRestoreCopy) 1000 .setMIFlag(MachineInstr::FrameDestroy); 1001 } 1002 1003 if (FuncInfo->SGPRForBPSaveRestoreCopy) { 1004 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg) 1005 .addReg(FuncInfo->SGPRForBPSaveRestoreCopy) 1006 .setMIFlag(MachineInstr::FrameDestroy); 1007 } 1008 1009 if (FPSaveIndex) { 1010 const int FramePtrFI = *FPSaveIndex; 1011 assert(!MFI.isDeadObjectIndex(FramePtrFI)); 1012 if (spilledToMemory(MF, FramePtrFI)) { 1013 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false); 1014 1015 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( 1016 MRI, LiveRegs, AMDGPU::VGPR_32RegClass); 1017 if (!TmpVGPR) 1018 report_fatal_error("failed to find free scratch register"); 1019 buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, TmpVGPR, 1020 FramePtrFI); 1021 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg) 1022 .addReg(TmpVGPR, RegState::Kill); 1023 } else { 1024 // Reload from VGPR spill. 1025 assert(MFI.getStackID(FramePtrFI) == TargetStackID::SGPRSpill); 1026 ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = 1027 FuncInfo->getSGPRToVGPRSpills(FramePtrFI); 1028 assert(Spill.size() == 1); 1029 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READLANE_B32), FramePtrReg) 1030 .addReg(Spill[0].VGPR) 1031 .addImm(Spill[0].Lane); 1032 } 1033 } 1034 1035 if (BPSaveIndex) { 1036 const int BasePtrFI = *BPSaveIndex; 1037 assert(!MFI.isDeadObjectIndex(BasePtrFI)); 1038 if (spilledToMemory(MF, BasePtrFI)) { 1039 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false); 1040 1041 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( 1042 MRI, LiveRegs, AMDGPU::VGPR_32RegClass); 1043 if (!TmpVGPR) 1044 report_fatal_error("failed to find free scratch register"); 1045 buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, TmpVGPR, 1046 BasePtrFI); 1047 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg) 1048 .addReg(TmpVGPR, RegState::Kill); 1049 } else { 1050 // Reload from VGPR spill. 1051 assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill); 1052 ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = 1053 FuncInfo->getSGPRToVGPRSpills(BasePtrFI); 1054 assert(Spill.size() == 1); 1055 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READLANE_B32), BasePtrReg) 1056 .addReg(Spill[0].VGPR) 1057 .addImm(Spill[0].Lane); 1058 } 1059 } 1060 1061 Register ScratchExecCopy; 1062 for (const SIMachineFunctionInfo::SGPRSpillVGPR &Reg : 1063 FuncInfo->getSGPRSpillVGPRs()) { 1064 if (!Reg.FI) 1065 continue; 1066 1067 if (!ScratchExecCopy) 1068 ScratchExecCopy = 1069 buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ false); 1070 1071 buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, Reg.VGPR, 1072 *Reg.FI); 1073 } 1074 1075 for (const auto &Reg : FuncInfo->WWMReservedRegs) { 1076 auto VGPR = Reg.first; 1077 auto FI = Reg.second; 1078 if (!FI) 1079 continue; 1080 1081 if (!ScratchExecCopy) 1082 ScratchExecCopy = 1083 buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ false); 1084 1085 buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, VGPR, *FI); 1086 } 1087 1088 if (ScratchExecCopy) { 1089 // FIXME: Split block and make terminator. 1090 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 1091 MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 1092 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) 1093 .addReg(ScratchExecCopy, RegState::Kill); 1094 } 1095 } 1096 1097 #ifndef NDEBUG 1098 static bool allSGPRSpillsAreDead(const MachineFunction &MF) { 1099 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1100 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1101 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); 1102 I != E; ++I) { 1103 if (!MFI.isDeadObjectIndex(I) && 1104 MFI.getStackID(I) == TargetStackID::SGPRSpill && 1105 (I != FuncInfo->FramePointerSaveIndex && 1106 I != FuncInfo->BasePointerSaveIndex)) { 1107 return false; 1108 } 1109 } 1110 1111 return true; 1112 } 1113 #endif 1114 1115 StackOffset SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, 1116 int FI, 1117 Register &FrameReg) const { 1118 const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); 1119 1120 FrameReg = RI->getFrameRegister(MF); 1121 return StackOffset::getFixed(MF.getFrameInfo().getObjectOffset(FI)); 1122 } 1123 1124 void SIFrameLowering::processFunctionBeforeFrameFinalized( 1125 MachineFunction &MF, 1126 RegScavenger *RS) const { 1127 MachineFrameInfo &MFI = MF.getFrameInfo(); 1128 1129 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1130 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1131 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1132 1133 FuncInfo->removeDeadFrameIndices(MFI); 1134 assert(allSGPRSpillsAreDead(MF) && 1135 "SGPR spill should have been removed in SILowerSGPRSpills"); 1136 1137 // FIXME: The other checks should be redundant with allStackObjectsAreDead, 1138 // but currently hasNonSpillStackObjects is set only from source 1139 // allocas. Stack temps produced from legalization are not counted currently. 1140 if (!allStackObjectsAreDead(MFI)) { 1141 assert(RS && "RegScavenger required if spilling"); 1142 1143 // Add an emergency spill slot 1144 RS->addScavengingFrameIndex(FuncInfo->getScavengeFI(MFI, *TRI)); 1145 } 1146 } 1147 1148 // Only report VGPRs to generic code. 1149 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, 1150 BitVector &SavedVGPRs, 1151 RegScavenger *RS) const { 1152 TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS); 1153 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1154 if (MFI->isEntryFunction()) 1155 return; 1156 1157 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 1158 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1159 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1160 1161 // Ignore the SGPRs the default implementation found. 1162 SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask()); 1163 1164 // Do not save AGPRs prior to GFX90A because there was no easy way to do so. 1165 // In gfx908 there was do AGPR loads and stores and thus spilling also 1166 // require a temporary VGPR. 1167 if (!ST.hasGFX90AInsts()) 1168 SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask()); 1169 1170 // hasFP only knows about stack objects that already exist. We're now 1171 // determining the stack slots that will be created, so we have to predict 1172 // them. Stack objects force FP usage with calls. 1173 // 1174 // Note a new VGPR CSR may be introduced if one is used for the spill, but we 1175 // don't want to report it here. 1176 // 1177 // FIXME: Is this really hasReservedCallFrame? 1178 const bool WillHaveFP = 1179 FrameInfo.hasCalls() && 1180 (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo)); 1181 1182 // VGPRs used for SGPR spilling need to be specially inserted in the prolog, 1183 // so don't allow the default insertion to handle them. 1184 for (auto SSpill : MFI->getSGPRSpillVGPRs()) 1185 SavedVGPRs.reset(SSpill.VGPR); 1186 1187 LivePhysRegs LiveRegs; 1188 LiveRegs.init(*TRI); 1189 1190 if (WillHaveFP || hasFP(MF)) { 1191 assert(!MFI->SGPRForFPSaveRestoreCopy && !MFI->FramePointerSaveIndex && 1192 "Re-reserving spill slot for FP"); 1193 getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForFPSaveRestoreCopy, 1194 MFI->FramePointerSaveIndex, true); 1195 } 1196 1197 if (TRI->hasBasePointer(MF)) { 1198 if (MFI->SGPRForFPSaveRestoreCopy) 1199 LiveRegs.addReg(MFI->SGPRForFPSaveRestoreCopy); 1200 1201 assert(!MFI->SGPRForBPSaveRestoreCopy && 1202 !MFI->BasePointerSaveIndex && "Re-reserving spill slot for BP"); 1203 getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForBPSaveRestoreCopy, 1204 MFI->BasePointerSaveIndex, false); 1205 } 1206 } 1207 1208 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF, 1209 BitVector &SavedRegs, 1210 RegScavenger *RS) const { 1211 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); 1212 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1213 if (MFI->isEntryFunction()) 1214 return; 1215 1216 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1217 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1218 1219 // The SP is specifically managed and we don't want extra spills of it. 1220 SavedRegs.reset(MFI->getStackPtrOffsetReg()); 1221 1222 const BitVector AllSavedRegs = SavedRegs; 1223 SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask()); 1224 1225 // If clearing VGPRs changed the mask, we will have some CSR VGPR spills. 1226 const bool HaveAnyCSRVGPR = SavedRegs != AllSavedRegs; 1227 1228 // We have to anticipate introducing CSR VGPR spills if we don't have any 1229 // stack objects already, since we require an FP if there is a call and stack. 1230 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 1231 const bool WillHaveFP = FrameInfo.hasCalls() && HaveAnyCSRVGPR; 1232 1233 // FP will be specially managed like SP. 1234 if (WillHaveFP || hasFP(MF)) 1235 SavedRegs.reset(MFI->getFrameOffsetReg()); 1236 } 1237 1238 bool SIFrameLowering::assignCalleeSavedSpillSlots( 1239 MachineFunction &MF, const TargetRegisterInfo *TRI, 1240 std::vector<CalleeSavedInfo> &CSI) const { 1241 if (CSI.empty()) 1242 return true; // Early exit if no callee saved registers are modified! 1243 1244 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1245 if (!FuncInfo->SGPRForFPSaveRestoreCopy && 1246 !FuncInfo->SGPRForBPSaveRestoreCopy) 1247 return false; 1248 1249 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1250 const SIRegisterInfo *RI = ST.getRegisterInfo(); 1251 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 1252 Register BasePtrReg = RI->getBaseRegister(); 1253 unsigned NumModifiedRegs = 0; 1254 1255 if (FuncInfo->SGPRForFPSaveRestoreCopy) 1256 NumModifiedRegs++; 1257 if (FuncInfo->SGPRForBPSaveRestoreCopy) 1258 NumModifiedRegs++; 1259 1260 for (auto &CS : CSI) { 1261 if (CS.getReg() == FramePtrReg && FuncInfo->SGPRForFPSaveRestoreCopy) { 1262 CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy); 1263 if (--NumModifiedRegs) 1264 break; 1265 } else if (CS.getReg() == BasePtrReg && 1266 FuncInfo->SGPRForBPSaveRestoreCopy) { 1267 CS.setDstReg(FuncInfo->SGPRForBPSaveRestoreCopy); 1268 if (--NumModifiedRegs) 1269 break; 1270 } 1271 } 1272 1273 return false; 1274 } 1275 1276 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( 1277 MachineFunction &MF, 1278 MachineBasicBlock &MBB, 1279 MachineBasicBlock::iterator I) const { 1280 int64_t Amount = I->getOperand(0).getImm(); 1281 if (Amount == 0) 1282 return MBB.erase(I); 1283 1284 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1285 const SIInstrInfo *TII = ST.getInstrInfo(); 1286 const DebugLoc &DL = I->getDebugLoc(); 1287 unsigned Opc = I->getOpcode(); 1288 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); 1289 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0; 1290 1291 if (!hasReservedCallFrame(MF)) { 1292 Amount = alignTo(Amount, getStackAlign()); 1293 assert(isUInt<32>(Amount) && "exceeded stack address space size"); 1294 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1295 Register SPReg = MFI->getStackPtrOffsetReg(); 1296 1297 Amount *= getScratchScaleFactor(ST); 1298 if (IsDestroy) 1299 Amount = -Amount; 1300 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg) 1301 .addReg(SPReg) 1302 .addImm(Amount); 1303 } else if (CalleePopAmount != 0) { 1304 llvm_unreachable("is this used?"); 1305 } 1306 1307 return MBB.erase(I); 1308 } 1309 1310 /// Returns true if the frame will require a reference to the stack pointer. 1311 /// 1312 /// This is the set of conditions common to setting up the stack pointer in a 1313 /// kernel, and for using a frame pointer in a callable function. 1314 /// 1315 /// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm 1316 /// references SP. 1317 static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) { 1318 return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint(); 1319 } 1320 1321 // The FP for kernels is always known 0, so we never really need to setup an 1322 // explicit register for it. However, DisableFramePointerElim will force us to 1323 // use a register for it. 1324 bool SIFrameLowering::hasFP(const MachineFunction &MF) const { 1325 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1326 1327 // For entry functions we can use an immediate offset in most cases, so the 1328 // presence of calls doesn't imply we need a distinct frame pointer. 1329 if (MFI.hasCalls() && 1330 !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) { 1331 // All offsets are unsigned, so need to be addressed in the same direction 1332 // as stack growth. 1333 1334 // FIXME: This function is pretty broken, since it can be called before the 1335 // frame layout is determined or CSR spills are inserted. 1336 return MFI.getStackSize() != 0; 1337 } 1338 1339 return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() || 1340 MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment( 1341 MF) || 1342 MF.getTarget().Options.DisableFramePointerElim(MF); 1343 } 1344 1345 // This is essentially a reduced version of hasFP for entry functions. Since the 1346 // stack pointer is known 0 on entry to kernels, we never really need an FP 1347 // register. We may need to initialize the stack pointer depending on the frame 1348 // properties, which logically overlaps many of the cases where an ordinary 1349 // function would require an FP. 1350 bool SIFrameLowering::requiresStackPointerReference( 1351 const MachineFunction &MF) const { 1352 // Callable functions always require a stack pointer reference. 1353 assert(MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() && 1354 "only expected to call this for entry points"); 1355 1356 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1357 1358 // Entry points ordinarily don't need to initialize SP. We have to set it up 1359 // for callees if there are any. Also note tail calls are impossible/don't 1360 // make any sense for kernels. 1361 if (MFI.hasCalls()) 1362 return true; 1363 1364 // We still need to initialize the SP if we're doing anything weird that 1365 // references the SP, like variable sized stack objects. 1366 return frameTriviallyRequiresSP(MFI); 1367 } 1368