1 //===----------------------- SIFrameLowering.cpp --------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 9 #include "SIFrameLowering.h" 10 #include "AMDGPU.h" 11 #include "GCNSubtarget.h" 12 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 13 #include "SIMachineFunctionInfo.h" 14 #include "llvm/CodeGen/LivePhysRegs.h" 15 #include "llvm/CodeGen/MachineFrameInfo.h" 16 #include "llvm/CodeGen/RegisterScavenging.h" 17 #include "llvm/Target/TargetMachine.h" 18 19 using namespace llvm; 20 21 #define DEBUG_TYPE "frame-info" 22 23 // Find a scratch register that we can use in the prologue. We avoid using 24 // callee-save registers since they may appear to be free when this is called 25 // from canUseAsPrologue (during shrink wrapping), but then no longer be free 26 // when this is called from emitPrologue. 27 static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, 28 LivePhysRegs &LiveRegs, 29 const TargetRegisterClass &RC, 30 bool Unused = false) { 31 // Mark callee saved registers as used so we will not choose them. 32 const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); 33 for (unsigned i = 0; CSRegs[i]; ++i) 34 LiveRegs.addReg(CSRegs[i]); 35 36 if (Unused) { 37 // We are looking for a register that can be used throughout the entire 38 // function, so any use is unacceptable. 39 for (MCRegister Reg : RC) { 40 if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg)) 41 return Reg; 42 } 43 } else { 44 for (MCRegister Reg : RC) { 45 if (LiveRegs.available(MRI, Reg)) 46 return Reg; 47 } 48 } 49 50 return MCRegister(); 51 } 52 53 static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF, 54 LivePhysRegs &LiveRegs, 55 Register &TempSGPR, 56 Optional<int> &FrameIndex, 57 bool IsFP) { 58 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 59 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 60 61 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 62 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 63 64 // We need to save and restore the current FP/BP. 65 66 // 1: If there is already a VGPR with free lanes, use it. We 67 // may already have to pay the penalty for spilling a CSR VGPR. 68 if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) { 69 int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr, 70 TargetStackID::SGPRSpill); 71 72 if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI)) 73 llvm_unreachable("allocate SGPR spill should have worked"); 74 75 FrameIndex = NewFI; 76 77 LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); 78 dbgs() << "Spilling " << (IsFP ? "FP" : "BP") << " to " 79 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane 80 << '\n'); 81 return; 82 } 83 84 // 2: Next, try to save the FP/BP in an unused SGPR. 85 TempSGPR = findScratchNonCalleeSaveRegister( 86 MF.getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true); 87 88 if (!TempSGPR) { 89 int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr, 90 TargetStackID::SGPRSpill); 91 92 if (TRI->spillSGPRToVGPR() && MFI->allocateSGPRSpillToVGPR(MF, NewFI)) { 93 // 3: There's no free lane to spill, and no free register to save FP/BP, 94 // so we're forced to spill another VGPR to use for the spill. 95 FrameIndex = NewFI; 96 97 LLVM_DEBUG( 98 auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); 99 dbgs() << (IsFP ? "FP" : "BP") << " requires fallback spill to " 100 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';); 101 } else { 102 // Remove dead <NewFI> index 103 MF.getFrameInfo().RemoveStackObject(NewFI); 104 // 4: If all else fails, spill the FP/BP to memory. 105 FrameIndex = FrameInfo.CreateSpillStackObject(4, Align(4)); 106 LLVM_DEBUG(dbgs() << "Reserved FI " << FrameIndex << " for spilling " 107 << (IsFP ? "FP" : "BP") << '\n'); 108 } 109 } else { 110 LLVM_DEBUG(dbgs() << "Saving " << (IsFP ? "FP" : "BP") << " with copy to " 111 << printReg(TempSGPR, TRI) << '\n'); 112 } 113 } 114 115 // We need to specially emit stack operations here because a different frame 116 // register is used than in the rest of the function, as getFrameRegister would 117 // use. 118 static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI, 119 const SIMachineFunctionInfo &FuncInfo, 120 LivePhysRegs &LiveRegs, MachineFunction &MF, 121 MachineBasicBlock &MBB, 122 MachineBasicBlock::iterator I, Register SpillReg, 123 int FI) { 124 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 125 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 126 127 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 128 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 129 MachineMemOperand *MMO = MF.getMachineMemOperand( 130 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI), 131 FrameInfo.getObjectAlign(FI)); 132 LiveRegs.addReg(SpillReg); 133 TRI.buildSpillLoadStore(MBB, I, Opc, FI, SpillReg, true, 134 FuncInfo.getStackPtrOffsetReg(), 0, MMO, nullptr, 135 &LiveRegs); 136 LiveRegs.removeReg(SpillReg); 137 } 138 139 static void buildEpilogRestore(const GCNSubtarget &ST, 140 const SIRegisterInfo &TRI, 141 const SIMachineFunctionInfo &FuncInfo, 142 LivePhysRegs &LiveRegs, MachineFunction &MF, 143 MachineBasicBlock &MBB, 144 MachineBasicBlock::iterator I, Register SpillReg, 145 int FI) { 146 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 147 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 148 149 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 150 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 151 MachineMemOperand *MMO = MF.getMachineMemOperand( 152 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI), 153 FrameInfo.getObjectAlign(FI)); 154 TRI.buildSpillLoadStore(MBB, I, Opc, FI, SpillReg, false, 155 FuncInfo.getStackPtrOffsetReg(), 0, MMO, nullptr, 156 &LiveRegs); 157 } 158 159 static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 160 const DebugLoc &DL, const SIInstrInfo *TII, 161 Register TargetReg) { 162 MachineFunction *MF = MBB.getParent(); 163 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 164 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 165 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); 166 Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0); 167 Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1); 168 169 if (MFI->getGITPtrHigh() != 0xffffffff) { 170 BuildMI(MBB, I, DL, SMovB32, TargetHi) 171 .addImm(MFI->getGITPtrHigh()) 172 .addReg(TargetReg, RegState::ImplicitDefine); 173 } else { 174 const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64); 175 BuildMI(MBB, I, DL, GetPC64, TargetReg); 176 } 177 Register GitPtrLo = MFI->getGITPtrLoReg(*MF); 178 MF->getRegInfo().addLiveIn(GitPtrLo); 179 MBB.addLiveIn(GitPtrLo); 180 BuildMI(MBB, I, DL, SMovB32, TargetLo) 181 .addReg(GitPtrLo); 182 } 183 184 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()` 185 void SIFrameLowering::emitEntryFunctionFlatScratchInit( 186 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 187 const DebugLoc &DL, Register ScratchWaveOffsetReg) const { 188 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 189 const SIInstrInfo *TII = ST.getInstrInfo(); 190 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 191 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 192 193 // We don't need this if we only have spills since there is no user facing 194 // scratch. 195 196 // TODO: If we know we don't have flat instructions earlier, we can omit 197 // this from the input registers. 198 // 199 // TODO: We only need to know if we access scratch space through a flat 200 // pointer. Because we only detect if flat instructions are used at all, 201 // this will be used more often than necessary on VI. 202 203 Register FlatScrInitLo; 204 Register FlatScrInitHi; 205 206 if (ST.isAmdPalOS()) { 207 // Extract the scratch offset from the descriptor in the GIT 208 LivePhysRegs LiveRegs; 209 LiveRegs.init(*TRI); 210 LiveRegs.addLiveIns(MBB); 211 212 // Find unused reg to load flat scratch init into 213 MachineRegisterInfo &MRI = MF.getRegInfo(); 214 Register FlatScrInit = AMDGPU::NoRegister; 215 ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF); 216 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2; 217 AllSGPR64s = AllSGPR64s.slice( 218 std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded)); 219 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 220 for (MCPhysReg Reg : AllSGPR64s) { 221 if (LiveRegs.available(MRI, Reg) && MRI.isAllocatable(Reg) && 222 !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) { 223 FlatScrInit = Reg; 224 break; 225 } 226 } 227 assert(FlatScrInit && "Failed to find free register for scratch init"); 228 229 FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0); 230 FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1); 231 232 buildGitPtr(MBB, I, DL, TII, FlatScrInit); 233 234 // We now have the GIT ptr - now get the scratch descriptor from the entry 235 // at offset 0 (or offset 16 for a compute shader). 236 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 237 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); 238 auto *MMO = MF.getMachineMemOperand( 239 PtrInfo, 240 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | 241 MachineMemOperand::MODereferenceable, 242 8, Align(4)); 243 unsigned Offset = 244 MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; 245 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); 246 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset); 247 BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit) 248 .addReg(FlatScrInit) 249 .addImm(EncodedOffset) // offset 250 .addImm(0) // cpol 251 .addMemOperand(MMO); 252 253 // Mask the offset in [47:0] of the descriptor 254 const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32); 255 BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi) 256 .addReg(FlatScrInitHi) 257 .addImm(0xffff); 258 } else { 259 Register FlatScratchInitReg = 260 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); 261 assert(FlatScratchInitReg); 262 263 MachineRegisterInfo &MRI = MF.getRegInfo(); 264 MRI.addLiveIn(FlatScratchInitReg); 265 MBB.addLiveIn(FlatScratchInitReg); 266 267 FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); 268 FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); 269 } 270 271 // Do a 64-bit pointer add. 272 if (ST.flatScratchIsPointer()) { 273 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 274 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) 275 .addReg(FlatScrInitLo) 276 .addReg(ScratchWaveOffsetReg); 277 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), FlatScrInitHi) 278 .addReg(FlatScrInitHi) 279 .addImm(0); 280 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)). 281 addReg(FlatScrInitLo). 282 addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO | 283 (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); 284 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)). 285 addReg(FlatScrInitHi). 286 addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI | 287 (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); 288 return; 289 } 290 291 // For GFX9. 292 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO) 293 .addReg(FlatScrInitLo) 294 .addReg(ScratchWaveOffsetReg); 295 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI) 296 .addReg(FlatScrInitHi) 297 .addImm(0); 298 299 return; 300 } 301 302 assert(ST.getGeneration() < AMDGPUSubtarget::GFX9); 303 304 // Copy the size in bytes. 305 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO) 306 .addReg(FlatScrInitHi, RegState::Kill); 307 308 // Add wave offset in bytes to private base offset. 309 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init. 310 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) 311 .addReg(FlatScrInitLo) 312 .addReg(ScratchWaveOffsetReg); 313 314 // Convert offset to 256-byte units. 315 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI) 316 .addReg(FlatScrInitLo, RegState::Kill) 317 .addImm(8); 318 } 319 320 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not 321 // memory. They should have been removed by now. 322 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { 323 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); 324 I != E; ++I) { 325 if (!MFI.isDeadObjectIndex(I)) 326 return false; 327 } 328 329 return true; 330 } 331 332 // Shift down registers reserved for the scratch RSRC. 333 Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg( 334 MachineFunction &MF) const { 335 336 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 337 const SIInstrInfo *TII = ST.getInstrInfo(); 338 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 339 MachineRegisterInfo &MRI = MF.getRegInfo(); 340 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 341 342 assert(MFI->isEntryFunction()); 343 344 Register ScratchRsrcReg = MFI->getScratchRSrcReg(); 345 346 if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(ScratchRsrcReg) && 347 allStackObjectsAreDead(MF.getFrameInfo()))) 348 return Register(); 349 350 if (ST.hasSGPRInitBug() || 351 ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF)) 352 return ScratchRsrcReg; 353 354 // We reserved the last registers for this. Shift it down to the end of those 355 // which were actually used. 356 // 357 // FIXME: It might be safer to use a pseudoregister before replacement. 358 359 // FIXME: We should be able to eliminate unused input registers. We only 360 // cannot do this for the resources required for scratch access. For now we 361 // skip over user SGPRs and may leave unused holes. 362 363 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4; 364 ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF); 365 AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded)); 366 367 // Skip the last N reserved elements because they should have already been 368 // reserved for VCC etc. 369 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 370 for (MCPhysReg Reg : AllSGPR128s) { 371 // Pick the first unallocated one. Make sure we don't clobber the other 372 // reserved input we needed. Also for PAL, make sure we don't clobber 373 // the GIT pointer passed in SGPR0 or SGPR8. 374 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && 375 !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) { 376 MRI.replaceRegWith(ScratchRsrcReg, Reg); 377 MFI->setScratchRSrcReg(Reg); 378 return Reg; 379 } 380 } 381 382 return ScratchRsrcReg; 383 } 384 385 static unsigned getScratchScaleFactor(const GCNSubtarget &ST) { 386 return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize(); 387 } 388 389 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, 390 MachineBasicBlock &MBB) const { 391 assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); 392 393 // FIXME: If we only have SGPR spills, we won't actually be using scratch 394 // memory since these spill to VGPRs. We should be cleaning up these unused 395 // SGPR spill frame indices somewhere. 396 397 // FIXME: We still have implicit uses on SGPR spill instructions in case they 398 // need to spill to vector memory. It's likely that will not happen, but at 399 // this point it appears we need the setup. This part of the prolog should be 400 // emitted after frame indices are eliminated. 401 402 // FIXME: Remove all of the isPhysRegUsed checks 403 404 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 405 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 406 const SIInstrInfo *TII = ST.getInstrInfo(); 407 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 408 MachineRegisterInfo &MRI = MF.getRegInfo(); 409 const Function &F = MF.getFunction(); 410 411 assert(MFI->isEntryFunction()); 412 413 Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( 414 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); 415 // FIXME: Hack to not crash in situations which emitted an error. 416 if (!PreloadedScratchWaveOffsetReg) 417 return; 418 419 // We need to do the replacement of the private segment buffer register even 420 // if there are no stack objects. There could be stores to undef or a 421 // constant without an associated object. 422 // 423 // This will return `Register()` in cases where there are no actual 424 // uses of the SRSRC. 425 Register ScratchRsrcReg; 426 if (!ST.enableFlatScratch()) 427 ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF); 428 429 // Make the selected register live throughout the function. 430 if (ScratchRsrcReg) { 431 for (MachineBasicBlock &OtherBB : MF) { 432 if (&OtherBB != &MBB) { 433 OtherBB.addLiveIn(ScratchRsrcReg); 434 } 435 } 436 } 437 438 // Now that we have fixed the reserved SRSRC we need to locate the 439 // (potentially) preloaded SRSRC. 440 Register PreloadedScratchRsrcReg; 441 if (ST.isAmdHsaOrMesa(F)) { 442 PreloadedScratchRsrcReg = 443 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); 444 if (ScratchRsrcReg && PreloadedScratchRsrcReg) { 445 // We added live-ins during argument lowering, but since they were not 446 // used they were deleted. We're adding the uses now, so add them back. 447 MRI.addLiveIn(PreloadedScratchRsrcReg); 448 MBB.addLiveIn(PreloadedScratchRsrcReg); 449 } 450 } 451 452 // Debug location must be unknown since the first debug location is used to 453 // determine the end of the prologue. 454 DebugLoc DL; 455 MachineBasicBlock::iterator I = MBB.begin(); 456 457 // We found the SRSRC first because it needs four registers and has an 458 // alignment requirement. If the SRSRC that we found is clobbering with 459 // the scratch wave offset, which may be in a fixed SGPR or a free SGPR 460 // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch 461 // wave offset to a free SGPR. 462 Register ScratchWaveOffsetReg; 463 if (TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) { 464 ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF); 465 unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); 466 AllSGPRs = AllSGPRs.slice( 467 std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded)); 468 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 469 for (MCPhysReg Reg : AllSGPRs) { 470 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && 471 !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) { 472 ScratchWaveOffsetReg = Reg; 473 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) 474 .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); 475 break; 476 } 477 } 478 } else { 479 ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg; 480 } 481 assert(ScratchWaveOffsetReg); 482 483 if (requiresStackPointerReference(MF)) { 484 Register SPReg = MFI->getStackPtrOffsetReg(); 485 assert(SPReg != AMDGPU::SP_REG); 486 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg) 487 .addImm(MF.getFrameInfo().getStackSize() * getScratchScaleFactor(ST)); 488 } 489 490 if (hasFP(MF)) { 491 Register FPReg = MFI->getFrameOffsetReg(); 492 assert(FPReg != AMDGPU::FP_REG); 493 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0); 494 } 495 496 if (MFI->hasFlatScratchInit() || ScratchRsrcReg) { 497 MRI.addLiveIn(PreloadedScratchWaveOffsetReg); 498 MBB.addLiveIn(PreloadedScratchWaveOffsetReg); 499 } 500 501 if (MFI->hasFlatScratchInit()) { 502 emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg); 503 } 504 505 if (ScratchRsrcReg) { 506 emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL, 507 PreloadedScratchRsrcReg, 508 ScratchRsrcReg, ScratchWaveOffsetReg); 509 } 510 } 511 512 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg` 513 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( 514 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 515 const DebugLoc &DL, Register PreloadedScratchRsrcReg, 516 Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const { 517 518 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 519 const SIInstrInfo *TII = ST.getInstrInfo(); 520 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 521 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 522 const Function &Fn = MF.getFunction(); 523 524 if (ST.isAmdPalOS()) { 525 // The pointer to the GIT is formed from the offset passed in and either 526 // the amdgpu-git-ptr-high function attribute or the top part of the PC 527 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); 528 Register Rsrc03 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); 529 530 buildGitPtr(MBB, I, DL, TII, Rsrc01); 531 532 // We now have the GIT ptr - now get the scratch descriptor from the entry 533 // at offset 0 (or offset 16 for a compute shader). 534 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 535 const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM); 536 auto MMO = MF.getMachineMemOperand(PtrInfo, 537 MachineMemOperand::MOLoad | 538 MachineMemOperand::MOInvariant | 539 MachineMemOperand::MODereferenceable, 540 16, Align(4)); 541 unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; 542 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); 543 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset); 544 BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg) 545 .addReg(Rsrc01) 546 .addImm(EncodedOffset) // offset 547 .addImm(0) // cpol 548 .addReg(ScratchRsrcReg, RegState::ImplicitDefine) 549 .addMemOperand(MMO); 550 551 // The driver will always set the SRD for wave 64 (bits 118:117 of 552 // descriptor / bits 22:21 of third sub-reg will be 0b11) 553 // If the shader is actually wave32 we have to modify the const_index_stride 554 // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The 555 // reason the driver does this is that there can be cases where it presents 556 // 2 shaders with different wave size (e.g. VsFs). 557 // TODO: convert to using SCRATCH instructions or multiple SRD buffers 558 if (ST.isWave32()) { 559 const MCInstrDesc &SBitsetB32 = TII->get(AMDGPU::S_BITSET0_B32); 560 BuildMI(MBB, I, DL, SBitsetB32, Rsrc03) 561 .addImm(21) 562 .addReg(Rsrc03); 563 } 564 } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) { 565 assert(!ST.isAmdHsaOrMesa(Fn)); 566 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); 567 568 Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); 569 Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); 570 571 // Use relocations to get the pointer, and setup the other bits manually. 572 uint64_t Rsrc23 = TII->getScratchRsrcWords23(); 573 574 if (MFI->hasImplicitBufferPtr()) { 575 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); 576 577 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { 578 const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64); 579 580 BuildMI(MBB, I, DL, Mov64, Rsrc01) 581 .addReg(MFI->getImplicitBufferPtrUserSGPR()) 582 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 583 } else { 584 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); 585 586 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 587 auto MMO = MF.getMachineMemOperand( 588 PtrInfo, 589 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | 590 MachineMemOperand::MODereferenceable, 591 8, Align(4)); 592 BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01) 593 .addReg(MFI->getImplicitBufferPtrUserSGPR()) 594 .addImm(0) // offset 595 .addImm(0) // cpol 596 .addMemOperand(MMO) 597 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 598 599 MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR()); 600 MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR()); 601 } 602 } else { 603 Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 604 Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 605 606 BuildMI(MBB, I, DL, SMovB32, Rsrc0) 607 .addExternalSymbol("SCRATCH_RSRC_DWORD0") 608 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 609 610 BuildMI(MBB, I, DL, SMovB32, Rsrc1) 611 .addExternalSymbol("SCRATCH_RSRC_DWORD1") 612 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 613 614 } 615 616 BuildMI(MBB, I, DL, SMovB32, Rsrc2) 617 .addImm(Rsrc23 & 0xffffffff) 618 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 619 620 BuildMI(MBB, I, DL, SMovB32, Rsrc3) 621 .addImm(Rsrc23 >> 32) 622 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 623 } else if (ST.isAmdHsaOrMesa(Fn)) { 624 assert(PreloadedScratchRsrcReg); 625 626 if (ScratchRsrcReg != PreloadedScratchRsrcReg) { 627 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) 628 .addReg(PreloadedScratchRsrcReg, RegState::Kill); 629 } 630 } 631 632 // Add the scratch wave offset into the scratch RSRC. 633 // 634 // We only want to update the first 48 bits, which is the base address 635 // pointer, without touching the adjacent 16 bits of flags. We know this add 636 // cannot carry-out from bit 47, otherwise the scratch allocation would be 637 // impossible to fit in the 48-bit global address space. 638 // 639 // TODO: Evaluate if it is better to just construct an SRD using the flat 640 // scratch init and some constants rather than update the one we are passed. 641 Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 642 Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 643 644 // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in 645 // the kernel body via inreg arguments. 646 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0) 647 .addReg(ScratchRsrcSub0) 648 .addReg(ScratchWaveOffsetReg) 649 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 650 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1) 651 .addReg(ScratchRsrcSub1) 652 .addImm(0) 653 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 654 } 655 656 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { 657 switch (ID) { 658 case TargetStackID::Default: 659 case TargetStackID::NoAlloc: 660 case TargetStackID::SGPRSpill: 661 return true; 662 case TargetStackID::ScalableVector: 663 return false; 664 } 665 llvm_unreachable("Invalid TargetStackID::Value"); 666 } 667 668 static void initLiveRegs(LivePhysRegs &LiveRegs, const SIRegisterInfo &TRI, 669 const SIMachineFunctionInfo *FuncInfo, 670 MachineFunction &MF, MachineBasicBlock &MBB, 671 MachineBasicBlock::iterator MBBI, bool IsProlog) { 672 if (LiveRegs.empty()) { 673 LiveRegs.init(TRI); 674 if (IsProlog) { 675 LiveRegs.addLiveIns(MBB); 676 } else { 677 // In epilog. 678 LiveRegs.addLiveOuts(MBB); 679 LiveRegs.stepBackward(*MBBI); 680 } 681 } 682 } 683 684 // Activate all lanes, returns saved exec. 685 static Register buildScratchExecCopy(LivePhysRegs &LiveRegs, 686 MachineFunction &MF, 687 MachineBasicBlock &MBB, 688 MachineBasicBlock::iterator MBBI, 689 bool IsProlog) { 690 Register ScratchExecCopy; 691 MachineRegisterInfo &MRI = MF.getRegInfo(); 692 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 693 const SIInstrInfo *TII = ST.getInstrInfo(); 694 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 695 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 696 DebugLoc DL; 697 698 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, IsProlog); 699 700 ScratchExecCopy = findScratchNonCalleeSaveRegister( 701 MRI, LiveRegs, *TRI.getWaveMaskRegClass()); 702 if (!ScratchExecCopy) 703 report_fatal_error("failed to find free scratch register"); 704 705 LiveRegs.addReg(ScratchExecCopy); 706 707 const unsigned OrSaveExec = 708 ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; 709 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy).addImm(-1); 710 711 return ScratchExecCopy; 712 } 713 714 // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR. 715 // Otherwise we are spilling to memory. 716 static bool spilledToMemory(const MachineFunction &MF, int SaveIndex) { 717 const MachineFrameInfo &MFI = MF.getFrameInfo(); 718 return MFI.getStackID(SaveIndex) != TargetStackID::SGPRSpill; 719 } 720 721 void SIFrameLowering::emitPrologue(MachineFunction &MF, 722 MachineBasicBlock &MBB) const { 723 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 724 if (FuncInfo->isEntryFunction()) { 725 emitEntryFunctionPrologue(MF, MBB); 726 return; 727 } 728 729 const MachineFrameInfo &MFI = MF.getFrameInfo(); 730 MachineRegisterInfo &MRI = MF.getRegInfo(); 731 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 732 const SIInstrInfo *TII = ST.getInstrInfo(); 733 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 734 735 Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); 736 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 737 Register BasePtrReg = 738 TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register(); 739 LivePhysRegs LiveRegs; 740 741 MachineBasicBlock::iterator MBBI = MBB.begin(); 742 DebugLoc DL; 743 744 bool HasFP = false; 745 bool HasBP = false; 746 uint32_t NumBytes = MFI.getStackSize(); 747 uint32_t RoundedSize = NumBytes; 748 // To avoid clobbering VGPRs in lanes that weren't active on function entry, 749 // turn on all lanes before doing the spill to memory. 750 Register ScratchExecCopy; 751 752 Optional<int> FPSaveIndex = FuncInfo->FramePointerSaveIndex; 753 Optional<int> BPSaveIndex = FuncInfo->BasePointerSaveIndex; 754 755 // VGPRs used for SGPR->VGPR spills 756 for (const SIMachineFunctionInfo::SGPRSpillVGPR &Reg : 757 FuncInfo->getSGPRSpillVGPRs()) { 758 if (!Reg.FI) 759 continue; 760 761 if (!ScratchExecCopy) 762 ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, 763 /*IsProlog*/ true); 764 765 buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, Reg.VGPR, 766 *Reg.FI); 767 } 768 769 // VGPRs used for Whole Wave Mode 770 for (const auto &Reg : FuncInfo->WWMReservedRegs) { 771 auto VGPR = Reg.first; 772 auto FI = Reg.second; 773 if (!FI) 774 continue; 775 776 if (!ScratchExecCopy) 777 ScratchExecCopy = 778 buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ true); 779 780 buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, VGPR, *FI); 781 } 782 783 if (ScratchExecCopy) { 784 // FIXME: Split block and make terminator. 785 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 786 MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 787 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) 788 .addReg(ScratchExecCopy, RegState::Kill); 789 LiveRegs.addReg(ScratchExecCopy); 790 } 791 792 if (FPSaveIndex && spilledToMemory(MF, *FPSaveIndex)) { 793 const int FramePtrFI = *FPSaveIndex; 794 assert(!MFI.isDeadObjectIndex(FramePtrFI)); 795 796 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true); 797 798 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( 799 MRI, LiveRegs, AMDGPU::VGPR_32RegClass); 800 if (!TmpVGPR) 801 report_fatal_error("failed to find free scratch register"); 802 803 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 804 .addReg(FramePtrReg); 805 806 buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, TmpVGPR, 807 FramePtrFI); 808 } 809 810 if (BPSaveIndex && spilledToMemory(MF, *BPSaveIndex)) { 811 const int BasePtrFI = *BPSaveIndex; 812 assert(!MFI.isDeadObjectIndex(BasePtrFI)); 813 814 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true); 815 816 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( 817 MRI, LiveRegs, AMDGPU::VGPR_32RegClass); 818 if (!TmpVGPR) 819 report_fatal_error("failed to find free scratch register"); 820 821 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 822 .addReg(BasePtrReg); 823 824 buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, TmpVGPR, 825 BasePtrFI); 826 } 827 828 // In this case, spill the FP to a reserved VGPR. 829 if (FPSaveIndex && !spilledToMemory(MF, *FPSaveIndex)) { 830 const int FramePtrFI = *FPSaveIndex; 831 assert(!MFI.isDeadObjectIndex(FramePtrFI)); 832 833 assert(MFI.getStackID(FramePtrFI) == TargetStackID::SGPRSpill); 834 ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = 835 FuncInfo->getSGPRToVGPRSpills(FramePtrFI); 836 assert(Spill.size() == 1); 837 838 // Save FP before setting it up. 839 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR) 840 .addReg(FramePtrReg) 841 .addImm(Spill[0].Lane) 842 .addReg(Spill[0].VGPR, RegState::Undef); 843 } 844 845 // In this case, spill the BP to a reserved VGPR. 846 if (BPSaveIndex && !spilledToMemory(MF, *BPSaveIndex)) { 847 const int BasePtrFI = *BPSaveIndex; 848 assert(!MFI.isDeadObjectIndex(BasePtrFI)); 849 850 assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill); 851 ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = 852 FuncInfo->getSGPRToVGPRSpills(BasePtrFI); 853 assert(Spill.size() == 1); 854 855 // Save BP before setting it up. 856 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR) 857 .addReg(BasePtrReg) 858 .addImm(Spill[0].Lane) 859 .addReg(Spill[0].VGPR, RegState::Undef); 860 } 861 862 // Emit the copy if we need an FP, and are using a free SGPR to save it. 863 if (FuncInfo->SGPRForFPSaveRestoreCopy) { 864 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), 865 FuncInfo->SGPRForFPSaveRestoreCopy) 866 .addReg(FramePtrReg) 867 .setMIFlag(MachineInstr::FrameSetup); 868 } 869 870 // Emit the copy if we need a BP, and are using a free SGPR to save it. 871 if (FuncInfo->SGPRForBPSaveRestoreCopy) { 872 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), 873 FuncInfo->SGPRForBPSaveRestoreCopy) 874 .addReg(BasePtrReg) 875 .setMIFlag(MachineInstr::FrameSetup); 876 } 877 878 // If a copy has been emitted for FP and/or BP, Make the SGPRs 879 // used in the copy instructions live throughout the function. 880 SmallVector<MCPhysReg, 2> TempSGPRs; 881 if (FuncInfo->SGPRForFPSaveRestoreCopy) 882 TempSGPRs.push_back(FuncInfo->SGPRForFPSaveRestoreCopy); 883 884 if (FuncInfo->SGPRForBPSaveRestoreCopy) 885 TempSGPRs.push_back(FuncInfo->SGPRForBPSaveRestoreCopy); 886 887 if (!TempSGPRs.empty()) { 888 for (MachineBasicBlock &MBB : MF) { 889 for (MCPhysReg Reg : TempSGPRs) 890 MBB.addLiveIn(Reg); 891 892 MBB.sortUniqueLiveIns(); 893 } 894 if (!LiveRegs.empty()) { 895 LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy); 896 LiveRegs.addReg(FuncInfo->SGPRForBPSaveRestoreCopy); 897 } 898 } 899 900 if (TRI.hasStackRealignment(MF)) { 901 HasFP = true; 902 const unsigned Alignment = MFI.getMaxAlign().value(); 903 904 RoundedSize += Alignment; 905 if (LiveRegs.empty()) { 906 LiveRegs.init(TRI); 907 LiveRegs.addLiveIns(MBB); 908 } 909 910 // s_add_u32 s33, s32, NumBytes 911 // s_and_b32 s33, s33, 0b111...0000 912 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), FramePtrReg) 913 .addReg(StackPtrReg) 914 .addImm((Alignment - 1) * getScratchScaleFactor(ST)) 915 .setMIFlag(MachineInstr::FrameSetup); 916 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg) 917 .addReg(FramePtrReg, RegState::Kill) 918 .addImm(-Alignment * getScratchScaleFactor(ST)) 919 .setMIFlag(MachineInstr::FrameSetup); 920 FuncInfo->setIsStackRealigned(true); 921 } else if ((HasFP = hasFP(MF))) { 922 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) 923 .addReg(StackPtrReg) 924 .setMIFlag(MachineInstr::FrameSetup); 925 } 926 927 // If we need a base pointer, set it up here. It's whatever the value of 928 // the stack pointer is at this point. Any variable size objects will be 929 // allocated after this, so we can still use the base pointer to reference 930 // the incoming arguments. 931 if ((HasBP = TRI.hasBasePointer(MF))) { 932 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg) 933 .addReg(StackPtrReg) 934 .setMIFlag(MachineInstr::FrameSetup); 935 } 936 937 if (HasFP && RoundedSize != 0) { 938 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg) 939 .addReg(StackPtrReg) 940 .addImm(RoundedSize * getScratchScaleFactor(ST)) 941 .setMIFlag(MachineInstr::FrameSetup); 942 } 943 944 assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy || 945 FuncInfo->FramePointerSaveIndex)) && 946 "Needed to save FP but didn't save it anywhere"); 947 948 assert((HasFP || (!FuncInfo->SGPRForFPSaveRestoreCopy && 949 !FuncInfo->FramePointerSaveIndex)) && 950 "Saved FP but didn't need it"); 951 952 assert((!HasBP || (FuncInfo->SGPRForBPSaveRestoreCopy || 953 FuncInfo->BasePointerSaveIndex)) && 954 "Needed to save BP but didn't save it anywhere"); 955 956 assert((HasBP || (!FuncInfo->SGPRForBPSaveRestoreCopy && 957 !FuncInfo->BasePointerSaveIndex)) && 958 "Saved BP but didn't need it"); 959 } 960 961 void SIFrameLowering::emitEpilogue(MachineFunction &MF, 962 MachineBasicBlock &MBB) const { 963 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 964 if (FuncInfo->isEntryFunction()) 965 return; 966 967 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 968 const SIInstrInfo *TII = ST.getInstrInfo(); 969 MachineRegisterInfo &MRI = MF.getRegInfo(); 970 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 971 MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); 972 LivePhysRegs LiveRegs; 973 DebugLoc DL; 974 975 const MachineFrameInfo &MFI = MF.getFrameInfo(); 976 uint32_t NumBytes = MFI.getStackSize(); 977 uint32_t RoundedSize = FuncInfo->isStackRealigned() 978 ? NumBytes + MFI.getMaxAlign().value() 979 : NumBytes; 980 const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); 981 const Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 982 const Register BasePtrReg = 983 TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register(); 984 985 Optional<int> FPSaveIndex = FuncInfo->FramePointerSaveIndex; 986 Optional<int> BPSaveIndex = FuncInfo->BasePointerSaveIndex; 987 988 if (RoundedSize != 0 && hasFP(MF)) { 989 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg) 990 .addReg(StackPtrReg) 991 .addImm(RoundedSize * getScratchScaleFactor(ST)) 992 .setMIFlag(MachineInstr::FrameDestroy); 993 } 994 995 if (FuncInfo->SGPRForFPSaveRestoreCopy) { 996 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) 997 .addReg(FuncInfo->SGPRForFPSaveRestoreCopy) 998 .setMIFlag(MachineInstr::FrameDestroy); 999 } 1000 1001 if (FuncInfo->SGPRForBPSaveRestoreCopy) { 1002 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg) 1003 .addReg(FuncInfo->SGPRForBPSaveRestoreCopy) 1004 .setMIFlag(MachineInstr::FrameDestroy); 1005 } 1006 1007 if (FPSaveIndex) { 1008 const int FramePtrFI = *FPSaveIndex; 1009 assert(!MFI.isDeadObjectIndex(FramePtrFI)); 1010 if (spilledToMemory(MF, FramePtrFI)) { 1011 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false); 1012 1013 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( 1014 MRI, LiveRegs, AMDGPU::VGPR_32RegClass); 1015 if (!TmpVGPR) 1016 report_fatal_error("failed to find free scratch register"); 1017 buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, TmpVGPR, 1018 FramePtrFI); 1019 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg) 1020 .addReg(TmpVGPR, RegState::Kill); 1021 } else { 1022 // Reload from VGPR spill. 1023 assert(MFI.getStackID(FramePtrFI) == TargetStackID::SGPRSpill); 1024 ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = 1025 FuncInfo->getSGPRToVGPRSpills(FramePtrFI); 1026 assert(Spill.size() == 1); 1027 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READLANE_B32), FramePtrReg) 1028 .addReg(Spill[0].VGPR) 1029 .addImm(Spill[0].Lane); 1030 } 1031 } 1032 1033 if (BPSaveIndex) { 1034 const int BasePtrFI = *BPSaveIndex; 1035 assert(!MFI.isDeadObjectIndex(BasePtrFI)); 1036 if (spilledToMemory(MF, BasePtrFI)) { 1037 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false); 1038 1039 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( 1040 MRI, LiveRegs, AMDGPU::VGPR_32RegClass); 1041 if (!TmpVGPR) 1042 report_fatal_error("failed to find free scratch register"); 1043 buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, TmpVGPR, 1044 BasePtrFI); 1045 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg) 1046 .addReg(TmpVGPR, RegState::Kill); 1047 } else { 1048 // Reload from VGPR spill. 1049 assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill); 1050 ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = 1051 FuncInfo->getSGPRToVGPRSpills(BasePtrFI); 1052 assert(Spill.size() == 1); 1053 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READLANE_B32), BasePtrReg) 1054 .addReg(Spill[0].VGPR) 1055 .addImm(Spill[0].Lane); 1056 } 1057 } 1058 1059 Register ScratchExecCopy; 1060 for (const SIMachineFunctionInfo::SGPRSpillVGPR &Reg : 1061 FuncInfo->getSGPRSpillVGPRs()) { 1062 if (!Reg.FI) 1063 continue; 1064 1065 if (!ScratchExecCopy) 1066 ScratchExecCopy = 1067 buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ false); 1068 1069 buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, Reg.VGPR, 1070 *Reg.FI); 1071 } 1072 1073 for (const auto &Reg : FuncInfo->WWMReservedRegs) { 1074 auto VGPR = Reg.first; 1075 auto FI = Reg.second; 1076 if (!FI) 1077 continue; 1078 1079 if (!ScratchExecCopy) 1080 ScratchExecCopy = 1081 buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ false); 1082 1083 buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, VGPR, *FI); 1084 } 1085 1086 if (ScratchExecCopy) { 1087 // FIXME: Split block and make terminator. 1088 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 1089 MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 1090 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) 1091 .addReg(ScratchExecCopy, RegState::Kill); 1092 } 1093 } 1094 1095 #ifndef NDEBUG 1096 static bool allSGPRSpillsAreDead(const MachineFunction &MF) { 1097 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1098 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1099 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); 1100 I != E; ++I) { 1101 if (!MFI.isDeadObjectIndex(I) && 1102 MFI.getStackID(I) == TargetStackID::SGPRSpill && 1103 (I != FuncInfo->FramePointerSaveIndex && 1104 I != FuncInfo->BasePointerSaveIndex)) { 1105 return false; 1106 } 1107 } 1108 1109 return true; 1110 } 1111 #endif 1112 1113 StackOffset SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, 1114 int FI, 1115 Register &FrameReg) const { 1116 const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); 1117 1118 FrameReg = RI->getFrameRegister(MF); 1119 return StackOffset::getFixed(MF.getFrameInfo().getObjectOffset(FI)); 1120 } 1121 1122 void SIFrameLowering::processFunctionBeforeFrameFinalized( 1123 MachineFunction &MF, 1124 RegScavenger *RS) const { 1125 MachineFrameInfo &MFI = MF.getFrameInfo(); 1126 1127 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1128 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1129 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1130 1131 FuncInfo->removeDeadFrameIndices(MFI); 1132 assert(allSGPRSpillsAreDead(MF) && 1133 "SGPR spill should have been removed in SILowerSGPRSpills"); 1134 1135 // FIXME: The other checks should be redundant with allStackObjectsAreDead, 1136 // but currently hasNonSpillStackObjects is set only from source 1137 // allocas. Stack temps produced from legalization are not counted currently. 1138 if (!allStackObjectsAreDead(MFI)) { 1139 assert(RS && "RegScavenger required if spilling"); 1140 1141 // Add an emergency spill slot 1142 RS->addScavengingFrameIndex(FuncInfo->getScavengeFI(MFI, *TRI)); 1143 } 1144 } 1145 1146 // Only report VGPRs to generic code. 1147 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, 1148 BitVector &SavedVGPRs, 1149 RegScavenger *RS) const { 1150 TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS); 1151 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1152 if (MFI->isEntryFunction()) 1153 return; 1154 1155 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 1156 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1157 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1158 1159 // Ignore the SGPRs the default implementation found. 1160 SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask()); 1161 1162 // Do not save AGPRs prior to GFX90A because there was no easy way to do so. 1163 // In gfx908 there was do AGPR loads and stores and thus spilling also 1164 // require a temporary VGPR. 1165 if (!ST.hasGFX90AInsts()) 1166 SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask()); 1167 1168 // hasFP only knows about stack objects that already exist. We're now 1169 // determining the stack slots that will be created, so we have to predict 1170 // them. Stack objects force FP usage with calls. 1171 // 1172 // Note a new VGPR CSR may be introduced if one is used for the spill, but we 1173 // don't want to report it here. 1174 // 1175 // FIXME: Is this really hasReservedCallFrame? 1176 const bool WillHaveFP = 1177 FrameInfo.hasCalls() && 1178 (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo)); 1179 1180 // VGPRs used for SGPR spilling need to be specially inserted in the prolog, 1181 // so don't allow the default insertion to handle them. 1182 for (auto SSpill : MFI->getSGPRSpillVGPRs()) 1183 SavedVGPRs.reset(SSpill.VGPR); 1184 1185 LivePhysRegs LiveRegs; 1186 LiveRegs.init(*TRI); 1187 1188 if (WillHaveFP || hasFP(MF)) { 1189 assert(!MFI->SGPRForFPSaveRestoreCopy && !MFI->FramePointerSaveIndex && 1190 "Re-reserving spill slot for FP"); 1191 getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForFPSaveRestoreCopy, 1192 MFI->FramePointerSaveIndex, true); 1193 } 1194 1195 if (TRI->hasBasePointer(MF)) { 1196 if (MFI->SGPRForFPSaveRestoreCopy) 1197 LiveRegs.addReg(MFI->SGPRForFPSaveRestoreCopy); 1198 1199 assert(!MFI->SGPRForBPSaveRestoreCopy && 1200 !MFI->BasePointerSaveIndex && "Re-reserving spill slot for BP"); 1201 getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForBPSaveRestoreCopy, 1202 MFI->BasePointerSaveIndex, false); 1203 } 1204 } 1205 1206 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF, 1207 BitVector &SavedRegs, 1208 RegScavenger *RS) const { 1209 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); 1210 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1211 if (MFI->isEntryFunction()) 1212 return; 1213 1214 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1215 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1216 1217 // The SP is specifically managed and we don't want extra spills of it. 1218 SavedRegs.reset(MFI->getStackPtrOffsetReg()); 1219 1220 const BitVector AllSavedRegs = SavedRegs; 1221 SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask()); 1222 1223 // If clearing VGPRs changed the mask, we will have some CSR VGPR spills. 1224 const bool HaveAnyCSRVGPR = SavedRegs != AllSavedRegs; 1225 1226 // We have to anticipate introducing CSR VGPR spills if we don't have any 1227 // stack objects already, since we require an FP if there is a call and stack. 1228 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 1229 const bool WillHaveFP = FrameInfo.hasCalls() && HaveAnyCSRVGPR; 1230 1231 // FP will be specially managed like SP. 1232 if (WillHaveFP || hasFP(MF)) 1233 SavedRegs.reset(MFI->getFrameOffsetReg()); 1234 } 1235 1236 bool SIFrameLowering::assignCalleeSavedSpillSlots( 1237 MachineFunction &MF, const TargetRegisterInfo *TRI, 1238 std::vector<CalleeSavedInfo> &CSI) const { 1239 if (CSI.empty()) 1240 return true; // Early exit if no callee saved registers are modified! 1241 1242 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1243 if (!FuncInfo->SGPRForFPSaveRestoreCopy && 1244 !FuncInfo->SGPRForBPSaveRestoreCopy) 1245 return false; 1246 1247 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1248 const SIRegisterInfo *RI = ST.getRegisterInfo(); 1249 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 1250 Register BasePtrReg = RI->getBaseRegister(); 1251 unsigned NumModifiedRegs = 0; 1252 1253 if (FuncInfo->SGPRForFPSaveRestoreCopy) 1254 NumModifiedRegs++; 1255 if (FuncInfo->SGPRForBPSaveRestoreCopy) 1256 NumModifiedRegs++; 1257 1258 for (auto &CS : CSI) { 1259 if (CS.getReg() == FramePtrReg && FuncInfo->SGPRForFPSaveRestoreCopy) { 1260 CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy); 1261 if (--NumModifiedRegs) 1262 break; 1263 } else if (CS.getReg() == BasePtrReg && 1264 FuncInfo->SGPRForBPSaveRestoreCopy) { 1265 CS.setDstReg(FuncInfo->SGPRForBPSaveRestoreCopy); 1266 if (--NumModifiedRegs) 1267 break; 1268 } 1269 } 1270 1271 return false; 1272 } 1273 1274 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( 1275 MachineFunction &MF, 1276 MachineBasicBlock &MBB, 1277 MachineBasicBlock::iterator I) const { 1278 int64_t Amount = I->getOperand(0).getImm(); 1279 if (Amount == 0) 1280 return MBB.erase(I); 1281 1282 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1283 const SIInstrInfo *TII = ST.getInstrInfo(); 1284 const DebugLoc &DL = I->getDebugLoc(); 1285 unsigned Opc = I->getOpcode(); 1286 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); 1287 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0; 1288 1289 if (!hasReservedCallFrame(MF)) { 1290 Amount = alignTo(Amount, getStackAlign()); 1291 assert(isUInt<32>(Amount) && "exceeded stack address space size"); 1292 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1293 Register SPReg = MFI->getStackPtrOffsetReg(); 1294 1295 unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; 1296 BuildMI(MBB, I, DL, TII->get(Op), SPReg) 1297 .addReg(SPReg) 1298 .addImm(Amount * getScratchScaleFactor(ST)); 1299 } else if (CalleePopAmount != 0) { 1300 llvm_unreachable("is this used?"); 1301 } 1302 1303 return MBB.erase(I); 1304 } 1305 1306 /// Returns true if the frame will require a reference to the stack pointer. 1307 /// 1308 /// This is the set of conditions common to setting up the stack pointer in a 1309 /// kernel, and for using a frame pointer in a callable function. 1310 /// 1311 /// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm 1312 /// references SP. 1313 static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) { 1314 return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint(); 1315 } 1316 1317 // The FP for kernels is always known 0, so we never really need to setup an 1318 // explicit register for it. However, DisableFramePointerElim will force us to 1319 // use a register for it. 1320 bool SIFrameLowering::hasFP(const MachineFunction &MF) const { 1321 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1322 1323 // For entry functions we can use an immediate offset in most cases, so the 1324 // presence of calls doesn't imply we need a distinct frame pointer. 1325 if (MFI.hasCalls() && 1326 !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) { 1327 // All offsets are unsigned, so need to be addressed in the same direction 1328 // as stack growth. 1329 1330 // FIXME: This function is pretty broken, since it can be called before the 1331 // frame layout is determined or CSR spills are inserted. 1332 return MFI.getStackSize() != 0; 1333 } 1334 1335 return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() || 1336 MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment( 1337 MF) || 1338 MF.getTarget().Options.DisableFramePointerElim(MF); 1339 } 1340 1341 // This is essentially a reduced version of hasFP for entry functions. Since the 1342 // stack pointer is known 0 on entry to kernels, we never really need an FP 1343 // register. We may need to initialize the stack pointer depending on the frame 1344 // properties, which logically overlaps many of the cases where an ordinary 1345 // function would require an FP. 1346 bool SIFrameLowering::requiresStackPointerReference( 1347 const MachineFunction &MF) const { 1348 // Callable functions always require a stack pointer reference. 1349 assert(MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() && 1350 "only expected to call this for entry points"); 1351 1352 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1353 1354 // Entry points ordinarily don't need to initialize SP. We have to set it up 1355 // for callees if there are any. Also note tail calls are impossible/don't 1356 // make any sense for kernels. 1357 if (MFI.hasCalls()) 1358 return true; 1359 1360 // We still need to initialize the SP if we're doing anything weird that 1361 // references the SP, like variable sized stack objects. 1362 return frameTriviallyRequiresSP(MFI); 1363 } 1364