1 //===----------------------- SIFrameLowering.cpp --------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 9 #include "SIFrameLowering.h" 10 #include "AMDGPU.h" 11 #include "GCNSubtarget.h" 12 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 13 #include "SIMachineFunctionInfo.h" 14 #include "llvm/CodeGen/LivePhysRegs.h" 15 #include "llvm/CodeGen/MachineFrameInfo.h" 16 #include "llvm/CodeGen/RegisterScavenging.h" 17 #include "llvm/Target/TargetMachine.h" 18 19 using namespace llvm; 20 21 #define DEBUG_TYPE "frame-info" 22 23 // Find a scratch register that we can use in the prologue. We avoid using 24 // callee-save registers since they may appear to be free when this is called 25 // from canUseAsPrologue (during shrink wrapping), but then no longer be free 26 // when this is called from emitPrologue. 27 static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, 28 LivePhysRegs &LiveRegs, 29 const TargetRegisterClass &RC, 30 bool Unused = false) { 31 // Mark callee saved registers as used so we will not choose them. 32 const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); 33 for (unsigned i = 0; CSRegs[i]; ++i) 34 LiveRegs.addReg(CSRegs[i]); 35 36 if (Unused) { 37 // We are looking for a register that can be used throughout the entire 38 // function, so any use is unacceptable. 39 for (MCRegister Reg : RC) { 40 if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg)) 41 return Reg; 42 } 43 } else { 44 for (MCRegister Reg : RC) { 45 if (LiveRegs.available(MRI, Reg)) 46 return Reg; 47 } 48 } 49 50 return MCRegister(); 51 } 52 53 static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF, 54 LivePhysRegs &LiveRegs, 55 Register &TempSGPR, 56 Optional<int> &FrameIndex, 57 bool IsFP) { 58 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 59 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 60 61 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 62 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 63 64 // We need to save and restore the current FP/BP. 65 66 // 1: If there is already a VGPR with free lanes, use it. We 67 // may already have to pay the penalty for spilling a CSR VGPR. 68 if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) { 69 int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr, 70 TargetStackID::SGPRSpill); 71 72 if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI)) 73 llvm_unreachable("allocate SGPR spill should have worked"); 74 75 FrameIndex = NewFI; 76 77 LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); 78 dbgs() << "Spilling " << (IsFP ? "FP" : "BP") << " to " 79 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane 80 << '\n'); 81 return; 82 } 83 84 // 2: Next, try to save the FP/BP in an unused SGPR. 85 TempSGPR = findScratchNonCalleeSaveRegister( 86 MF.getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true); 87 88 if (!TempSGPR) { 89 int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr, 90 TargetStackID::SGPRSpill); 91 92 if (TRI->spillSGPRToVGPR() && MFI->allocateSGPRSpillToVGPR(MF, NewFI)) { 93 // 3: There's no free lane to spill, and no free register to save FP/BP, 94 // so we're forced to spill another VGPR to use for the spill. 95 FrameIndex = NewFI; 96 97 LLVM_DEBUG( 98 auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); 99 dbgs() << (IsFP ? "FP" : "BP") << " requires fallback spill to " 100 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';); 101 } else { 102 // Remove dead <NewFI> index 103 MF.getFrameInfo().RemoveStackObject(NewFI); 104 // 4: If all else fails, spill the FP/BP to memory. 105 FrameIndex = FrameInfo.CreateSpillStackObject(4, Align(4)); 106 LLVM_DEBUG(dbgs() << "Reserved FI " << FrameIndex << " for spilling " 107 << (IsFP ? "FP" : "BP") << '\n'); 108 } 109 } else { 110 LLVM_DEBUG(dbgs() << "Saving " << (IsFP ? "FP" : "BP") << " with copy to " 111 << printReg(TempSGPR, TRI) << '\n'); 112 } 113 } 114 115 // We need to specially emit stack operations here because a different frame 116 // register is used than in the rest of the function, as getFrameRegister would 117 // use. 118 static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI, 119 const SIMachineFunctionInfo &FuncInfo, 120 LivePhysRegs &LiveRegs, MachineFunction &MF, 121 MachineBasicBlock &MBB, 122 MachineBasicBlock::iterator I, Register SpillReg, 123 int FI) { 124 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 125 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 126 127 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 128 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 129 MachineMemOperand *MMO = MF.getMachineMemOperand( 130 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI), 131 FrameInfo.getObjectAlign(FI)); 132 LiveRegs.addReg(SpillReg); 133 TRI.buildSpillLoadStore(MBB, I, Opc, FI, SpillReg, true, 134 FuncInfo.getStackPtrOffsetReg(), 0, MMO, nullptr, 135 &LiveRegs); 136 LiveRegs.removeReg(SpillReg); 137 } 138 139 static void buildEpilogRestore(const GCNSubtarget &ST, 140 const SIRegisterInfo &TRI, 141 const SIMachineFunctionInfo &FuncInfo, 142 LivePhysRegs &LiveRegs, MachineFunction &MF, 143 MachineBasicBlock &MBB, 144 MachineBasicBlock::iterator I, Register SpillReg, 145 int FI) { 146 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 147 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 148 149 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 150 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 151 MachineMemOperand *MMO = MF.getMachineMemOperand( 152 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI), 153 FrameInfo.getObjectAlign(FI)); 154 TRI.buildSpillLoadStore(MBB, I, Opc, FI, SpillReg, false, 155 FuncInfo.getStackPtrOffsetReg(), 0, MMO, nullptr, 156 &LiveRegs); 157 } 158 159 static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 160 const DebugLoc &DL, const SIInstrInfo *TII, 161 Register TargetReg) { 162 MachineFunction *MF = MBB.getParent(); 163 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 164 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 165 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); 166 Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0); 167 Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1); 168 169 if (MFI->getGITPtrHigh() != 0xffffffff) { 170 BuildMI(MBB, I, DL, SMovB32, TargetHi) 171 .addImm(MFI->getGITPtrHigh()) 172 .addReg(TargetReg, RegState::ImplicitDefine); 173 } else { 174 const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64); 175 BuildMI(MBB, I, DL, GetPC64, TargetReg); 176 } 177 Register GitPtrLo = MFI->getGITPtrLoReg(*MF); 178 MF->getRegInfo().addLiveIn(GitPtrLo); 179 MBB.addLiveIn(GitPtrLo); 180 BuildMI(MBB, I, DL, SMovB32, TargetLo) 181 .addReg(GitPtrLo); 182 } 183 184 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()` 185 void SIFrameLowering::emitEntryFunctionFlatScratchInit( 186 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 187 const DebugLoc &DL, Register ScratchWaveOffsetReg) const { 188 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 189 const SIInstrInfo *TII = ST.getInstrInfo(); 190 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 191 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 192 193 // We don't need this if we only have spills since there is no user facing 194 // scratch. 195 196 // TODO: If we know we don't have flat instructions earlier, we can omit 197 // this from the input registers. 198 // 199 // TODO: We only need to know if we access scratch space through a flat 200 // pointer. Because we only detect if flat instructions are used at all, 201 // this will be used more often than necessary on VI. 202 203 Register FlatScrInitLo; 204 Register FlatScrInitHi; 205 206 if (ST.isAmdPalOS()) { 207 // Extract the scratch offset from the descriptor in the GIT 208 LivePhysRegs LiveRegs; 209 LiveRegs.init(*TRI); 210 LiveRegs.addLiveIns(MBB); 211 212 // Find unused reg to load flat scratch init into 213 MachineRegisterInfo &MRI = MF.getRegInfo(); 214 Register FlatScrInit = AMDGPU::NoRegister; 215 ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF); 216 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2; 217 AllSGPR64s = AllSGPR64s.slice( 218 std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded)); 219 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 220 for (MCPhysReg Reg : AllSGPR64s) { 221 if (LiveRegs.available(MRI, Reg) && MRI.isAllocatable(Reg) && 222 !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) { 223 FlatScrInit = Reg; 224 break; 225 } 226 } 227 assert(FlatScrInit && "Failed to find free register for scratch init"); 228 229 FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0); 230 FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1); 231 232 buildGitPtr(MBB, I, DL, TII, FlatScrInit); 233 234 // We now have the GIT ptr - now get the scratch descriptor from the entry 235 // at offset 0 (or offset 16 for a compute shader). 236 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 237 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); 238 auto *MMO = MF.getMachineMemOperand( 239 PtrInfo, 240 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | 241 MachineMemOperand::MODereferenceable, 242 8, Align(4)); 243 unsigned Offset = 244 MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; 245 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); 246 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset); 247 BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit) 248 .addReg(FlatScrInit) 249 .addImm(EncodedOffset) // offset 250 .addImm(0) // cpol 251 .addMemOperand(MMO); 252 253 // Mask the offset in [47:0] of the descriptor 254 const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32); 255 BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi) 256 .addReg(FlatScrInitHi) 257 .addImm(0xffff); 258 } else { 259 Register FlatScratchInitReg = 260 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); 261 assert(FlatScratchInitReg); 262 263 MachineRegisterInfo &MRI = MF.getRegInfo(); 264 MRI.addLiveIn(FlatScratchInitReg); 265 MBB.addLiveIn(FlatScratchInitReg); 266 267 FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); 268 FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); 269 } 270 271 // Do a 64-bit pointer add. 272 if (ST.flatScratchIsPointer()) { 273 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 274 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) 275 .addReg(FlatScrInitLo) 276 .addReg(ScratchWaveOffsetReg); 277 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), FlatScrInitHi) 278 .addReg(FlatScrInitHi) 279 .addImm(0); 280 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)). 281 addReg(FlatScrInitLo). 282 addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO | 283 (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); 284 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)). 285 addReg(FlatScrInitHi). 286 addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI | 287 (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); 288 return; 289 } 290 291 // For GFX9. 292 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO) 293 .addReg(FlatScrInitLo) 294 .addReg(ScratchWaveOffsetReg); 295 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI) 296 .addReg(FlatScrInitHi) 297 .addImm(0); 298 299 return; 300 } 301 302 assert(ST.getGeneration() < AMDGPUSubtarget::GFX9); 303 304 // Copy the size in bytes. 305 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO) 306 .addReg(FlatScrInitHi, RegState::Kill); 307 308 // Add wave offset in bytes to private base offset. 309 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init. 310 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) 311 .addReg(FlatScrInitLo) 312 .addReg(ScratchWaveOffsetReg); 313 314 // Convert offset to 256-byte units. 315 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI) 316 .addReg(FlatScrInitLo, RegState::Kill) 317 .addImm(8); 318 } 319 320 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not 321 // memory. They should have been removed by now. 322 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { 323 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); 324 I != E; ++I) { 325 if (!MFI.isDeadObjectIndex(I)) 326 return false; 327 } 328 329 return true; 330 } 331 332 // Shift down registers reserved for the scratch RSRC. 333 Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg( 334 MachineFunction &MF) const { 335 336 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 337 const SIInstrInfo *TII = ST.getInstrInfo(); 338 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 339 MachineRegisterInfo &MRI = MF.getRegInfo(); 340 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 341 342 assert(MFI->isEntryFunction()); 343 344 Register ScratchRsrcReg = MFI->getScratchRSrcReg(); 345 346 if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(ScratchRsrcReg) && 347 allStackObjectsAreDead(MF.getFrameInfo()))) 348 return Register(); 349 350 if (ST.hasSGPRInitBug() || 351 ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF)) 352 return ScratchRsrcReg; 353 354 // We reserved the last registers for this. Shift it down to the end of those 355 // which were actually used. 356 // 357 // FIXME: It might be safer to use a pseudoregister before replacement. 358 359 // FIXME: We should be able to eliminate unused input registers. We only 360 // cannot do this for the resources required for scratch access. For now we 361 // skip over user SGPRs and may leave unused holes. 362 363 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4; 364 ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF); 365 AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded)); 366 367 // Skip the last N reserved elements because they should have already been 368 // reserved for VCC etc. 369 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 370 for (MCPhysReg Reg : AllSGPR128s) { 371 // Pick the first unallocated one. Make sure we don't clobber the other 372 // reserved input we needed. Also for PAL, make sure we don't clobber 373 // the GIT pointer passed in SGPR0 or SGPR8. 374 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && 375 !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) { 376 MRI.replaceRegWith(ScratchRsrcReg, Reg); 377 MFI->setScratchRSrcReg(Reg); 378 return Reg; 379 } 380 } 381 382 return ScratchRsrcReg; 383 } 384 385 static unsigned getScratchScaleFactor(const GCNSubtarget &ST) { 386 return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize(); 387 } 388 389 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, 390 MachineBasicBlock &MBB) const { 391 assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); 392 393 // FIXME: If we only have SGPR spills, we won't actually be using scratch 394 // memory since these spill to VGPRs. We should be cleaning up these unused 395 // SGPR spill frame indices somewhere. 396 397 // FIXME: We still have implicit uses on SGPR spill instructions in case they 398 // need to spill to vector memory. It's likely that will not happen, but at 399 // this point it appears we need the setup. This part of the prolog should be 400 // emitted after frame indices are eliminated. 401 402 // FIXME: Remove all of the isPhysRegUsed checks 403 404 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 405 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 406 const SIInstrInfo *TII = ST.getInstrInfo(); 407 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 408 MachineRegisterInfo &MRI = MF.getRegInfo(); 409 const Function &F = MF.getFunction(); 410 411 assert(MFI->isEntryFunction()); 412 413 Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( 414 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); 415 // FIXME: Hack to not crash in situations which emitted an error. 416 if (!PreloadedScratchWaveOffsetReg) 417 return; 418 419 // We need to do the replacement of the private segment buffer register even 420 // if there are no stack objects. There could be stores to undef or a 421 // constant without an associated object. 422 // 423 // This will return `Register()` in cases where there are no actual 424 // uses of the SRSRC. 425 Register ScratchRsrcReg; 426 if (!ST.enableFlatScratch()) 427 ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF); 428 429 // Make the selected register live throughout the function. 430 if (ScratchRsrcReg) { 431 for (MachineBasicBlock &OtherBB : MF) { 432 if (&OtherBB != &MBB) { 433 OtherBB.addLiveIn(ScratchRsrcReg); 434 } 435 } 436 } 437 438 // Now that we have fixed the reserved SRSRC we need to locate the 439 // (potentially) preloaded SRSRC. 440 Register PreloadedScratchRsrcReg; 441 if (ST.isAmdHsaOrMesa(F)) { 442 PreloadedScratchRsrcReg = 443 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); 444 if (ScratchRsrcReg && PreloadedScratchRsrcReg) { 445 // We added live-ins during argument lowering, but since they were not 446 // used they were deleted. We're adding the uses now, so add them back. 447 MRI.addLiveIn(PreloadedScratchRsrcReg); 448 MBB.addLiveIn(PreloadedScratchRsrcReg); 449 } 450 } 451 452 // Debug location must be unknown since the first debug location is used to 453 // determine the end of the prologue. 454 DebugLoc DL; 455 MachineBasicBlock::iterator I = MBB.begin(); 456 457 // We found the SRSRC first because it needs four registers and has an 458 // alignment requirement. If the SRSRC that we found is clobbering with 459 // the scratch wave offset, which may be in a fixed SGPR or a free SGPR 460 // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch 461 // wave offset to a free SGPR. 462 Register ScratchWaveOffsetReg; 463 if (TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) { 464 ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF); 465 unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); 466 AllSGPRs = AllSGPRs.slice( 467 std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded)); 468 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 469 for (MCPhysReg Reg : AllSGPRs) { 470 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && 471 !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) { 472 ScratchWaveOffsetReg = Reg; 473 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) 474 .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); 475 break; 476 } 477 } 478 } else { 479 ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg; 480 } 481 assert(ScratchWaveOffsetReg); 482 483 if (requiresStackPointerReference(MF)) { 484 Register SPReg = MFI->getStackPtrOffsetReg(); 485 assert(SPReg != AMDGPU::SP_REG); 486 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg) 487 .addImm(MF.getFrameInfo().getStackSize() * getScratchScaleFactor(ST)); 488 } 489 490 if (hasFP(MF)) { 491 Register FPReg = MFI->getFrameOffsetReg(); 492 assert(FPReg != AMDGPU::FP_REG); 493 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0); 494 } 495 496 if ((MFI->hasFlatScratchInit() || ScratchRsrcReg) && 497 !ST.flatScratchIsArchitected()) { 498 MRI.addLiveIn(PreloadedScratchWaveOffsetReg); 499 MBB.addLiveIn(PreloadedScratchWaveOffsetReg); 500 } 501 502 if (MFI->hasFlatScratchInit()) { 503 emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg); 504 } 505 506 if (ScratchRsrcReg) { 507 emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL, 508 PreloadedScratchRsrcReg, 509 ScratchRsrcReg, ScratchWaveOffsetReg); 510 } 511 } 512 513 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg` 514 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( 515 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 516 const DebugLoc &DL, Register PreloadedScratchRsrcReg, 517 Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const { 518 519 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 520 const SIInstrInfo *TII = ST.getInstrInfo(); 521 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 522 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 523 const Function &Fn = MF.getFunction(); 524 525 if (ST.isAmdPalOS()) { 526 // The pointer to the GIT is formed from the offset passed in and either 527 // the amdgpu-git-ptr-high function attribute or the top part of the PC 528 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); 529 Register Rsrc03 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); 530 531 buildGitPtr(MBB, I, DL, TII, Rsrc01); 532 533 // We now have the GIT ptr - now get the scratch descriptor from the entry 534 // at offset 0 (or offset 16 for a compute shader). 535 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 536 const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM); 537 auto MMO = MF.getMachineMemOperand(PtrInfo, 538 MachineMemOperand::MOLoad | 539 MachineMemOperand::MOInvariant | 540 MachineMemOperand::MODereferenceable, 541 16, Align(4)); 542 unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; 543 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); 544 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset); 545 BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg) 546 .addReg(Rsrc01) 547 .addImm(EncodedOffset) // offset 548 .addImm(0) // cpol 549 .addReg(ScratchRsrcReg, RegState::ImplicitDefine) 550 .addMemOperand(MMO); 551 552 // The driver will always set the SRD for wave 64 (bits 118:117 of 553 // descriptor / bits 22:21 of third sub-reg will be 0b11) 554 // If the shader is actually wave32 we have to modify the const_index_stride 555 // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The 556 // reason the driver does this is that there can be cases where it presents 557 // 2 shaders with different wave size (e.g. VsFs). 558 // TODO: convert to using SCRATCH instructions or multiple SRD buffers 559 if (ST.isWave32()) { 560 const MCInstrDesc &SBitsetB32 = TII->get(AMDGPU::S_BITSET0_B32); 561 BuildMI(MBB, I, DL, SBitsetB32, Rsrc03) 562 .addImm(21) 563 .addReg(Rsrc03); 564 } 565 } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) { 566 assert(!ST.isAmdHsaOrMesa(Fn)); 567 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); 568 569 Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); 570 Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); 571 572 // Use relocations to get the pointer, and setup the other bits manually. 573 uint64_t Rsrc23 = TII->getScratchRsrcWords23(); 574 575 if (MFI->hasImplicitBufferPtr()) { 576 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); 577 578 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { 579 const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64); 580 581 BuildMI(MBB, I, DL, Mov64, Rsrc01) 582 .addReg(MFI->getImplicitBufferPtrUserSGPR()) 583 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 584 } else { 585 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); 586 587 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 588 auto MMO = MF.getMachineMemOperand( 589 PtrInfo, 590 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | 591 MachineMemOperand::MODereferenceable, 592 8, Align(4)); 593 BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01) 594 .addReg(MFI->getImplicitBufferPtrUserSGPR()) 595 .addImm(0) // offset 596 .addImm(0) // cpol 597 .addMemOperand(MMO) 598 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 599 600 MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR()); 601 MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR()); 602 } 603 } else { 604 Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 605 Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 606 607 BuildMI(MBB, I, DL, SMovB32, Rsrc0) 608 .addExternalSymbol("SCRATCH_RSRC_DWORD0") 609 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 610 611 BuildMI(MBB, I, DL, SMovB32, Rsrc1) 612 .addExternalSymbol("SCRATCH_RSRC_DWORD1") 613 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 614 615 } 616 617 BuildMI(MBB, I, DL, SMovB32, Rsrc2) 618 .addImm(Rsrc23 & 0xffffffff) 619 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 620 621 BuildMI(MBB, I, DL, SMovB32, Rsrc3) 622 .addImm(Rsrc23 >> 32) 623 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 624 } else if (ST.isAmdHsaOrMesa(Fn)) { 625 assert(PreloadedScratchRsrcReg); 626 627 if (ScratchRsrcReg != PreloadedScratchRsrcReg) { 628 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) 629 .addReg(PreloadedScratchRsrcReg, RegState::Kill); 630 } 631 } 632 633 // Add the scratch wave offset into the scratch RSRC. 634 // 635 // We only want to update the first 48 bits, which is the base address 636 // pointer, without touching the adjacent 16 bits of flags. We know this add 637 // cannot carry-out from bit 47, otherwise the scratch allocation would be 638 // impossible to fit in the 48-bit global address space. 639 // 640 // TODO: Evaluate if it is better to just construct an SRD using the flat 641 // scratch init and some constants rather than update the one we are passed. 642 Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 643 Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 644 645 // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in 646 // the kernel body via inreg arguments. 647 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0) 648 .addReg(ScratchRsrcSub0) 649 .addReg(ScratchWaveOffsetReg) 650 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 651 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1) 652 .addReg(ScratchRsrcSub1) 653 .addImm(0) 654 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 655 } 656 657 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { 658 switch (ID) { 659 case TargetStackID::Default: 660 case TargetStackID::NoAlloc: 661 case TargetStackID::SGPRSpill: 662 return true; 663 case TargetStackID::ScalableVector: 664 return false; 665 } 666 llvm_unreachable("Invalid TargetStackID::Value"); 667 } 668 669 static void initLiveRegs(LivePhysRegs &LiveRegs, const SIRegisterInfo &TRI, 670 const SIMachineFunctionInfo *FuncInfo, 671 MachineFunction &MF, MachineBasicBlock &MBB, 672 MachineBasicBlock::iterator MBBI, bool IsProlog) { 673 if (LiveRegs.empty()) { 674 LiveRegs.init(TRI); 675 if (IsProlog) { 676 LiveRegs.addLiveIns(MBB); 677 } else { 678 // In epilog. 679 LiveRegs.addLiveOuts(MBB); 680 LiveRegs.stepBackward(*MBBI); 681 } 682 } 683 } 684 685 // Activate all lanes, returns saved exec. 686 static Register buildScratchExecCopy(LivePhysRegs &LiveRegs, 687 MachineFunction &MF, 688 MachineBasicBlock &MBB, 689 MachineBasicBlock::iterator MBBI, 690 bool IsProlog) { 691 Register ScratchExecCopy; 692 MachineRegisterInfo &MRI = MF.getRegInfo(); 693 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 694 const SIInstrInfo *TII = ST.getInstrInfo(); 695 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 696 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 697 DebugLoc DL; 698 699 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, IsProlog); 700 701 ScratchExecCopy = findScratchNonCalleeSaveRegister( 702 MRI, LiveRegs, *TRI.getWaveMaskRegClass()); 703 if (!ScratchExecCopy) 704 report_fatal_error("failed to find free scratch register"); 705 706 LiveRegs.addReg(ScratchExecCopy); 707 708 const unsigned OrSaveExec = 709 ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; 710 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy).addImm(-1); 711 712 return ScratchExecCopy; 713 } 714 715 // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR. 716 // Otherwise we are spilling to memory. 717 static bool spilledToMemory(const MachineFunction &MF, int SaveIndex) { 718 const MachineFrameInfo &MFI = MF.getFrameInfo(); 719 return MFI.getStackID(SaveIndex) != TargetStackID::SGPRSpill; 720 } 721 722 void SIFrameLowering::emitPrologue(MachineFunction &MF, 723 MachineBasicBlock &MBB) const { 724 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 725 if (FuncInfo->isEntryFunction()) { 726 emitEntryFunctionPrologue(MF, MBB); 727 return; 728 } 729 730 const MachineFrameInfo &MFI = MF.getFrameInfo(); 731 MachineRegisterInfo &MRI = MF.getRegInfo(); 732 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 733 const SIInstrInfo *TII = ST.getInstrInfo(); 734 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 735 736 Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); 737 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 738 Register BasePtrReg = 739 TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register(); 740 LivePhysRegs LiveRegs; 741 742 MachineBasicBlock::iterator MBBI = MBB.begin(); 743 DebugLoc DL; 744 745 bool HasFP = false; 746 bool HasBP = false; 747 uint32_t NumBytes = MFI.getStackSize(); 748 uint32_t RoundedSize = NumBytes; 749 // To avoid clobbering VGPRs in lanes that weren't active on function entry, 750 // turn on all lanes before doing the spill to memory. 751 Register ScratchExecCopy; 752 753 Optional<int> FPSaveIndex = FuncInfo->FramePointerSaveIndex; 754 Optional<int> BPSaveIndex = FuncInfo->BasePointerSaveIndex; 755 756 // VGPRs used for SGPR->VGPR spills 757 for (const SIMachineFunctionInfo::SGPRSpillVGPR &Reg : 758 FuncInfo->getSGPRSpillVGPRs()) { 759 if (!Reg.FI) 760 continue; 761 762 if (!ScratchExecCopy) 763 ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, 764 /*IsProlog*/ true); 765 766 buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, Reg.VGPR, 767 *Reg.FI); 768 } 769 770 // VGPRs used for Whole Wave Mode 771 for (const auto &Reg : FuncInfo->WWMReservedRegs) { 772 auto VGPR = Reg.first; 773 auto FI = Reg.second; 774 if (!FI) 775 continue; 776 777 if (!ScratchExecCopy) 778 ScratchExecCopy = 779 buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ true); 780 781 buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, VGPR, *FI); 782 } 783 784 if (ScratchExecCopy) { 785 // FIXME: Split block and make terminator. 786 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 787 MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 788 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) 789 .addReg(ScratchExecCopy, RegState::Kill); 790 LiveRegs.addReg(ScratchExecCopy); 791 } 792 793 if (FPSaveIndex && spilledToMemory(MF, *FPSaveIndex)) { 794 const int FramePtrFI = *FPSaveIndex; 795 assert(!MFI.isDeadObjectIndex(FramePtrFI)); 796 797 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true); 798 799 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( 800 MRI, LiveRegs, AMDGPU::VGPR_32RegClass); 801 if (!TmpVGPR) 802 report_fatal_error("failed to find free scratch register"); 803 804 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 805 .addReg(FramePtrReg); 806 807 buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, TmpVGPR, 808 FramePtrFI); 809 } 810 811 if (BPSaveIndex && spilledToMemory(MF, *BPSaveIndex)) { 812 const int BasePtrFI = *BPSaveIndex; 813 assert(!MFI.isDeadObjectIndex(BasePtrFI)); 814 815 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true); 816 817 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( 818 MRI, LiveRegs, AMDGPU::VGPR_32RegClass); 819 if (!TmpVGPR) 820 report_fatal_error("failed to find free scratch register"); 821 822 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 823 .addReg(BasePtrReg); 824 825 buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, TmpVGPR, 826 BasePtrFI); 827 } 828 829 // In this case, spill the FP to a reserved VGPR. 830 if (FPSaveIndex && !spilledToMemory(MF, *FPSaveIndex)) { 831 const int FramePtrFI = *FPSaveIndex; 832 assert(!MFI.isDeadObjectIndex(FramePtrFI)); 833 834 assert(MFI.getStackID(FramePtrFI) == TargetStackID::SGPRSpill); 835 ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = 836 FuncInfo->getSGPRToVGPRSpills(FramePtrFI); 837 assert(Spill.size() == 1); 838 839 // Save FP before setting it up. 840 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR) 841 .addReg(FramePtrReg) 842 .addImm(Spill[0].Lane) 843 .addReg(Spill[0].VGPR, RegState::Undef); 844 } 845 846 // In this case, spill the BP to a reserved VGPR. 847 if (BPSaveIndex && !spilledToMemory(MF, *BPSaveIndex)) { 848 const int BasePtrFI = *BPSaveIndex; 849 assert(!MFI.isDeadObjectIndex(BasePtrFI)); 850 851 assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill); 852 ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = 853 FuncInfo->getSGPRToVGPRSpills(BasePtrFI); 854 assert(Spill.size() == 1); 855 856 // Save BP before setting it up. 857 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR) 858 .addReg(BasePtrReg) 859 .addImm(Spill[0].Lane) 860 .addReg(Spill[0].VGPR, RegState::Undef); 861 } 862 863 // Emit the copy if we need an FP, and are using a free SGPR to save it. 864 if (FuncInfo->SGPRForFPSaveRestoreCopy) { 865 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), 866 FuncInfo->SGPRForFPSaveRestoreCopy) 867 .addReg(FramePtrReg) 868 .setMIFlag(MachineInstr::FrameSetup); 869 } 870 871 // Emit the copy if we need a BP, and are using a free SGPR to save it. 872 if (FuncInfo->SGPRForBPSaveRestoreCopy) { 873 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), 874 FuncInfo->SGPRForBPSaveRestoreCopy) 875 .addReg(BasePtrReg) 876 .setMIFlag(MachineInstr::FrameSetup); 877 } 878 879 // If a copy has been emitted for FP and/or BP, Make the SGPRs 880 // used in the copy instructions live throughout the function. 881 SmallVector<MCPhysReg, 2> TempSGPRs; 882 if (FuncInfo->SGPRForFPSaveRestoreCopy) 883 TempSGPRs.push_back(FuncInfo->SGPRForFPSaveRestoreCopy); 884 885 if (FuncInfo->SGPRForBPSaveRestoreCopy) 886 TempSGPRs.push_back(FuncInfo->SGPRForBPSaveRestoreCopy); 887 888 if (!TempSGPRs.empty()) { 889 for (MachineBasicBlock &MBB : MF) { 890 for (MCPhysReg Reg : TempSGPRs) 891 MBB.addLiveIn(Reg); 892 893 MBB.sortUniqueLiveIns(); 894 } 895 if (!LiveRegs.empty()) { 896 LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy); 897 LiveRegs.addReg(FuncInfo->SGPRForBPSaveRestoreCopy); 898 } 899 } 900 901 if (TRI.hasStackRealignment(MF)) { 902 HasFP = true; 903 const unsigned Alignment = MFI.getMaxAlign().value(); 904 905 RoundedSize += Alignment; 906 if (LiveRegs.empty()) { 907 LiveRegs.init(TRI); 908 LiveRegs.addLiveIns(MBB); 909 } 910 911 // s_add_u32 s33, s32, NumBytes 912 // s_and_b32 s33, s33, 0b111...0000 913 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), FramePtrReg) 914 .addReg(StackPtrReg) 915 .addImm((Alignment - 1) * getScratchScaleFactor(ST)) 916 .setMIFlag(MachineInstr::FrameSetup); 917 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg) 918 .addReg(FramePtrReg, RegState::Kill) 919 .addImm(-Alignment * getScratchScaleFactor(ST)) 920 .setMIFlag(MachineInstr::FrameSetup); 921 FuncInfo->setIsStackRealigned(true); 922 } else if ((HasFP = hasFP(MF))) { 923 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) 924 .addReg(StackPtrReg) 925 .setMIFlag(MachineInstr::FrameSetup); 926 } 927 928 // If we need a base pointer, set it up here. It's whatever the value of 929 // the stack pointer is at this point. Any variable size objects will be 930 // allocated after this, so we can still use the base pointer to reference 931 // the incoming arguments. 932 if ((HasBP = TRI.hasBasePointer(MF))) { 933 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg) 934 .addReg(StackPtrReg) 935 .setMIFlag(MachineInstr::FrameSetup); 936 } 937 938 if (HasFP && RoundedSize != 0) { 939 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg) 940 .addReg(StackPtrReg) 941 .addImm(RoundedSize * getScratchScaleFactor(ST)) 942 .setMIFlag(MachineInstr::FrameSetup); 943 } 944 945 assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy || 946 FuncInfo->FramePointerSaveIndex)) && 947 "Needed to save FP but didn't save it anywhere"); 948 949 assert((HasFP || (!FuncInfo->SGPRForFPSaveRestoreCopy && 950 !FuncInfo->FramePointerSaveIndex)) && 951 "Saved FP but didn't need it"); 952 953 assert((!HasBP || (FuncInfo->SGPRForBPSaveRestoreCopy || 954 FuncInfo->BasePointerSaveIndex)) && 955 "Needed to save BP but didn't save it anywhere"); 956 957 assert((HasBP || (!FuncInfo->SGPRForBPSaveRestoreCopy && 958 !FuncInfo->BasePointerSaveIndex)) && 959 "Saved BP but didn't need it"); 960 } 961 962 void SIFrameLowering::emitEpilogue(MachineFunction &MF, 963 MachineBasicBlock &MBB) const { 964 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 965 if (FuncInfo->isEntryFunction()) 966 return; 967 968 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 969 const SIInstrInfo *TII = ST.getInstrInfo(); 970 MachineRegisterInfo &MRI = MF.getRegInfo(); 971 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 972 MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); 973 LivePhysRegs LiveRegs; 974 DebugLoc DL; 975 976 const MachineFrameInfo &MFI = MF.getFrameInfo(); 977 uint32_t NumBytes = MFI.getStackSize(); 978 uint32_t RoundedSize = FuncInfo->isStackRealigned() 979 ? NumBytes + MFI.getMaxAlign().value() 980 : NumBytes; 981 const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); 982 const Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 983 const Register BasePtrReg = 984 TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register(); 985 986 Optional<int> FPSaveIndex = FuncInfo->FramePointerSaveIndex; 987 Optional<int> BPSaveIndex = FuncInfo->BasePointerSaveIndex; 988 989 if (RoundedSize != 0 && hasFP(MF)) { 990 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg) 991 .addReg(StackPtrReg) 992 .addImm(RoundedSize * getScratchScaleFactor(ST)) 993 .setMIFlag(MachineInstr::FrameDestroy); 994 } 995 996 if (FuncInfo->SGPRForFPSaveRestoreCopy) { 997 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) 998 .addReg(FuncInfo->SGPRForFPSaveRestoreCopy) 999 .setMIFlag(MachineInstr::FrameDestroy); 1000 } 1001 1002 if (FuncInfo->SGPRForBPSaveRestoreCopy) { 1003 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg) 1004 .addReg(FuncInfo->SGPRForBPSaveRestoreCopy) 1005 .setMIFlag(MachineInstr::FrameDestroy); 1006 } 1007 1008 if (FPSaveIndex) { 1009 const int FramePtrFI = *FPSaveIndex; 1010 assert(!MFI.isDeadObjectIndex(FramePtrFI)); 1011 if (spilledToMemory(MF, FramePtrFI)) { 1012 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false); 1013 1014 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( 1015 MRI, LiveRegs, AMDGPU::VGPR_32RegClass); 1016 if (!TmpVGPR) 1017 report_fatal_error("failed to find free scratch register"); 1018 buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, TmpVGPR, 1019 FramePtrFI); 1020 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg) 1021 .addReg(TmpVGPR, RegState::Kill); 1022 } else { 1023 // Reload from VGPR spill. 1024 assert(MFI.getStackID(FramePtrFI) == TargetStackID::SGPRSpill); 1025 ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = 1026 FuncInfo->getSGPRToVGPRSpills(FramePtrFI); 1027 assert(Spill.size() == 1); 1028 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READLANE_B32), FramePtrReg) 1029 .addReg(Spill[0].VGPR) 1030 .addImm(Spill[0].Lane); 1031 } 1032 } 1033 1034 if (BPSaveIndex) { 1035 const int BasePtrFI = *BPSaveIndex; 1036 assert(!MFI.isDeadObjectIndex(BasePtrFI)); 1037 if (spilledToMemory(MF, BasePtrFI)) { 1038 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false); 1039 1040 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( 1041 MRI, LiveRegs, AMDGPU::VGPR_32RegClass); 1042 if (!TmpVGPR) 1043 report_fatal_error("failed to find free scratch register"); 1044 buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, TmpVGPR, 1045 BasePtrFI); 1046 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg) 1047 .addReg(TmpVGPR, RegState::Kill); 1048 } else { 1049 // Reload from VGPR spill. 1050 assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill); 1051 ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = 1052 FuncInfo->getSGPRToVGPRSpills(BasePtrFI); 1053 assert(Spill.size() == 1); 1054 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READLANE_B32), BasePtrReg) 1055 .addReg(Spill[0].VGPR) 1056 .addImm(Spill[0].Lane); 1057 } 1058 } 1059 1060 Register ScratchExecCopy; 1061 for (const SIMachineFunctionInfo::SGPRSpillVGPR &Reg : 1062 FuncInfo->getSGPRSpillVGPRs()) { 1063 if (!Reg.FI) 1064 continue; 1065 1066 if (!ScratchExecCopy) 1067 ScratchExecCopy = 1068 buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ false); 1069 1070 buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, Reg.VGPR, 1071 *Reg.FI); 1072 } 1073 1074 for (const auto &Reg : FuncInfo->WWMReservedRegs) { 1075 auto VGPR = Reg.first; 1076 auto FI = Reg.second; 1077 if (!FI) 1078 continue; 1079 1080 if (!ScratchExecCopy) 1081 ScratchExecCopy = 1082 buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ false); 1083 1084 buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, VGPR, *FI); 1085 } 1086 1087 if (ScratchExecCopy) { 1088 // FIXME: Split block and make terminator. 1089 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 1090 MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 1091 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) 1092 .addReg(ScratchExecCopy, RegState::Kill); 1093 } 1094 } 1095 1096 #ifndef NDEBUG 1097 static bool allSGPRSpillsAreDead(const MachineFunction &MF) { 1098 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1099 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1100 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); 1101 I != E; ++I) { 1102 if (!MFI.isDeadObjectIndex(I) && 1103 MFI.getStackID(I) == TargetStackID::SGPRSpill && 1104 (I != FuncInfo->FramePointerSaveIndex && 1105 I != FuncInfo->BasePointerSaveIndex)) { 1106 return false; 1107 } 1108 } 1109 1110 return true; 1111 } 1112 #endif 1113 1114 StackOffset SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, 1115 int FI, 1116 Register &FrameReg) const { 1117 const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); 1118 1119 FrameReg = RI->getFrameRegister(MF); 1120 return StackOffset::getFixed(MF.getFrameInfo().getObjectOffset(FI)); 1121 } 1122 1123 void SIFrameLowering::processFunctionBeforeFrameFinalized( 1124 MachineFunction &MF, 1125 RegScavenger *RS) const { 1126 MachineFrameInfo &MFI = MF.getFrameInfo(); 1127 1128 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1129 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1130 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1131 1132 FuncInfo->removeDeadFrameIndices(MFI); 1133 assert(allSGPRSpillsAreDead(MF) && 1134 "SGPR spill should have been removed in SILowerSGPRSpills"); 1135 1136 // FIXME: The other checks should be redundant with allStackObjectsAreDead, 1137 // but currently hasNonSpillStackObjects is set only from source 1138 // allocas. Stack temps produced from legalization are not counted currently. 1139 if (!allStackObjectsAreDead(MFI)) { 1140 assert(RS && "RegScavenger required if spilling"); 1141 1142 // Add an emergency spill slot 1143 RS->addScavengingFrameIndex(FuncInfo->getScavengeFI(MFI, *TRI)); 1144 } 1145 } 1146 1147 // Only report VGPRs to generic code. 1148 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, 1149 BitVector &SavedVGPRs, 1150 RegScavenger *RS) const { 1151 TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS); 1152 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1153 if (MFI->isEntryFunction()) 1154 return; 1155 1156 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 1157 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1158 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1159 1160 // Ignore the SGPRs the default implementation found. 1161 SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask()); 1162 1163 // Do not save AGPRs prior to GFX90A because there was no easy way to do so. 1164 // In gfx908 there was do AGPR loads and stores and thus spilling also 1165 // require a temporary VGPR. 1166 if (!ST.hasGFX90AInsts()) 1167 SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask()); 1168 1169 // hasFP only knows about stack objects that already exist. We're now 1170 // determining the stack slots that will be created, so we have to predict 1171 // them. Stack objects force FP usage with calls. 1172 // 1173 // Note a new VGPR CSR may be introduced if one is used for the spill, but we 1174 // don't want to report it here. 1175 // 1176 // FIXME: Is this really hasReservedCallFrame? 1177 const bool WillHaveFP = 1178 FrameInfo.hasCalls() && 1179 (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo)); 1180 1181 // VGPRs used for SGPR spilling need to be specially inserted in the prolog, 1182 // so don't allow the default insertion to handle them. 1183 for (auto SSpill : MFI->getSGPRSpillVGPRs()) 1184 SavedVGPRs.reset(SSpill.VGPR); 1185 1186 LivePhysRegs LiveRegs; 1187 LiveRegs.init(*TRI); 1188 1189 if (WillHaveFP || hasFP(MF)) { 1190 assert(!MFI->SGPRForFPSaveRestoreCopy && !MFI->FramePointerSaveIndex && 1191 "Re-reserving spill slot for FP"); 1192 getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForFPSaveRestoreCopy, 1193 MFI->FramePointerSaveIndex, true); 1194 } 1195 1196 if (TRI->hasBasePointer(MF)) { 1197 if (MFI->SGPRForFPSaveRestoreCopy) 1198 LiveRegs.addReg(MFI->SGPRForFPSaveRestoreCopy); 1199 1200 assert(!MFI->SGPRForBPSaveRestoreCopy && 1201 !MFI->BasePointerSaveIndex && "Re-reserving spill slot for BP"); 1202 getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForBPSaveRestoreCopy, 1203 MFI->BasePointerSaveIndex, false); 1204 } 1205 } 1206 1207 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF, 1208 BitVector &SavedRegs, 1209 RegScavenger *RS) const { 1210 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); 1211 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1212 if (MFI->isEntryFunction()) 1213 return; 1214 1215 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1216 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1217 1218 // The SP is specifically managed and we don't want extra spills of it. 1219 SavedRegs.reset(MFI->getStackPtrOffsetReg()); 1220 1221 const BitVector AllSavedRegs = SavedRegs; 1222 SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask()); 1223 1224 // If clearing VGPRs changed the mask, we will have some CSR VGPR spills. 1225 const bool HaveAnyCSRVGPR = SavedRegs != AllSavedRegs; 1226 1227 // We have to anticipate introducing CSR VGPR spills if we don't have any 1228 // stack objects already, since we require an FP if there is a call and stack. 1229 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 1230 const bool WillHaveFP = FrameInfo.hasCalls() && HaveAnyCSRVGPR; 1231 1232 // FP will be specially managed like SP. 1233 if (WillHaveFP || hasFP(MF)) 1234 SavedRegs.reset(MFI->getFrameOffsetReg()); 1235 } 1236 1237 bool SIFrameLowering::assignCalleeSavedSpillSlots( 1238 MachineFunction &MF, const TargetRegisterInfo *TRI, 1239 std::vector<CalleeSavedInfo> &CSI) const { 1240 if (CSI.empty()) 1241 return true; // Early exit if no callee saved registers are modified! 1242 1243 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1244 if (!FuncInfo->SGPRForFPSaveRestoreCopy && 1245 !FuncInfo->SGPRForBPSaveRestoreCopy) 1246 return false; 1247 1248 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1249 const SIRegisterInfo *RI = ST.getRegisterInfo(); 1250 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 1251 Register BasePtrReg = RI->getBaseRegister(); 1252 unsigned NumModifiedRegs = 0; 1253 1254 if (FuncInfo->SGPRForFPSaveRestoreCopy) 1255 NumModifiedRegs++; 1256 if (FuncInfo->SGPRForBPSaveRestoreCopy) 1257 NumModifiedRegs++; 1258 1259 for (auto &CS : CSI) { 1260 if (CS.getReg() == FramePtrReg && FuncInfo->SGPRForFPSaveRestoreCopy) { 1261 CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy); 1262 if (--NumModifiedRegs) 1263 break; 1264 } else if (CS.getReg() == BasePtrReg && 1265 FuncInfo->SGPRForBPSaveRestoreCopy) { 1266 CS.setDstReg(FuncInfo->SGPRForBPSaveRestoreCopy); 1267 if (--NumModifiedRegs) 1268 break; 1269 } 1270 } 1271 1272 return false; 1273 } 1274 1275 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( 1276 MachineFunction &MF, 1277 MachineBasicBlock &MBB, 1278 MachineBasicBlock::iterator I) const { 1279 int64_t Amount = I->getOperand(0).getImm(); 1280 if (Amount == 0) 1281 return MBB.erase(I); 1282 1283 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1284 const SIInstrInfo *TII = ST.getInstrInfo(); 1285 const DebugLoc &DL = I->getDebugLoc(); 1286 unsigned Opc = I->getOpcode(); 1287 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); 1288 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0; 1289 1290 if (!hasReservedCallFrame(MF)) { 1291 Amount = alignTo(Amount, getStackAlign()); 1292 assert(isUInt<32>(Amount) && "exceeded stack address space size"); 1293 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1294 Register SPReg = MFI->getStackPtrOffsetReg(); 1295 1296 unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; 1297 BuildMI(MBB, I, DL, TII->get(Op), SPReg) 1298 .addReg(SPReg) 1299 .addImm(Amount * getScratchScaleFactor(ST)); 1300 } else if (CalleePopAmount != 0) { 1301 llvm_unreachable("is this used?"); 1302 } 1303 1304 return MBB.erase(I); 1305 } 1306 1307 /// Returns true if the frame will require a reference to the stack pointer. 1308 /// 1309 /// This is the set of conditions common to setting up the stack pointer in a 1310 /// kernel, and for using a frame pointer in a callable function. 1311 /// 1312 /// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm 1313 /// references SP. 1314 static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) { 1315 return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint(); 1316 } 1317 1318 // The FP for kernels is always known 0, so we never really need to setup an 1319 // explicit register for it. However, DisableFramePointerElim will force us to 1320 // use a register for it. 1321 bool SIFrameLowering::hasFP(const MachineFunction &MF) const { 1322 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1323 1324 // For entry functions we can use an immediate offset in most cases, so the 1325 // presence of calls doesn't imply we need a distinct frame pointer. 1326 if (MFI.hasCalls() && 1327 !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) { 1328 // All offsets are unsigned, so need to be addressed in the same direction 1329 // as stack growth. 1330 1331 // FIXME: This function is pretty broken, since it can be called before the 1332 // frame layout is determined or CSR spills are inserted. 1333 return MFI.getStackSize() != 0; 1334 } 1335 1336 return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() || 1337 MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment( 1338 MF) || 1339 MF.getTarget().Options.DisableFramePointerElim(MF); 1340 } 1341 1342 // This is essentially a reduced version of hasFP for entry functions. Since the 1343 // stack pointer is known 0 on entry to kernels, we never really need an FP 1344 // register. We may need to initialize the stack pointer depending on the frame 1345 // properties, which logically overlaps many of the cases where an ordinary 1346 // function would require an FP. 1347 bool SIFrameLowering::requiresStackPointerReference( 1348 const MachineFunction &MF) const { 1349 // Callable functions always require a stack pointer reference. 1350 assert(MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() && 1351 "only expected to call this for entry points"); 1352 1353 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1354 1355 // Entry points ordinarily don't need to initialize SP. We have to set it up 1356 // for callees if there are any. Also note tail calls are impossible/don't 1357 // make any sense for kernels. 1358 if (MFI.hasCalls()) 1359 return true; 1360 1361 // We still need to initialize the SP if we're doing anything weird that 1362 // references the SP, like variable sized stack objects. 1363 return frameTriviallyRequiresSP(MFI); 1364 } 1365