1 //===----------------------- SIFrameLowering.cpp --------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 9 #include "SIFrameLowering.h" 10 #include "AMDGPUSubtarget.h" 11 #include "SIInstrInfo.h" 12 #include "SIMachineFunctionInfo.h" 13 #include "SIRegisterInfo.h" 14 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 15 16 #include "llvm/CodeGen/LivePhysRegs.h" 17 #include "llvm/CodeGen/MachineFrameInfo.h" 18 #include "llvm/CodeGen/MachineFunction.h" 19 #include "llvm/CodeGen/MachineInstrBuilder.h" 20 #include "llvm/CodeGen/RegisterScavenging.h" 21 22 using namespace llvm; 23 24 #define DEBUG_TYPE "frame-info" 25 26 27 static ArrayRef<MCPhysReg> getAllSGPR128(const GCNSubtarget &ST, 28 const MachineFunction &MF) { 29 return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(), 30 ST.getMaxNumSGPRs(MF) / 4); 31 } 32 33 static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST, 34 const MachineFunction &MF) { 35 return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF)); 36 } 37 38 // Find a scratch register that we can use at the start of the prologue to 39 // re-align the stack pointer. We avoid using callee-save registers since they 40 // may appear to be free when this is called from canUseAsPrologue (during 41 // shrink wrapping), but then no longer be free when this is called from 42 // emitPrologue. 43 // 44 // FIXME: This is a bit conservative, since in the above case we could use one 45 // of the callee-save registers as a scratch temp to re-align the stack pointer, 46 // but we would then have to make sure that we were in fact saving at least one 47 // callee-save register in the prologue, which is additional complexity that 48 // doesn't seem worth the benefit. 49 static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, 50 LivePhysRegs &LiveRegs, 51 const TargetRegisterClass &RC, 52 bool Unused = false) { 53 // Mark callee saved registers as used so we will not choose them. 54 const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); 55 for (unsigned i = 0; CSRegs[i]; ++i) 56 LiveRegs.addReg(CSRegs[i]); 57 58 if (Unused) { 59 // We are looking for a register that can be used throughout the entire 60 // function, so any use is unacceptable. 61 for (MCRegister Reg : RC) { 62 if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg)) 63 return Reg; 64 } 65 } else { 66 for (MCRegister Reg : RC) { 67 if (LiveRegs.available(MRI, Reg)) 68 return Reg; 69 } 70 } 71 72 // If we require an unused register, this is used in contexts where failure is 73 // an option and has an alternative plan. In other contexts, this must 74 // succeed0. 75 if (!Unused) 76 report_fatal_error("failed to find free scratch register"); 77 78 return MCRegister(); 79 } 80 81 static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF, 82 LivePhysRegs &LiveRegs, 83 Register &TempSGPR, 84 Optional<int> &FrameIndex, 85 bool IsFP) { 86 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 87 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 88 89 #ifndef NDEBUG 90 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 91 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 92 #endif 93 94 // We need to save and restore the current FP/BP. 95 96 // 1: If there is already a VGPR with free lanes, use it. We 97 // may already have to pay the penalty for spilling a CSR VGPR. 98 if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) { 99 int NewFI = FrameInfo.CreateStackObject(4, 4, true, nullptr, 100 TargetStackID::SGPRSpill); 101 102 if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI)) 103 llvm_unreachable("allocate SGPR spill should have worked"); 104 105 FrameIndex = NewFI; 106 107 LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); 108 dbgs() << "Spilling " << (IsFP ? "FP" : "BP") << " to " 109 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane 110 << '\n'); 111 return; 112 } 113 114 // 2: Next, try to save the FP/BP in an unused SGPR. 115 TempSGPR = findScratchNonCalleeSaveRegister( 116 MF.getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true); 117 118 if (!TempSGPR) { 119 int NewFI = FrameInfo.CreateStackObject(4, 4, true, nullptr, 120 TargetStackID::SGPRSpill); 121 122 if (MFI->allocateSGPRSpillToVGPR(MF, NewFI)) { 123 // 3: There's no free lane to spill, and no free register to save FP/BP, 124 // so we're forced to spill another VGPR to use for the spill. 125 FrameIndex = NewFI; 126 } else { 127 // 4: If all else fails, spill the FP/BP to memory. 128 FrameIndex = FrameInfo.CreateSpillStackObject(4, Align(4)); 129 } 130 131 LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); 132 dbgs() << (IsFP ? "FP" : "BP") << " requires fallback spill to " 133 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane 134 << '\n';); 135 } else { 136 LLVM_DEBUG(dbgs() << "Saving " << (IsFP ? "FP" : "BP") << " with copy to " 137 << printReg(TempSGPR, TRI) << '\n'); 138 } 139 } 140 141 // We need to specially emit stack operations here because a different frame 142 // register is used than in the rest of the function, as getFrameRegister would 143 // use. 144 static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, 145 MachineBasicBlock::iterator I, 146 const SIInstrInfo *TII, Register SpillReg, 147 Register ScratchRsrcReg, Register SPReg, int FI) { 148 MachineFunction *MF = MBB.getParent(); 149 MachineFrameInfo &MFI = MF->getFrameInfo(); 150 151 int64_t Offset = MFI.getObjectOffset(FI); 152 153 MachineMemOperand *MMO = MF->getMachineMemOperand( 154 MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4, 155 MFI.getObjectAlign(FI)); 156 157 if (isUInt<12>(Offset)) { 158 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET)) 159 .addReg(SpillReg, RegState::Kill) 160 .addReg(ScratchRsrcReg) 161 .addReg(SPReg) 162 .addImm(Offset) 163 .addImm(0) // glc 164 .addImm(0) // slc 165 .addImm(0) // tfe 166 .addImm(0) // dlc 167 .addImm(0) // swz 168 .addMemOperand(MMO); 169 return; 170 } 171 172 MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( 173 MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass); 174 175 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg) 176 .addImm(Offset); 177 178 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN)) 179 .addReg(SpillReg, RegState::Kill) 180 .addReg(OffsetReg, RegState::Kill) 181 .addReg(ScratchRsrcReg) 182 .addReg(SPReg) 183 .addImm(0) 184 .addImm(0) // glc 185 .addImm(0) // slc 186 .addImm(0) // tfe 187 .addImm(0) // dlc 188 .addImm(0) // swz 189 .addMemOperand(MMO); 190 } 191 192 static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, 193 MachineBasicBlock::iterator I, 194 const SIInstrInfo *TII, Register SpillReg, 195 Register ScratchRsrcReg, Register SPReg, int FI) { 196 MachineFunction *MF = MBB.getParent(); 197 MachineFrameInfo &MFI = MF->getFrameInfo(); 198 int64_t Offset = MFI.getObjectOffset(FI); 199 200 MachineMemOperand *MMO = MF->getMachineMemOperand( 201 MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4, 202 MFI.getObjectAlign(FI)); 203 204 if (isUInt<12>(Offset)) { 205 BuildMI(MBB, I, DebugLoc(), 206 TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFSET), SpillReg) 207 .addReg(ScratchRsrcReg) 208 .addReg(SPReg) 209 .addImm(Offset) 210 .addImm(0) // glc 211 .addImm(0) // slc 212 .addImm(0) // tfe 213 .addImm(0) // dlc 214 .addImm(0) // swz 215 .addMemOperand(MMO); 216 return; 217 } 218 219 MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( 220 MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass); 221 222 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg) 223 .addImm(Offset); 224 225 BuildMI(MBB, I, DebugLoc(), 226 TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), SpillReg) 227 .addReg(OffsetReg, RegState::Kill) 228 .addReg(ScratchRsrcReg) 229 .addReg(SPReg) 230 .addImm(0) 231 .addImm(0) // glc 232 .addImm(0) // slc 233 .addImm(0) // tfe 234 .addImm(0) // dlc 235 .addImm(0) // swz 236 .addMemOperand(MMO); 237 } 238 239 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()` 240 void SIFrameLowering::emitEntryFunctionFlatScratchInit( 241 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 242 const DebugLoc &DL, Register ScratchWaveOffsetReg) const { 243 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 244 const SIInstrInfo *TII = ST.getInstrInfo(); 245 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 246 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 247 248 // We don't need this if we only have spills since there is no user facing 249 // scratch. 250 251 // TODO: If we know we don't have flat instructions earlier, we can omit 252 // this from the input registers. 253 // 254 // TODO: We only need to know if we access scratch space through a flat 255 // pointer. Because we only detect if flat instructions are used at all, 256 // this will be used more often than necessary on VI. 257 258 Register FlatScratchInitReg = 259 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); 260 261 MachineRegisterInfo &MRI = MF.getRegInfo(); 262 MRI.addLiveIn(FlatScratchInitReg); 263 MBB.addLiveIn(FlatScratchInitReg); 264 265 Register FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); 266 Register FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); 267 268 // Do a 64-bit pointer add. 269 if (ST.flatScratchIsPointer()) { 270 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 271 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) 272 .addReg(FlatScrInitLo) 273 .addReg(ScratchWaveOffsetReg); 274 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), FlatScrInitHi) 275 .addReg(FlatScrInitHi) 276 .addImm(0); 277 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)). 278 addReg(FlatScrInitLo). 279 addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO | 280 (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); 281 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)). 282 addReg(FlatScrInitHi). 283 addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI | 284 (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); 285 return; 286 } 287 288 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO) 289 .addReg(FlatScrInitLo) 290 .addReg(ScratchWaveOffsetReg); 291 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI) 292 .addReg(FlatScrInitHi) 293 .addImm(0); 294 295 return; 296 } 297 298 assert(ST.getGeneration() < AMDGPUSubtarget::GFX10); 299 300 // Copy the size in bytes. 301 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO) 302 .addReg(FlatScrInitHi, RegState::Kill); 303 304 // Add wave offset in bytes to private base offset. 305 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init. 306 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) 307 .addReg(FlatScrInitLo) 308 .addReg(ScratchWaveOffsetReg); 309 310 // Convert offset to 256-byte units. 311 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI) 312 .addReg(FlatScrInitLo, RegState::Kill) 313 .addImm(8); 314 } 315 316 // Shift down registers reserved for the scratch RSRC. 317 Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg( 318 MachineFunction &MF) const { 319 320 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 321 const SIInstrInfo *TII = ST.getInstrInfo(); 322 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 323 MachineRegisterInfo &MRI = MF.getRegInfo(); 324 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 325 326 assert(MFI->isEntryFunction()); 327 328 Register ScratchRsrcReg = MFI->getScratchRSrcReg(); 329 330 if (!ScratchRsrcReg || !MRI.isPhysRegUsed(ScratchRsrcReg)) 331 return Register(); 332 333 if (ST.hasSGPRInitBug() || 334 ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF)) 335 return ScratchRsrcReg; 336 337 // We reserved the last registers for this. Shift it down to the end of those 338 // which were actually used. 339 // 340 // FIXME: It might be safer to use a pseudoregister before replacement. 341 342 // FIXME: We should be able to eliminate unused input registers. We only 343 // cannot do this for the resources required for scratch access. For now we 344 // skip over user SGPRs and may leave unused holes. 345 346 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4; 347 ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(ST, MF); 348 AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded)); 349 350 // Skip the last N reserved elements because they should have already been 351 // reserved for VCC etc. 352 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 353 for (MCPhysReg Reg : AllSGPR128s) { 354 // Pick the first unallocated one. Make sure we don't clobber the other 355 // reserved input we needed. Also for PAL, make sure we don't clobber 356 // the GIT pointer passed in SGPR0 or SGPR8. 357 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && 358 !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) { 359 MRI.replaceRegWith(ScratchRsrcReg, Reg); 360 MFI->setScratchRSrcReg(Reg); 361 return Reg; 362 } 363 } 364 365 return ScratchRsrcReg; 366 } 367 368 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, 369 MachineBasicBlock &MBB) const { 370 assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); 371 372 // FIXME: If we only have SGPR spills, we won't actually be using scratch 373 // memory since these spill to VGPRs. We should be cleaning up these unused 374 // SGPR spill frame indices somewhere. 375 376 // FIXME: We still have implicit uses on SGPR spill instructions in case they 377 // need to spill to vector memory. It's likely that will not happen, but at 378 // this point it appears we need the setup. This part of the prolog should be 379 // emitted after frame indices are eliminated. 380 381 // FIXME: Remove all of the isPhysRegUsed checks 382 383 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 384 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 385 const SIInstrInfo *TII = ST.getInstrInfo(); 386 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 387 MachineRegisterInfo &MRI = MF.getRegInfo(); 388 const Function &F = MF.getFunction(); 389 390 assert(MFI->isEntryFunction()); 391 392 Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( 393 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); 394 // FIXME: Hack to not crash in situations which emitted an error. 395 if (!PreloadedScratchWaveOffsetReg) 396 return; 397 398 // We need to do the replacement of the private segment buffer register even 399 // if there are no stack objects. There could be stores to undef or a 400 // constant without an associated object. 401 // 402 // This will return `Register()` in cases where there are no actual 403 // uses of the SRSRC. 404 Register ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF); 405 406 // Make the selected register live throughout the function. 407 if (ScratchRsrcReg) { 408 for (MachineBasicBlock &OtherBB : MF) { 409 if (&OtherBB != &MBB) { 410 OtherBB.addLiveIn(ScratchRsrcReg); 411 } 412 } 413 } 414 415 // Now that we have fixed the reserved SRSRC we need to locate the 416 // (potentially) preloaded SRSRC. 417 Register PreloadedScratchRsrcReg; 418 if (ST.isAmdHsaOrMesa(F)) { 419 PreloadedScratchRsrcReg = 420 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); 421 if (ScratchRsrcReg && PreloadedScratchRsrcReg) { 422 // We added live-ins during argument lowering, but since they were not 423 // used they were deleted. We're adding the uses now, so add them back. 424 MRI.addLiveIn(PreloadedScratchRsrcReg); 425 MBB.addLiveIn(PreloadedScratchRsrcReg); 426 } 427 } 428 429 // Debug location must be unknown since the first debug location is used to 430 // determine the end of the prologue. 431 DebugLoc DL; 432 MachineBasicBlock::iterator I = MBB.begin(); 433 434 // We found the SRSRC first because it needs four registers and has an 435 // alignment requirement. If the SRSRC that we found is clobbering with 436 // the scratch wave offset, which may be in a fixed SGPR or a free SGPR 437 // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch 438 // wave offset to a free SGPR. 439 Register ScratchWaveOffsetReg; 440 if (TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) { 441 ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF); 442 unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); 443 AllSGPRs = AllSGPRs.slice( 444 std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded)); 445 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); 446 for (MCPhysReg Reg : AllSGPRs) { 447 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && 448 !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) { 449 ScratchWaveOffsetReg = Reg; 450 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) 451 .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); 452 break; 453 } 454 } 455 } else { 456 ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg; 457 } 458 assert(ScratchWaveOffsetReg); 459 460 if (MF.getFrameInfo().hasCalls()) { 461 Register SPReg = MFI->getStackPtrOffsetReg(); 462 assert(SPReg != AMDGPU::SP_REG); 463 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg) 464 .addImm(MF.getFrameInfo().getStackSize() * ST.getWavefrontSize()); 465 } 466 467 if (hasFP(MF)) { 468 Register FPReg = MFI->getFrameOffsetReg(); 469 assert(FPReg != AMDGPU::FP_REG); 470 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0); 471 } 472 473 if (MFI->hasFlatScratchInit() || ScratchRsrcReg) { 474 MRI.addLiveIn(PreloadedScratchWaveOffsetReg); 475 MBB.addLiveIn(PreloadedScratchWaveOffsetReg); 476 } 477 478 if (MFI->hasFlatScratchInit()) { 479 emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg); 480 } 481 482 if (ScratchRsrcReg) { 483 emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL, 484 PreloadedScratchRsrcReg, 485 ScratchRsrcReg, ScratchWaveOffsetReg); 486 } 487 } 488 489 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg` 490 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( 491 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 492 const DebugLoc &DL, Register PreloadedScratchRsrcReg, 493 Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const { 494 495 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 496 const SIInstrInfo *TII = ST.getInstrInfo(); 497 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 498 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 499 const Function &Fn = MF.getFunction(); 500 501 if (ST.isAmdPalOS()) { 502 // The pointer to the GIT is formed from the offset passed in and either 503 // the amdgpu-git-ptr-high function attribute or the top part of the PC 504 Register RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 505 Register RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 506 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); 507 508 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); 509 510 if (MFI->getGITPtrHigh() != 0xffffffff) { 511 BuildMI(MBB, I, DL, SMovB32, RsrcHi) 512 .addImm(MFI->getGITPtrHigh()) 513 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 514 } else { 515 const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64); 516 BuildMI(MBB, I, DL, GetPC64, Rsrc01); 517 } 518 Register GitPtrLo = MFI->getGITPtrLoReg(MF); 519 MF.getRegInfo().addLiveIn(GitPtrLo); 520 MBB.addLiveIn(GitPtrLo); 521 BuildMI(MBB, I, DL, SMovB32, RsrcLo) 522 .addReg(GitPtrLo) 523 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 524 525 // We now have the GIT ptr - now get the scratch descriptor from the entry 526 // at offset 0 (or offset 16 for a compute shader). 527 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 528 const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM); 529 auto MMO = MF.getMachineMemOperand(PtrInfo, 530 MachineMemOperand::MOLoad | 531 MachineMemOperand::MOInvariant | 532 MachineMemOperand::MODereferenceable, 533 16, Align(4)); 534 unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; 535 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); 536 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset); 537 BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg) 538 .addReg(Rsrc01) 539 .addImm(EncodedOffset) // offset 540 .addImm(0) // glc 541 .addImm(0) // dlc 542 .addReg(ScratchRsrcReg, RegState::ImplicitDefine) 543 .addMemOperand(MMO); 544 } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) { 545 assert(!ST.isAmdHsaOrMesa(Fn)); 546 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); 547 548 Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); 549 Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); 550 551 // Use relocations to get the pointer, and setup the other bits manually. 552 uint64_t Rsrc23 = TII->getScratchRsrcWords23(); 553 554 if (MFI->hasImplicitBufferPtr()) { 555 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); 556 557 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { 558 const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64); 559 560 BuildMI(MBB, I, DL, Mov64, Rsrc01) 561 .addReg(MFI->getImplicitBufferPtrUserSGPR()) 562 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 563 } else { 564 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); 565 566 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 567 auto MMO = MF.getMachineMemOperand( 568 PtrInfo, 569 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | 570 MachineMemOperand::MODereferenceable, 571 8, Align(4)); 572 BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01) 573 .addReg(MFI->getImplicitBufferPtrUserSGPR()) 574 .addImm(0) // offset 575 .addImm(0) // glc 576 .addImm(0) // dlc 577 .addMemOperand(MMO) 578 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 579 580 MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR()); 581 MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR()); 582 } 583 } else { 584 Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 585 Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 586 587 BuildMI(MBB, I, DL, SMovB32, Rsrc0) 588 .addExternalSymbol("SCRATCH_RSRC_DWORD0") 589 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 590 591 BuildMI(MBB, I, DL, SMovB32, Rsrc1) 592 .addExternalSymbol("SCRATCH_RSRC_DWORD1") 593 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 594 595 } 596 597 BuildMI(MBB, I, DL, SMovB32, Rsrc2) 598 .addImm(Rsrc23 & 0xffffffff) 599 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 600 601 BuildMI(MBB, I, DL, SMovB32, Rsrc3) 602 .addImm(Rsrc23 >> 32) 603 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 604 } else if (ST.isAmdHsaOrMesa(Fn)) { 605 assert(PreloadedScratchRsrcReg); 606 607 if (ScratchRsrcReg != PreloadedScratchRsrcReg) { 608 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) 609 .addReg(PreloadedScratchRsrcReg, RegState::Kill); 610 } 611 } 612 613 // Add the scratch wave offset into the scratch RSRC. 614 // 615 // We only want to update the first 48 bits, which is the base address 616 // pointer, without touching the adjacent 16 bits of flags. We know this add 617 // cannot carry-out from bit 47, otherwise the scratch allocation would be 618 // impossible to fit in the 48-bit global address space. 619 // 620 // TODO: Evaluate if it is better to just construct an SRD using the flat 621 // scratch init and some constants rather than update the one we are passed. 622 Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 623 Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 624 625 // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in 626 // the kernel body via inreg arguments. 627 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0) 628 .addReg(ScratchRsrcSub0) 629 .addReg(ScratchWaveOffsetReg) 630 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 631 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1) 632 .addReg(ScratchRsrcSub1) 633 .addImm(0) 634 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 635 } 636 637 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { 638 switch (ID) { 639 case TargetStackID::Default: 640 case TargetStackID::NoAlloc: 641 case TargetStackID::SGPRSpill: 642 return true; 643 case TargetStackID::SVEVector: 644 return false; 645 } 646 llvm_unreachable("Invalid TargetStackID::Value"); 647 } 648 649 // Activate all lanes, returns saved exec. 650 static Register buildScratchExecCopy(LivePhysRegs &LiveRegs, 651 MachineFunction &MF, 652 MachineBasicBlock &MBB, 653 MachineBasicBlock::iterator MBBI, 654 bool IsProlog) { 655 Register ScratchExecCopy; 656 MachineRegisterInfo &MRI = MF.getRegInfo(); 657 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 658 const SIInstrInfo *TII = ST.getInstrInfo(); 659 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 660 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 661 DebugLoc DL; 662 663 if (LiveRegs.empty()) { 664 if (IsProlog) { 665 LiveRegs.init(TRI); 666 LiveRegs.addLiveIns(MBB); 667 if (FuncInfo->SGPRForFPSaveRestoreCopy) 668 LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy); 669 670 if (FuncInfo->SGPRForBPSaveRestoreCopy) 671 LiveRegs.removeReg(FuncInfo->SGPRForBPSaveRestoreCopy); 672 } else { 673 // In epilog. 674 LiveRegs.init(*ST.getRegisterInfo()); 675 LiveRegs.addLiveOuts(MBB); 676 LiveRegs.stepBackward(*MBBI); 677 } 678 } 679 680 ScratchExecCopy = findScratchNonCalleeSaveRegister( 681 MRI, LiveRegs, *TRI.getWaveMaskRegClass()); 682 683 if (!IsProlog) 684 LiveRegs.removeReg(ScratchExecCopy); 685 686 const unsigned OrSaveExec = 687 ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; 688 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy).addImm(-1); 689 690 return ScratchExecCopy; 691 } 692 693 void SIFrameLowering::emitPrologue(MachineFunction &MF, 694 MachineBasicBlock &MBB) const { 695 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 696 if (FuncInfo->isEntryFunction()) { 697 emitEntryFunctionPrologue(MF, MBB); 698 return; 699 } 700 701 const MachineFrameInfo &MFI = MF.getFrameInfo(); 702 MachineRegisterInfo &MRI = MF.getRegInfo(); 703 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 704 const SIInstrInfo *TII = ST.getInstrInfo(); 705 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 706 707 Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); 708 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 709 Register BasePtrReg = 710 TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register(); 711 LivePhysRegs LiveRegs; 712 713 MachineBasicBlock::iterator MBBI = MBB.begin(); 714 DebugLoc DL; 715 716 bool HasFP = false; 717 bool HasBP = false; 718 uint32_t NumBytes = MFI.getStackSize(); 719 uint32_t RoundedSize = NumBytes; 720 // To avoid clobbering VGPRs in lanes that weren't active on function entry, 721 // turn on all lanes before doing the spill to memory. 722 Register ScratchExecCopy; 723 724 bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue(); 725 bool SpillFPToMemory = false; 726 // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR. 727 // Otherwise we are spilling the FP to memory. 728 if (HasFPSaveIndex) { 729 SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) != 730 TargetStackID::SGPRSpill; 731 } 732 733 bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue(); 734 bool SpillBPToMemory = false; 735 // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR. 736 // Otherwise we are spilling the BP to memory. 737 if (HasBPSaveIndex) { 738 SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) != 739 TargetStackID::SGPRSpill; 740 } 741 742 // Emit the copy if we need an FP, and are using a free SGPR to save it. 743 if (FuncInfo->SGPRForFPSaveRestoreCopy) { 744 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy) 745 .addReg(FramePtrReg) 746 .setMIFlag(MachineInstr::FrameSetup); 747 } 748 749 // Emit the copy if we need a BP, and are using a free SGPR to save it. 750 if (FuncInfo->SGPRForBPSaveRestoreCopy) { 751 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), 752 FuncInfo->SGPRForBPSaveRestoreCopy) 753 .addReg(BasePtrReg) 754 .setMIFlag(MachineInstr::FrameSetup); 755 } 756 757 // If a copy has been emitted for FP and/or BP, Make the SGPRs 758 // used in the copy instructions live throughout the function. 759 SmallVector<MCPhysReg, 2> TempSGPRs; 760 if (FuncInfo->SGPRForFPSaveRestoreCopy) 761 TempSGPRs.push_back(FuncInfo->SGPRForFPSaveRestoreCopy); 762 763 if (FuncInfo->SGPRForBPSaveRestoreCopy) 764 TempSGPRs.push_back(FuncInfo->SGPRForBPSaveRestoreCopy); 765 766 if (!TempSGPRs.empty()) { 767 for (MachineBasicBlock &MBB : MF) { 768 for (MCPhysReg Reg : TempSGPRs) 769 MBB.addLiveIn(Reg); 770 771 MBB.sortUniqueLiveIns(); 772 } 773 } 774 775 for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg 776 : FuncInfo->getSGPRSpillVGPRs()) { 777 if (!Reg.FI.hasValue()) 778 continue; 779 780 if (!ScratchExecCopy) 781 ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); 782 783 buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR, 784 FuncInfo->getScratchRSrcReg(), 785 StackPtrReg, 786 Reg.FI.getValue()); 787 } 788 789 if (HasFPSaveIndex && SpillFPToMemory) { 790 assert(!MFI.isDeadObjectIndex(FuncInfo->FramePointerSaveIndex.getValue())); 791 792 if (!ScratchExecCopy) 793 ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); 794 795 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( 796 MRI, LiveRegs, AMDGPU::VGPR_32RegClass); 797 798 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 799 .addReg(FramePtrReg); 800 801 buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR, 802 FuncInfo->getScratchRSrcReg(), StackPtrReg, 803 FuncInfo->FramePointerSaveIndex.getValue()); 804 } 805 806 if (HasBPSaveIndex && SpillBPToMemory) { 807 assert(!MFI.isDeadObjectIndex(*FuncInfo->BasePointerSaveIndex)); 808 809 if (!ScratchExecCopy) 810 ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); 811 812 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( 813 MRI, LiveRegs, AMDGPU::VGPR_32RegClass); 814 815 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 816 .addReg(BasePtrReg); 817 818 buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR, 819 FuncInfo->getScratchRSrcReg(), StackPtrReg, 820 *FuncInfo->BasePointerSaveIndex); 821 } 822 823 if (ScratchExecCopy) { 824 // FIXME: Split block and make terminator. 825 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 826 MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 827 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) 828 .addReg(ScratchExecCopy, RegState::Kill); 829 LiveRegs.addReg(ScratchExecCopy); 830 } 831 832 // In this case, spill the FP to a reserved VGPR. 833 if (HasFPSaveIndex && !SpillFPToMemory) { 834 const int FI = FuncInfo->FramePointerSaveIndex.getValue(); 835 assert(!MFI.isDeadObjectIndex(FI)); 836 837 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); 838 ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = 839 FuncInfo->getSGPRToVGPRSpills(FI); 840 assert(Spill.size() == 1); 841 842 // Save FP before setting it up. 843 // FIXME: This should respect spillSGPRToVGPR; 844 BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), 845 Spill[0].VGPR) 846 .addReg(FramePtrReg) 847 .addImm(Spill[0].Lane) 848 .addReg(Spill[0].VGPR, RegState::Undef); 849 } 850 851 // In this case, spill the BP to a reserved VGPR. 852 if (HasBPSaveIndex && !SpillBPToMemory) { 853 const int BasePtrFI = *FuncInfo->BasePointerSaveIndex; 854 assert(!MFI.isDeadObjectIndex(BasePtrFI)); 855 856 assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill); 857 ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = 858 FuncInfo->getSGPRToVGPRSpills(BasePtrFI); 859 assert(Spill.size() == 1); 860 861 // Save BP before setting it up. 862 // FIXME: This should respect spillSGPRToVGPR; 863 BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), 864 Spill[0].VGPR) 865 .addReg(BasePtrReg) 866 .addImm(Spill[0].Lane) 867 .addReg(Spill[0].VGPR, RegState::Undef); 868 } 869 870 if (TRI.needsStackRealignment(MF)) { 871 HasFP = true; 872 const unsigned Alignment = MFI.getMaxAlign().value(); 873 874 RoundedSize += Alignment; 875 if (LiveRegs.empty()) { 876 LiveRegs.init(TRI); 877 LiveRegs.addLiveIns(MBB); 878 LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy); 879 LiveRegs.addReg(FuncInfo->SGPRForBPSaveRestoreCopy); 880 } 881 882 Register ScratchSPReg = findScratchNonCalleeSaveRegister( 883 MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass); 884 assert(ScratchSPReg && ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy && 885 ScratchSPReg != FuncInfo->SGPRForBPSaveRestoreCopy); 886 887 // s_add_u32 tmp_reg, s32, NumBytes 888 // s_and_b32 s32, tmp_reg, 0b111...0000 889 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg) 890 .addReg(StackPtrReg) 891 .addImm((Alignment - 1) * ST.getWavefrontSize()) 892 .setMIFlag(MachineInstr::FrameSetup); 893 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg) 894 .addReg(ScratchSPReg, RegState::Kill) 895 .addImm(-Alignment * ST.getWavefrontSize()) 896 .setMIFlag(MachineInstr::FrameSetup); 897 FuncInfo->setIsStackRealigned(true); 898 } else if ((HasFP = hasFP(MF))) { 899 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) 900 .addReg(StackPtrReg) 901 .setMIFlag(MachineInstr::FrameSetup); 902 } 903 904 // If we need a base pointer, set it up here. It's whatever the value of 905 // the stack pointer is at this point. Any variable size objects will be 906 // allocated after this, so we can still use the base pointer to reference 907 // the incoming arguments. 908 if ((HasBP = TRI.hasBasePointer(MF))) { 909 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg) 910 .addReg(StackPtrReg) 911 .setMIFlag(MachineInstr::FrameSetup); 912 } 913 914 if (HasFP && RoundedSize != 0) { 915 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg) 916 .addReg(StackPtrReg) 917 .addImm(RoundedSize * ST.getWavefrontSize()) 918 .setMIFlag(MachineInstr::FrameSetup); 919 } 920 921 assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy || 922 FuncInfo->FramePointerSaveIndex)) && 923 "Needed to save FP but didn't save it anywhere"); 924 925 assert((HasFP || (!FuncInfo->SGPRForFPSaveRestoreCopy && 926 !FuncInfo->FramePointerSaveIndex)) && 927 "Saved FP but didn't need it"); 928 929 assert((!HasBP || (FuncInfo->SGPRForBPSaveRestoreCopy || 930 FuncInfo->BasePointerSaveIndex)) && 931 "Needed to save BP but didn't save it anywhere"); 932 933 assert((HasBP || (!FuncInfo->SGPRForBPSaveRestoreCopy && 934 !FuncInfo->BasePointerSaveIndex)) && 935 "Saved BP but didn't need it"); 936 } 937 938 void SIFrameLowering::emitEpilogue(MachineFunction &MF, 939 MachineBasicBlock &MBB) const { 940 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 941 if (FuncInfo->isEntryFunction()) 942 return; 943 944 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 945 const SIInstrInfo *TII = ST.getInstrInfo(); 946 MachineRegisterInfo &MRI = MF.getRegInfo(); 947 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 948 MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); 949 LivePhysRegs LiveRegs; 950 DebugLoc DL; 951 952 const MachineFrameInfo &MFI = MF.getFrameInfo(); 953 uint32_t NumBytes = MFI.getStackSize(); 954 uint32_t RoundedSize = FuncInfo->isStackRealigned() 955 ? NumBytes + MFI.getMaxAlign().value() 956 : NumBytes; 957 const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); 958 const Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 959 const Register BasePtrReg = 960 TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register(); 961 962 bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue(); 963 bool SpillFPToMemory = false; 964 if (HasFPSaveIndex) { 965 SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) != 966 TargetStackID::SGPRSpill; 967 } 968 969 bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue(); 970 bool SpillBPToMemory = false; 971 if (HasBPSaveIndex) { 972 SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) != 973 TargetStackID::SGPRSpill; 974 } 975 976 if (RoundedSize != 0 && hasFP(MF)) { 977 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg) 978 .addReg(StackPtrReg) 979 .addImm(RoundedSize * ST.getWavefrontSize()) 980 .setMIFlag(MachineInstr::FrameDestroy); 981 } 982 983 if (FuncInfo->SGPRForFPSaveRestoreCopy) { 984 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) 985 .addReg(FuncInfo->SGPRForFPSaveRestoreCopy) 986 .setMIFlag(MachineInstr::FrameSetup); 987 } 988 989 if (FuncInfo->SGPRForBPSaveRestoreCopy) { 990 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg) 991 .addReg(FuncInfo->SGPRForBPSaveRestoreCopy) 992 .setMIFlag(MachineInstr::FrameSetup); 993 } 994 995 Register ScratchExecCopy; 996 if (HasFPSaveIndex) { 997 const int FI = FuncInfo->FramePointerSaveIndex.getValue(); 998 assert(!MFI.isDeadObjectIndex(FI)); 999 if (SpillFPToMemory) { 1000 if (!ScratchExecCopy) 1001 ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false); 1002 1003 MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister( 1004 MRI, LiveRegs, AMDGPU::VGPR_32RegClass); 1005 buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR, 1006 FuncInfo->getScratchRSrcReg(), StackPtrReg, FI); 1007 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg) 1008 .addReg(TempVGPR, RegState::Kill); 1009 } else { 1010 // Reload from VGPR spill. 1011 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); 1012 ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = 1013 FuncInfo->getSGPRToVGPRSpills(FI); 1014 assert(Spill.size() == 1); 1015 BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), 1016 FramePtrReg) 1017 .addReg(Spill[0].VGPR) 1018 .addImm(Spill[0].Lane); 1019 } 1020 } 1021 1022 if (HasBPSaveIndex) { 1023 const int BasePtrFI = *FuncInfo->BasePointerSaveIndex; 1024 assert(!MFI.isDeadObjectIndex(BasePtrFI)); 1025 if (SpillBPToMemory) { 1026 if (!ScratchExecCopy) 1027 ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false); 1028 1029 MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister( 1030 MRI, LiveRegs, AMDGPU::VGPR_32RegClass); 1031 buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR, 1032 FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI); 1033 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg) 1034 .addReg(TempVGPR, RegState::Kill); 1035 } else { 1036 // Reload from VGPR spill. 1037 assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill); 1038 ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = 1039 FuncInfo->getSGPRToVGPRSpills(BasePtrFI); 1040 assert(Spill.size() == 1); 1041 BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), 1042 BasePtrReg) 1043 .addReg(Spill[0].VGPR) 1044 .addImm(Spill[0].Lane); 1045 } 1046 } 1047 1048 for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg : 1049 FuncInfo->getSGPRSpillVGPRs()) { 1050 if (!Reg.FI.hasValue()) 1051 continue; 1052 1053 if (!ScratchExecCopy) 1054 ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false); 1055 1056 buildEpilogReload(LiveRegs, MBB, MBBI, TII, Reg.VGPR, 1057 FuncInfo->getScratchRSrcReg(), StackPtrReg, 1058 Reg.FI.getValue()); 1059 } 1060 1061 if (ScratchExecCopy) { 1062 // FIXME: Split block and make terminator. 1063 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 1064 MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 1065 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) 1066 .addReg(ScratchExecCopy, RegState::Kill); 1067 } 1068 } 1069 1070 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not 1071 // memory. They should have been removed by now. 1072 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { 1073 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); 1074 I != E; ++I) { 1075 if (!MFI.isDeadObjectIndex(I)) 1076 return false; 1077 } 1078 1079 return true; 1080 } 1081 1082 #ifndef NDEBUG 1083 static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI, 1084 Optional<int> FramePointerSaveIndex, 1085 Optional<int> BasePointerSaveIndex) { 1086 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); 1087 I != E; ++I) { 1088 if (!MFI.isDeadObjectIndex(I) && 1089 MFI.getStackID(I) == TargetStackID::SGPRSpill && 1090 ((FramePointerSaveIndex && I != FramePointerSaveIndex) || 1091 (BasePointerSaveIndex && I != BasePointerSaveIndex))) { 1092 return false; 1093 } 1094 } 1095 1096 return true; 1097 } 1098 #endif 1099 1100 int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, 1101 Register &FrameReg) const { 1102 const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); 1103 1104 FrameReg = RI->getFrameRegister(MF); 1105 return MF.getFrameInfo().getObjectOffset(FI); 1106 } 1107 1108 void SIFrameLowering::processFunctionBeforeFrameFinalized( 1109 MachineFunction &MF, 1110 RegScavenger *RS) const { 1111 MachineFrameInfo &MFI = MF.getFrameInfo(); 1112 1113 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1114 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1115 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1116 1117 FuncInfo->removeDeadFrameIndices(MFI); 1118 assert(allSGPRSpillsAreDead(MFI, None, None) && 1119 "SGPR spill should have been removed in SILowerSGPRSpills"); 1120 1121 // FIXME: The other checks should be redundant with allStackObjectsAreDead, 1122 // but currently hasNonSpillStackObjects is set only from source 1123 // allocas. Stack temps produced from legalization are not counted currently. 1124 if (!allStackObjectsAreDead(MFI)) { 1125 assert(RS && "RegScavenger required if spilling"); 1126 1127 if (FuncInfo->isEntryFunction()) { 1128 int ScavengeFI = MFI.CreateFixedObject( 1129 TRI->getSpillSize(AMDGPU::SGPR_32RegClass), 0, false); 1130 RS->addScavengingFrameIndex(ScavengeFI); 1131 } else { 1132 int ScavengeFI = MFI.CreateStackObject( 1133 TRI->getSpillSize(AMDGPU::SGPR_32RegClass), 1134 TRI->getSpillAlignment(AMDGPU::SGPR_32RegClass), 1135 false); 1136 RS->addScavengingFrameIndex(ScavengeFI); 1137 } 1138 } 1139 } 1140 1141 // Only report VGPRs to generic code. 1142 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, 1143 BitVector &SavedVGPRs, 1144 RegScavenger *RS) const { 1145 TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS); 1146 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1147 if (MFI->isEntryFunction()) 1148 return; 1149 1150 MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 1151 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1152 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1153 1154 // Ignore the SGPRs the default implementation found. 1155 SavedVGPRs.clearBitsNotInMask(TRI->getAllVGPRRegMask()); 1156 1157 // hasFP only knows about stack objects that already exist. We're now 1158 // determining the stack slots that will be created, so we have to predict 1159 // them. Stack objects force FP usage with calls. 1160 // 1161 // Note a new VGPR CSR may be introduced if one is used for the spill, but we 1162 // don't want to report it here. 1163 // 1164 // FIXME: Is this really hasReservedCallFrame? 1165 const bool WillHaveFP = 1166 FrameInfo.hasCalls() && 1167 (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo)); 1168 1169 // VGPRs used for SGPR spilling need to be specially inserted in the prolog, 1170 // so don't allow the default insertion to handle them. 1171 for (auto SSpill : MFI->getSGPRSpillVGPRs()) 1172 SavedVGPRs.reset(SSpill.VGPR); 1173 1174 LivePhysRegs LiveRegs; 1175 LiveRegs.init(*TRI); 1176 1177 if (WillHaveFP || hasFP(MF)) { 1178 getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForFPSaveRestoreCopy, 1179 MFI->FramePointerSaveIndex, true); 1180 } 1181 1182 if (TRI->hasBasePointer(MF)) { 1183 if (MFI->SGPRForFPSaveRestoreCopy) 1184 LiveRegs.addReg(MFI->SGPRForFPSaveRestoreCopy); 1185 getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForBPSaveRestoreCopy, 1186 MFI->BasePointerSaveIndex, false); 1187 } 1188 } 1189 1190 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF, 1191 BitVector &SavedRegs, 1192 RegScavenger *RS) const { 1193 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); 1194 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1195 if (MFI->isEntryFunction()) 1196 return; 1197 1198 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1199 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1200 1201 // The SP is specifically managed and we don't want extra spills of it. 1202 SavedRegs.reset(MFI->getStackPtrOffsetReg()); 1203 SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask()); 1204 } 1205 1206 bool SIFrameLowering::assignCalleeSavedSpillSlots( 1207 MachineFunction &MF, const TargetRegisterInfo *TRI, 1208 std::vector<CalleeSavedInfo> &CSI) const { 1209 if (CSI.empty()) 1210 return true; // Early exit if no callee saved registers are modified! 1211 1212 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 1213 if (!FuncInfo->SGPRForFPSaveRestoreCopy && 1214 !FuncInfo->SGPRForBPSaveRestoreCopy) 1215 return false; 1216 1217 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1218 const SIRegisterInfo *RI = ST.getRegisterInfo(); 1219 Register FramePtrReg = FuncInfo->getFrameOffsetReg(); 1220 Register BasePtrReg = RI->getBaseRegister(); 1221 unsigned NumModifiedRegs = 0; 1222 1223 if (FuncInfo->SGPRForFPSaveRestoreCopy) 1224 NumModifiedRegs++; 1225 if (FuncInfo->SGPRForBPSaveRestoreCopy) 1226 NumModifiedRegs++; 1227 1228 for (auto &CS : CSI) { 1229 if (CS.getReg() == FramePtrReg && FuncInfo->SGPRForFPSaveRestoreCopy) { 1230 CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy); 1231 if (--NumModifiedRegs) 1232 break; 1233 } else if (CS.getReg() == BasePtrReg && 1234 FuncInfo->SGPRForBPSaveRestoreCopy) { 1235 CS.setDstReg(FuncInfo->SGPRForBPSaveRestoreCopy); 1236 if (--NumModifiedRegs) 1237 break; 1238 } 1239 } 1240 1241 return false; 1242 } 1243 1244 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( 1245 MachineFunction &MF, 1246 MachineBasicBlock &MBB, 1247 MachineBasicBlock::iterator I) const { 1248 int64_t Amount = I->getOperand(0).getImm(); 1249 if (Amount == 0) 1250 return MBB.erase(I); 1251 1252 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1253 const SIInstrInfo *TII = ST.getInstrInfo(); 1254 const DebugLoc &DL = I->getDebugLoc(); 1255 unsigned Opc = I->getOpcode(); 1256 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); 1257 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0; 1258 1259 if (!hasReservedCallFrame(MF)) { 1260 Amount = alignTo(Amount, getStackAlign()); 1261 assert(isUInt<32>(Amount) && "exceeded stack address space size"); 1262 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1263 Register SPReg = MFI->getStackPtrOffsetReg(); 1264 1265 unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; 1266 BuildMI(MBB, I, DL, TII->get(Op), SPReg) 1267 .addReg(SPReg) 1268 .addImm(Amount * ST.getWavefrontSize()); 1269 } else if (CalleePopAmount != 0) { 1270 llvm_unreachable("is this used?"); 1271 } 1272 1273 return MBB.erase(I); 1274 } 1275 1276 bool SIFrameLowering::hasFP(const MachineFunction &MF) const { 1277 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1278 1279 // For entry functions we can use an immediate offset in most cases, so the 1280 // presence of calls doesn't imply we need a distinct frame pointer. 1281 if (MFI.hasCalls() && 1282 !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) { 1283 // All offsets are unsigned, so need to be addressed in the same direction 1284 // as stack growth. 1285 1286 // FIXME: This function is pretty broken, since it can be called before the 1287 // frame layout is determined or CSR spills are inserted. 1288 return MFI.getStackSize() != 0; 1289 } 1290 1291 return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() || 1292 MFI.hasStackMap() || MFI.hasPatchPoint() || 1293 MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->needsStackRealignment(MF) || 1294 MF.getTarget().Options.DisableFramePointerElim(MF); 1295 } 1296