1 //===----------------------- SIFrameLowering.cpp --------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //==-----------------------------------------------------------------------===// 9 10 #include "SIFrameLowering.h" 11 #include "SIInstrInfo.h" 12 #include "SIMachineFunctionInfo.h" 13 #include "SIRegisterInfo.h" 14 #include "AMDGPUSubtarget.h" 15 16 #include "llvm/CodeGen/MachineFrameInfo.h" 17 #include "llvm/CodeGen/MachineFunction.h" 18 #include "llvm/CodeGen/MachineInstrBuilder.h" 19 #include "llvm/CodeGen/RegisterScavenging.h" 20 21 using namespace llvm; 22 23 24 static ArrayRef<MCPhysReg> getAllSGPR128(const SISubtarget &ST, 25 const MachineFunction &MF) { 26 return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(), 27 ST.getMaxNumSGPRs(MF) / 4); 28 } 29 30 static ArrayRef<MCPhysReg> getAllSGPRs(const SISubtarget &ST, 31 const MachineFunction &MF) { 32 return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), 33 ST.getMaxNumSGPRs(MF)); 34 } 35 36 void SIFrameLowering::emitFlatScratchInit(const SIInstrInfo *TII, 37 const SIRegisterInfo* TRI, 38 MachineFunction &MF, 39 MachineBasicBlock &MBB) const { 40 // We don't need this if we only have spills since there is no user facing 41 // scratch. 42 43 // TODO: If we know we don't have flat instructions earlier, we can omit 44 // this from the input registers. 45 // 46 // TODO: We only need to know if we access scratch space through a flat 47 // pointer. Because we only detect if flat instructions are used at all, 48 // this will be used more often than necessary on VI. 49 50 // Debug location must be unknown since the first debug location is used to 51 // determine the end of the prologue. 52 DebugLoc DL; 53 MachineBasicBlock::iterator I = MBB.begin(); 54 55 unsigned FlatScratchInitReg 56 = TRI->getPreloadedValue(MF, SIRegisterInfo::FLAT_SCRATCH_INIT); 57 58 MachineRegisterInfo &MRI = MF.getRegInfo(); 59 MRI.addLiveIn(FlatScratchInitReg); 60 MBB.addLiveIn(FlatScratchInitReg); 61 62 // Copy the size in bytes. 63 unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); 64 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO) 65 .addReg(FlatScrInitHi, RegState::Kill); 66 67 unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); 68 69 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 70 unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); 71 72 // Add wave offset in bytes to private base offset. 73 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init. 74 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) 75 .addReg(FlatScrInitLo) 76 .addReg(ScratchWaveOffsetReg); 77 78 // Convert offset to 256-byte units. 79 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI) 80 .addReg(FlatScrInitLo, RegState::Kill) 81 .addImm(8); 82 } 83 84 unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( 85 const SISubtarget &ST, 86 const SIInstrInfo *TII, 87 const SIRegisterInfo *TRI, 88 SIMachineFunctionInfo *MFI, 89 MachineFunction &MF) const { 90 91 // We need to insert initialization of the scratch resource descriptor. 92 unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); 93 if (ScratchRsrcReg == AMDGPU::NoRegister) 94 return AMDGPU::NoRegister; 95 96 if (ST.hasSGPRInitBug() || 97 ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF)) 98 return ScratchRsrcReg; 99 100 // We reserved the last registers for this. Shift it down to the end of those 101 // which were actually used. 102 // 103 // FIXME: It might be safer to use a pseudoregister before replacement. 104 105 // FIXME: We should be able to eliminate unused input registers. We only 106 // cannot do this for the resources required for scratch access. For now we 107 // skip over user SGPRs and may leave unused holes. 108 109 // We find the resource first because it has an alignment requirement. 110 111 MachineRegisterInfo &MRI = MF.getRegInfo(); 112 113 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4; 114 ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(ST, MF); 115 AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded)); 116 117 // Skip the last 2 elements because the last one is reserved for VCC, and 118 // this is the 2nd to last element already. 119 for (MCPhysReg Reg : AllSGPR128s) { 120 // Pick the first unallocated one. Make sure we don't clobber the other 121 // reserved input we needed. 122 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) { 123 //assert(MRI.isAllocatable(Reg)); 124 MRI.replaceRegWith(ScratchRsrcReg, Reg); 125 MFI->setScratchRSrcReg(Reg); 126 return Reg; 127 } 128 } 129 130 return ScratchRsrcReg; 131 } 132 133 unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( 134 const SISubtarget &ST, 135 const SIInstrInfo *TII, 136 const SIRegisterInfo *TRI, 137 SIMachineFunctionInfo *MFI, 138 MachineFunction &MF) const { 139 unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); 140 if (ST.hasSGPRInitBug() || 141 ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) 142 return ScratchWaveOffsetReg; 143 144 unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); 145 MachineRegisterInfo &MRI = MF.getRegInfo(); 146 147 unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); 148 149 ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF); 150 if (NumPreloaded > AllSGPRs.size()) 151 return ScratchWaveOffsetReg; 152 153 AllSGPRs = AllSGPRs.slice(NumPreloaded); 154 155 // We need to drop register from the end of the list that we cannot use 156 // for the scratch wave offset. 157 // + 2 s102 and s103 do not exist on VI. 158 // + 2 for vcc 159 // + 2 for xnack_mask 160 // + 2 for flat_scratch 161 // + 4 for registers reserved for scratch resource register 162 // + 1 for register reserved for scratch wave offset. (By exluding this 163 // register from the list to consider, it means that when this 164 // register is being used for the scratch wave offset and there 165 // are no other free SGPRs, then the value will stay in this register. 166 // ---- 167 // 13 168 if (AllSGPRs.size() < 13) 169 return ScratchWaveOffsetReg; 170 171 for (MCPhysReg Reg : AllSGPRs.drop_back(13)) { 172 // Pick the first unallocated SGPR. Be careful not to pick an alias of the 173 // scratch descriptor, since we haven’t added its uses yet. 174 if (!MRI.isPhysRegUsed(Reg)) { 175 if (!MRI.isAllocatable(Reg) || 176 TRI->isSubRegisterEq(ScratchRsrcReg, Reg)) 177 continue; 178 179 MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); 180 MFI->setScratchWaveOffsetReg(Reg); 181 return Reg; 182 } 183 } 184 185 return ScratchWaveOffsetReg; 186 } 187 188 void SIFrameLowering::emitPrologue(MachineFunction &MF, 189 MachineBasicBlock &MBB) const { 190 // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was 191 // specified. 192 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 193 if (ST.debuggerEmitPrologue()) 194 emitDebuggerPrologue(MF, MBB); 195 196 assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); 197 198 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 199 200 // If we only have SGPR spills, we won't actually be using scratch memory 201 // since these spill to VGPRs. 202 // 203 // FIXME: We should be cleaning up these unused SGPR spill frame indices 204 // somewhere. 205 206 const SIInstrInfo *TII = ST.getInstrInfo(); 207 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 208 MachineRegisterInfo &MRI = MF.getRegInfo(); 209 210 unsigned ScratchRsrcReg 211 = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF); 212 unsigned ScratchWaveOffsetReg 213 = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF); 214 215 if (ScratchRsrcReg == AMDGPU::NoRegister) { 216 assert(ScratchWaveOffsetReg == AMDGPU::NoRegister); 217 return; 218 } 219 220 assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg)); 221 222 // We need to do the replacement of the private segment buffer and wave offset 223 // register even if there are no stack objects. There could be stores to undef 224 // or a constant without an associated object. 225 226 // FIXME: We still have implicit uses on SGPR spill instructions in case they 227 // need to spill to vector memory. It's likely that will not happen, but at 228 // this point it appears we need the setup. This part of the prolog should be 229 // emitted after frame indices are eliminated. 230 231 if (MF.getFrameInfo().hasStackObjects() && MFI->hasFlatScratchInit()) 232 emitFlatScratchInit(TII, TRI, MF, MBB); 233 234 // We need to insert initialization of the scratch resource descriptor. 235 unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue( 236 MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); 237 238 239 unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; 240 if (ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF)) { 241 PreloadedPrivateBufferReg = TRI->getPreloadedValue( 242 MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); 243 } 244 245 bool OffsetRegUsed = !MRI.use_empty(ScratchWaveOffsetReg); 246 bool ResourceRegUsed = !MRI.use_empty(ScratchRsrcReg); 247 248 // We added live-ins during argument lowering, but since they were not used 249 // they were deleted. We're adding the uses now, so add them back. 250 if (OffsetRegUsed) { 251 assert(PreloadedScratchWaveOffsetReg != AMDGPU::NoRegister && 252 "scratch wave offset input is required"); 253 MRI.addLiveIn(PreloadedScratchWaveOffsetReg); 254 MBB.addLiveIn(PreloadedScratchWaveOffsetReg); 255 } 256 257 if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) { 258 assert(ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF)); 259 MRI.addLiveIn(PreloadedPrivateBufferReg); 260 MBB.addLiveIn(PreloadedPrivateBufferReg); 261 } 262 263 // Make the register selected live throughout the function. 264 for (MachineBasicBlock &OtherBB : MF) { 265 if (&OtherBB == &MBB) 266 continue; 267 268 if (OffsetRegUsed) 269 OtherBB.addLiveIn(ScratchWaveOffsetReg); 270 271 if (ResourceRegUsed) 272 OtherBB.addLiveIn(ScratchRsrcReg); 273 } 274 275 DebugLoc DL; 276 MachineBasicBlock::iterator I = MBB.begin(); 277 278 // If we reserved the original input registers, we don't need to copy to the 279 // reserved registers. 280 281 bool CopyBuffer = ResourceRegUsed && 282 PreloadedPrivateBufferReg != AMDGPU::NoRegister && 283 ST.isAmdCodeObjectV2(MF) && 284 ScratchRsrcReg != PreloadedPrivateBufferReg; 285 286 // This needs to be careful of the copying order to avoid overwriting one of 287 // the input registers before it's been copied to it's final 288 // destination. Usually the offset should be copied first. 289 bool CopyBufferFirst = TRI->isSubRegisterEq(PreloadedPrivateBufferReg, 290 ScratchWaveOffsetReg); 291 if (CopyBuffer && CopyBufferFirst) { 292 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) 293 .addReg(PreloadedPrivateBufferReg, RegState::Kill); 294 } 295 296 if (OffsetRegUsed && 297 PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) { 298 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) 299 .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); 300 } 301 302 if (CopyBuffer && !CopyBufferFirst) { 303 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) 304 .addReg(PreloadedPrivateBufferReg, RegState::Kill); 305 } 306 307 if (ResourceRegUsed && (ST.isMesaGfxShader(MF) || (PreloadedPrivateBufferReg == AMDGPU::NoRegister))) { 308 assert(!ST.isAmdCodeObjectV2(MF)); 309 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); 310 311 unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); 312 unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); 313 314 // Use relocations to get the pointer, and setup the other bits manually. 315 uint64_t Rsrc23 = TII->getScratchRsrcWords23(); 316 317 if (MFI->hasPrivateMemoryInputPtr()) { 318 unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); 319 320 if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) { 321 const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64); 322 323 BuildMI(MBB, I, DL, Mov64, Rsrc01) 324 .addReg(PreloadedPrivateBufferReg) 325 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 326 } else { 327 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); 328 329 PointerType *PtrTy = 330 PointerType::get(Type::getInt64Ty(MF.getFunction()->getContext()), 331 AMDGPUAS::CONSTANT_ADDRESS); 332 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); 333 auto MMO = MF.getMachineMemOperand(PtrInfo, 334 MachineMemOperand::MOLoad | 335 MachineMemOperand::MOInvariant | 336 MachineMemOperand::MODereferenceable, 337 0, 0); 338 BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01) 339 .addReg(PreloadedPrivateBufferReg) 340 .addImm(0) // offset 341 .addImm(0) // glc 342 .addMemOperand(MMO) 343 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 344 } 345 } else { 346 unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 347 unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 348 349 BuildMI(MBB, I, DL, SMovB32, Rsrc0) 350 .addExternalSymbol("SCRATCH_RSRC_DWORD0") 351 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 352 353 BuildMI(MBB, I, DL, SMovB32, Rsrc1) 354 .addExternalSymbol("SCRATCH_RSRC_DWORD1") 355 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 356 357 } 358 359 BuildMI(MBB, I, DL, SMovB32, Rsrc2) 360 .addImm(Rsrc23 & 0xffffffff) 361 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 362 363 BuildMI(MBB, I, DL, SMovB32, Rsrc3) 364 .addImm(Rsrc23 >> 32) 365 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 366 } 367 } 368 369 void SIFrameLowering::emitEpilogue(MachineFunction &MF, 370 MachineBasicBlock &MBB) const { 371 372 } 373 374 void SIFrameLowering::processFunctionBeforeFrameFinalized( 375 MachineFunction &MF, 376 RegScavenger *RS) const { 377 MachineFrameInfo &MFI = MF.getFrameInfo(); 378 379 if (!MFI.hasStackObjects()) 380 return; 381 382 bool MayNeedScavengingEmergencySlot = MFI.hasStackObjects(); 383 384 assert((RS || !MayNeedScavengingEmergencySlot) && 385 "RegScavenger required if spilling"); 386 387 if (MayNeedScavengingEmergencySlot) { 388 int ScavengeFI = MFI.CreateStackObject( 389 AMDGPU::SGPR_32RegClass.getSize(), 390 AMDGPU::SGPR_32RegClass.getAlignment(), false); 391 RS->addScavengingFrameIndex(ScavengeFI); 392 } 393 } 394 395 void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF, 396 MachineBasicBlock &MBB) const { 397 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 398 const SIInstrInfo *TII = ST.getInstrInfo(); 399 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 400 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 401 402 MachineBasicBlock::iterator I = MBB.begin(); 403 DebugLoc DL; 404 405 // For each dimension: 406 for (unsigned i = 0; i < 3; ++i) { 407 // Get work group ID SGPR, and make it live-in again. 408 unsigned WorkGroupIDSGPR = MFI->getWorkGroupIDSGPR(i); 409 MF.getRegInfo().addLiveIn(WorkGroupIDSGPR); 410 MBB.addLiveIn(WorkGroupIDSGPR); 411 412 // Since SGPRs are spilled into VGPRs, copy work group ID SGPR to VGPR in 413 // order to spill it to scratch. 414 unsigned WorkGroupIDVGPR = 415 MF.getRegInfo().createVirtualRegister(&AMDGPU::VGPR_32RegClass); 416 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), WorkGroupIDVGPR) 417 .addReg(WorkGroupIDSGPR); 418 419 // Spill work group ID. 420 int WorkGroupIDObjectIdx = MFI->getDebuggerWorkGroupIDStackObjectIndex(i); 421 TII->storeRegToStackSlot(MBB, I, WorkGroupIDVGPR, false, 422 WorkGroupIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI); 423 424 // Get work item ID VGPR, and make it live-in again. 425 unsigned WorkItemIDVGPR = MFI->getWorkItemIDVGPR(i); 426 MF.getRegInfo().addLiveIn(WorkItemIDVGPR); 427 MBB.addLiveIn(WorkItemIDVGPR); 428 429 // Spill work item ID. 430 int WorkItemIDObjectIdx = MFI->getDebuggerWorkItemIDStackObjectIndex(i); 431 TII->storeRegToStackSlot(MBB, I, WorkItemIDVGPR, false, 432 WorkItemIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI); 433 } 434 } 435