1 //===----------------------- SIFrameLowering.cpp --------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //==-----------------------------------------------------------------------===// 9 10 #include "SIFrameLowering.h" 11 #include "SIInstrInfo.h" 12 #include "SIMachineFunctionInfo.h" 13 #include "SIRegisterInfo.h" 14 #include "AMDGPUSubtarget.h" 15 16 #include "llvm/CodeGen/MachineFrameInfo.h" 17 #include "llvm/CodeGen/MachineFunction.h" 18 #include "llvm/CodeGen/MachineInstrBuilder.h" 19 #include "llvm/CodeGen/RegisterScavenging.h" 20 21 using namespace llvm; 22 23 24 static ArrayRef<MCPhysReg> getAllSGPR128(const SISubtarget &ST, 25 const MachineFunction &MF) { 26 return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(), 27 ST.getMaxNumSGPRs(MF) / 4); 28 } 29 30 static ArrayRef<MCPhysReg> getAllSGPRs(const SISubtarget &ST, 31 const MachineFunction &MF) { 32 return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), 33 ST.getMaxNumSGPRs(MF)); 34 } 35 36 void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST, 37 MachineFunction &MF, 38 MachineBasicBlock &MBB) const { 39 const SIInstrInfo *TII = ST.getInstrInfo(); 40 const SIRegisterInfo* TRI = &TII->getRegisterInfo(); 41 42 // We don't need this if we only have spills since there is no user facing 43 // scratch. 44 45 // TODO: If we know we don't have flat instructions earlier, we can omit 46 // this from the input registers. 47 // 48 // TODO: We only need to know if we access scratch space through a flat 49 // pointer. Because we only detect if flat instructions are used at all, 50 // this will be used more often than necessary on VI. 51 52 // Debug location must be unknown since the first debug location is used to 53 // determine the end of the prologue. 54 DebugLoc DL; 55 MachineBasicBlock::iterator I = MBB.begin(); 56 57 unsigned FlatScratchInitReg 58 = TRI->getPreloadedValue(MF, SIRegisterInfo::FLAT_SCRATCH_INIT); 59 60 MachineRegisterInfo &MRI = MF.getRegInfo(); 61 MRI.addLiveIn(FlatScratchInitReg); 62 MBB.addLiveIn(FlatScratchInitReg); 63 64 unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); 65 unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); 66 67 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 68 unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); 69 70 // Do a 64-bit pointer add. 71 if (ST.flatScratchIsPointer()) { 72 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO) 73 .addReg(FlatScrInitLo) 74 .addReg(ScratchWaveOffsetReg); 75 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI) 76 .addReg(FlatScrInitHi) 77 .addImm(0); 78 79 return; 80 } 81 82 // Copy the size in bytes. 83 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO) 84 .addReg(FlatScrInitHi, RegState::Kill); 85 86 // Add wave offset in bytes to private base offset. 87 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init. 88 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) 89 .addReg(FlatScrInitLo) 90 .addReg(ScratchWaveOffsetReg); 91 92 // Convert offset to 256-byte units. 93 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI) 94 .addReg(FlatScrInitLo, RegState::Kill) 95 .addImm(8); 96 } 97 98 unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( 99 const SISubtarget &ST, 100 const SIInstrInfo *TII, 101 const SIRegisterInfo *TRI, 102 SIMachineFunctionInfo *MFI, 103 MachineFunction &MF) const { 104 105 // We need to insert initialization of the scratch resource descriptor. 106 unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); 107 if (ScratchRsrcReg == AMDGPU::NoRegister) 108 return AMDGPU::NoRegister; 109 110 if (ST.hasSGPRInitBug() || 111 ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF)) 112 return ScratchRsrcReg; 113 114 // We reserved the last registers for this. Shift it down to the end of those 115 // which were actually used. 116 // 117 // FIXME: It might be safer to use a pseudoregister before replacement. 118 119 // FIXME: We should be able to eliminate unused input registers. We only 120 // cannot do this for the resources required for scratch access. For now we 121 // skip over user SGPRs and may leave unused holes. 122 123 // We find the resource first because it has an alignment requirement. 124 125 MachineRegisterInfo &MRI = MF.getRegInfo(); 126 127 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4; 128 ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(ST, MF); 129 AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded)); 130 131 // Skip the last 2 elements because the last one is reserved for VCC, and 132 // this is the 2nd to last element already. 133 for (MCPhysReg Reg : AllSGPR128s) { 134 // Pick the first unallocated one. Make sure we don't clobber the other 135 // reserved input we needed. 136 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) { 137 //assert(MRI.isAllocatable(Reg)); 138 MRI.replaceRegWith(ScratchRsrcReg, Reg); 139 MFI->setScratchRSrcReg(Reg); 140 return Reg; 141 } 142 } 143 144 return ScratchRsrcReg; 145 } 146 147 unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( 148 const SISubtarget &ST, 149 const SIInstrInfo *TII, 150 const SIRegisterInfo *TRI, 151 SIMachineFunctionInfo *MFI, 152 MachineFunction &MF) const { 153 unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); 154 if (ST.hasSGPRInitBug() || 155 ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) 156 return ScratchWaveOffsetReg; 157 158 unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); 159 MachineRegisterInfo &MRI = MF.getRegInfo(); 160 161 unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); 162 163 ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF); 164 if (NumPreloaded > AllSGPRs.size()) 165 return ScratchWaveOffsetReg; 166 167 AllSGPRs = AllSGPRs.slice(NumPreloaded); 168 169 // We need to drop register from the end of the list that we cannot use 170 // for the scratch wave offset. 171 // + 2 s102 and s103 do not exist on VI. 172 // + 2 for vcc 173 // + 2 for xnack_mask 174 // + 2 for flat_scratch 175 // + 4 for registers reserved for scratch resource register 176 // + 1 for register reserved for scratch wave offset. (By exluding this 177 // register from the list to consider, it means that when this 178 // register is being used for the scratch wave offset and there 179 // are no other free SGPRs, then the value will stay in this register. 180 // ---- 181 // 13 182 if (AllSGPRs.size() < 13) 183 return ScratchWaveOffsetReg; 184 185 for (MCPhysReg Reg : AllSGPRs.drop_back(13)) { 186 // Pick the first unallocated SGPR. Be careful not to pick an alias of the 187 // scratch descriptor, since we haven’t added its uses yet. 188 if (!MRI.isPhysRegUsed(Reg)) { 189 if (!MRI.isAllocatable(Reg) || 190 TRI->isSubRegisterEq(ScratchRsrcReg, Reg)) 191 continue; 192 193 MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); 194 MFI->setScratchWaveOffsetReg(Reg); 195 return Reg; 196 } 197 } 198 199 return ScratchWaveOffsetReg; 200 } 201 202 void SIFrameLowering::emitPrologue(MachineFunction &MF, 203 MachineBasicBlock &MBB) const { 204 // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was 205 // specified. 206 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 207 if (ST.debuggerEmitPrologue()) 208 emitDebuggerPrologue(MF, MBB); 209 210 assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); 211 212 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 213 214 // If we only have SGPR spills, we won't actually be using scratch memory 215 // since these spill to VGPRs. 216 // 217 // FIXME: We should be cleaning up these unused SGPR spill frame indices 218 // somewhere. 219 220 const SIInstrInfo *TII = ST.getInstrInfo(); 221 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 222 MachineRegisterInfo &MRI = MF.getRegInfo(); 223 224 unsigned ScratchRsrcReg 225 = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF); 226 unsigned ScratchWaveOffsetReg 227 = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF); 228 229 if (ScratchRsrcReg == AMDGPU::NoRegister) { 230 assert(ScratchWaveOffsetReg == AMDGPU::NoRegister); 231 return; 232 } 233 234 assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg)); 235 236 // We need to do the replacement of the private segment buffer and wave offset 237 // register even if there are no stack objects. There could be stores to undef 238 // or a constant without an associated object. 239 240 // FIXME: We still have implicit uses on SGPR spill instructions in case they 241 // need to spill to vector memory. It's likely that will not happen, but at 242 // this point it appears we need the setup. This part of the prolog should be 243 // emitted after frame indices are eliminated. 244 245 if (MF.getFrameInfo().hasStackObjects() && MFI->hasFlatScratchInit()) 246 emitFlatScratchInit(ST, MF, MBB); 247 248 // We need to insert initialization of the scratch resource descriptor. 249 unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue( 250 MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); 251 252 253 unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; 254 if (ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF)) { 255 PreloadedPrivateBufferReg = TRI->getPreloadedValue( 256 MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); 257 } 258 259 bool OffsetRegUsed = !MRI.use_empty(ScratchWaveOffsetReg); 260 bool ResourceRegUsed = !MRI.use_empty(ScratchRsrcReg); 261 262 // We added live-ins during argument lowering, but since they were not used 263 // they were deleted. We're adding the uses now, so add them back. 264 if (OffsetRegUsed) { 265 assert(PreloadedScratchWaveOffsetReg != AMDGPU::NoRegister && 266 "scratch wave offset input is required"); 267 MRI.addLiveIn(PreloadedScratchWaveOffsetReg); 268 MBB.addLiveIn(PreloadedScratchWaveOffsetReg); 269 } 270 271 if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) { 272 assert(ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF)); 273 MRI.addLiveIn(PreloadedPrivateBufferReg); 274 MBB.addLiveIn(PreloadedPrivateBufferReg); 275 } 276 277 // Make the register selected live throughout the function. 278 for (MachineBasicBlock &OtherBB : MF) { 279 if (&OtherBB == &MBB) 280 continue; 281 282 if (OffsetRegUsed) 283 OtherBB.addLiveIn(ScratchWaveOffsetReg); 284 285 if (ResourceRegUsed) 286 OtherBB.addLiveIn(ScratchRsrcReg); 287 } 288 289 DebugLoc DL; 290 MachineBasicBlock::iterator I = MBB.begin(); 291 292 // If we reserved the original input registers, we don't need to copy to the 293 // reserved registers. 294 295 bool CopyBuffer = ResourceRegUsed && 296 PreloadedPrivateBufferReg != AMDGPU::NoRegister && 297 ST.isAmdCodeObjectV2(MF) && 298 ScratchRsrcReg != PreloadedPrivateBufferReg; 299 300 // This needs to be careful of the copying order to avoid overwriting one of 301 // the input registers before it's been copied to it's final 302 // destination. Usually the offset should be copied first. 303 bool CopyBufferFirst = TRI->isSubRegisterEq(PreloadedPrivateBufferReg, 304 ScratchWaveOffsetReg); 305 if (CopyBuffer && CopyBufferFirst) { 306 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) 307 .addReg(PreloadedPrivateBufferReg, RegState::Kill); 308 } 309 310 if (OffsetRegUsed && 311 PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) { 312 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) 313 .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); 314 } 315 316 if (CopyBuffer && !CopyBufferFirst) { 317 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) 318 .addReg(PreloadedPrivateBufferReg, RegState::Kill); 319 } 320 321 if (ResourceRegUsed && (ST.isMesaGfxShader(MF) || (PreloadedPrivateBufferReg == AMDGPU::NoRegister))) { 322 assert(!ST.isAmdCodeObjectV2(MF)); 323 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); 324 325 unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); 326 unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); 327 328 // Use relocations to get the pointer, and setup the other bits manually. 329 uint64_t Rsrc23 = TII->getScratchRsrcWords23(); 330 331 if (MFI->hasPrivateMemoryInputPtr()) { 332 unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); 333 334 if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) { 335 const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64); 336 337 BuildMI(MBB, I, DL, Mov64, Rsrc01) 338 .addReg(PreloadedPrivateBufferReg) 339 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 340 } else { 341 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); 342 343 PointerType *PtrTy = 344 PointerType::get(Type::getInt64Ty(MF.getFunction()->getContext()), 345 AMDGPUAS::CONSTANT_ADDRESS); 346 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); 347 auto MMO = MF.getMachineMemOperand(PtrInfo, 348 MachineMemOperand::MOLoad | 349 MachineMemOperand::MOInvariant | 350 MachineMemOperand::MODereferenceable, 351 0, 0); 352 BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01) 353 .addReg(PreloadedPrivateBufferReg) 354 .addImm(0) // offset 355 .addImm(0) // glc 356 .addMemOperand(MMO) 357 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 358 } 359 } else { 360 unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 361 unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 362 363 BuildMI(MBB, I, DL, SMovB32, Rsrc0) 364 .addExternalSymbol("SCRATCH_RSRC_DWORD0") 365 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 366 367 BuildMI(MBB, I, DL, SMovB32, Rsrc1) 368 .addExternalSymbol("SCRATCH_RSRC_DWORD1") 369 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 370 371 } 372 373 BuildMI(MBB, I, DL, SMovB32, Rsrc2) 374 .addImm(Rsrc23 & 0xffffffff) 375 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 376 377 BuildMI(MBB, I, DL, SMovB32, Rsrc3) 378 .addImm(Rsrc23 >> 32) 379 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 380 } 381 } 382 383 void SIFrameLowering::emitEpilogue(MachineFunction &MF, 384 MachineBasicBlock &MBB) const { 385 386 } 387 388 void SIFrameLowering::processFunctionBeforeFrameFinalized( 389 MachineFunction &MF, 390 RegScavenger *RS) const { 391 MachineFrameInfo &MFI = MF.getFrameInfo(); 392 393 if (!MFI.hasStackObjects()) 394 return; 395 396 bool MayNeedScavengingEmergencySlot = MFI.hasStackObjects(); 397 398 assert((RS || !MayNeedScavengingEmergencySlot) && 399 "RegScavenger required if spilling"); 400 401 if (MayNeedScavengingEmergencySlot) { 402 int ScavengeFI = MFI.CreateStackObject( 403 AMDGPU::SGPR_32RegClass.getSize(), 404 AMDGPU::SGPR_32RegClass.getAlignment(), false); 405 RS->addScavengingFrameIndex(ScavengeFI); 406 } 407 } 408 409 void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF, 410 MachineBasicBlock &MBB) const { 411 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 412 const SIInstrInfo *TII = ST.getInstrInfo(); 413 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 414 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 415 416 MachineBasicBlock::iterator I = MBB.begin(); 417 DebugLoc DL; 418 419 // For each dimension: 420 for (unsigned i = 0; i < 3; ++i) { 421 // Get work group ID SGPR, and make it live-in again. 422 unsigned WorkGroupIDSGPR = MFI->getWorkGroupIDSGPR(i); 423 MF.getRegInfo().addLiveIn(WorkGroupIDSGPR); 424 MBB.addLiveIn(WorkGroupIDSGPR); 425 426 // Since SGPRs are spilled into VGPRs, copy work group ID SGPR to VGPR in 427 // order to spill it to scratch. 428 unsigned WorkGroupIDVGPR = 429 MF.getRegInfo().createVirtualRegister(&AMDGPU::VGPR_32RegClass); 430 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), WorkGroupIDVGPR) 431 .addReg(WorkGroupIDSGPR); 432 433 // Spill work group ID. 434 int WorkGroupIDObjectIdx = MFI->getDebuggerWorkGroupIDStackObjectIndex(i); 435 TII->storeRegToStackSlot(MBB, I, WorkGroupIDVGPR, false, 436 WorkGroupIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI); 437 438 // Get work item ID VGPR, and make it live-in again. 439 unsigned WorkItemIDVGPR = MFI->getWorkItemIDVGPR(i); 440 MF.getRegInfo().addLiveIn(WorkItemIDVGPR); 441 MBB.addLiveIn(WorkItemIDVGPR); 442 443 // Spill work item ID. 444 int WorkItemIDObjectIdx = MFI->getDebuggerWorkItemIDStackObjectIndex(i); 445 TII->storeRegToStackSlot(MBB, I, WorkItemIDVGPR, false, 446 WorkItemIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI); 447 } 448 } 449