1 //===----------------------- SIFrameLowering.cpp --------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //==-----------------------------------------------------------------------===// 9 10 #include "SIFrameLowering.h" 11 #include "SIInstrInfo.h" 12 #include "SIMachineFunctionInfo.h" 13 #include "SIRegisterInfo.h" 14 #include "AMDGPUSubtarget.h" 15 16 #include "llvm/CodeGen/MachineFrameInfo.h" 17 #include "llvm/CodeGen/MachineFunction.h" 18 #include "llvm/CodeGen/MachineInstrBuilder.h" 19 #include "llvm/CodeGen/RegisterScavenging.h" 20 21 using namespace llvm; 22 23 24 static ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF, 25 const SIRegisterInfo *TRI) { 26 return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(), 27 TRI->getMaxNumSGPRs(MF) / 4); 28 } 29 30 static ArrayRef<MCPhysReg> getAllSGPRs(const MachineFunction &MF, 31 const SIRegisterInfo *TRI) { 32 return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), 33 TRI->getMaxNumSGPRs(MF)); 34 } 35 36 void SIFrameLowering::emitFlatScratchInit(const SIInstrInfo *TII, 37 const SIRegisterInfo* TRI, 38 MachineFunction &MF, 39 MachineBasicBlock &MBB) const { 40 // We don't need this if we only have spills since there is no user facing 41 // scratch. 42 43 // TODO: If we know we don't have flat instructions earlier, we can omit 44 // this from the input registers. 45 // 46 // TODO: We only need to know if we access scratch space through a flat 47 // pointer. Because we only detect if flat instructions are used at all, 48 // this will be used more often than necessary on VI. 49 50 // Debug location must be unknown since the first debug location is used to 51 // determine the end of the prologue. 52 DebugLoc DL; 53 MachineBasicBlock::iterator I = MBB.begin(); 54 55 unsigned FlatScratchInitReg 56 = TRI->getPreloadedValue(MF, SIRegisterInfo::FLAT_SCRATCH_INIT); 57 58 MachineRegisterInfo &MRI = MF.getRegInfo(); 59 MRI.addLiveIn(FlatScratchInitReg); 60 MBB.addLiveIn(FlatScratchInitReg); 61 62 // Copy the size in bytes. 63 unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); 64 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO) 65 .addReg(FlatScrInitHi, RegState::Kill); 66 67 unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); 68 69 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 70 unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); 71 72 // Add wave offset in bytes to private base offset. 73 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init. 74 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) 75 .addReg(FlatScrInitLo) 76 .addReg(ScratchWaveOffsetReg); 77 78 // Convert offset to 256-byte units. 79 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI) 80 .addReg(FlatScrInitLo, RegState::Kill) 81 .addImm(8); 82 } 83 84 unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( 85 const SISubtarget &ST, 86 const SIInstrInfo *TII, 87 const SIRegisterInfo *TRI, 88 SIMachineFunctionInfo *MFI, 89 MachineFunction &MF) const { 90 91 // We need to insert initialization of the scratch resource descriptor. 92 unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); 93 if (ScratchRsrcReg == AMDGPU::NoRegister) 94 return AMDGPU::NoRegister; 95 96 if (ST.hasSGPRInitBug() || 97 ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF)) 98 return ScratchRsrcReg; 99 100 // We reserved the last registers for this. Shift it down to the end of those 101 // which were actually used. 102 // 103 // FIXME: It might be safer to use a pseudoregister before replacement. 104 105 // FIXME: We should be able to eliminate unused input registers. We only 106 // cannot do this for the resources required for scratch access. For now we 107 // skip over user SGPRs and may leave unused holes. 108 109 // We find the resource first because it has an alignment requirement. 110 111 MachineRegisterInfo &MRI = MF.getRegInfo(); 112 113 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4; 114 ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(MF, TRI); 115 AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded)); 116 117 // Skip the last 2 elements because the last one is reserved for VCC, and 118 // this is the 2nd to last element already. 119 for (MCPhysReg Reg : AllSGPR128s) { 120 // Pick the first unallocated one. Make sure we don't clobber the other 121 // reserved input we needed. 122 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) { 123 //assert(MRI.isAllocatable(Reg)); 124 MRI.replaceRegWith(ScratchRsrcReg, Reg); 125 MFI->setScratchRSrcReg(Reg); 126 return Reg; 127 } 128 } 129 130 return ScratchRsrcReg; 131 } 132 133 unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( 134 const SISubtarget &ST, 135 const SIInstrInfo *TII, 136 const SIRegisterInfo *TRI, 137 SIMachineFunctionInfo *MFI, 138 MachineFunction &MF) const { 139 unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); 140 if (ST.hasSGPRInitBug() || 141 ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) 142 return ScratchWaveOffsetReg; 143 144 unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); 145 MachineRegisterInfo &MRI = MF.getRegInfo(); 146 147 unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); 148 149 ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(MF, TRI); 150 if (NumPreloaded > AllSGPRs.size()) 151 return ScratchWaveOffsetReg; 152 153 AllSGPRs = AllSGPRs.slice(NumPreloaded); 154 155 // We need to drop register from the end of the list that we cannot use 156 // for the scratch wave offset. 157 // + 2 s102 and s103 do not exist on VI. 158 // + 2 for vcc 159 // + 2 for xnack_mask 160 // + 2 for flat_scratch 161 // + 4 for registers reserved for scratch resource register 162 // + 1 for register reserved for scratch wave offset. (By exluding this 163 // register from the list to consider, it means that when this 164 // register is being used for the scratch wave offset and there 165 // are no other free SGPRs, then the value will stay in this register. 166 // ---- 167 // 13 168 if (AllSGPRs.size() < 13) 169 return ScratchWaveOffsetReg; 170 171 for (MCPhysReg Reg : AllSGPRs.drop_back(13)) { 172 // Pick the first unallocated SGPR. Be careful not to pick an alias of the 173 // scratch descriptor, since we haven’t added its uses yet. 174 if (!MRI.isPhysRegUsed(Reg)) { 175 if (!MRI.isAllocatable(Reg) || 176 TRI->isSubRegisterEq(ScratchRsrcReg, Reg)) 177 continue; 178 179 MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); 180 MFI->setScratchWaveOffsetReg(Reg); 181 return Reg; 182 } 183 } 184 185 return ScratchWaveOffsetReg; 186 } 187 188 void SIFrameLowering::emitPrologue(MachineFunction &MF, 189 MachineBasicBlock &MBB) const { 190 // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was 191 // specified. 192 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 193 if (ST.debuggerEmitPrologue()) 194 emitDebuggerPrologue(MF, MBB); 195 196 assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); 197 198 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 199 200 // If we only have SGPR spills, we won't actually be using scratch memory 201 // since these spill to VGPRs. 202 // 203 // FIXME: We should be cleaning up these unused SGPR spill frame indices 204 // somewhere. 205 206 const SIInstrInfo *TII = ST.getInstrInfo(); 207 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 208 MachineRegisterInfo &MRI = MF.getRegInfo(); 209 210 unsigned ScratchRsrcReg 211 = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF); 212 unsigned ScratchWaveOffsetReg 213 = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF); 214 215 if (ScratchRsrcReg == AMDGPU::NoRegister) { 216 assert(ScratchWaveOffsetReg == AMDGPU::NoRegister); 217 return; 218 } 219 220 assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg)); 221 222 // We need to do the replacement of the private segment buffer and wave offset 223 // register even if there are no stack objects. There could be stores to undef 224 // or a constant without an associated object. 225 226 // FIXME: We still have implicit uses on SGPR spill instructions in case they 227 // need to spill to vector memory. It's likely that will not happen, but at 228 // this point it appears we need the setup. This part of the prolog should be 229 // emitted after frame indices are eliminated. 230 231 if (MF.getFrameInfo().hasStackObjects() && MFI->hasFlatScratchInit()) 232 emitFlatScratchInit(TII, TRI, MF, MBB); 233 234 // We need to insert initialization of the scratch resource descriptor. 235 unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue( 236 MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); 237 238 239 unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; 240 if (ST.isAmdCodeObjectV2()) { 241 PreloadedPrivateBufferReg = TRI->getPreloadedValue( 242 MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); 243 } 244 245 bool OffsetRegUsed = !MRI.use_empty(ScratchWaveOffsetReg); 246 bool ResourceRegUsed = !MRI.use_empty(ScratchRsrcReg); 247 248 // We added live-ins during argument lowering, but since they were not used 249 // they were deleted. We're adding the uses now, so add them back. 250 if (OffsetRegUsed) { 251 assert(PreloadedScratchWaveOffsetReg != AMDGPU::NoRegister && 252 "scratch wave offset input is required"); 253 MRI.addLiveIn(PreloadedScratchWaveOffsetReg); 254 MBB.addLiveIn(PreloadedScratchWaveOffsetReg); 255 } 256 257 if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) { 258 assert(ST.isAmdCodeObjectV2()); 259 MRI.addLiveIn(PreloadedPrivateBufferReg); 260 MBB.addLiveIn(PreloadedPrivateBufferReg); 261 } 262 263 // Make the register selected live throughout the function. 264 for (MachineBasicBlock &OtherBB : MF) { 265 if (&OtherBB == &MBB) 266 continue; 267 268 if (OffsetRegUsed) 269 OtherBB.addLiveIn(ScratchWaveOffsetReg); 270 271 if (ResourceRegUsed) 272 OtherBB.addLiveIn(ScratchRsrcReg); 273 } 274 275 DebugLoc DL; 276 MachineBasicBlock::iterator I = MBB.begin(); 277 278 // If we reserved the original input registers, we don't need to copy to the 279 // reserved registers. 280 281 bool CopyBuffer = ResourceRegUsed && 282 PreloadedPrivateBufferReg != AMDGPU::NoRegister && 283 ScratchRsrcReg != PreloadedPrivateBufferReg; 284 285 // This needs to be careful of the copying order to avoid overwriting one of 286 // the input registers before it's been copied to it's final 287 // destination. Usually the offset should be copied first. 288 bool CopyBufferFirst = TRI->isSubRegisterEq(PreloadedPrivateBufferReg, 289 ScratchWaveOffsetReg); 290 if (CopyBuffer && CopyBufferFirst) { 291 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) 292 .addReg(PreloadedPrivateBufferReg, RegState::Kill); 293 } 294 295 if (OffsetRegUsed && 296 PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) { 297 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) 298 .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); 299 } 300 301 if (CopyBuffer && !CopyBufferFirst) { 302 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) 303 .addReg(PreloadedPrivateBufferReg, RegState::Kill); 304 } 305 306 if (ResourceRegUsed && PreloadedPrivateBufferReg == AMDGPU::NoRegister) { 307 assert(!ST.isAmdCodeObjectV2()); 308 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); 309 310 unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 311 unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 312 unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); 313 unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); 314 315 // Use relocations to get the pointer, and setup the other bits manually. 316 uint64_t Rsrc23 = TII->getScratchRsrcWords23(); 317 BuildMI(MBB, I, DL, SMovB32, Rsrc0) 318 .addExternalSymbol("SCRATCH_RSRC_DWORD0") 319 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 320 321 BuildMI(MBB, I, DL, SMovB32, Rsrc1) 322 .addExternalSymbol("SCRATCH_RSRC_DWORD1") 323 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 324 325 BuildMI(MBB, I, DL, SMovB32, Rsrc2) 326 .addImm(Rsrc23 & 0xffffffff) 327 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 328 329 BuildMI(MBB, I, DL, SMovB32, Rsrc3) 330 .addImm(Rsrc23 >> 32) 331 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 332 } 333 } 334 335 void SIFrameLowering::emitEpilogue(MachineFunction &MF, 336 MachineBasicBlock &MBB) const { 337 338 } 339 340 void SIFrameLowering::processFunctionBeforeFrameFinalized( 341 MachineFunction &MF, 342 RegScavenger *RS) const { 343 MachineFrameInfo &MFI = MF.getFrameInfo(); 344 345 if (!MFI.hasStackObjects()) 346 return; 347 348 bool MayNeedScavengingEmergencySlot = MFI.hasStackObjects(); 349 350 assert((RS || !MayNeedScavengingEmergencySlot) && 351 "RegScavenger required if spilling"); 352 353 if (MayNeedScavengingEmergencySlot) { 354 int ScavengeFI = MFI.CreateStackObject( 355 AMDGPU::SGPR_32RegClass.getSize(), 356 AMDGPU::SGPR_32RegClass.getAlignment(), false); 357 RS->addScavengingFrameIndex(ScavengeFI); 358 } 359 } 360 361 void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF, 362 MachineBasicBlock &MBB) const { 363 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 364 const SIInstrInfo *TII = ST.getInstrInfo(); 365 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 366 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 367 368 MachineBasicBlock::iterator I = MBB.begin(); 369 DebugLoc DL; 370 371 // For each dimension: 372 for (unsigned i = 0; i < 3; ++i) { 373 // Get work group ID SGPR, and make it live-in again. 374 unsigned WorkGroupIDSGPR = MFI->getWorkGroupIDSGPR(i); 375 MF.getRegInfo().addLiveIn(WorkGroupIDSGPR); 376 MBB.addLiveIn(WorkGroupIDSGPR); 377 378 // Since SGPRs are spilled into VGPRs, copy work group ID SGPR to VGPR in 379 // order to spill it to scratch. 380 unsigned WorkGroupIDVGPR = 381 MF.getRegInfo().createVirtualRegister(&AMDGPU::VGPR_32RegClass); 382 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), WorkGroupIDVGPR) 383 .addReg(WorkGroupIDSGPR); 384 385 // Spill work group ID. 386 int WorkGroupIDObjectIdx = MFI->getDebuggerWorkGroupIDStackObjectIndex(i); 387 TII->storeRegToStackSlot(MBB, I, WorkGroupIDVGPR, false, 388 WorkGroupIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI); 389 390 // Get work item ID VGPR, and make it live-in again. 391 unsigned WorkItemIDVGPR = MFI->getWorkItemIDVGPR(i); 392 MF.getRegInfo().addLiveIn(WorkItemIDVGPR); 393 MBB.addLiveIn(WorkItemIDVGPR); 394 395 // Spill work item ID. 396 int WorkItemIDObjectIdx = MFI->getDebuggerWorkItemIDStackObjectIndex(i); 397 TII->storeRegToStackSlot(MBB, I, WorkItemIDVGPR, false, 398 WorkItemIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI); 399 } 400 } 401