1 //===----------------------- SIFrameLowering.cpp --------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //==-----------------------------------------------------------------------===// 9 10 #include "SIFrameLowering.h" 11 #include "SIInstrInfo.h" 12 #include "SIMachineFunctionInfo.h" 13 #include "SIRegisterInfo.h" 14 #include "AMDGPUSubtarget.h" 15 16 #include "llvm/CodeGen/MachineFrameInfo.h" 17 #include "llvm/CodeGen/MachineFunction.h" 18 #include "llvm/CodeGen/MachineInstrBuilder.h" 19 #include "llvm/CodeGen/RegisterScavenging.h" 20 21 using namespace llvm; 22 23 24 static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo, 25 const MachineFrameInfo &MFI) { 26 return FuncInfo->hasSpilledSGPRs() && 27 (!FuncInfo->hasSpilledVGPRs() && !FuncInfo->hasNonSpillStackObjects()); 28 } 29 30 static ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF, 31 const SIRegisterInfo *TRI) { 32 return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(), 33 TRI->getMaxNumSGPRs(MF) / 4); 34 } 35 36 static ArrayRef<MCPhysReg> getAllSGPRs(const MachineFunction &MF, 37 const SIRegisterInfo *TRI) { 38 return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), 39 TRI->getMaxNumSGPRs(MF)); 40 } 41 42 void SIFrameLowering::emitFlatScratchInit(const SIInstrInfo *TII, 43 const SIRegisterInfo* TRI, 44 MachineFunction &MF, 45 MachineBasicBlock &MBB) const { 46 // We don't need this if we only have spills since there is no user facing 47 // scratch. 48 49 // TODO: If we know we don't have flat instructions earlier, we can omit 50 // this from the input registers. 51 // 52 // TODO: We only need to know if we access scratch space through a flat 53 // pointer. Because we only detect if flat instructions are used at all, 54 // this will be used more often than necessary on VI. 55 56 // Debug location must be unknown since the first debug location is used to 57 // determine the end of the prologue. 58 DebugLoc DL; 59 MachineBasicBlock::iterator I = MBB.begin(); 60 61 unsigned FlatScratchInitReg 62 = TRI->getPreloadedValue(MF, SIRegisterInfo::FLAT_SCRATCH_INIT); 63 64 MachineRegisterInfo &MRI = MF.getRegInfo(); 65 MRI.addLiveIn(FlatScratchInitReg); 66 MBB.addLiveIn(FlatScratchInitReg); 67 68 // Copy the size in bytes. 69 unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); 70 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO) 71 .addReg(FlatScrInitHi, RegState::Kill); 72 73 unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); 74 75 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 76 unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); 77 78 79 // Add wave offset in bytes to private base offset. 80 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init. 81 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) 82 .addReg(FlatScrInitLo) 83 .addReg(ScratchWaveOffsetReg); 84 85 // Convert offset to 256-byte units. 86 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI) 87 .addReg(FlatScrInitLo, RegState::Kill) 88 .addImm(8); 89 } 90 91 unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( 92 const SISubtarget &ST, 93 const SIInstrInfo *TII, 94 const SIRegisterInfo *TRI, 95 SIMachineFunctionInfo *MFI, 96 MachineFunction &MF) const { 97 98 // We need to insert initialization of the scratch resource descriptor. 99 unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); 100 assert(ScratchRsrcReg != AMDGPU::NoRegister); 101 102 if (ST.hasSGPRInitBug() || 103 ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF)) 104 return ScratchRsrcReg; 105 106 // We reserved the last registers for this. Shift it down to the end of those 107 // which were actually used. 108 // 109 // FIXME: It might be safer to use a pseudoregister before replacement. 110 111 // FIXME: We should be able to eliminate unused input registers. We only 112 // cannot do this for the resources required for scratch access. For now we 113 // skip over user SGPRs and may leave unused holes. 114 115 // We find the resource first because it has an alignment requirement. 116 117 MachineRegisterInfo &MRI = MF.getRegInfo(); 118 119 unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4; 120 // Skip the last 2 elements because the last one is reserved for VCC, and 121 // this is the 2nd to last element already. 122 for (MCPhysReg Reg : getAllSGPR128(MF, TRI).drop_back(2).slice(NumPreloaded)) { 123 // Pick the first unallocated one. Make sure we don't clobber the other 124 // reserved input we needed. 125 if (!MRI.isPhysRegUsed(Reg)) { 126 assert(MRI.isAllocatable(Reg)); 127 MRI.replaceRegWith(ScratchRsrcReg, Reg); 128 MFI->setScratchRSrcReg(Reg); 129 return Reg; 130 } 131 } 132 133 return ScratchRsrcReg; 134 } 135 136 unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( 137 const SISubtarget &ST, 138 const SIInstrInfo *TII, 139 const SIRegisterInfo *TRI, 140 SIMachineFunctionInfo *MFI, 141 MachineFunction &MF) const { 142 unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); 143 if (ST.hasSGPRInitBug() || 144 ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) 145 return ScratchWaveOffsetReg; 146 147 unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); 148 MachineRegisterInfo &MRI = MF.getRegInfo(); 149 unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); 150 151 // We need to drop register from the end of the list that we cannot use 152 // for the scratch wave offset. 153 // + 2 s102 and s103 do not exist on VI. 154 // + 2 for vcc 155 // + 2 for xnack_mask 156 // + 2 for flat_scratch 157 // + 4 for registers reserved for scratch resource register 158 // + 1 for register reserved for scratch wave offset. (By exluding this 159 // register from the list to consider, it means that when this 160 // register is being used for the scratch wave offset and there 161 // are no other free SGPRs, then the value will stay in this register. 162 // ---- 163 // 13 164 for (MCPhysReg Reg : getAllSGPRs(MF, TRI).drop_back(13).slice(NumPreloaded)) { 165 // Pick the first unallocated SGPR. Be careful not to pick an alias of the 166 // scratch descriptor, since we haven’t added its uses yet. 167 if (!MRI.isPhysRegUsed(Reg)) { 168 if (!MRI.isAllocatable(Reg) || 169 TRI->isSubRegisterEq(ScratchRsrcReg, Reg)) 170 continue; 171 172 MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); 173 MFI->setScratchWaveOffsetReg(Reg); 174 return Reg; 175 } 176 } 177 178 return ScratchWaveOffsetReg; 179 } 180 181 void SIFrameLowering::emitPrologue(MachineFunction &MF, 182 MachineBasicBlock &MBB) const { 183 // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was 184 // specified. 185 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 186 if (ST.debuggerEmitPrologue()) 187 emitDebuggerPrologue(MF, MBB); 188 189 if (!MF.getFrameInfo().hasStackObjects()) 190 return; 191 192 assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); 193 194 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 195 196 // If we only have SGPR spills, we won't actually be using scratch memory 197 // since these spill to VGPRs. 198 // 199 // FIXME: We should be cleaning up these unused SGPR spill frame indices 200 // somewhere. 201 if (hasOnlySGPRSpills(MFI, MF.getFrameInfo())) 202 return; 203 204 const SIInstrInfo *TII = ST.getInstrInfo(); 205 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 206 MachineRegisterInfo &MRI = MF.getRegInfo(); 207 208 unsigned ScratchRsrcReg 209 = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF); 210 unsigned ScratchWaveOffsetReg 211 = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF); 212 assert(ScratchRsrcReg != AMDGPU::NoRegister); 213 assert(ScratchWaveOffsetReg != AMDGPU::NoRegister); 214 assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg)); 215 216 if (MFI->hasFlatScratchInit()) 217 emitFlatScratchInit(TII, TRI, MF, MBB); 218 219 // We need to insert initialization of the scratch resource descriptor. 220 unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue( 221 MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); 222 223 unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; 224 if (ST.isAmdCodeObjectV2()) { 225 PreloadedPrivateBufferReg = TRI->getPreloadedValue( 226 MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); 227 } 228 229 // If we reserved the original input registers, we don't need to copy to the 230 // reserved registers. 231 if (ScratchRsrcReg == PreloadedPrivateBufferReg) { 232 // We should always reserve these 5 registers at the same time. 233 assert(ScratchWaveOffsetReg == PreloadedScratchWaveOffsetReg && 234 "scratch wave offset and private segment buffer inconsistent"); 235 return; 236 } 237 238 // We added live-ins during argument lowering, but since they were not used 239 // they were deleted. We're adding the uses now, so add them back. 240 MRI.addLiveIn(PreloadedScratchWaveOffsetReg); 241 MBB.addLiveIn(PreloadedScratchWaveOffsetReg); 242 243 if (ST.isAmdCodeObjectV2()) { 244 MRI.addLiveIn(PreloadedPrivateBufferReg); 245 MBB.addLiveIn(PreloadedPrivateBufferReg); 246 } 247 248 // Make the register selected live throughout the function. 249 for (MachineBasicBlock &OtherBB : MF) { 250 if (&OtherBB == &MBB) 251 continue; 252 253 OtherBB.addLiveIn(ScratchRsrcReg); 254 OtherBB.addLiveIn(ScratchWaveOffsetReg); 255 } 256 257 DebugLoc DL; 258 MachineBasicBlock::iterator I = MBB.begin(); 259 260 if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) { 261 // Make sure we emit the copy for the offset first. We may have chosen to 262 // copy the buffer resource into a register that aliases the input offset 263 // register. 264 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) 265 .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); 266 } 267 268 if (ST.isAmdCodeObjectV2()) { 269 // Insert copies from argument register. 270 assert( 271 !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchRsrcReg) && 272 !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchWaveOffsetReg)); 273 274 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) 275 .addReg(PreloadedPrivateBufferReg, RegState::Kill); 276 } else { 277 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); 278 279 unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 280 unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 281 unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); 282 unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); 283 284 // Use relocations to get the pointer, and setup the other bits manually. 285 uint64_t Rsrc23 = TII->getScratchRsrcWords23(); 286 BuildMI(MBB, I, DL, SMovB32, Rsrc0) 287 .addExternalSymbol("SCRATCH_RSRC_DWORD0") 288 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 289 290 BuildMI(MBB, I, DL, SMovB32, Rsrc1) 291 .addExternalSymbol("SCRATCH_RSRC_DWORD1") 292 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 293 294 BuildMI(MBB, I, DL, SMovB32, Rsrc2) 295 .addImm(Rsrc23 & 0xffffffff) 296 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 297 298 BuildMI(MBB, I, DL, SMovB32, Rsrc3) 299 .addImm(Rsrc23 >> 32) 300 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 301 } 302 } 303 304 void SIFrameLowering::emitEpilogue(MachineFunction &MF, 305 MachineBasicBlock &MBB) const { 306 307 } 308 309 void SIFrameLowering::processFunctionBeforeFrameFinalized( 310 MachineFunction &MF, 311 RegScavenger *RS) const { 312 MachineFrameInfo &MFI = MF.getFrameInfo(); 313 314 if (!MFI.hasStackObjects()) 315 return; 316 317 bool MayNeedScavengingEmergencySlot = MFI.hasStackObjects(); 318 319 assert((RS || !MayNeedScavengingEmergencySlot) && 320 "RegScavenger required if spilling"); 321 322 if (MayNeedScavengingEmergencySlot) { 323 int ScavengeFI = MFI.CreateStackObject( 324 AMDGPU::SGPR_32RegClass.getSize(), 325 AMDGPU::SGPR_32RegClass.getAlignment(), false); 326 RS->addScavengingFrameIndex(ScavengeFI); 327 } 328 } 329 330 void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF, 331 MachineBasicBlock &MBB) const { 332 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 333 const SIInstrInfo *TII = ST.getInstrInfo(); 334 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 335 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 336 337 MachineBasicBlock::iterator I = MBB.begin(); 338 DebugLoc DL; 339 340 // For each dimension: 341 for (unsigned i = 0; i < 3; ++i) { 342 // Get work group ID SGPR, and make it live-in again. 343 unsigned WorkGroupIDSGPR = MFI->getWorkGroupIDSGPR(i); 344 MF.getRegInfo().addLiveIn(WorkGroupIDSGPR); 345 MBB.addLiveIn(WorkGroupIDSGPR); 346 347 // Since SGPRs are spilled into VGPRs, copy work group ID SGPR to VGPR in 348 // order to spill it to scratch. 349 unsigned WorkGroupIDVGPR = 350 MF.getRegInfo().createVirtualRegister(&AMDGPU::VGPR_32RegClass); 351 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), WorkGroupIDVGPR) 352 .addReg(WorkGroupIDSGPR); 353 354 // Spill work group ID. 355 int WorkGroupIDObjectIdx = MFI->getDebuggerWorkGroupIDStackObjectIndex(i); 356 TII->storeRegToStackSlot(MBB, I, WorkGroupIDVGPR, false, 357 WorkGroupIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI); 358 359 // Get work item ID VGPR, and make it live-in again. 360 unsigned WorkItemIDVGPR = MFI->getWorkItemIDVGPR(i); 361 MF.getRegInfo().addLiveIn(WorkItemIDVGPR); 362 MBB.addLiveIn(WorkItemIDVGPR); 363 364 // Spill work item ID. 365 int WorkItemIDObjectIdx = MFI->getDebuggerWorkItemIDStackObjectIndex(i); 366 TII->storeRegToStackSlot(MBB, I, WorkItemIDVGPR, false, 367 WorkItemIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI); 368 } 369 } 370