1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI implementation of the TargetRegisterInfo class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "SIRegisterInfo.h" 15 #include "AMDGPURegisterBankInfo.h" 16 #include "AMDGPUSubtarget.h" 17 #include "SIInstrInfo.h" 18 #include "SIMachineFunctionInfo.h" 19 #include "MCTargetDesc/AMDGPUInstPrinter.h" 20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 21 #include "llvm/CodeGen/LiveIntervals.h" 22 #include "llvm/CodeGen/MachineDominators.h" 23 #include "llvm/CodeGen/MachineFrameInfo.h" 24 #include "llvm/CodeGen/MachineInstrBuilder.h" 25 #include "llvm/CodeGen/RegisterScavenging.h" 26 #include "llvm/CodeGen/SlotIndexes.h" 27 #include "llvm/IR/Function.h" 28 #include "llvm/IR/LLVMContext.h" 29 30 using namespace llvm; 31 32 #define GET_REGINFO_TARGET_DESC 33 #include "AMDGPUGenRegisterInfo.inc" 34 35 static cl::opt<bool> EnableSpillSGPRToVGPR( 36 "amdgpu-spill-sgpr-to-vgpr", 37 cl::desc("Enable spilling VGPRs to SGPRs"), 38 cl::ReallyHidden, 39 cl::init(true)); 40 41 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) 42 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST), 43 SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) { 44 45 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 && 46 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) && 47 (getSubRegIndexLaneMask(AMDGPU::lo16) | 48 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() == 49 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() && 50 "getNumCoveredRegs() will not work with generated subreg masks!"); 51 52 RegPressureIgnoredUnits.resize(getNumRegUnits()); 53 RegPressureIgnoredUnits.set(*MCRegUnitIterator(AMDGPU::M0, this)); 54 for (auto Reg : AMDGPU::VGPR_HI16RegClass) 55 RegPressureIgnoredUnits.set(*MCRegUnitIterator(Reg, this)); 56 } 57 58 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, 59 MCRegister Reg) const { 60 MCRegAliasIterator R(Reg, this, true); 61 62 for (; R.isValid(); ++R) 63 Reserved.set(*R); 64 } 65 66 // Forced to be here by one .inc 67 const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs( 68 const MachineFunction *MF) const { 69 CallingConv::ID CC = MF->getFunction().getCallingConv(); 70 switch (CC) { 71 case CallingConv::C: 72 case CallingConv::Fast: 73 case CallingConv::Cold: 74 return CSR_AMDGPU_HighRegs_SaveList; 75 default: { 76 // Dummy to not crash RegisterClassInfo. 77 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister; 78 return &NoCalleeSavedReg; 79 } 80 } 81 } 82 83 const MCPhysReg * 84 SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const { 85 return nullptr; 86 } 87 88 const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, 89 CallingConv::ID CC) const { 90 switch (CC) { 91 case CallingConv::C: 92 case CallingConv::Fast: 93 case CallingConv::Cold: 94 return CSR_AMDGPU_HighRegs_RegMask; 95 default: 96 return nullptr; 97 } 98 } 99 100 Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { 101 const SIFrameLowering *TFI = 102 MF.getSubtarget<GCNSubtarget>().getFrameLowering(); 103 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 104 // During ISel lowering we always reserve the stack pointer in entry 105 // functions, but never actually want to reference it when accessing our own 106 // frame. If we need a frame pointer we use it, but otherwise we can just use 107 // an immediate "0" which we represent by returning NoRegister. 108 if (FuncInfo->isEntryFunction()) { 109 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register(); 110 } 111 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() 112 : FuncInfo->getStackPtrOffsetReg(); 113 } 114 115 const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const { 116 return CSR_AMDGPU_AllVGPRs_RegMask; 117 } 118 119 const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const { 120 return CSR_AMDGPU_AllAllocatableSRegs_RegMask; 121 } 122 123 // FIXME: TableGen should generate something to make this manageable for all 124 // register classes. At a minimum we could use the opposite of 125 // composeSubRegIndices and go up from the base 32-bit subreg. 126 unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel, 127 unsigned NumRegs) { 128 // Table of NumRegs sized pieces at every 32-bit offset. 129 static const uint16_t SubRegFromChannelTable[][32] = { 130 {AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 131 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 132 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 133 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 134 AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19, 135 AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23, 136 AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27, 137 AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31}, 138 {AMDGPU::sub0_sub1, AMDGPU::sub1_sub2, AMDGPU::sub2_sub3, 139 AMDGPU::sub3_sub4, AMDGPU::sub4_sub5, AMDGPU::sub5_sub6, 140 AMDGPU::sub6_sub7, AMDGPU::sub7_sub8, AMDGPU::sub8_sub9, 141 AMDGPU::sub9_sub10, AMDGPU::sub10_sub11, AMDGPU::sub11_sub12, 142 AMDGPU::sub12_sub13, AMDGPU::sub13_sub14, AMDGPU::sub14_sub15, 143 AMDGPU::sub15_sub16, AMDGPU::sub16_sub17, AMDGPU::sub17_sub18, 144 AMDGPU::sub18_sub19, AMDGPU::sub19_sub20, AMDGPU::sub20_sub21, 145 AMDGPU::sub21_sub22, AMDGPU::sub22_sub23, AMDGPU::sub23_sub24, 146 AMDGPU::sub24_sub25, AMDGPU::sub25_sub26, AMDGPU::sub26_sub27, 147 AMDGPU::sub27_sub28, AMDGPU::sub28_sub29, AMDGPU::sub29_sub30, 148 AMDGPU::sub30_sub31, AMDGPU::NoSubRegister}, 149 {AMDGPU::sub0_sub1_sub2, AMDGPU::sub1_sub2_sub3, 150 AMDGPU::sub2_sub3_sub4, AMDGPU::sub3_sub4_sub5, 151 AMDGPU::sub4_sub5_sub6, AMDGPU::sub5_sub6_sub7, 152 AMDGPU::sub6_sub7_sub8, AMDGPU::sub7_sub8_sub9, 153 AMDGPU::sub8_sub9_sub10, AMDGPU::sub9_sub10_sub11, 154 AMDGPU::sub10_sub11_sub12, AMDGPU::sub11_sub12_sub13, 155 AMDGPU::sub12_sub13_sub14, AMDGPU::sub13_sub14_sub15, 156 AMDGPU::sub14_sub15_sub16, AMDGPU::sub15_sub16_sub17, 157 AMDGPU::sub16_sub17_sub18, AMDGPU::sub17_sub18_sub19, 158 AMDGPU::sub18_sub19_sub20, AMDGPU::sub19_sub20_sub21, 159 AMDGPU::sub20_sub21_sub22, AMDGPU::sub21_sub22_sub23, 160 AMDGPU::sub22_sub23_sub24, AMDGPU::sub23_sub24_sub25, 161 AMDGPU::sub24_sub25_sub26, AMDGPU::sub25_sub26_sub27, 162 AMDGPU::sub26_sub27_sub28, AMDGPU::sub27_sub28_sub29, 163 AMDGPU::sub28_sub29_sub30, AMDGPU::sub29_sub30_sub31, 164 AMDGPU::NoSubRegister, AMDGPU::NoSubRegister}, 165 {AMDGPU::sub0_sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4, 166 AMDGPU::sub2_sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6, 167 AMDGPU::sub4_sub5_sub6_sub7, AMDGPU::sub5_sub6_sub7_sub8, 168 AMDGPU::sub6_sub7_sub8_sub9, AMDGPU::sub7_sub8_sub9_sub10, 169 AMDGPU::sub8_sub9_sub10_sub11, AMDGPU::sub9_sub10_sub11_sub12, 170 AMDGPU::sub10_sub11_sub12_sub13, AMDGPU::sub11_sub12_sub13_sub14, 171 AMDGPU::sub12_sub13_sub14_sub15, AMDGPU::sub13_sub14_sub15_sub16, 172 AMDGPU::sub14_sub15_sub16_sub17, AMDGPU::sub15_sub16_sub17_sub18, 173 AMDGPU::sub16_sub17_sub18_sub19, AMDGPU::sub17_sub18_sub19_sub20, 174 AMDGPU::sub18_sub19_sub20_sub21, AMDGPU::sub19_sub20_sub21_sub22, 175 AMDGPU::sub20_sub21_sub22_sub23, AMDGPU::sub21_sub22_sub23_sub24, 176 AMDGPU::sub22_sub23_sub24_sub25, AMDGPU::sub23_sub24_sub25_sub26, 177 AMDGPU::sub24_sub25_sub26_sub27, AMDGPU::sub25_sub26_sub27_sub28, 178 AMDGPU::sub26_sub27_sub28_sub29, AMDGPU::sub27_sub28_sub29_sub30, 179 AMDGPU::sub28_sub29_sub30_sub31, AMDGPU::NoSubRegister, 180 AMDGPU::NoSubRegister, AMDGPU::NoSubRegister}}; 181 182 const unsigned NumRegIndex = NumRegs - 1; 183 184 assert(NumRegIndex < array_lengthof(SubRegFromChannelTable) && 185 "Not implemented"); 186 assert(Channel < array_lengthof(SubRegFromChannelTable[0])); 187 return SubRegFromChannelTable[NumRegIndex][Channel]; 188 } 189 190 MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg( 191 const MachineFunction &MF) const { 192 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4; 193 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); 194 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass); 195 } 196 197 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { 198 BitVector Reserved(getNumRegs()); 199 200 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but 201 // this seems likely to result in bugs, so I'm marking them as reserved. 202 reserveRegisterTuples(Reserved, AMDGPU::EXEC); 203 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); 204 205 // M0 has to be reserved so that llvm accepts it as a live-in into a block. 206 reserveRegisterTuples(Reserved, AMDGPU::M0); 207 208 // Reserve src_vccz, src_execz, src_scc. 209 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ); 210 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ); 211 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC); 212 213 // Reserve the memory aperture registers. 214 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE); 215 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); 216 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); 217 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT); 218 219 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen. 220 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID); 221 222 // Reserve xnack_mask registers - support is not implemented in Codegen. 223 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK); 224 225 // Reserve lds_direct register - support is not implemented in Codegen. 226 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT); 227 228 // Reserve Trap Handler registers - support is not implemented in Codegen. 229 reserveRegisterTuples(Reserved, AMDGPU::TBA); 230 reserveRegisterTuples(Reserved, AMDGPU::TMA); 231 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1); 232 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3); 233 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5); 234 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7); 235 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); 236 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); 237 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13); 238 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15); 239 240 // Reserve null register - it shall never be allocated 241 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL); 242 243 // Disallow vcc_hi allocation in wave32. It may be allocated but most likely 244 // will result in bugs. 245 if (isWave32) { 246 Reserved.set(AMDGPU::VCC); 247 Reserved.set(AMDGPU::VCC_HI); 248 } 249 250 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); 251 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 252 for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) { 253 unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); 254 reserveRegisterTuples(Reserved, Reg); 255 } 256 257 unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); 258 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); 259 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) { 260 unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); 261 reserveRegisterTuples(Reserved, Reg); 262 Reg = AMDGPU::AGPR_32RegClass.getRegister(i); 263 reserveRegisterTuples(Reserved, Reg); 264 } 265 266 // Reserve all the rest AGPRs if there are no instructions to use it. 267 if (!ST.hasMAIInsts()) { 268 for (unsigned i = 0; i < MaxNumVGPRs; ++i) { 269 unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); 270 reserveRegisterTuples(Reserved, Reg); 271 } 272 } 273 274 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 275 276 unsigned ScratchRSrcReg = MFI->getScratchRSrcReg(); 277 if (ScratchRSrcReg != AMDGPU::NoRegister) { 278 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need 279 // to spill. 280 // TODO: May need to reserve a VGPR if doing LDS spilling. 281 reserveRegisterTuples(Reserved, ScratchRSrcReg); 282 } 283 284 // We have to assume the SP is needed in case there are calls in the function, 285 // which is detected after the function is lowered. If we aren't really going 286 // to need SP, don't bother reserving it. 287 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg(); 288 289 if (StackPtrReg) { 290 reserveRegisterTuples(Reserved, StackPtrReg); 291 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg)); 292 } 293 294 MCRegister FrameReg = MFI->getFrameOffsetReg(); 295 if (FrameReg) { 296 reserveRegisterTuples(Reserved, FrameReg); 297 assert(!isSubRegister(ScratchRSrcReg, FrameReg)); 298 } 299 300 for (MCRegister Reg : MFI->WWMReservedRegs) { 301 reserveRegisterTuples(Reserved, Reg); 302 } 303 304 // FIXME: Stop using reserved registers for this. 305 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs()) 306 reserveRegisterTuples(Reserved, Reg); 307 308 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) 309 reserveRegisterTuples(Reserved, Reg); 310 311 return Reserved; 312 } 313 314 bool SIRegisterInfo::canRealignStack(const MachineFunction &MF) const { 315 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 316 // On entry, the base address is 0, so it can't possibly need any more 317 // alignment. 318 319 // FIXME: Should be able to specify the entry frame alignment per calling 320 // convention instead. 321 if (Info->isEntryFunction()) 322 return false; 323 324 return TargetRegisterInfo::canRealignStack(MF); 325 } 326 327 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { 328 const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>(); 329 if (Info->isEntryFunction()) { 330 const MachineFrameInfo &MFI = Fn.getFrameInfo(); 331 return MFI.hasStackObjects() || MFI.hasCalls(); 332 } 333 334 // May need scavenger for dealing with callee saved registers. 335 return true; 336 } 337 338 bool SIRegisterInfo::requiresFrameIndexScavenging( 339 const MachineFunction &MF) const { 340 // Do not use frame virtual registers. They used to be used for SGPRs, but 341 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the 342 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a 343 // spill. 344 return false; 345 } 346 347 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( 348 const MachineFunction &MF) const { 349 const MachineFrameInfo &MFI = MF.getFrameInfo(); 350 return MFI.hasStackObjects(); 351 } 352 353 bool SIRegisterInfo::requiresVirtualBaseRegisters( 354 const MachineFunction &) const { 355 // There are no special dedicated stack or frame pointers. 356 return true; 357 } 358 359 int64_t SIRegisterInfo::getMUBUFInstrOffset(const MachineInstr *MI) const { 360 assert(SIInstrInfo::isMUBUF(*MI)); 361 362 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 363 AMDGPU::OpName::offset); 364 return MI->getOperand(OffIdx).getImm(); 365 } 366 367 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI, 368 int Idx) const { 369 if (!SIInstrInfo::isMUBUF(*MI)) 370 return 0; 371 372 assert(Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 373 AMDGPU::OpName::vaddr) && 374 "Should never see frame index on non-address operand"); 375 376 return getMUBUFInstrOffset(MI); 377 } 378 379 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { 380 if (!MI->mayLoadOrStore()) 381 return false; 382 383 int64_t FullOffset = Offset + getMUBUFInstrOffset(MI); 384 385 return !isUInt<12>(FullOffset); 386 } 387 388 void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, 389 unsigned BaseReg, 390 int FrameIdx, 391 int64_t Offset) const { 392 MachineBasicBlock::iterator Ins = MBB->begin(); 393 DebugLoc DL; // Defaults to "unknown" 394 395 if (Ins != MBB->end()) 396 DL = Ins->getDebugLoc(); 397 398 MachineFunction *MF = MBB->getParent(); 399 const SIInstrInfo *TII = ST.getInstrInfo(); 400 401 if (Offset == 0) { 402 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg) 403 .addFrameIndex(FrameIdx); 404 return; 405 } 406 407 MachineRegisterInfo &MRI = MF->getRegInfo(); 408 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 409 410 Register FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 411 412 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) 413 .addImm(Offset); 414 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), FIReg) 415 .addFrameIndex(FrameIdx); 416 417 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg) 418 .addReg(OffsetReg, RegState::Kill) 419 .addReg(FIReg) 420 .addImm(0); // clamp bit 421 } 422 423 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, 424 int64_t Offset) const { 425 const SIInstrInfo *TII = ST.getInstrInfo(); 426 427 #ifndef NDEBUG 428 // FIXME: Is it possible to be storing a frame index to itself? 429 bool SeenFI = false; 430 for (const MachineOperand &MO: MI.operands()) { 431 if (MO.isFI()) { 432 if (SeenFI) 433 llvm_unreachable("should not see multiple frame indices"); 434 435 SeenFI = true; 436 } 437 } 438 #endif 439 440 MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 441 #ifndef NDEBUG 442 MachineBasicBlock *MBB = MI.getParent(); 443 MachineFunction *MF = MBB->getParent(); 444 #endif 445 assert(FIOp && FIOp->isFI() && "frame index must be address operand"); 446 assert(TII->isMUBUF(MI)); 447 assert(TII->getNamedOperand(MI, AMDGPU::OpName::soffset)->getReg() == 448 MF->getInfo<SIMachineFunctionInfo>()->getStackPtrOffsetReg() && 449 "should only be seeing stack pointer offset relative FrameIndex"); 450 451 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); 452 int64_t NewOffset = OffsetOp->getImm() + Offset; 453 assert(isUInt<12>(NewOffset) && "offset should be legal"); 454 455 FIOp->ChangeToRegister(BaseReg, false); 456 OffsetOp->setImm(NewOffset); 457 } 458 459 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, 460 unsigned BaseReg, 461 int64_t Offset) const { 462 if (!SIInstrInfo::isMUBUF(*MI)) 463 return false; 464 465 int64_t NewOffset = Offset + getMUBUFInstrOffset(MI); 466 467 return isUInt<12>(NewOffset); 468 } 469 470 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( 471 const MachineFunction &MF, unsigned Kind) const { 472 // This is inaccurate. It depends on the instruction and address space. The 473 // only place where we should hit this is for dealing with frame indexes / 474 // private accesses, so this is correct in that case. 475 return &AMDGPU::VGPR_32RegClass; 476 } 477 478 static unsigned getNumSubRegsForSpillOp(unsigned Op) { 479 480 switch (Op) { 481 case AMDGPU::SI_SPILL_S1024_SAVE: 482 case AMDGPU::SI_SPILL_S1024_RESTORE: 483 case AMDGPU::SI_SPILL_V1024_SAVE: 484 case AMDGPU::SI_SPILL_V1024_RESTORE: 485 case AMDGPU::SI_SPILL_A1024_SAVE: 486 case AMDGPU::SI_SPILL_A1024_RESTORE: 487 return 32; 488 case AMDGPU::SI_SPILL_S512_SAVE: 489 case AMDGPU::SI_SPILL_S512_RESTORE: 490 case AMDGPU::SI_SPILL_V512_SAVE: 491 case AMDGPU::SI_SPILL_V512_RESTORE: 492 case AMDGPU::SI_SPILL_A512_SAVE: 493 case AMDGPU::SI_SPILL_A512_RESTORE: 494 return 16; 495 case AMDGPU::SI_SPILL_S256_SAVE: 496 case AMDGPU::SI_SPILL_S256_RESTORE: 497 case AMDGPU::SI_SPILL_V256_SAVE: 498 case AMDGPU::SI_SPILL_V256_RESTORE: 499 return 8; 500 case AMDGPU::SI_SPILL_S160_SAVE: 501 case AMDGPU::SI_SPILL_S160_RESTORE: 502 case AMDGPU::SI_SPILL_V160_SAVE: 503 case AMDGPU::SI_SPILL_V160_RESTORE: 504 return 5; 505 case AMDGPU::SI_SPILL_S128_SAVE: 506 case AMDGPU::SI_SPILL_S128_RESTORE: 507 case AMDGPU::SI_SPILL_V128_SAVE: 508 case AMDGPU::SI_SPILL_V128_RESTORE: 509 case AMDGPU::SI_SPILL_A128_SAVE: 510 case AMDGPU::SI_SPILL_A128_RESTORE: 511 return 4; 512 case AMDGPU::SI_SPILL_S96_SAVE: 513 case AMDGPU::SI_SPILL_S96_RESTORE: 514 case AMDGPU::SI_SPILL_V96_SAVE: 515 case AMDGPU::SI_SPILL_V96_RESTORE: 516 return 3; 517 case AMDGPU::SI_SPILL_S64_SAVE: 518 case AMDGPU::SI_SPILL_S64_RESTORE: 519 case AMDGPU::SI_SPILL_V64_SAVE: 520 case AMDGPU::SI_SPILL_V64_RESTORE: 521 case AMDGPU::SI_SPILL_A64_SAVE: 522 case AMDGPU::SI_SPILL_A64_RESTORE: 523 return 2; 524 case AMDGPU::SI_SPILL_S32_SAVE: 525 case AMDGPU::SI_SPILL_S32_RESTORE: 526 case AMDGPU::SI_SPILL_V32_SAVE: 527 case AMDGPU::SI_SPILL_V32_RESTORE: 528 case AMDGPU::SI_SPILL_A32_SAVE: 529 case AMDGPU::SI_SPILL_A32_RESTORE: 530 return 1; 531 default: llvm_unreachable("Invalid spill opcode"); 532 } 533 } 534 535 static int getOffsetMUBUFStore(unsigned Opc) { 536 switch (Opc) { 537 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 538 return AMDGPU::BUFFER_STORE_DWORD_OFFSET; 539 case AMDGPU::BUFFER_STORE_BYTE_OFFEN: 540 return AMDGPU::BUFFER_STORE_BYTE_OFFSET; 541 case AMDGPU::BUFFER_STORE_SHORT_OFFEN: 542 return AMDGPU::BUFFER_STORE_SHORT_OFFSET; 543 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: 544 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; 545 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN: 546 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; 547 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN: 548 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET; 549 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN: 550 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET; 551 default: 552 return -1; 553 } 554 } 555 556 static int getOffsetMUBUFLoad(unsigned Opc) { 557 switch (Opc) { 558 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 559 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 560 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN: 561 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET; 562 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN: 563 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET; 564 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN: 565 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET; 566 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN: 567 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET; 568 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN: 569 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; 570 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN: 571 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET; 572 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN: 573 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET; 574 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN: 575 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET; 576 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN: 577 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET; 578 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN: 579 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET; 580 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN: 581 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET; 582 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN: 583 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET; 584 default: 585 return -1; 586 } 587 } 588 589 static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, 590 MachineBasicBlock::iterator MI, 591 int Index, 592 unsigned Lane, 593 unsigned ValueReg, 594 bool IsKill) { 595 MachineBasicBlock *MBB = MI->getParent(); 596 MachineFunction *MF = MI->getParent()->getParent(); 597 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 598 const SIInstrInfo *TII = ST.getInstrInfo(); 599 600 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane); 601 602 if (Reg == AMDGPU::NoRegister) 603 return MachineInstrBuilder(); 604 605 bool IsStore = MI->mayStore(); 606 MachineRegisterInfo &MRI = MF->getRegInfo(); 607 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 608 609 unsigned Dst = IsStore ? Reg : ValueReg; 610 unsigned Src = IsStore ? ValueReg : Reg; 611 unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32 612 : AMDGPU::V_ACCVGPR_READ_B32; 613 614 return BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst) 615 .addReg(Src, getKillRegState(IsKill)); 616 } 617 618 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not 619 // need to handle the case where an SGPR may need to be spilled while spilling. 620 static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, 621 MachineFrameInfo &MFI, 622 MachineBasicBlock::iterator MI, 623 int Index, 624 int64_t Offset) { 625 const SIInstrInfo *TII = ST.getInstrInfo(); 626 MachineBasicBlock *MBB = MI->getParent(); 627 const DebugLoc &DL = MI->getDebugLoc(); 628 bool IsStore = MI->mayStore(); 629 630 unsigned Opc = MI->getOpcode(); 631 int LoadStoreOp = IsStore ? 632 getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc); 633 if (LoadStoreOp == -1) 634 return false; 635 636 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); 637 if (spillVGPRtoAGPR(ST, MI, Index, 0, Reg->getReg(), false).getInstr()) 638 return true; 639 640 MachineInstrBuilder NewMI = 641 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) 642 .add(*Reg) 643 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) 644 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) 645 .addImm(Offset) 646 .addImm(0) // glc 647 .addImm(0) // slc 648 .addImm(0) // tfe 649 .addImm(0) // dlc 650 .addImm(0) // swz 651 .cloneMemRefs(*MI); 652 653 const MachineOperand *VDataIn = TII->getNamedOperand(*MI, 654 AMDGPU::OpName::vdata_in); 655 if (VDataIn) 656 NewMI.add(*VDataIn); 657 return true; 658 } 659 660 void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, 661 unsigned LoadStoreOp, 662 int Index, 663 Register ValueReg, 664 bool IsKill, 665 MCRegister ScratchRsrcReg, 666 MCRegister ScratchOffsetReg, 667 int64_t InstOffset, 668 MachineMemOperand *MMO, 669 RegScavenger *RS) const { 670 MachineBasicBlock *MBB = MI->getParent(); 671 MachineFunction *MF = MI->getParent()->getParent(); 672 const SIInstrInfo *TII = ST.getInstrInfo(); 673 const MachineFrameInfo &MFI = MF->getFrameInfo(); 674 675 const MCInstrDesc &Desc = TII->get(LoadStoreOp); 676 const DebugLoc &DL = MI->getDebugLoc(); 677 bool IsStore = Desc.mayStore(); 678 679 bool Scavenged = false; 680 MCRegister SOffset = ScratchOffsetReg; 681 682 const unsigned EltSize = 4; 683 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); 684 unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / (EltSize * CHAR_BIT); 685 unsigned Size = NumSubRegs * EltSize; 686 int64_t Offset = InstOffset + MFI.getObjectOffset(Index); 687 int64_t ScratchOffsetRegDelta = 0; 688 689 Align Alignment = MFI.getObjectAlign(Index); 690 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); 691 692 Register TmpReg = 693 hasAGPRs(RC) ? TII->getNamedOperand(*MI, AMDGPU::OpName::tmp)->getReg() 694 : Register(); 695 696 assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset"); 697 698 if (!isUInt<12>(Offset + Size - EltSize)) { 699 SOffset = MCRegister(); 700 701 // We currently only support spilling VGPRs to EltSize boundaries, meaning 702 // we can simplify the adjustment of Offset here to just scale with 703 // WavefrontSize. 704 Offset *= ST.getWavefrontSize(); 705 706 // We don't have access to the register scavenger if this function is called 707 // during PEI::scavengeFrameVirtualRegs(). 708 if (RS) 709 SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false); 710 711 if (!SOffset) { 712 if (!ScratchOffsetReg) { 713 report_fatal_error("could not scavenge SGPR to spill in entry function"); 714 } 715 // There are no free SGPRs, and since we are in the process of spilling 716 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true 717 // on SI/CI and on VI it is true until we implement spilling using scalar 718 // stores), we have no way to free up an SGPR. Our solution here is to 719 // add the offset directly to the ScratchOffset register, and then 720 // subtract the offset after the spill to return ScratchOffset to it's 721 // original value. 722 SOffset = ScratchOffsetReg; 723 ScratchOffsetRegDelta = Offset; 724 } else { 725 Scavenged = true; 726 } 727 728 if (ScratchOffsetReg == AMDGPU::NoRegister) { 729 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset) 730 .addImm(Offset); 731 } else { 732 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) 733 .addReg(ScratchOffsetReg) 734 .addImm(Offset); 735 } 736 737 Offset = 0; 738 } 739 740 for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) { 741 Register SubReg = NumSubRegs == 1 742 ? Register(ValueReg) 743 : getSubReg(ValueReg, getSubRegFromChannel(i)); 744 745 unsigned SOffsetRegState = 0; 746 unsigned SrcDstRegState = getDefRegState(!IsStore); 747 if (i + 1 == e) { 748 SOffsetRegState |= getKillRegState(Scavenged); 749 // The last implicit use carries the "Kill" flag. 750 SrcDstRegState |= getKillRegState(IsKill); 751 } 752 753 auto MIB = spillVGPRtoAGPR(ST, MI, Index, i, SubReg, IsKill); 754 755 if (!MIB.getInstr()) { 756 unsigned FinalReg = SubReg; 757 if (TmpReg != AMDGPU::NoRegister) { 758 if (IsStore) 759 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_READ_B32), TmpReg) 760 .addReg(SubReg, getKillRegState(IsKill)); 761 SubReg = TmpReg; 762 } 763 764 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i); 765 MachineMemOperand *NewMMO = 766 MF->getMachineMemOperand(PInfo, MMO->getFlags(), EltSize, 767 commonAlignment(Alignment, EltSize * i)); 768 769 MIB = BuildMI(*MBB, MI, DL, Desc) 770 .addReg(SubReg, 771 getDefRegState(!IsStore) | getKillRegState(IsKill)) 772 .addReg(ScratchRsrcReg); 773 if (SOffset == AMDGPU::NoRegister) { 774 MIB.addImm(0); 775 } else { 776 MIB.addReg(SOffset, SOffsetRegState); 777 } 778 MIB.addImm(Offset) 779 .addImm(0) // glc 780 .addImm(0) // slc 781 .addImm(0) // tfe 782 .addImm(0) // dlc 783 .addImm(0) // swz 784 .addMemOperand(NewMMO); 785 786 if (!IsStore && TmpReg != AMDGPU::NoRegister) 787 MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32), 788 FinalReg) 789 .addReg(TmpReg, RegState::Kill); 790 } 791 792 if (NumSubRegs > 1) 793 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); 794 } 795 796 if (ScratchOffsetRegDelta != 0) { 797 // Subtract the offset we added to the ScratchOffset register. 798 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffsetReg) 799 .addReg(ScratchOffsetReg) 800 .addImm(ScratchOffsetRegDelta); 801 } 802 } 803 804 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, 805 int Index, 806 RegScavenger *RS, 807 bool OnlyToVGPR) const { 808 MachineBasicBlock *MBB = MI->getParent(); 809 MachineFunction *MF = MBB->getParent(); 810 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 811 DenseSet<unsigned> SGPRSpillVGPRDefinedSet; 812 813 ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills 814 = MFI->getSGPRToVGPRSpills(Index); 815 bool SpillToVGPR = !VGPRSpills.empty(); 816 if (OnlyToVGPR && !SpillToVGPR) 817 return false; 818 819 const SIInstrInfo *TII = ST.getInstrInfo(); 820 821 Register SuperReg = MI->getOperand(0).getReg(); 822 bool IsKill = MI->getOperand(0).isKill(); 823 const DebugLoc &DL = MI->getDebugLoc(); 824 825 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 826 827 assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() && 828 SuperReg != MFI->getFrameOffsetReg())); 829 830 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 831 832 unsigned EltSize = 4; 833 const TargetRegisterClass *RC = getPhysRegClass(SuperReg); 834 835 ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); 836 unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 837 838 // Scavenged temporary VGPR to use. It must be scavenged once for any number 839 // of spilled subregs. 840 Register TmpVGPR; 841 842 // SubReg carries the "Kill" flag when SubReg == SuperReg. 843 unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); 844 for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { 845 Register SubReg = 846 NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]); 847 848 if (SpillToVGPR) { 849 SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; 850 851 // During SGPR spilling to VGPR, determine if the VGPR is defined. The 852 // only circumstance in which we say it is undefined is when it is the 853 // first spill to this VGPR in the first basic block. 854 bool VGPRDefined = true; 855 if (MBB == &MF->front()) 856 VGPRDefined = !SGPRSpillVGPRDefinedSet.insert(Spill.VGPR).second; 857 858 // Mark the "old value of vgpr" input undef only if this is the first sgpr 859 // spill to this specific vgpr in the first basic block. 860 BuildMI(*MBB, MI, DL, 861 TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), 862 Spill.VGPR) 863 .addReg(SubReg, getKillRegState(IsKill)) 864 .addImm(Spill.Lane) 865 .addReg(Spill.VGPR, VGPRDefined ? 0 : RegState::Undef); 866 867 // FIXME: Since this spills to another register instead of an actual 868 // frame index, we should delete the frame index when all references to 869 // it are fixed. 870 } else { 871 // XXX - Can to VGPR spill fail for some subregisters but not others? 872 if (OnlyToVGPR) 873 return false; 874 875 // Spill SGPR to a frame index. 876 if (!TmpVGPR.isValid()) 877 TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 878 879 MachineInstrBuilder Mov 880 = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 881 .addReg(SubReg, SubKillState); 882 883 // There could be undef components of a spilled super register. 884 // TODO: Can we detect this and skip the spill? 885 if (NumSubRegs > 1) { 886 // The last implicit use of the SuperReg carries the "Kill" flag. 887 unsigned SuperKillState = 0; 888 if (i + 1 == e) 889 SuperKillState |= getKillRegState(IsKill); 890 Mov.addReg(SuperReg, RegState::Implicit | SuperKillState); 891 } 892 893 Align Alignment = FrameInfo.getObjectAlign(Index); 894 MachinePointerInfo PtrInfo 895 = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); 896 MachineMemOperand *MMO = 897 MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, EltSize, 898 commonAlignment(Alignment, EltSize * i)); 899 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) 900 .addReg(TmpVGPR, RegState::Kill) // src 901 .addFrameIndex(Index) // vaddr 902 .addReg(MFI->getScratchRSrcReg()) // srrsrc 903 .addReg(MFI->getStackPtrOffsetReg()) // soffset 904 .addImm(i * 4) // offset 905 .addMemOperand(MMO); 906 } 907 } 908 909 MI->eraseFromParent(); 910 MFI->addToSpilledSGPRs(NumSubRegs); 911 return true; 912 } 913 914 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, 915 int Index, 916 RegScavenger *RS, 917 bool OnlyToVGPR) const { 918 MachineFunction *MF = MI->getParent()->getParent(); 919 MachineBasicBlock *MBB = MI->getParent(); 920 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 921 922 ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills 923 = MFI->getSGPRToVGPRSpills(Index); 924 bool SpillToVGPR = !VGPRSpills.empty(); 925 if (OnlyToVGPR && !SpillToVGPR) 926 return false; 927 928 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 929 const SIInstrInfo *TII = ST.getInstrInfo(); 930 const DebugLoc &DL = MI->getDebugLoc(); 931 932 Register SuperReg = MI->getOperand(0).getReg(); 933 934 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 935 936 unsigned EltSize = 4; 937 938 const TargetRegisterClass *RC = getPhysRegClass(SuperReg); 939 940 ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); 941 unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 942 943 Register TmpVGPR; 944 945 for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { 946 Register SubReg = 947 NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]); 948 949 if (SpillToVGPR) { 950 SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; 951 auto MIB = 952 BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), 953 SubReg) 954 .addReg(Spill.VGPR) 955 .addImm(Spill.Lane); 956 957 if (NumSubRegs > 1 && i == 0) 958 MIB.addReg(SuperReg, RegState::ImplicitDefine); 959 } else { 960 if (OnlyToVGPR) 961 return false; 962 963 // Restore SGPR from a stack slot. 964 // FIXME: We should use S_LOAD_DWORD here for VI. 965 if (!TmpVGPR.isValid()) 966 TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 967 Align Alignment = FrameInfo.getObjectAlign(Index); 968 969 MachinePointerInfo PtrInfo 970 = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); 971 972 MachineMemOperand *MMO = 973 MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, EltSize, 974 commonAlignment(Alignment, EltSize * i)); 975 976 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpVGPR) 977 .addFrameIndex(Index) // vaddr 978 .addReg(MFI->getScratchRSrcReg()) // srsrc 979 .addReg(MFI->getStackPtrOffsetReg()) // soffset 980 .addImm(i * 4) // offset 981 .addMemOperand(MMO); 982 983 auto MIB = 984 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) 985 .addReg(TmpVGPR, RegState::Kill); 986 987 if (NumSubRegs > 1) 988 MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); 989 } 990 } 991 992 MI->eraseFromParent(); 993 return true; 994 } 995 996 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to 997 /// a VGPR and the stack slot can be safely eliminated when all other users are 998 /// handled. 999 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( 1000 MachineBasicBlock::iterator MI, 1001 int FI, 1002 RegScavenger *RS) const { 1003 switch (MI->getOpcode()) { 1004 case AMDGPU::SI_SPILL_S1024_SAVE: 1005 case AMDGPU::SI_SPILL_S512_SAVE: 1006 case AMDGPU::SI_SPILL_S256_SAVE: 1007 case AMDGPU::SI_SPILL_S160_SAVE: 1008 case AMDGPU::SI_SPILL_S128_SAVE: 1009 case AMDGPU::SI_SPILL_S96_SAVE: 1010 case AMDGPU::SI_SPILL_S64_SAVE: 1011 case AMDGPU::SI_SPILL_S32_SAVE: 1012 return spillSGPR(MI, FI, RS, true); 1013 case AMDGPU::SI_SPILL_S1024_RESTORE: 1014 case AMDGPU::SI_SPILL_S512_RESTORE: 1015 case AMDGPU::SI_SPILL_S256_RESTORE: 1016 case AMDGPU::SI_SPILL_S160_RESTORE: 1017 case AMDGPU::SI_SPILL_S128_RESTORE: 1018 case AMDGPU::SI_SPILL_S96_RESTORE: 1019 case AMDGPU::SI_SPILL_S64_RESTORE: 1020 case AMDGPU::SI_SPILL_S32_RESTORE: 1021 return restoreSGPR(MI, FI, RS, true); 1022 default: 1023 llvm_unreachable("not an SGPR spill instruction"); 1024 } 1025 } 1026 1027 void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, 1028 int SPAdj, unsigned FIOperandNum, 1029 RegScavenger *RS) const { 1030 MachineFunction *MF = MI->getParent()->getParent(); 1031 MachineBasicBlock *MBB = MI->getParent(); 1032 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1033 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 1034 const SIInstrInfo *TII = ST.getInstrInfo(); 1035 DebugLoc DL = MI->getDebugLoc(); 1036 1037 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?"); 1038 1039 MachineOperand &FIOp = MI->getOperand(FIOperandNum); 1040 int Index = MI->getOperand(FIOperandNum).getIndex(); 1041 1042 Register FrameReg = getFrameRegister(*MF); 1043 1044 switch (MI->getOpcode()) { 1045 // SGPR register spill 1046 case AMDGPU::SI_SPILL_S1024_SAVE: 1047 case AMDGPU::SI_SPILL_S512_SAVE: 1048 case AMDGPU::SI_SPILL_S256_SAVE: 1049 case AMDGPU::SI_SPILL_S160_SAVE: 1050 case AMDGPU::SI_SPILL_S128_SAVE: 1051 case AMDGPU::SI_SPILL_S96_SAVE: 1052 case AMDGPU::SI_SPILL_S64_SAVE: 1053 case AMDGPU::SI_SPILL_S32_SAVE: { 1054 spillSGPR(MI, Index, RS); 1055 break; 1056 } 1057 1058 // SGPR register restore 1059 case AMDGPU::SI_SPILL_S1024_RESTORE: 1060 case AMDGPU::SI_SPILL_S512_RESTORE: 1061 case AMDGPU::SI_SPILL_S256_RESTORE: 1062 case AMDGPU::SI_SPILL_S160_RESTORE: 1063 case AMDGPU::SI_SPILL_S128_RESTORE: 1064 case AMDGPU::SI_SPILL_S96_RESTORE: 1065 case AMDGPU::SI_SPILL_S64_RESTORE: 1066 case AMDGPU::SI_SPILL_S32_RESTORE: { 1067 restoreSGPR(MI, Index, RS); 1068 break; 1069 } 1070 1071 // VGPR register spill 1072 case AMDGPU::SI_SPILL_V1024_SAVE: 1073 case AMDGPU::SI_SPILL_V512_SAVE: 1074 case AMDGPU::SI_SPILL_V256_SAVE: 1075 case AMDGPU::SI_SPILL_V160_SAVE: 1076 case AMDGPU::SI_SPILL_V128_SAVE: 1077 case AMDGPU::SI_SPILL_V96_SAVE: 1078 case AMDGPU::SI_SPILL_V64_SAVE: 1079 case AMDGPU::SI_SPILL_V32_SAVE: 1080 case AMDGPU::SI_SPILL_A1024_SAVE: 1081 case AMDGPU::SI_SPILL_A512_SAVE: 1082 case AMDGPU::SI_SPILL_A128_SAVE: 1083 case AMDGPU::SI_SPILL_A64_SAVE: 1084 case AMDGPU::SI_SPILL_A32_SAVE: { 1085 const MachineOperand *VData = TII->getNamedOperand(*MI, 1086 AMDGPU::OpName::vdata); 1087 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 1088 MFI->getStackPtrOffsetReg()); 1089 1090 buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, 1091 Index, 1092 VData->getReg(), VData->isKill(), 1093 TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), 1094 FrameReg, 1095 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 1096 *MI->memoperands_begin(), 1097 RS); 1098 MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); 1099 MI->eraseFromParent(); 1100 break; 1101 } 1102 case AMDGPU::SI_SPILL_V32_RESTORE: 1103 case AMDGPU::SI_SPILL_V64_RESTORE: 1104 case AMDGPU::SI_SPILL_V96_RESTORE: 1105 case AMDGPU::SI_SPILL_V128_RESTORE: 1106 case AMDGPU::SI_SPILL_V160_RESTORE: 1107 case AMDGPU::SI_SPILL_V256_RESTORE: 1108 case AMDGPU::SI_SPILL_V512_RESTORE: 1109 case AMDGPU::SI_SPILL_V1024_RESTORE: 1110 case AMDGPU::SI_SPILL_A32_RESTORE: 1111 case AMDGPU::SI_SPILL_A64_RESTORE: 1112 case AMDGPU::SI_SPILL_A128_RESTORE: 1113 case AMDGPU::SI_SPILL_A512_RESTORE: 1114 case AMDGPU::SI_SPILL_A1024_RESTORE: { 1115 const MachineOperand *VData = TII->getNamedOperand(*MI, 1116 AMDGPU::OpName::vdata); 1117 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 1118 MFI->getStackPtrOffsetReg()); 1119 1120 buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, 1121 Index, 1122 VData->getReg(), VData->isKill(), 1123 TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), 1124 FrameReg, 1125 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 1126 *MI->memoperands_begin(), 1127 RS); 1128 MI->eraseFromParent(); 1129 break; 1130 } 1131 1132 default: { 1133 const DebugLoc &DL = MI->getDebugLoc(); 1134 bool IsMUBUF = TII->isMUBUF(*MI); 1135 1136 if (!IsMUBUF && !MFI->isEntryFunction()) { 1137 // Convert to a swizzled stack address by scaling by the wave size. 1138 // 1139 // In an entry function/kernel the offset is already swizzled. 1140 1141 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32; 1142 Register ResultReg = 1143 IsCopy ? MI->getOperand(0).getReg() 1144 : RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 1145 1146 int64_t Offset = FrameInfo.getObjectOffset(Index); 1147 if (Offset == 0) { 1148 // XXX - This never happens because of emergency scavenging slot at 0? 1149 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg) 1150 .addImm(ST.getWavefrontSizeLog2()) 1151 .addReg(FrameReg); 1152 } else { 1153 if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) { 1154 // Reuse ResultReg in intermediate step. 1155 Register ScaledReg = ResultReg; 1156 1157 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), 1158 ScaledReg) 1159 .addImm(ST.getWavefrontSizeLog2()) 1160 .addReg(FrameReg); 1161 1162 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; 1163 1164 // TODO: Fold if use instruction is another add of a constant. 1165 if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { 1166 // FIXME: This can fail 1167 MIB.addImm(Offset); 1168 MIB.addReg(ScaledReg, RegState::Kill); 1169 if (!IsVOP2) 1170 MIB.addImm(0); // clamp bit 1171 } else { 1172 assert(MIB->getOpcode() == AMDGPU::V_ADD_I32_e64 && 1173 "Need to reuse carry out register"); 1174 1175 // Use scavenged unused carry out as offset register. 1176 Register ConstOffsetReg; 1177 if (!isWave32) 1178 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0); 1179 else 1180 ConstOffsetReg = MIB.getReg(1); 1181 1182 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) 1183 .addImm(Offset); 1184 MIB.addReg(ConstOffsetReg, RegState::Kill); 1185 MIB.addReg(ScaledReg, RegState::Kill); 1186 MIB.addImm(0); // clamp bit 1187 } 1188 } else { 1189 // We have to produce a carry out, and there isn't a free SGPR pair 1190 // for it. We can keep the whole computation on the SALU to avoid 1191 // clobbering an additional register at the cost of an extra mov. 1192 1193 // We may have 1 free scratch SGPR even though a carry out is 1194 // unavailable. Only one additional mov is needed. 1195 Register TmpScaledReg = 1196 RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); 1197 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg; 1198 1199 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg) 1200 .addReg(FrameReg) 1201 .addImm(ST.getWavefrontSizeLog2()); 1202 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), ScaledReg) 1203 .addReg(ScaledReg, RegState::Kill) 1204 .addImm(Offset); 1205 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg) 1206 .addReg(ScaledReg, RegState::Kill); 1207 1208 // If there were truly no free SGPRs, we need to undo everything. 1209 if (!TmpScaledReg.isValid()) { 1210 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScaledReg) 1211 .addReg(ScaledReg, RegState::Kill) 1212 .addImm(Offset); 1213 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg) 1214 .addReg(FrameReg) 1215 .addImm(ST.getWavefrontSizeLog2()); 1216 } 1217 } 1218 } 1219 1220 // Don't introduce an extra copy if we're just materializing in a mov. 1221 if (IsCopy) 1222 MI->eraseFromParent(); 1223 else 1224 FIOp.ChangeToRegister(ResultReg, false, false, true); 1225 return; 1226 } 1227 1228 if (IsMUBUF) { 1229 // Disable offen so we don't need a 0 vgpr base. 1230 assert(static_cast<int>(FIOperandNum) == 1231 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 1232 AMDGPU::OpName::vaddr)); 1233 1234 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset); 1235 assert((SOffset.isReg() && 1236 SOffset.getReg() == MFI->getStackPtrOffsetReg()) || 1237 (SOffset.isImm() && SOffset.getImm() == 0)); 1238 if (SOffset.isReg()) { 1239 if (FrameReg == AMDGPU::NoRegister) { 1240 SOffset.ChangeToImmediate(0); 1241 } else { 1242 SOffset.setReg(FrameReg); 1243 } 1244 } 1245 1246 int64_t Offset = FrameInfo.getObjectOffset(Index); 1247 int64_t OldImm 1248 = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(); 1249 int64_t NewOffset = OldImm + Offset; 1250 1251 if (isUInt<12>(NewOffset) && 1252 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) { 1253 MI->eraseFromParent(); 1254 return; 1255 } 1256 } 1257 1258 // If the offset is simply too big, don't convert to a scratch wave offset 1259 // relative index. 1260 1261 int64_t Offset = FrameInfo.getObjectOffset(Index); 1262 FIOp.ChangeToImmediate(Offset); 1263 if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { 1264 Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 1265 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 1266 .addImm(Offset); 1267 FIOp.ChangeToRegister(TmpReg, false, false, true); 1268 } 1269 } 1270 } 1271 } 1272 1273 StringRef SIRegisterInfo::getRegAsmName(unsigned Reg) const { 1274 return AMDGPUInstPrinter::getRegisterName(Reg); 1275 } 1276 1277 // FIXME: This is very slow. It might be worth creating a map from physreg to 1278 // register class. 1279 const TargetRegisterClass * 1280 SIRegisterInfo::getPhysRegClass(MCRegister Reg) const { 1281 static const TargetRegisterClass *const BaseClasses[] = { 1282 &AMDGPU::VGPR_32RegClass, 1283 &AMDGPU::SReg_32RegClass, 1284 &AMDGPU::AGPR_32RegClass, 1285 &AMDGPU::VReg_64RegClass, 1286 &AMDGPU::SReg_64RegClass, 1287 &AMDGPU::AReg_64RegClass, 1288 &AMDGPU::VReg_96RegClass, 1289 &AMDGPU::SReg_96RegClass, 1290 &AMDGPU::VReg_128RegClass, 1291 &AMDGPU::SReg_128RegClass, 1292 &AMDGPU::AReg_128RegClass, 1293 &AMDGPU::VReg_160RegClass, 1294 &AMDGPU::SReg_160RegClass, 1295 &AMDGPU::VReg_256RegClass, 1296 &AMDGPU::SReg_256RegClass, 1297 &AMDGPU::VReg_512RegClass, 1298 &AMDGPU::SReg_512RegClass, 1299 &AMDGPU::AReg_512RegClass, 1300 &AMDGPU::SReg_1024RegClass, 1301 &AMDGPU::VReg_1024RegClass, 1302 &AMDGPU::AReg_1024RegClass, 1303 &AMDGPU::SCC_CLASSRegClass, 1304 &AMDGPU::Pseudo_SReg_32RegClass, 1305 &AMDGPU::Pseudo_SReg_128RegClass, 1306 }; 1307 1308 for (const TargetRegisterClass *BaseClass : BaseClasses) { 1309 if (BaseClass->contains(Reg)) { 1310 return BaseClass; 1311 } 1312 } 1313 return nullptr; 1314 } 1315 1316 // TODO: It might be helpful to have some target specific flags in 1317 // TargetRegisterClass to mark which classes are VGPRs to make this trivial. 1318 bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { 1319 unsigned Size = getRegSizeInBits(*RC); 1320 switch (Size) { 1321 case 32: 1322 return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr; 1323 case 64: 1324 return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr; 1325 case 96: 1326 return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr; 1327 case 128: 1328 return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr; 1329 case 160: 1330 return getCommonSubClass(&AMDGPU::VReg_160RegClass, RC) != nullptr; 1331 case 256: 1332 return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr; 1333 case 512: 1334 return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr; 1335 case 1024: 1336 return getCommonSubClass(&AMDGPU::VReg_1024RegClass, RC) != nullptr; 1337 case 1: 1338 return getCommonSubClass(&AMDGPU::VReg_1RegClass, RC) != nullptr; 1339 default: 1340 assert(Size < 32 && "Invalid register class size"); 1341 return false; 1342 } 1343 } 1344 1345 bool SIRegisterInfo::hasAGPRs(const TargetRegisterClass *RC) const { 1346 unsigned Size = getRegSizeInBits(*RC); 1347 if (Size < 32) 1348 return false; 1349 switch (Size) { 1350 case 32: 1351 return getCommonSubClass(&AMDGPU::AGPR_32RegClass, RC) != nullptr; 1352 case 64: 1353 return getCommonSubClass(&AMDGPU::AReg_64RegClass, RC) != nullptr; 1354 case 96: 1355 return false; 1356 case 128: 1357 return getCommonSubClass(&AMDGPU::AReg_128RegClass, RC) != nullptr; 1358 case 160: 1359 case 256: 1360 return false; 1361 case 512: 1362 return getCommonSubClass(&AMDGPU::AReg_512RegClass, RC) != nullptr; 1363 case 1024: 1364 return getCommonSubClass(&AMDGPU::AReg_1024RegClass, RC) != nullptr; 1365 default: 1366 llvm_unreachable("Invalid register class size"); 1367 } 1368 } 1369 1370 const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( 1371 const TargetRegisterClass *SRC) const { 1372 switch (getRegSizeInBits(*SRC)) { 1373 case 32: 1374 return &AMDGPU::VGPR_32RegClass; 1375 case 64: 1376 return &AMDGPU::VReg_64RegClass; 1377 case 96: 1378 return &AMDGPU::VReg_96RegClass; 1379 case 128: 1380 return &AMDGPU::VReg_128RegClass; 1381 case 160: 1382 return &AMDGPU::VReg_160RegClass; 1383 case 256: 1384 return &AMDGPU::VReg_256RegClass; 1385 case 512: 1386 return &AMDGPU::VReg_512RegClass; 1387 case 1024: 1388 return &AMDGPU::VReg_1024RegClass; 1389 case 1: 1390 return &AMDGPU::VReg_1RegClass; 1391 default: 1392 llvm_unreachable("Invalid register class size"); 1393 } 1394 } 1395 1396 const TargetRegisterClass *SIRegisterInfo::getEquivalentAGPRClass( 1397 const TargetRegisterClass *SRC) const { 1398 switch (getRegSizeInBits(*SRC)) { 1399 case 32: 1400 return &AMDGPU::AGPR_32RegClass; 1401 case 64: 1402 return &AMDGPU::AReg_64RegClass; 1403 case 128: 1404 return &AMDGPU::AReg_128RegClass; 1405 case 512: 1406 return &AMDGPU::AReg_512RegClass; 1407 case 1024: 1408 return &AMDGPU::AReg_1024RegClass; 1409 default: 1410 llvm_unreachable("Invalid register class size"); 1411 } 1412 } 1413 1414 const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass( 1415 const TargetRegisterClass *VRC) const { 1416 switch (getRegSizeInBits(*VRC)) { 1417 case 32: 1418 return &AMDGPU::SGPR_32RegClass; 1419 case 64: 1420 return &AMDGPU::SReg_64RegClass; 1421 case 96: 1422 return &AMDGPU::SReg_96RegClass; 1423 case 128: 1424 return &AMDGPU::SGPR_128RegClass; 1425 case 160: 1426 return &AMDGPU::SReg_160RegClass; 1427 case 256: 1428 return &AMDGPU::SReg_256RegClass; 1429 case 512: 1430 return &AMDGPU::SReg_512RegClass; 1431 case 1024: 1432 return &AMDGPU::SReg_1024RegClass; 1433 default: 1434 llvm_unreachable("Invalid register class size"); 1435 } 1436 } 1437 1438 const TargetRegisterClass *SIRegisterInfo::getSubRegClass( 1439 const TargetRegisterClass *RC, unsigned SubIdx) const { 1440 if (SubIdx == AMDGPU::NoSubRegister) 1441 return RC; 1442 1443 // We can assume that each lane corresponds to one 32-bit register. 1444 unsigned Count = getNumChannelsFromSubReg(SubIdx); 1445 if (isSGPRClass(RC)) { 1446 switch (Count) { 1447 case 1: 1448 return &AMDGPU::SGPR_32RegClass; 1449 case 2: 1450 return &AMDGPU::SReg_64RegClass; 1451 case 3: 1452 return &AMDGPU::SReg_96RegClass; 1453 case 4: 1454 return &AMDGPU::SGPR_128RegClass; 1455 case 5: 1456 return &AMDGPU::SReg_160RegClass; 1457 case 8: 1458 return &AMDGPU::SReg_256RegClass; 1459 case 16: 1460 return &AMDGPU::SReg_512RegClass; 1461 case 32: /* fall-through */ 1462 default: 1463 llvm_unreachable("Invalid sub-register class size"); 1464 } 1465 } else if (hasAGPRs(RC)) { 1466 switch (Count) { 1467 case 1: 1468 return &AMDGPU::AGPR_32RegClass; 1469 case 2: 1470 return &AMDGPU::AReg_64RegClass; 1471 case 4: 1472 return &AMDGPU::AReg_128RegClass; 1473 case 16: 1474 return &AMDGPU::AReg_512RegClass; 1475 case 32: /* fall-through */ 1476 default: 1477 llvm_unreachable("Invalid sub-register class size"); 1478 } 1479 } else { 1480 switch (Count) { 1481 case 1: 1482 return &AMDGPU::VGPR_32RegClass; 1483 case 2: 1484 return &AMDGPU::VReg_64RegClass; 1485 case 3: 1486 return &AMDGPU::VReg_96RegClass; 1487 case 4: 1488 return &AMDGPU::VReg_128RegClass; 1489 case 5: 1490 return &AMDGPU::VReg_160RegClass; 1491 case 8: 1492 return &AMDGPU::VReg_256RegClass; 1493 case 16: 1494 return &AMDGPU::VReg_512RegClass; 1495 case 32: /* fall-through */ 1496 default: 1497 llvm_unreachable("Invalid sub-register class size"); 1498 } 1499 } 1500 } 1501 1502 bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { 1503 if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST && 1504 OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST) 1505 return !ST.hasMFMAInlineLiteralBug(); 1506 1507 return OpType >= AMDGPU::OPERAND_SRC_FIRST && 1508 OpType <= AMDGPU::OPERAND_SRC_LAST; 1509 } 1510 1511 bool SIRegisterInfo::shouldRewriteCopySrc( 1512 const TargetRegisterClass *DefRC, 1513 unsigned DefSubReg, 1514 const TargetRegisterClass *SrcRC, 1515 unsigned SrcSubReg) const { 1516 // We want to prefer the smallest register class possible, so we don't want to 1517 // stop and rewrite on anything that looks like a subregister 1518 // extract. Operations mostly don't care about the super register class, so we 1519 // only want to stop on the most basic of copies between the same register 1520 // class. 1521 // 1522 // e.g. if we have something like 1523 // %0 = ... 1524 // %1 = ... 1525 // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2 1526 // %3 = COPY %2, sub0 1527 // 1528 // We want to look through the COPY to find: 1529 // => %3 = COPY %0 1530 1531 // Plain copy. 1532 return getCommonSubClass(DefRC, SrcRC) != nullptr; 1533 } 1534 1535 /// Returns a register that is not used at any point in the function. 1536 /// If all registers are used, then this function will return 1537 // AMDGPU::NoRegister. 1538 MCRegister 1539 SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, 1540 const TargetRegisterClass *RC, 1541 const MachineFunction &MF) const { 1542 1543 for (MCRegister Reg : *RC) 1544 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 1545 return Reg; 1546 return MCRegister(); 1547 } 1548 1549 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, 1550 unsigned EltSize) const { 1551 if (EltSize == 4) { 1552 static const int16_t Sub0_31[] = { 1553 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1554 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 1555 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 1556 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 1557 AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19, 1558 AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23, 1559 AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27, 1560 AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31, 1561 }; 1562 1563 static const int16_t Sub0_15[] = { 1564 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1565 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 1566 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 1567 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 1568 }; 1569 1570 static const int16_t Sub0_7[] = { 1571 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1572 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 1573 }; 1574 1575 static const int16_t Sub0_4[] = { 1576 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, 1577 }; 1578 1579 static const int16_t Sub0_3[] = { 1580 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1581 }; 1582 1583 static const int16_t Sub0_2[] = { 1584 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 1585 }; 1586 1587 static const int16_t Sub0_1[] = { 1588 AMDGPU::sub0, AMDGPU::sub1, 1589 }; 1590 1591 switch (AMDGPU::getRegBitWidth(*RC->MC)) { 1592 case 32: 1593 return {}; 1594 case 64: 1595 return makeArrayRef(Sub0_1); 1596 case 96: 1597 return makeArrayRef(Sub0_2); 1598 case 128: 1599 return makeArrayRef(Sub0_3); 1600 case 160: 1601 return makeArrayRef(Sub0_4); 1602 case 256: 1603 return makeArrayRef(Sub0_7); 1604 case 512: 1605 return makeArrayRef(Sub0_15); 1606 case 1024: 1607 return makeArrayRef(Sub0_31); 1608 default: 1609 llvm_unreachable("unhandled register size"); 1610 } 1611 } 1612 1613 if (EltSize == 8) { 1614 static const int16_t Sub0_31_64[] = { 1615 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 1616 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 1617 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 1618 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, 1619 AMDGPU::sub16_sub17, AMDGPU::sub18_sub19, 1620 AMDGPU::sub20_sub21, AMDGPU::sub22_sub23, 1621 AMDGPU::sub24_sub25, AMDGPU::sub26_sub27, 1622 AMDGPU::sub28_sub29, AMDGPU::sub30_sub31 1623 }; 1624 1625 static const int16_t Sub0_15_64[] = { 1626 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 1627 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 1628 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 1629 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15 1630 }; 1631 1632 static const int16_t Sub0_7_64[] = { 1633 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 1634 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7 1635 }; 1636 1637 1638 static const int16_t Sub0_3_64[] = { 1639 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3 1640 }; 1641 1642 switch (AMDGPU::getRegBitWidth(*RC->MC)) { 1643 case 64: 1644 return {}; 1645 case 128: 1646 return makeArrayRef(Sub0_3_64); 1647 case 256: 1648 return makeArrayRef(Sub0_7_64); 1649 case 512: 1650 return makeArrayRef(Sub0_15_64); 1651 case 1024: 1652 return makeArrayRef(Sub0_31_64); 1653 default: 1654 llvm_unreachable("unhandled register size"); 1655 } 1656 } 1657 1658 if (EltSize == 16) { 1659 1660 static const int16_t Sub0_31_128[] = { 1661 AMDGPU::sub0_sub1_sub2_sub3, 1662 AMDGPU::sub4_sub5_sub6_sub7, 1663 AMDGPU::sub8_sub9_sub10_sub11, 1664 AMDGPU::sub12_sub13_sub14_sub15, 1665 AMDGPU::sub16_sub17_sub18_sub19, 1666 AMDGPU::sub20_sub21_sub22_sub23, 1667 AMDGPU::sub24_sub25_sub26_sub27, 1668 AMDGPU::sub28_sub29_sub30_sub31 1669 }; 1670 1671 static const int16_t Sub0_15_128[] = { 1672 AMDGPU::sub0_sub1_sub2_sub3, 1673 AMDGPU::sub4_sub5_sub6_sub7, 1674 AMDGPU::sub8_sub9_sub10_sub11, 1675 AMDGPU::sub12_sub13_sub14_sub15 1676 }; 1677 1678 static const int16_t Sub0_7_128[] = { 1679 AMDGPU::sub0_sub1_sub2_sub3, 1680 AMDGPU::sub4_sub5_sub6_sub7 1681 }; 1682 1683 switch (AMDGPU::getRegBitWidth(*RC->MC)) { 1684 case 128: 1685 return {}; 1686 case 256: 1687 return makeArrayRef(Sub0_7_128); 1688 case 512: 1689 return makeArrayRef(Sub0_15_128); 1690 case 1024: 1691 return makeArrayRef(Sub0_31_128); 1692 default: 1693 llvm_unreachable("unhandled register size"); 1694 } 1695 } 1696 1697 if (EltSize == 32) { 1698 static const int16_t Sub0_31_256[] = { 1699 AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, 1700 AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15, 1701 AMDGPU::sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23, 1702 AMDGPU::sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31 1703 }; 1704 1705 static const int16_t Sub0_15_256[] = { 1706 AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, 1707 AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15 1708 }; 1709 1710 switch (AMDGPU::getRegBitWidth(*RC->MC)) { 1711 case 256: 1712 return {}; 1713 case 512: 1714 return makeArrayRef(Sub0_15_256); 1715 case 1024: 1716 return makeArrayRef(Sub0_31_256); 1717 default: 1718 llvm_unreachable("unhandled register size"); 1719 } 1720 } 1721 1722 assert(EltSize == 64 && "unhandled elt size"); 1723 static const int16_t Sub0_31_512[] = { 1724 AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15, 1725 AMDGPU::sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31 1726 }; 1727 1728 switch (AMDGPU::getRegBitWidth(*RC->MC)) { 1729 case 512: 1730 return {}; 1731 case 1024: 1732 return makeArrayRef(Sub0_31_512); 1733 default: 1734 llvm_unreachable("unhandled register size"); 1735 } 1736 } 1737 1738 const TargetRegisterClass* 1739 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI, 1740 Register Reg) const { 1741 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegClass(Reg); 1742 } 1743 1744 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, 1745 Register Reg) const { 1746 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); 1747 assert(RC && "Register class for the reg not found"); 1748 return hasVGPRs(RC); 1749 } 1750 1751 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI, 1752 Register Reg) const { 1753 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); 1754 assert(RC && "Register class for the reg not found"); 1755 return hasAGPRs(RC); 1756 } 1757 1758 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, 1759 const TargetRegisterClass *SrcRC, 1760 unsigned SubReg, 1761 const TargetRegisterClass *DstRC, 1762 unsigned DstSubReg, 1763 const TargetRegisterClass *NewRC, 1764 LiveIntervals &LIS) const { 1765 unsigned SrcSize = getRegSizeInBits(*SrcRC); 1766 unsigned DstSize = getRegSizeInBits(*DstRC); 1767 unsigned NewSize = getRegSizeInBits(*NewRC); 1768 1769 // Do not increase size of registers beyond dword, we would need to allocate 1770 // adjacent registers and constraint regalloc more than needed. 1771 1772 // Always allow dword coalescing. 1773 if (SrcSize <= 32 || DstSize <= 32) 1774 return true; 1775 1776 return NewSize <= DstSize || NewSize <= SrcSize; 1777 } 1778 1779 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, 1780 MachineFunction &MF) const { 1781 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1782 1783 unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), 1784 MF.getFunction()); 1785 switch (RC->getID()) { 1786 default: 1787 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF); 1788 case AMDGPU::VGPR_32RegClassID: 1789 case AMDGPU::VGPR_LO16RegClassID: 1790 case AMDGPU::VGPR_HI16RegClassID: 1791 return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF)); 1792 case AMDGPU::SGPR_32RegClassID: 1793 return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF)); 1794 } 1795 } 1796 1797 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, 1798 unsigned Idx) const { 1799 if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 || 1800 Idx == AMDGPU::RegisterPressureSets::AGPR_32) 1801 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, 1802 const_cast<MachineFunction &>(MF)); 1803 1804 if (Idx == AMDGPU::RegisterPressureSets::SReg_32) 1805 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass, 1806 const_cast<MachineFunction &>(MF)); 1807 1808 llvm_unreachable("Unexpected register pressure set!"); 1809 } 1810 1811 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const { 1812 static const int Empty[] = { -1 }; 1813 1814 if (RegPressureIgnoredUnits[RegUnit]) 1815 return Empty; 1816 1817 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit); 1818 } 1819 1820 MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const { 1821 // Not a callee saved register. 1822 return AMDGPU::SGPR30_SGPR31; 1823 } 1824 1825 const TargetRegisterClass * 1826 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, 1827 const RegisterBank &RB, 1828 const MachineRegisterInfo &MRI) const { 1829 switch (Size) { 1830 case 1: { 1831 switch (RB.getID()) { 1832 case AMDGPU::VGPRRegBankID: 1833 return &AMDGPU::VGPR_32RegClass; 1834 case AMDGPU::VCCRegBankID: 1835 return isWave32 ? 1836 &AMDGPU::SReg_32_XM0_XEXECRegClass : &AMDGPU::SReg_64_XEXECRegClass; 1837 case AMDGPU::SGPRRegBankID: 1838 return &AMDGPU::SReg_32RegClass; 1839 default: 1840 llvm_unreachable("unknown register bank"); 1841 } 1842 } 1843 case 32: 1844 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass : 1845 &AMDGPU::SReg_32RegClass; 1846 case 64: 1847 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass : 1848 &AMDGPU::SReg_64RegClass; 1849 case 96: 1850 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_96RegClass : 1851 &AMDGPU::SReg_96RegClass; 1852 case 128: 1853 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass : 1854 &AMDGPU::SGPR_128RegClass; 1855 case 160: 1856 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_160RegClass : 1857 &AMDGPU::SReg_160RegClass; 1858 case 256: 1859 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_256RegClass : 1860 &AMDGPU::SReg_256RegClass; 1861 case 512: 1862 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_512RegClass : 1863 &AMDGPU::SReg_512RegClass; 1864 case 1024: 1865 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_1024RegClass : 1866 &AMDGPU::SReg_1024RegClass; 1867 default: 1868 if (Size < 32) 1869 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass : 1870 &AMDGPU::SReg_32RegClass; 1871 return nullptr; 1872 } 1873 } 1874 1875 const TargetRegisterClass * 1876 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, 1877 const MachineRegisterInfo &MRI) const { 1878 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg()); 1879 if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>()) 1880 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI); 1881 1882 const TargetRegisterClass *RC = RCOrRB.get<const TargetRegisterClass*>(); 1883 return getAllocatableClass(RC); 1884 } 1885 1886 MCRegister SIRegisterInfo::getVCC() const { 1887 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; 1888 } 1889 1890 const TargetRegisterClass * 1891 SIRegisterInfo::getRegClass(unsigned RCID) const { 1892 switch ((int)RCID) { 1893 case AMDGPU::SReg_1RegClassID: 1894 return getBoolRC(); 1895 case AMDGPU::SReg_1_XEXECRegClassID: 1896 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass 1897 : &AMDGPU::SReg_64_XEXECRegClass; 1898 case -1: 1899 return nullptr; 1900 default: 1901 return AMDGPUGenRegisterInfo::getRegClass(RCID); 1902 } 1903 } 1904 1905 // Find reaching register definition 1906 MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg, 1907 MachineInstr &Use, 1908 MachineRegisterInfo &MRI, 1909 LiveIntervals *LIS) const { 1910 auto &MDT = LIS->getAnalysis<MachineDominatorTree>(); 1911 SlotIndex UseIdx = LIS->getInstructionIndex(Use); 1912 SlotIndex DefIdx; 1913 1914 if (Reg.isVirtual()) { 1915 if (!LIS->hasInterval(Reg)) 1916 return nullptr; 1917 LiveInterval &LI = LIS->getInterval(Reg); 1918 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg) 1919 : MRI.getMaxLaneMaskForVReg(Reg); 1920 VNInfo *V = nullptr; 1921 if (LI.hasSubRanges()) { 1922 for (auto &S : LI.subranges()) { 1923 if ((S.LaneMask & SubLanes) == SubLanes) { 1924 V = S.getVNInfoAt(UseIdx); 1925 break; 1926 } 1927 } 1928 } else { 1929 V = LI.getVNInfoAt(UseIdx); 1930 } 1931 if (!V) 1932 return nullptr; 1933 DefIdx = V->def; 1934 } else { 1935 // Find last def. 1936 for (MCRegUnitIterator Units(Reg, this); Units.isValid(); ++Units) { 1937 LiveRange &LR = LIS->getRegUnit(*Units); 1938 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) { 1939 if (!DefIdx.isValid() || 1940 MDT.dominates(LIS->getInstructionFromIndex(DefIdx), 1941 LIS->getInstructionFromIndex(V->def))) 1942 DefIdx = V->def; 1943 } else { 1944 return nullptr; 1945 } 1946 } 1947 } 1948 1949 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx); 1950 1951 if (!Def || !MDT.dominates(Def, &Use)) 1952 return nullptr; 1953 1954 assert(Def->modifiesRegister(Reg, this)); 1955 1956 return Def; 1957 } 1958