1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief SI implementation of the TargetRegisterInfo class. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "SIRegisterInfo.h" 16 #include "SIInstrInfo.h" 17 #include "SIMachineFunctionInfo.h" 18 #include "AMDGPUSubtarget.h" 19 #include "llvm/CodeGen/MachineFrameInfo.h" 20 #include "llvm/CodeGen/MachineInstrBuilder.h" 21 #include "llvm/CodeGen/RegisterScavenging.h" 22 #include "llvm/IR/Function.h" 23 #include "llvm/IR/LLVMContext.h" 24 25 using namespace llvm; 26 27 static cl::opt<bool> EnableSpillSGPRToSMEM( 28 "amdgpu-spill-sgpr-to-smem", 29 cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"), 30 cl::init(false)); 31 32 33 static bool hasPressureSet(const int *PSets, unsigned PSetID) { 34 for (unsigned i = 0; PSets[i] != -1; ++i) { 35 if (PSets[i] == (int)PSetID) 36 return true; 37 } 38 return false; 39 } 40 41 void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg, 42 BitVector &PressureSets) const { 43 for (MCRegUnitIterator U(Reg, this); U.isValid(); ++U) { 44 const int *PSets = getRegUnitPressureSets(*U); 45 if (hasPressureSet(PSets, PSetID)) { 46 PressureSets.set(PSetID); 47 break; 48 } 49 } 50 } 51 52 SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo(), 53 SGPRPressureSets(getNumRegPressureSets()), 54 VGPRPressureSets(getNumRegPressureSets()) { 55 unsigned NumRegPressureSets = getNumRegPressureSets(); 56 57 SGPRSetID = NumRegPressureSets; 58 VGPRSetID = NumRegPressureSets; 59 60 for (unsigned i = 0; i < NumRegPressureSets; ++i) { 61 classifyPressureSet(i, AMDGPU::SGPR0, SGPRPressureSets); 62 classifyPressureSet(i, AMDGPU::VGPR0, VGPRPressureSets); 63 } 64 65 // Determine the number of reg units for each pressure set. 66 std::vector<unsigned> PressureSetRegUnits(NumRegPressureSets, 0); 67 for (unsigned i = 0, e = getNumRegUnits(); i != e; ++i) { 68 const int *PSets = getRegUnitPressureSets(i); 69 for (unsigned j = 0; PSets[j] != -1; ++j) { 70 ++PressureSetRegUnits[PSets[j]]; 71 } 72 } 73 74 unsigned VGPRMax = 0, SGPRMax = 0; 75 for (unsigned i = 0; i < NumRegPressureSets; ++i) { 76 if (isVGPRPressureSet(i) && PressureSetRegUnits[i] > VGPRMax) { 77 VGPRSetID = i; 78 VGPRMax = PressureSetRegUnits[i]; 79 continue; 80 } 81 if (isSGPRPressureSet(i) && PressureSetRegUnits[i] > SGPRMax) { 82 SGPRSetID = i; 83 SGPRMax = PressureSetRegUnits[i]; 84 } 85 } 86 87 assert(SGPRSetID < NumRegPressureSets && 88 VGPRSetID < NumRegPressureSets); 89 } 90 91 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const { 92 MCRegAliasIterator R(Reg, this, true); 93 94 for (; R.isValid(); ++R) 95 Reserved.set(*R); 96 } 97 98 unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( 99 const MachineFunction &MF) const { 100 unsigned BaseIdx = alignDown(getMaxNumSGPRs(MF), 4) - 4; 101 unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); 102 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); 103 } 104 105 unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( 106 const MachineFunction &MF) const { 107 unsigned RegCount = getMaxNumSGPRs(MF); 108 unsigned Reg; 109 110 // Try to place it in a hole after PrivateSegmentbufferReg. 111 if (RegCount & 3) { 112 // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to 113 // alignment constraints, so we have a hole where can put the wave offset. 114 Reg = RegCount - 1; 115 } else { 116 // We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the 117 // wave offset before it. 118 Reg = RegCount - 5; 119 } 120 return AMDGPU::SGPR_32RegClass.getRegister(Reg); 121 } 122 123 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { 124 BitVector Reserved(getNumRegs()); 125 Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); 126 127 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but 128 // this seems likely to result in bugs, so I'm marking them as reserved. 129 reserveRegisterTuples(Reserved, AMDGPU::EXEC); 130 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); 131 132 // Reserve Trap Handler registers - support is not implemented in Codegen. 133 reserveRegisterTuples(Reserved, AMDGPU::TBA); 134 reserveRegisterTuples(Reserved, AMDGPU::TMA); 135 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1); 136 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3); 137 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5); 138 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7); 139 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); 140 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); 141 142 unsigned MaxNumSGPRs = getMaxNumSGPRs(MF); 143 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 144 for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) { 145 unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); 146 reserveRegisterTuples(Reserved, Reg); 147 } 148 149 unsigned MaxNumVGPRs = getMaxNumVGPRs(MF); 150 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); 151 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) { 152 unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); 153 reserveRegisterTuples(Reserved, Reg); 154 } 155 156 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 157 158 unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); 159 if (ScratchWaveOffsetReg != AMDGPU::NoRegister) { 160 // Reserve 1 SGPR for scratch wave offset in case we need to spill. 161 reserveRegisterTuples(Reserved, ScratchWaveOffsetReg); 162 } 163 164 unsigned ScratchRSrcReg = MFI->getScratchRSrcReg(); 165 if (ScratchRSrcReg != AMDGPU::NoRegister) { 166 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need 167 // to spill. 168 // TODO: May need to reserve a VGPR if doing LDS spilling. 169 reserveRegisterTuples(Reserved, ScratchRSrcReg); 170 assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg)); 171 } 172 173 return Reserved; 174 } 175 176 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { 177 return Fn.getFrameInfo().hasStackObjects(); 178 } 179 180 bool 181 SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const { 182 return MF.getFrameInfo().hasStackObjects(); 183 } 184 185 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( 186 const MachineFunction &MF) const { 187 // m0 is needed for the scalar store offset. m0 is unallocatable, so we can't 188 // create a virtual register for it during frame index elimination, so the 189 // scavenger is directly needed. 190 return MF.getFrameInfo().hasStackObjects() && 191 MF.getSubtarget<SISubtarget>().hasScalarStores() && 192 MF.getInfo<SIMachineFunctionInfo>()->hasSpilledSGPRs(); 193 } 194 195 bool SIRegisterInfo::requiresVirtualBaseRegisters( 196 const MachineFunction &) const { 197 // There are no special dedicated stack or frame pointers. 198 return true; 199 } 200 201 bool SIRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const { 202 // This helps catch bugs as verifier errors. 203 return true; 204 } 205 206 int64_t SIRegisterInfo::getMUBUFInstrOffset(const MachineInstr *MI) const { 207 assert(SIInstrInfo::isMUBUF(*MI)); 208 209 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 210 AMDGPU::OpName::offset); 211 return MI->getOperand(OffIdx).getImm(); 212 } 213 214 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI, 215 int Idx) const { 216 if (!SIInstrInfo::isMUBUF(*MI)) 217 return 0; 218 219 assert(Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 220 AMDGPU::OpName::vaddr) && 221 "Should never see frame index on non-address operand"); 222 223 return getMUBUFInstrOffset(MI); 224 } 225 226 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { 227 if (!MI->mayLoadOrStore()) 228 return false; 229 230 int64_t FullOffset = Offset + getMUBUFInstrOffset(MI); 231 232 return !isUInt<12>(FullOffset); 233 } 234 235 void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, 236 unsigned BaseReg, 237 int FrameIdx, 238 int64_t Offset) const { 239 MachineBasicBlock::iterator Ins = MBB->begin(); 240 DebugLoc DL; // Defaults to "unknown" 241 242 if (Ins != MBB->end()) 243 DL = Ins->getDebugLoc(); 244 245 MachineFunction *MF = MBB->getParent(); 246 const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>(); 247 const SIInstrInfo *TII = Subtarget.getInstrInfo(); 248 249 if (Offset == 0) { 250 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg) 251 .addFrameIndex(FrameIdx); 252 return; 253 } 254 255 MachineRegisterInfo &MRI = MF->getRegInfo(); 256 unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 257 unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 258 259 unsigned FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 260 261 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) 262 .addImm(Offset); 263 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), FIReg) 264 .addFrameIndex(FrameIdx); 265 266 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_ADD_I32_e64), BaseReg) 267 .addReg(UnusedCarry, RegState::Define | RegState::Dead) 268 .addReg(OffsetReg, RegState::Kill) 269 .addReg(FIReg); 270 } 271 272 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, 273 int64_t Offset) const { 274 275 MachineBasicBlock *MBB = MI.getParent(); 276 MachineFunction *MF = MBB->getParent(); 277 const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>(); 278 const SIInstrInfo *TII = Subtarget.getInstrInfo(); 279 280 #ifndef NDEBUG 281 // FIXME: Is it possible to be storing a frame index to itself? 282 bool SeenFI = false; 283 for (const MachineOperand &MO: MI.operands()) { 284 if (MO.isFI()) { 285 if (SeenFI) 286 llvm_unreachable("should not see multiple frame indices"); 287 288 SeenFI = true; 289 } 290 } 291 #endif 292 293 MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 294 assert(FIOp && FIOp->isFI() && "frame index must be address operand"); 295 296 assert(TII->isMUBUF(MI)); 297 298 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); 299 int64_t NewOffset = OffsetOp->getImm() + Offset; 300 assert(isUInt<12>(NewOffset) && "offset should be legal"); 301 302 FIOp->ChangeToRegister(BaseReg, false); 303 OffsetOp->setImm(NewOffset); 304 } 305 306 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, 307 unsigned BaseReg, 308 int64_t Offset) const { 309 if (!SIInstrInfo::isMUBUF(*MI)) 310 return false; 311 312 int64_t NewOffset = Offset + getMUBUFInstrOffset(MI); 313 314 return isUInt<12>(NewOffset); 315 } 316 317 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( 318 const MachineFunction &MF, unsigned Kind) const { 319 // This is inaccurate. It depends on the instruction and address space. The 320 // only place where we should hit this is for dealing with frame indexes / 321 // private accesses, so this is correct in that case. 322 return &AMDGPU::VGPR_32RegClass; 323 } 324 325 static unsigned getNumSubRegsForSpillOp(unsigned Op) { 326 327 switch (Op) { 328 case AMDGPU::SI_SPILL_S512_SAVE: 329 case AMDGPU::SI_SPILL_S512_RESTORE: 330 case AMDGPU::SI_SPILL_V512_SAVE: 331 case AMDGPU::SI_SPILL_V512_RESTORE: 332 return 16; 333 case AMDGPU::SI_SPILL_S256_SAVE: 334 case AMDGPU::SI_SPILL_S256_RESTORE: 335 case AMDGPU::SI_SPILL_V256_SAVE: 336 case AMDGPU::SI_SPILL_V256_RESTORE: 337 return 8; 338 case AMDGPU::SI_SPILL_S128_SAVE: 339 case AMDGPU::SI_SPILL_S128_RESTORE: 340 case AMDGPU::SI_SPILL_V128_SAVE: 341 case AMDGPU::SI_SPILL_V128_RESTORE: 342 return 4; 343 case AMDGPU::SI_SPILL_V96_SAVE: 344 case AMDGPU::SI_SPILL_V96_RESTORE: 345 return 3; 346 case AMDGPU::SI_SPILL_S64_SAVE: 347 case AMDGPU::SI_SPILL_S64_RESTORE: 348 case AMDGPU::SI_SPILL_V64_SAVE: 349 case AMDGPU::SI_SPILL_V64_RESTORE: 350 return 2; 351 case AMDGPU::SI_SPILL_S32_SAVE: 352 case AMDGPU::SI_SPILL_S32_RESTORE: 353 case AMDGPU::SI_SPILL_V32_SAVE: 354 case AMDGPU::SI_SPILL_V32_RESTORE: 355 return 1; 356 default: llvm_unreachable("Invalid spill opcode"); 357 } 358 } 359 360 static int getOffsetMUBUFStore(unsigned Opc) { 361 switch (Opc) { 362 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 363 return AMDGPU::BUFFER_STORE_DWORD_OFFSET; 364 case AMDGPU::BUFFER_STORE_BYTE_OFFEN: 365 return AMDGPU::BUFFER_STORE_BYTE_OFFSET; 366 case AMDGPU::BUFFER_STORE_SHORT_OFFEN: 367 return AMDGPU::BUFFER_STORE_SHORT_OFFSET; 368 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: 369 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; 370 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN: 371 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; 372 default: 373 return -1; 374 } 375 } 376 377 static int getOffsetMUBUFLoad(unsigned Opc) { 378 switch (Opc) { 379 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 380 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 381 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN: 382 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET; 383 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN: 384 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET; 385 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN: 386 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET; 387 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN: 388 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET; 389 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN: 390 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; 391 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN: 392 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET; 393 default: 394 return -1; 395 } 396 } 397 398 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not 399 // need to handle the case where an SGPR may need to be spilled while spilling. 400 static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, 401 MachineFrameInfo &MFI, 402 MachineBasicBlock::iterator MI, 403 int Index, 404 int64_t Offset) { 405 MachineBasicBlock *MBB = MI->getParent(); 406 const DebugLoc &DL = MI->getDebugLoc(); 407 bool IsStore = MI->mayStore(); 408 409 unsigned Opc = MI->getOpcode(); 410 int LoadStoreOp = IsStore ? 411 getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc); 412 if (LoadStoreOp == -1) 413 return false; 414 415 unsigned Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata)->getReg(); 416 417 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) 418 .addReg(Reg, getDefRegState(!IsStore)) 419 .addOperand(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) 420 .addOperand(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) 421 .addImm(Offset) 422 .addImm(0) // glc 423 .addImm(0) // slc 424 .addImm(0) // tfe 425 .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 426 return true; 427 } 428 429 void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, 430 unsigned LoadStoreOp, 431 int Index, 432 unsigned ValueReg, 433 bool IsKill, 434 unsigned ScratchRsrcReg, 435 unsigned ScratchOffsetReg, 436 int64_t InstOffset, 437 MachineMemOperand *MMO, 438 RegScavenger *RS) const { 439 MachineBasicBlock *MBB = MI->getParent(); 440 MachineFunction *MF = MI->getParent()->getParent(); 441 const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); 442 const SIInstrInfo *TII = ST.getInstrInfo(); 443 const MachineFrameInfo &MFI = MF->getFrameInfo(); 444 445 const MCInstrDesc &Desc = TII->get(LoadStoreOp); 446 const DebugLoc &DL = MI->getDebugLoc(); 447 bool IsStore = Desc.mayStore(); 448 449 bool RanOutOfSGPRs = false; 450 bool Scavenged = false; 451 unsigned SOffset = ScratchOffsetReg; 452 453 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); 454 unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / 32; 455 unsigned Size = NumSubRegs * 4; 456 int64_t Offset = InstOffset + MFI.getObjectOffset(Index); 457 const int64_t OriginalImmOffset = Offset; 458 459 unsigned Align = MFI.getObjectAlignment(Index); 460 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); 461 462 if (!isUInt<12>(Offset + Size)) { 463 SOffset = AMDGPU::NoRegister; 464 465 // We don't have access to the register scavenger if this function is called 466 // during PEI::scavengeFrameVirtualRegs(). 467 if (RS) 468 SOffset = RS->FindUnusedReg(&AMDGPU::SGPR_32RegClass); 469 470 if (SOffset == AMDGPU::NoRegister) { 471 // There are no free SGPRs, and since we are in the process of spilling 472 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true 473 // on SI/CI and on VI it is true until we implement spilling using scalar 474 // stores), we have no way to free up an SGPR. Our solution here is to 475 // add the offset directly to the ScratchOffset register, and then 476 // subtract the offset after the spill to return ScratchOffset to it's 477 // original value. 478 RanOutOfSGPRs = true; 479 SOffset = ScratchOffsetReg; 480 } else { 481 Scavenged = true; 482 } 483 484 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) 485 .addReg(ScratchOffsetReg) 486 .addImm(Offset); 487 488 Offset = 0; 489 } 490 491 const unsigned EltSize = 4; 492 493 for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) { 494 unsigned SubReg = NumSubRegs == 1 ? 495 ValueReg : getSubReg(ValueReg, getSubRegFromChannel(i)); 496 497 unsigned SOffsetRegState = 0; 498 unsigned SrcDstRegState = getDefRegState(!IsStore); 499 if (i + 1 == e) { 500 SOffsetRegState |= getKillRegState(Scavenged); 501 // The last implicit use carries the "Kill" flag. 502 SrcDstRegState |= getKillRegState(IsKill); 503 } 504 505 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i); 506 MachineMemOperand *NewMMO 507 = MF->getMachineMemOperand(PInfo, MMO->getFlags(), 508 EltSize, MinAlign(Align, EltSize * i)); 509 510 auto MIB = BuildMI(*MBB, MI, DL, Desc) 511 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)) 512 .addReg(ScratchRsrcReg) 513 .addReg(SOffset, SOffsetRegState) 514 .addImm(Offset) 515 .addImm(0) // glc 516 .addImm(0) // slc 517 .addImm(0) // tfe 518 .addMemOperand(NewMMO); 519 520 if (NumSubRegs > 1) 521 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); 522 } 523 524 if (RanOutOfSGPRs) { 525 // Subtract the offset we added to the ScratchOffset register. 526 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffsetReg) 527 .addReg(ScratchOffsetReg) 528 .addImm(OriginalImmOffset); 529 } 530 } 531 532 static std::pair<unsigned, unsigned> getSpillEltSize(unsigned SuperRegSize, 533 bool Store) { 534 if (SuperRegSize % 16 == 0) { 535 return { 16, Store ? AMDGPU::S_BUFFER_STORE_DWORDX4_SGPR : 536 AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR }; 537 } 538 539 if (SuperRegSize % 8 == 0) { 540 return { 8, Store ? AMDGPU::S_BUFFER_STORE_DWORDX2_SGPR : 541 AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR }; 542 } 543 544 return { 4, Store ? AMDGPU::S_BUFFER_STORE_DWORD_SGPR : 545 AMDGPU::S_BUFFER_LOAD_DWORD_SGPR}; 546 } 547 548 void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, 549 int Index, 550 RegScavenger *RS) const { 551 MachineBasicBlock *MBB = MI->getParent(); 552 MachineFunction *MF = MBB->getParent(); 553 MachineRegisterInfo &MRI = MF->getRegInfo(); 554 const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); 555 const SIInstrInfo *TII = ST.getInstrInfo(); 556 557 unsigned SuperReg = MI->getOperand(0).getReg(); 558 bool IsKill = MI->getOperand(0).isKill(); 559 const DebugLoc &DL = MI->getDebugLoc(); 560 561 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 562 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 563 564 bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM; 565 566 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 567 568 unsigned OffsetReg = AMDGPU::M0; 569 unsigned M0CopyReg = AMDGPU::NoRegister; 570 571 if (SpillToSMEM) { 572 if (RS->isRegUsed(AMDGPU::M0)) { 573 M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 574 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg) 575 .addReg(AMDGPU::M0); 576 } 577 } 578 579 unsigned ScalarStoreOp; 580 unsigned EltSize = 4; 581 const TargetRegisterClass *RC = getPhysRegClass(SuperReg); 582 if (SpillToSMEM && isSGPRClass(RC)) { 583 // XXX - if private_element_size is larger than 4 it might be useful to be 584 // able to spill wider vmem spills. 585 std::tie(EltSize, ScalarStoreOp) = getSpillEltSize(RC->getSize(), true); 586 } 587 588 ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); 589 unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 590 591 // SubReg carries the "Kill" flag when SubReg == SuperReg. 592 unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); 593 for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { 594 unsigned SubReg = NumSubRegs == 1 ? 595 SuperReg : getSubReg(SuperReg, SplitParts[i]); 596 597 if (SpillToSMEM) { 598 int64_t FrOffset = FrameInfo.getObjectOffset(Index); 599 600 // The allocated memory size is really the wavefront size * the frame 601 // index size. The widest register class is 64 bytes, so a 4-byte scratch 602 // allocation is enough to spill this in a single stack object. 603 // 604 // FIXME: Frame size/offsets are computed earlier than this, so the extra 605 // space is still unnecessarily allocated. 606 607 unsigned Align = FrameInfo.getObjectAlignment(Index); 608 MachinePointerInfo PtrInfo 609 = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); 610 MachineMemOperand *MMO 611 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 612 EltSize, MinAlign(Align, EltSize * i)); 613 614 // SMEM instructions only support a single offset, so increment the wave 615 // offset. 616 617 int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); 618 if (Offset != 0) { 619 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) 620 .addReg(MFI->getScratchWaveOffsetReg()) 621 .addImm(Offset); 622 } else { 623 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) 624 .addReg(MFI->getScratchWaveOffsetReg()); 625 } 626 627 BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp)) 628 .addReg(SubReg, getKillRegState(IsKill)) // sdata 629 .addReg(MFI->getScratchRSrcReg()) // sbase 630 .addReg(OffsetReg, RegState::Kill) // soff 631 .addImm(0) // glc 632 .addMemOperand(MMO); 633 634 continue; 635 } 636 637 struct SIMachineFunctionInfo::SpilledReg Spill = 638 MFI->getSpilledReg(MF, Index, i); 639 if (Spill.hasReg()) { 640 BuildMI(*MBB, MI, DL, 641 TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), 642 Spill.VGPR) 643 .addReg(SubReg, getKillRegState(IsKill)) 644 .addImm(Spill.Lane); 645 646 // FIXME: Since this spills to another register instead of an actual 647 // frame index, we should delete the frame index when all references to 648 // it are fixed. 649 } else { 650 // Spill SGPR to a frame index. 651 // TODO: Should VI try to spill to VGPR and then spill to SMEM? 652 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 653 // TODO: Should VI try to spill to VGPR and then spill to SMEM? 654 655 MachineInstrBuilder Mov 656 = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 657 .addReg(SubReg, SubKillState); 658 659 660 // There could be undef components of a spilled super register. 661 // TODO: Can we detect this and skip the spill? 662 if (NumSubRegs > 1) { 663 // The last implicit use of the SuperReg carries the "Kill" flag. 664 unsigned SuperKillState = 0; 665 if (i + 1 == e) 666 SuperKillState |= getKillRegState(IsKill); 667 Mov.addReg(SuperReg, RegState::Implicit | SuperKillState); 668 } 669 670 unsigned Align = FrameInfo.getObjectAlignment(Index); 671 MachinePointerInfo PtrInfo 672 = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); 673 MachineMemOperand *MMO 674 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 675 EltSize, MinAlign(Align, EltSize * i)); 676 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) 677 .addReg(TmpReg, RegState::Kill) // src 678 .addFrameIndex(Index) // vaddr 679 .addReg(MFI->getScratchRSrcReg()) // srrsrc 680 .addReg(MFI->getScratchWaveOffsetReg()) // soffset 681 .addImm(i * 4) // offset 682 .addMemOperand(MMO); 683 } 684 } 685 686 if (M0CopyReg != AMDGPU::NoRegister) { 687 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0) 688 .addReg(M0CopyReg, RegState::Kill); 689 } 690 691 MI->eraseFromParent(); 692 MFI->addToSpilledSGPRs(NumSubRegs); 693 } 694 695 void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, 696 int Index, 697 RegScavenger *RS) const { 698 MachineFunction *MF = MI->getParent()->getParent(); 699 MachineRegisterInfo &MRI = MF->getRegInfo(); 700 MachineBasicBlock *MBB = MI->getParent(); 701 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 702 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 703 const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); 704 const SIInstrInfo *TII = ST.getInstrInfo(); 705 const DebugLoc &DL = MI->getDebugLoc(); 706 707 unsigned SuperReg = MI->getOperand(0).getReg(); 708 bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM; 709 710 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 711 712 unsigned OffsetReg = AMDGPU::M0; 713 unsigned M0CopyReg = AMDGPU::NoRegister; 714 715 if (SpillToSMEM) { 716 if (RS->isRegUsed(AMDGPU::M0)) { 717 M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 718 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg) 719 .addReg(AMDGPU::M0); 720 } 721 } 722 723 unsigned EltSize = 4; 724 unsigned ScalarLoadOp; 725 726 const TargetRegisterClass *RC = getPhysRegClass(SuperReg); 727 if (SpillToSMEM && isSGPRClass(RC)) { 728 // XXX - if private_element_size is larger than 4 it might be useful to be 729 // able to spill wider vmem spills. 730 std::tie(EltSize, ScalarLoadOp) = getSpillEltSize(RC->getSize(), false); 731 } 732 733 ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); 734 unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 735 736 // SubReg carries the "Kill" flag when SubReg == SuperReg. 737 int64_t FrOffset = FrameInfo.getObjectOffset(Index); 738 739 for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { 740 unsigned SubReg = NumSubRegs == 1 ? 741 SuperReg : getSubReg(SuperReg, SplitParts[i]); 742 743 if (SpillToSMEM) { 744 // FIXME: Size may be > 4 but extra bytes wasted. 745 unsigned Align = FrameInfo.getObjectAlignment(Index); 746 MachinePointerInfo PtrInfo 747 = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); 748 MachineMemOperand *MMO 749 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, 750 EltSize, MinAlign(Align, EltSize * i)); 751 752 // Add i * 4 offset 753 int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); 754 if (Offset != 0) { 755 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) 756 .addReg(MFI->getScratchWaveOffsetReg()) 757 .addImm(Offset); 758 } else { 759 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) 760 .addReg(MFI->getScratchWaveOffsetReg()); 761 } 762 763 auto MIB = 764 BuildMI(*MBB, MI, DL, TII->get(ScalarLoadOp), SubReg) 765 .addReg(MFI->getScratchRSrcReg()) // sbase 766 .addReg(OffsetReg, RegState::Kill) // soff 767 .addImm(0) // glc 768 .addMemOperand(MMO); 769 770 if (NumSubRegs > 1) 771 MIB.addReg(SuperReg, RegState::ImplicitDefine); 772 773 continue; 774 } 775 776 SIMachineFunctionInfo::SpilledReg Spill 777 = MFI->getSpilledReg(MF, Index, i); 778 779 if (Spill.hasReg()) { 780 auto MIB = 781 BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), 782 SubReg) 783 .addReg(Spill.VGPR) 784 .addImm(Spill.Lane); 785 786 if (NumSubRegs > 1) 787 MIB.addReg(SuperReg, RegState::ImplicitDefine); 788 } else { 789 // Restore SGPR from a stack slot. 790 // FIXME: We should use S_LOAD_DWORD here for VI. 791 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 792 unsigned Align = FrameInfo.getObjectAlignment(Index); 793 794 MachinePointerInfo PtrInfo 795 = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); 796 797 MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo, 798 MachineMemOperand::MOLoad, EltSize, 799 MinAlign(Align, EltSize * i)); 800 801 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg) 802 .addFrameIndex(Index) // vaddr 803 .addReg(MFI->getScratchRSrcReg()) // srsrc 804 .addReg(MFI->getScratchWaveOffsetReg()) // soffset 805 .addImm(i * 4) // offset 806 .addMemOperand(MMO); 807 808 auto MIB = 809 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) 810 .addReg(TmpReg, RegState::Kill); 811 812 if (NumSubRegs > 1) 813 MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); 814 } 815 } 816 817 if (M0CopyReg != AMDGPU::NoRegister) { 818 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0) 819 .addReg(M0CopyReg, RegState::Kill); 820 } 821 822 MI->eraseFromParent(); 823 } 824 825 void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, 826 int SPAdj, unsigned FIOperandNum, 827 RegScavenger *RS) const { 828 MachineFunction *MF = MI->getParent()->getParent(); 829 MachineRegisterInfo &MRI = MF->getRegInfo(); 830 MachineBasicBlock *MBB = MI->getParent(); 831 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 832 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 833 const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); 834 const SIInstrInfo *TII = ST.getInstrInfo(); 835 DebugLoc DL = MI->getDebugLoc(); 836 837 MachineOperand &FIOp = MI->getOperand(FIOperandNum); 838 int Index = MI->getOperand(FIOperandNum).getIndex(); 839 840 switch (MI->getOpcode()) { 841 // SGPR register spill 842 case AMDGPU::SI_SPILL_S512_SAVE: 843 case AMDGPU::SI_SPILL_S256_SAVE: 844 case AMDGPU::SI_SPILL_S128_SAVE: 845 case AMDGPU::SI_SPILL_S64_SAVE: 846 case AMDGPU::SI_SPILL_S32_SAVE: { 847 spillSGPR(MI, Index, RS); 848 break; 849 } 850 851 // SGPR register restore 852 case AMDGPU::SI_SPILL_S512_RESTORE: 853 case AMDGPU::SI_SPILL_S256_RESTORE: 854 case AMDGPU::SI_SPILL_S128_RESTORE: 855 case AMDGPU::SI_SPILL_S64_RESTORE: 856 case AMDGPU::SI_SPILL_S32_RESTORE: { 857 restoreSGPR(MI, Index, RS); 858 break; 859 } 860 861 // VGPR register spill 862 case AMDGPU::SI_SPILL_V512_SAVE: 863 case AMDGPU::SI_SPILL_V256_SAVE: 864 case AMDGPU::SI_SPILL_V128_SAVE: 865 case AMDGPU::SI_SPILL_V96_SAVE: 866 case AMDGPU::SI_SPILL_V64_SAVE: 867 case AMDGPU::SI_SPILL_V32_SAVE: { 868 const MachineOperand *VData = TII->getNamedOperand(*MI, 869 AMDGPU::OpName::vdata); 870 buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, 871 Index, 872 VData->getReg(), VData->isKill(), 873 TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), 874 TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(), 875 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 876 *MI->memoperands_begin(), 877 RS); 878 MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); 879 MI->eraseFromParent(); 880 break; 881 } 882 case AMDGPU::SI_SPILL_V32_RESTORE: 883 case AMDGPU::SI_SPILL_V64_RESTORE: 884 case AMDGPU::SI_SPILL_V96_RESTORE: 885 case AMDGPU::SI_SPILL_V128_RESTORE: 886 case AMDGPU::SI_SPILL_V256_RESTORE: 887 case AMDGPU::SI_SPILL_V512_RESTORE: { 888 const MachineOperand *VData = TII->getNamedOperand(*MI, 889 AMDGPU::OpName::vdata); 890 891 buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, 892 Index, 893 VData->getReg(), VData->isKill(), 894 TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), 895 TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(), 896 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 897 *MI->memoperands_begin(), 898 RS); 899 MI->eraseFromParent(); 900 break; 901 } 902 903 default: { 904 if (TII->isMUBUF(*MI)) { 905 // Disable offen so we don't need a 0 vgpr base. 906 assert(static_cast<int>(FIOperandNum) == 907 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 908 AMDGPU::OpName::vaddr)); 909 910 int64_t Offset = FrameInfo.getObjectOffset(Index); 911 int64_t OldImm 912 = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(); 913 int64_t NewOffset = OldImm + Offset; 914 915 if (isUInt<12>(NewOffset) && 916 buildMUBUFOffsetLoadStore(TII, FrameInfo, MI, Index, NewOffset)) { 917 MI->eraseFromParent(); 918 break; 919 } 920 } 921 922 int64_t Offset = FrameInfo.getObjectOffset(Index); 923 FIOp.ChangeToImmediate(Offset); 924 if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { 925 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 926 BuildMI(*MBB, MI, MI->getDebugLoc(), 927 TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 928 .addImm(Offset); 929 FIOp.ChangeToRegister(TmpReg, false, false, true); 930 } 931 } 932 } 933 } 934 935 // FIXME: This is very slow. It might be worth creating a map from physreg to 936 // register class. 937 const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { 938 assert(!TargetRegisterInfo::isVirtualRegister(Reg)); 939 940 static const TargetRegisterClass *const BaseClasses[] = { 941 &AMDGPU::VGPR_32RegClass, 942 &AMDGPU::SReg_32RegClass, 943 &AMDGPU::VReg_64RegClass, 944 &AMDGPU::SReg_64RegClass, 945 &AMDGPU::VReg_96RegClass, 946 &AMDGPU::VReg_128RegClass, 947 &AMDGPU::SReg_128RegClass, 948 &AMDGPU::VReg_256RegClass, 949 &AMDGPU::SReg_256RegClass, 950 &AMDGPU::VReg_512RegClass, 951 &AMDGPU::SReg_512RegClass, 952 &AMDGPU::SCC_CLASSRegClass, 953 }; 954 955 for (const TargetRegisterClass *BaseClass : BaseClasses) { 956 if (BaseClass->contains(Reg)) { 957 return BaseClass; 958 } 959 } 960 return nullptr; 961 } 962 963 // TODO: It might be helpful to have some target specific flags in 964 // TargetRegisterClass to mark which classes are VGPRs to make this trivial. 965 bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { 966 switch (RC->getSize()) { 967 case 0: return false; 968 case 1: return false; 969 case 4: 970 return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr; 971 case 8: 972 return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr; 973 case 12: 974 return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr; 975 case 16: 976 return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr; 977 case 32: 978 return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr; 979 case 64: 980 return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr; 981 default: 982 llvm_unreachable("Invalid register class size"); 983 } 984 } 985 986 const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( 987 const TargetRegisterClass *SRC) const { 988 switch (SRC->getSize()) { 989 case 4: 990 return &AMDGPU::VGPR_32RegClass; 991 case 8: 992 return &AMDGPU::VReg_64RegClass; 993 case 12: 994 return &AMDGPU::VReg_96RegClass; 995 case 16: 996 return &AMDGPU::VReg_128RegClass; 997 case 32: 998 return &AMDGPU::VReg_256RegClass; 999 case 64: 1000 return &AMDGPU::VReg_512RegClass; 1001 default: 1002 llvm_unreachable("Invalid register class size"); 1003 } 1004 } 1005 1006 const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass( 1007 const TargetRegisterClass *VRC) const { 1008 switch (VRC->getSize()) { 1009 case 4: 1010 return &AMDGPU::SGPR_32RegClass; 1011 case 8: 1012 return &AMDGPU::SReg_64RegClass; 1013 case 16: 1014 return &AMDGPU::SReg_128RegClass; 1015 case 32: 1016 return &AMDGPU::SReg_256RegClass; 1017 case 64: 1018 return &AMDGPU::SReg_512RegClass; 1019 default: 1020 llvm_unreachable("Invalid register class size"); 1021 } 1022 } 1023 1024 const TargetRegisterClass *SIRegisterInfo::getSubRegClass( 1025 const TargetRegisterClass *RC, unsigned SubIdx) const { 1026 if (SubIdx == AMDGPU::NoSubRegister) 1027 return RC; 1028 1029 // We can assume that each lane corresponds to one 32-bit register. 1030 LaneBitmask::Type Mask = getSubRegIndexLaneMask(SubIdx).getAsInteger(); 1031 unsigned Count = countPopulation(Mask); 1032 if (isSGPRClass(RC)) { 1033 switch (Count) { 1034 case 1: 1035 return &AMDGPU::SGPR_32RegClass; 1036 case 2: 1037 return &AMDGPU::SReg_64RegClass; 1038 case 4: 1039 return &AMDGPU::SReg_128RegClass; 1040 case 8: 1041 return &AMDGPU::SReg_256RegClass; 1042 case 16: /* fall-through */ 1043 default: 1044 llvm_unreachable("Invalid sub-register class size"); 1045 } 1046 } else { 1047 switch (Count) { 1048 case 1: 1049 return &AMDGPU::VGPR_32RegClass; 1050 case 2: 1051 return &AMDGPU::VReg_64RegClass; 1052 case 3: 1053 return &AMDGPU::VReg_96RegClass; 1054 case 4: 1055 return &AMDGPU::VReg_128RegClass; 1056 case 8: 1057 return &AMDGPU::VReg_256RegClass; 1058 case 16: /* fall-through */ 1059 default: 1060 llvm_unreachable("Invalid sub-register class size"); 1061 } 1062 } 1063 } 1064 1065 bool SIRegisterInfo::shouldRewriteCopySrc( 1066 const TargetRegisterClass *DefRC, 1067 unsigned DefSubReg, 1068 const TargetRegisterClass *SrcRC, 1069 unsigned SrcSubReg) const { 1070 // We want to prefer the smallest register class possible, so we don't want to 1071 // stop and rewrite on anything that looks like a subregister 1072 // extract. Operations mostly don't care about the super register class, so we 1073 // only want to stop on the most basic of copies between the same register 1074 // class. 1075 // 1076 // e.g. if we have something like 1077 // vreg0 = ... 1078 // vreg1 = ... 1079 // vreg2 = REG_SEQUENCE vreg0, sub0, vreg1, sub1, vreg2, sub2 1080 // vreg3 = COPY vreg2, sub0 1081 // 1082 // We want to look through the COPY to find: 1083 // => vreg3 = COPY vreg0 1084 1085 // Plain copy. 1086 return getCommonSubClass(DefRC, SrcRC) != nullptr; 1087 } 1088 1089 // FIXME: Most of these are flexible with HSA and we don't need to reserve them 1090 // as input registers if unused. Whether the dispatch ptr is necessary should be 1091 // easy to detect from used intrinsics. Scratch setup is harder to know. 1092 unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, 1093 enum PreloadedValue Value) const { 1094 1095 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1096 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 1097 (void)ST; 1098 switch (Value) { 1099 case SIRegisterInfo::WORKGROUP_ID_X: 1100 assert(MFI->hasWorkGroupIDX()); 1101 return MFI->WorkGroupIDXSystemSGPR; 1102 case SIRegisterInfo::WORKGROUP_ID_Y: 1103 assert(MFI->hasWorkGroupIDY()); 1104 return MFI->WorkGroupIDYSystemSGPR; 1105 case SIRegisterInfo::WORKGROUP_ID_Z: 1106 assert(MFI->hasWorkGroupIDZ()); 1107 return MFI->WorkGroupIDZSystemSGPR; 1108 case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET: 1109 return MFI->PrivateSegmentWaveByteOffsetSystemSGPR; 1110 case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER: 1111 assert(ST.isAmdCodeObjectV2() && 1112 "Non-CodeObjectV2 ABI currently uses relocations"); 1113 assert(MFI->hasPrivateSegmentBuffer()); 1114 return MFI->PrivateSegmentBufferUserSGPR; 1115 case SIRegisterInfo::KERNARG_SEGMENT_PTR: 1116 assert(MFI->hasKernargSegmentPtr()); 1117 return MFI->KernargSegmentPtrUserSGPR; 1118 case SIRegisterInfo::DISPATCH_ID: 1119 assert(MFI->hasDispatchID()); 1120 return MFI->DispatchIDUserSGPR; 1121 case SIRegisterInfo::FLAT_SCRATCH_INIT: 1122 assert(MFI->hasFlatScratchInit()); 1123 return MFI->FlatScratchInitUserSGPR; 1124 case SIRegisterInfo::DISPATCH_PTR: 1125 assert(MFI->hasDispatchPtr()); 1126 return MFI->DispatchPtrUserSGPR; 1127 case SIRegisterInfo::QUEUE_PTR: 1128 assert(MFI->hasQueuePtr()); 1129 return MFI->QueuePtrUserSGPR; 1130 case SIRegisterInfo::WORKITEM_ID_X: 1131 assert(MFI->hasWorkItemIDX()); 1132 return AMDGPU::VGPR0; 1133 case SIRegisterInfo::WORKITEM_ID_Y: 1134 assert(MFI->hasWorkItemIDY()); 1135 return AMDGPU::VGPR1; 1136 case SIRegisterInfo::WORKITEM_ID_Z: 1137 assert(MFI->hasWorkItemIDZ()); 1138 return AMDGPU::VGPR2; 1139 } 1140 llvm_unreachable("unexpected preloaded value type"); 1141 } 1142 1143 /// \brief Returns a register that is not used at any point in the function. 1144 /// If all registers are used, then this function will return 1145 // AMDGPU::NoRegister. 1146 unsigned 1147 SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, 1148 const TargetRegisterClass *RC, 1149 const MachineFunction &MF) const { 1150 1151 for (unsigned Reg : *RC) 1152 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 1153 return Reg; 1154 return AMDGPU::NoRegister; 1155 } 1156 1157 unsigned SIRegisterInfo::getTotalNumSGPRs(const SISubtarget &ST) const { 1158 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 1159 return 800; 1160 return 512; 1161 } 1162 1163 unsigned SIRegisterInfo::getNumAddressableSGPRs(const SISubtarget &ST) const { 1164 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 1165 return 102; 1166 return 104; 1167 } 1168 1169 unsigned SIRegisterInfo::getNumReservedSGPRs(const SISubtarget &ST, 1170 const SIMachineFunctionInfo &MFI) const { 1171 if (MFI.hasFlatScratchInit()) { 1172 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 1173 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order) 1174 1175 if (ST.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 1176 return 4; // FLAT_SCRATCH, VCC (in that order) 1177 } 1178 1179 if (ST.isXNACKEnabled()) 1180 return 4; // XNACK, VCC (in that order) 1181 1182 return 2; // VCC. 1183 } 1184 1185 unsigned SIRegisterInfo::getMinNumSGPRs(const SISubtarget &ST, 1186 unsigned WavesPerEU) const { 1187 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 1188 switch (WavesPerEU) { 1189 case 0: return 0; 1190 case 10: return 0; 1191 case 9: return 0; 1192 case 8: return 81; 1193 default: return 97; 1194 } 1195 } else { 1196 switch (WavesPerEU) { 1197 case 0: return 0; 1198 case 10: return 0; 1199 case 9: return 49; 1200 case 8: return 57; 1201 case 7: return 65; 1202 case 6: return 73; 1203 case 5: return 81; 1204 default: return 97; 1205 } 1206 } 1207 } 1208 1209 unsigned SIRegisterInfo::getMaxNumSGPRs(const SISubtarget &ST, 1210 unsigned WavesPerEU, 1211 bool Addressable) const { 1212 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 1213 switch (WavesPerEU) { 1214 case 0: return 80; 1215 case 10: return 80; 1216 case 9: return 80; 1217 case 8: return 96; 1218 default: return Addressable ? getNumAddressableSGPRs(ST) : 112; 1219 } 1220 } else { 1221 switch (WavesPerEU) { 1222 case 0: return 48; 1223 case 10: return 48; 1224 case 9: return 56; 1225 case 8: return 64; 1226 case 7: return 72; 1227 case 6: return 80; 1228 case 5: return 96; 1229 default: return getNumAddressableSGPRs(ST); 1230 } 1231 } 1232 } 1233 1234 unsigned SIRegisterInfo::getMaxNumSGPRs(const MachineFunction &MF) const { 1235 const Function &F = *MF.getFunction(); 1236 1237 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 1238 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 1239 1240 // Compute maximum number of SGPRs function can use using default/requested 1241 // minimum number of waves per execution unit. 1242 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 1243 unsigned MaxNumSGPRs = getMaxNumSGPRs(ST, WavesPerEU.first, false); 1244 unsigned MaxNumAddressableSGPRs = getMaxNumSGPRs(ST, WavesPerEU.first, true); 1245 1246 // Check if maximum number of SGPRs was explicitly requested using 1247 // "amdgpu-num-sgpr" attribute. 1248 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 1249 unsigned Requested = AMDGPU::getIntegerAttribute( 1250 F, "amdgpu-num-sgpr", MaxNumSGPRs); 1251 1252 // Make sure requested value does not violate subtarget's specifications. 1253 if (Requested && (Requested <= getNumReservedSGPRs(ST, MFI))) 1254 Requested = 0; 1255 1256 // If more SGPRs are required to support the input user/system SGPRs, 1257 // increase to accommodate them. 1258 // 1259 // FIXME: This really ends up using the requested number of SGPRs + number 1260 // of reserved special registers in total. Theoretically you could re-use 1261 // the last input registers for these special registers, but this would 1262 // require a lot of complexity to deal with the weird aliasing. 1263 unsigned NumInputSGPRs = MFI.getNumPreloadedSGPRs(); 1264 if (Requested && Requested < NumInputSGPRs) 1265 Requested = NumInputSGPRs; 1266 1267 // Make sure requested value is compatible with values implied by 1268 // default/requested minimum/maximum number of waves per execution unit. 1269 if (Requested && Requested > getMaxNumSGPRs(ST, WavesPerEU.first, false)) 1270 Requested = 0; 1271 if (WavesPerEU.second && 1272 Requested && Requested < getMinNumSGPRs(ST, WavesPerEU.second)) 1273 Requested = 0; 1274 1275 if (Requested) 1276 MaxNumSGPRs = Requested; 1277 } 1278 1279 if (ST.hasSGPRInitBug()) 1280 MaxNumSGPRs = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; 1281 1282 return std::min(MaxNumSGPRs - getNumReservedSGPRs(ST, MFI), 1283 MaxNumAddressableSGPRs); 1284 } 1285 1286 unsigned SIRegisterInfo::getNumDebuggerReservedVGPRs( 1287 const SISubtarget &ST) const { 1288 if (ST.debuggerReserveRegs()) 1289 return 4; 1290 return 0; 1291 } 1292 1293 unsigned SIRegisterInfo::getMinNumVGPRs(unsigned WavesPerEU) const { 1294 switch (WavesPerEU) { 1295 case 0: return 0; 1296 case 10: return 0; 1297 case 9: return 25; 1298 case 8: return 29; 1299 case 7: return 33; 1300 case 6: return 37; 1301 case 5: return 41; 1302 case 4: return 49; 1303 case 3: return 65; 1304 case 2: return 85; 1305 default: return 129; 1306 } 1307 } 1308 1309 unsigned SIRegisterInfo::getMaxNumVGPRs(unsigned WavesPerEU) const { 1310 switch (WavesPerEU) { 1311 case 0: return 24; 1312 case 10: return 24; 1313 case 9: return 28; 1314 case 8: return 32; 1315 case 7: return 36; 1316 case 6: return 40; 1317 case 5: return 48; 1318 case 4: return 64; 1319 case 3: return 84; 1320 case 2: return 128; 1321 default: return getTotalNumVGPRs(); 1322 } 1323 } 1324 1325 unsigned SIRegisterInfo::getMaxNumVGPRs(const MachineFunction &MF) const { 1326 const Function &F = *MF.getFunction(); 1327 1328 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 1329 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 1330 1331 // Compute maximum number of VGPRs function can use using default/requested 1332 // minimum number of waves per execution unit. 1333 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 1334 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 1335 1336 // Check if maximum number of VGPRs was explicitly requested using 1337 // "amdgpu-num-vgpr" attribute. 1338 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 1339 unsigned Requested = AMDGPU::getIntegerAttribute( 1340 F, "amdgpu-num-vgpr", MaxNumVGPRs); 1341 1342 // Make sure requested value does not violate subtarget's specifications. 1343 if (Requested && Requested <= getNumDebuggerReservedVGPRs(ST)) 1344 Requested = 0; 1345 1346 // Make sure requested value is compatible with values implied by 1347 // default/requested minimum/maximum number of waves per execution unit. 1348 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 1349 Requested = 0; 1350 if (WavesPerEU.second && 1351 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 1352 Requested = 0; 1353 1354 if (Requested) 1355 MaxNumVGPRs = Requested; 1356 } 1357 1358 return MaxNumVGPRs - getNumDebuggerReservedVGPRs(ST); 1359 } 1360 1361 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, 1362 unsigned EltSize) const { 1363 if (EltSize == 4) { 1364 static const int16_t Sub0_15[] = { 1365 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1366 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 1367 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 1368 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 1369 }; 1370 1371 static const int16_t Sub0_7[] = { 1372 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1373 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 1374 }; 1375 1376 static const int16_t Sub0_3[] = { 1377 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1378 }; 1379 1380 static const int16_t Sub0_2[] = { 1381 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 1382 }; 1383 1384 static const int16_t Sub0_1[] = { 1385 AMDGPU::sub0, AMDGPU::sub1, 1386 }; 1387 1388 switch (AMDGPU::getRegBitWidth(*RC->MC)) { 1389 case 32: 1390 return {}; 1391 case 64: 1392 return makeArrayRef(Sub0_1); 1393 case 96: 1394 return makeArrayRef(Sub0_2); 1395 case 128: 1396 return makeArrayRef(Sub0_3); 1397 case 256: 1398 return makeArrayRef(Sub0_7); 1399 case 512: 1400 return makeArrayRef(Sub0_15); 1401 default: 1402 llvm_unreachable("unhandled register size"); 1403 } 1404 } 1405 1406 if (EltSize == 8) { 1407 static const int16_t Sub0_15_64[] = { 1408 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 1409 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 1410 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 1411 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15 1412 }; 1413 1414 static const int16_t Sub0_7_64[] = { 1415 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 1416 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7 1417 }; 1418 1419 1420 static const int16_t Sub0_3_64[] = { 1421 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3 1422 }; 1423 1424 switch (AMDGPU::getRegBitWidth(*RC->MC)) { 1425 case 64: 1426 return {}; 1427 case 128: 1428 return makeArrayRef(Sub0_3_64); 1429 case 256: 1430 return makeArrayRef(Sub0_7_64); 1431 case 512: 1432 return makeArrayRef(Sub0_15_64); 1433 default: 1434 llvm_unreachable("unhandled register size"); 1435 } 1436 } 1437 1438 assert(EltSize == 16 && "unhandled register spill split size"); 1439 1440 static const int16_t Sub0_15_128[] = { 1441 AMDGPU::sub0_sub1_sub2_sub3, 1442 AMDGPU::sub4_sub5_sub6_sub7, 1443 AMDGPU::sub8_sub9_sub10_sub11, 1444 AMDGPU::sub12_sub13_sub14_sub15 1445 }; 1446 1447 static const int16_t Sub0_7_128[] = { 1448 AMDGPU::sub0_sub1_sub2_sub3, 1449 AMDGPU::sub4_sub5_sub6_sub7 1450 }; 1451 1452 switch (AMDGPU::getRegBitWidth(*RC->MC)) { 1453 case 128: 1454 return {}; 1455 case 256: 1456 return makeArrayRef(Sub0_7_128); 1457 case 512: 1458 return makeArrayRef(Sub0_15_128); 1459 default: 1460 llvm_unreachable("unhandled register size"); 1461 } 1462 } 1463 1464 const TargetRegisterClass* 1465 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI, 1466 unsigned Reg) const { 1467 if (TargetRegisterInfo::isVirtualRegister(Reg)) 1468 return MRI.getRegClass(Reg); 1469 1470 return getPhysRegClass(Reg); 1471 } 1472 1473 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, 1474 unsigned Reg) const { 1475 return hasVGPRs(getRegClassForReg(MRI, Reg)); 1476 } 1477