1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI implementation of the TargetRegisterInfo class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "SIRegisterInfo.h" 15 #include "AMDGPURegisterBankInfo.h" 16 #include "AMDGPUSubtarget.h" 17 #include "SIInstrInfo.h" 18 #include "SIMachineFunctionInfo.h" 19 #include "MCTargetDesc/AMDGPUInstPrinter.h" 20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 21 #include "llvm/CodeGen/LiveIntervals.h" 22 #include "llvm/CodeGen/MachineDominators.h" 23 #include "llvm/CodeGen/MachineFrameInfo.h" 24 #include "llvm/CodeGen/MachineInstrBuilder.h" 25 #include "llvm/CodeGen/RegisterScavenging.h" 26 #include "llvm/CodeGen/SlotIndexes.h" 27 #include "llvm/IR/Function.h" 28 #include "llvm/IR/LLVMContext.h" 29 30 using namespace llvm; 31 32 static bool hasPressureSet(const int *PSets, unsigned PSetID) { 33 for (unsigned i = 0; PSets[i] != -1; ++i) { 34 if (PSets[i] == (int)PSetID) 35 return true; 36 } 37 return false; 38 } 39 40 void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg, 41 BitVector &PressureSets) const { 42 for (MCRegUnitIterator U(Reg, this); U.isValid(); ++U) { 43 const int *PSets = getRegUnitPressureSets(*U); 44 if (hasPressureSet(PSets, PSetID)) { 45 PressureSets.set(PSetID); 46 break; 47 } 48 } 49 } 50 51 static cl::opt<bool> EnableSpillSGPRToSMEM( 52 "amdgpu-spill-sgpr-to-smem", 53 cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"), 54 cl::init(false)); 55 56 static cl::opt<bool> EnableSpillSGPRToVGPR( 57 "amdgpu-spill-sgpr-to-vgpr", 58 cl::desc("Enable spilling VGPRs to SGPRs"), 59 cl::ReallyHidden, 60 cl::init(true)); 61 62 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) : 63 AMDGPURegisterInfo(), 64 SGPRPressureSets(getNumRegPressureSets()), 65 VGPRPressureSets(getNumRegPressureSets()), 66 AGPRPressureSets(getNumRegPressureSets()), 67 SpillSGPRToVGPR(false), 68 SpillSGPRToSMEM(false), 69 isWave32(ST.isWave32()) { 70 if (EnableSpillSGPRToSMEM && ST.hasScalarStores()) 71 SpillSGPRToSMEM = true; 72 else if (EnableSpillSGPRToVGPR) 73 SpillSGPRToVGPR = true; 74 75 unsigned NumRegPressureSets = getNumRegPressureSets(); 76 77 SGPRSetID = NumRegPressureSets; 78 VGPRSetID = NumRegPressureSets; 79 AGPRSetID = NumRegPressureSets; 80 81 for (unsigned i = 0; i < NumRegPressureSets; ++i) { 82 classifyPressureSet(i, AMDGPU::SGPR0, SGPRPressureSets); 83 classifyPressureSet(i, AMDGPU::VGPR0, VGPRPressureSets); 84 classifyPressureSet(i, AMDGPU::AGPR0, AGPRPressureSets); 85 } 86 87 // Determine the number of reg units for each pressure set. 88 std::vector<unsigned> PressureSetRegUnits(NumRegPressureSets, 0); 89 for (unsigned i = 0, e = getNumRegUnits(); i != e; ++i) { 90 const int *PSets = getRegUnitPressureSets(i); 91 for (unsigned j = 0; PSets[j] != -1; ++j) { 92 ++PressureSetRegUnits[PSets[j]]; 93 } 94 } 95 96 unsigned VGPRMax = 0, SGPRMax = 0, AGPRMax = 0; 97 for (unsigned i = 0; i < NumRegPressureSets; ++i) { 98 if (isVGPRPressureSet(i) && PressureSetRegUnits[i] > VGPRMax) { 99 VGPRSetID = i; 100 VGPRMax = PressureSetRegUnits[i]; 101 continue; 102 } 103 if (isSGPRPressureSet(i) && PressureSetRegUnits[i] > SGPRMax) { 104 SGPRSetID = i; 105 SGPRMax = PressureSetRegUnits[i]; 106 } 107 if (isAGPRPressureSet(i) && PressureSetRegUnits[i] > AGPRMax) { 108 AGPRSetID = i; 109 AGPRMax = PressureSetRegUnits[i]; 110 continue; 111 } 112 } 113 114 assert(SGPRSetID < NumRegPressureSets && 115 VGPRSetID < NumRegPressureSets && 116 AGPRSetID < NumRegPressureSets); 117 } 118 119 unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( 120 const MachineFunction &MF) const { 121 122 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 123 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4; 124 unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); 125 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); 126 } 127 128 static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) { 129 unsigned Reg; 130 131 // Try to place it in a hole after PrivateSegmentBufferReg. 132 if (RegCount & 3) { 133 // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to 134 // alignment constraints, so we have a hole where can put the wave offset. 135 Reg = RegCount - 1; 136 } else { 137 // We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the 138 // wave offset before it. 139 Reg = RegCount - 5; 140 } 141 142 return Reg; 143 } 144 145 unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( 146 const MachineFunction &MF) const { 147 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 148 unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF)); 149 return AMDGPU::SGPR_32RegClass.getRegister(Reg); 150 } 151 152 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { 153 BitVector Reserved(getNumRegs()); 154 155 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but 156 // this seems likely to result in bugs, so I'm marking them as reserved. 157 reserveRegisterTuples(Reserved, AMDGPU::EXEC); 158 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); 159 160 // M0 has to be reserved so that llvm accepts it as a live-in into a block. 161 reserveRegisterTuples(Reserved, AMDGPU::M0); 162 163 // Reserve src_vccz, src_execz, src_scc. 164 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ); 165 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ); 166 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC); 167 168 // Reserve the memory aperture registers. 169 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE); 170 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); 171 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); 172 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT); 173 174 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen. 175 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID); 176 177 // Reserve xnack_mask registers - support is not implemented in Codegen. 178 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK); 179 180 // Reserve lds_direct register - support is not implemented in Codegen. 181 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT); 182 183 // Reserve Trap Handler registers - support is not implemented in Codegen. 184 reserveRegisterTuples(Reserved, AMDGPU::TBA); 185 reserveRegisterTuples(Reserved, AMDGPU::TMA); 186 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1); 187 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3); 188 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5); 189 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7); 190 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); 191 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); 192 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13); 193 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15); 194 195 // Reserve null register - it shall never be allocated 196 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL); 197 198 // Disallow vcc_hi allocation in wave32. It may be allocated but most likely 199 // will result in bugs. 200 if (isWave32) { 201 Reserved.set(AMDGPU::VCC); 202 Reserved.set(AMDGPU::VCC_HI); 203 } 204 205 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 206 207 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); 208 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 209 for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) { 210 unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); 211 reserveRegisterTuples(Reserved, Reg); 212 } 213 214 unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); 215 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); 216 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) { 217 unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); 218 reserveRegisterTuples(Reserved, Reg); 219 Reg = AMDGPU::AGPR_32RegClass.getRegister(i); 220 reserveRegisterTuples(Reserved, Reg); 221 } 222 223 // Reserve all the rest AGPRs if there are no instructions to use it. 224 if (!ST.hasMAIInsts()) { 225 for (unsigned i = 0; i < MaxNumVGPRs; ++i) { 226 unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); 227 reserveRegisterTuples(Reserved, Reg); 228 } 229 } 230 231 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 232 233 unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); 234 if (ScratchWaveOffsetReg != AMDGPU::NoRegister) { 235 // Reserve 1 SGPR for scratch wave offset in case we need to spill. 236 reserveRegisterTuples(Reserved, ScratchWaveOffsetReg); 237 } 238 239 unsigned ScratchRSrcReg = MFI->getScratchRSrcReg(); 240 if (ScratchRSrcReg != AMDGPU::NoRegister) { 241 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need 242 // to spill. 243 // TODO: May need to reserve a VGPR if doing LDS spilling. 244 reserveRegisterTuples(Reserved, ScratchRSrcReg); 245 assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg)); 246 } 247 248 // We have to assume the SP is needed in case there are calls in the function, 249 // which is detected after the function is lowered. If we aren't really going 250 // to need SP, don't bother reserving it. 251 unsigned StackPtrReg = MFI->getStackPtrOffsetReg(); 252 253 if (StackPtrReg != AMDGPU::NoRegister) { 254 reserveRegisterTuples(Reserved, StackPtrReg); 255 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg)); 256 } 257 258 unsigned FrameReg = MFI->getFrameOffsetReg(); 259 if (FrameReg != AMDGPU::NoRegister) { 260 reserveRegisterTuples(Reserved, FrameReg); 261 assert(!isSubRegister(ScratchRSrcReg, FrameReg)); 262 } 263 264 for (unsigned Reg : MFI->WWMReservedRegs) { 265 reserveRegisterTuples(Reserved, Reg); 266 } 267 268 // FIXME: Stop using reserved registers for this. 269 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs()) 270 reserveRegisterTuples(Reserved, Reg); 271 272 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) 273 reserveRegisterTuples(Reserved, Reg); 274 275 return Reserved; 276 } 277 278 bool SIRegisterInfo::canRealignStack(const MachineFunction &MF) const { 279 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 280 // On entry, the base address is 0, so it can't possibly need any more 281 // alignment. 282 283 // FIXME: Should be able to specify the entry frame alignment per calling 284 // convention instead. 285 if (Info->isEntryFunction()) 286 return false; 287 288 return TargetRegisterInfo::canRealignStack(MF); 289 } 290 291 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { 292 const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>(); 293 if (Info->isEntryFunction()) { 294 const MachineFrameInfo &MFI = Fn.getFrameInfo(); 295 return MFI.hasStackObjects() || MFI.hasCalls(); 296 } 297 298 // May need scavenger for dealing with callee saved registers. 299 return true; 300 } 301 302 bool SIRegisterInfo::requiresFrameIndexScavenging( 303 const MachineFunction &MF) const { 304 const MachineFrameInfo &MFI = MF.getFrameInfo(); 305 if (MFI.hasStackObjects()) 306 return true; 307 308 // May need to deal with callee saved registers. 309 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 310 return !Info->isEntryFunction(); 311 } 312 313 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( 314 const MachineFunction &MF) const { 315 const MachineFrameInfo &MFI = MF.getFrameInfo(); 316 if (!MFI.hasStackObjects()) 317 return false; 318 319 // The scavenger is used for large frames which may require finding a free 320 // register for large offsets. 321 if (!isUInt<12>(MFI.getStackSize())) 322 return true; 323 324 // If using scalar stores, for spills, m0 is needed for the scalar store 325 // offset (pre-GFX9). m0 is unallocatable, so we can't create a virtual 326 // register for it during frame index elimination, so the scavenger is 327 // directly needed. 328 return MF.getSubtarget<GCNSubtarget>().hasScalarStores() && 329 MF.getInfo<SIMachineFunctionInfo>()->hasSpilledSGPRs(); 330 } 331 332 bool SIRegisterInfo::requiresVirtualBaseRegisters( 333 const MachineFunction &) const { 334 // There are no special dedicated stack or frame pointers. 335 return true; 336 } 337 338 bool SIRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const { 339 // This helps catch bugs as verifier errors. 340 return true; 341 } 342 343 int64_t SIRegisterInfo::getMUBUFInstrOffset(const MachineInstr *MI) const { 344 assert(SIInstrInfo::isMUBUF(*MI)); 345 346 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 347 AMDGPU::OpName::offset); 348 return MI->getOperand(OffIdx).getImm(); 349 } 350 351 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI, 352 int Idx) const { 353 if (!SIInstrInfo::isMUBUF(*MI)) 354 return 0; 355 356 assert(Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 357 AMDGPU::OpName::vaddr) && 358 "Should never see frame index on non-address operand"); 359 360 return getMUBUFInstrOffset(MI); 361 } 362 363 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { 364 if (!MI->mayLoadOrStore()) 365 return false; 366 367 int64_t FullOffset = Offset + getMUBUFInstrOffset(MI); 368 369 return !isUInt<12>(FullOffset); 370 } 371 372 void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, 373 unsigned BaseReg, 374 int FrameIdx, 375 int64_t Offset) const { 376 MachineBasicBlock::iterator Ins = MBB->begin(); 377 DebugLoc DL; // Defaults to "unknown" 378 379 if (Ins != MBB->end()) 380 DL = Ins->getDebugLoc(); 381 382 MachineFunction *MF = MBB->getParent(); 383 const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>(); 384 const SIInstrInfo *TII = Subtarget.getInstrInfo(); 385 386 if (Offset == 0) { 387 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg) 388 .addFrameIndex(FrameIdx); 389 return; 390 } 391 392 MachineRegisterInfo &MRI = MF->getRegInfo(); 393 unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 394 395 unsigned FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 396 397 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) 398 .addImm(Offset); 399 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), FIReg) 400 .addFrameIndex(FrameIdx); 401 402 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg) 403 .addReg(OffsetReg, RegState::Kill) 404 .addReg(FIReg) 405 .addImm(0); // clamp bit 406 } 407 408 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, 409 int64_t Offset) const { 410 411 MachineBasicBlock *MBB = MI.getParent(); 412 MachineFunction *MF = MBB->getParent(); 413 const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>(); 414 const SIInstrInfo *TII = Subtarget.getInstrInfo(); 415 416 #ifndef NDEBUG 417 // FIXME: Is it possible to be storing a frame index to itself? 418 bool SeenFI = false; 419 for (const MachineOperand &MO: MI.operands()) { 420 if (MO.isFI()) { 421 if (SeenFI) 422 llvm_unreachable("should not see multiple frame indices"); 423 424 SeenFI = true; 425 } 426 } 427 #endif 428 429 MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 430 assert(FIOp && FIOp->isFI() && "frame index must be address operand"); 431 assert(TII->isMUBUF(MI)); 432 assert(TII->getNamedOperand(MI, AMDGPU::OpName::soffset)->getReg() == 433 MF->getInfo<SIMachineFunctionInfo>()->getFrameOffsetReg() && 434 "should only be seeing frame offset relative FrameIndex"); 435 436 437 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); 438 int64_t NewOffset = OffsetOp->getImm() + Offset; 439 assert(isUInt<12>(NewOffset) && "offset should be legal"); 440 441 FIOp->ChangeToRegister(BaseReg, false); 442 OffsetOp->setImm(NewOffset); 443 } 444 445 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, 446 unsigned BaseReg, 447 int64_t Offset) const { 448 if (!SIInstrInfo::isMUBUF(*MI)) 449 return false; 450 451 int64_t NewOffset = Offset + getMUBUFInstrOffset(MI); 452 453 return isUInt<12>(NewOffset); 454 } 455 456 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( 457 const MachineFunction &MF, unsigned Kind) const { 458 // This is inaccurate. It depends on the instruction and address space. The 459 // only place where we should hit this is for dealing with frame indexes / 460 // private accesses, so this is correct in that case. 461 return &AMDGPU::VGPR_32RegClass; 462 } 463 464 static unsigned getNumSubRegsForSpillOp(unsigned Op) { 465 466 switch (Op) { 467 case AMDGPU::SI_SPILL_S1024_SAVE: 468 case AMDGPU::SI_SPILL_S1024_RESTORE: 469 case AMDGPU::SI_SPILL_V1024_SAVE: 470 case AMDGPU::SI_SPILL_V1024_RESTORE: 471 case AMDGPU::SI_SPILL_A1024_SAVE: 472 case AMDGPU::SI_SPILL_A1024_RESTORE: 473 return 32; 474 case AMDGPU::SI_SPILL_S512_SAVE: 475 case AMDGPU::SI_SPILL_S512_RESTORE: 476 case AMDGPU::SI_SPILL_V512_SAVE: 477 case AMDGPU::SI_SPILL_V512_RESTORE: 478 case AMDGPU::SI_SPILL_A512_SAVE: 479 case AMDGPU::SI_SPILL_A512_RESTORE: 480 return 16; 481 case AMDGPU::SI_SPILL_S256_SAVE: 482 case AMDGPU::SI_SPILL_S256_RESTORE: 483 case AMDGPU::SI_SPILL_V256_SAVE: 484 case AMDGPU::SI_SPILL_V256_RESTORE: 485 return 8; 486 case AMDGPU::SI_SPILL_S160_SAVE: 487 case AMDGPU::SI_SPILL_S160_RESTORE: 488 case AMDGPU::SI_SPILL_V160_SAVE: 489 case AMDGPU::SI_SPILL_V160_RESTORE: 490 return 5; 491 case AMDGPU::SI_SPILL_S128_SAVE: 492 case AMDGPU::SI_SPILL_S128_RESTORE: 493 case AMDGPU::SI_SPILL_V128_SAVE: 494 case AMDGPU::SI_SPILL_V128_RESTORE: 495 case AMDGPU::SI_SPILL_A128_SAVE: 496 case AMDGPU::SI_SPILL_A128_RESTORE: 497 return 4; 498 case AMDGPU::SI_SPILL_S96_SAVE: 499 case AMDGPU::SI_SPILL_S96_RESTORE: 500 case AMDGPU::SI_SPILL_V96_SAVE: 501 case AMDGPU::SI_SPILL_V96_RESTORE: 502 return 3; 503 case AMDGPU::SI_SPILL_S64_SAVE: 504 case AMDGPU::SI_SPILL_S64_RESTORE: 505 case AMDGPU::SI_SPILL_V64_SAVE: 506 case AMDGPU::SI_SPILL_V64_RESTORE: 507 case AMDGPU::SI_SPILL_A64_SAVE: 508 case AMDGPU::SI_SPILL_A64_RESTORE: 509 return 2; 510 case AMDGPU::SI_SPILL_S32_SAVE: 511 case AMDGPU::SI_SPILL_S32_RESTORE: 512 case AMDGPU::SI_SPILL_V32_SAVE: 513 case AMDGPU::SI_SPILL_V32_RESTORE: 514 case AMDGPU::SI_SPILL_A32_SAVE: 515 case AMDGPU::SI_SPILL_A32_RESTORE: 516 return 1; 517 default: llvm_unreachable("Invalid spill opcode"); 518 } 519 } 520 521 static int getOffsetMUBUFStore(unsigned Opc) { 522 switch (Opc) { 523 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 524 return AMDGPU::BUFFER_STORE_DWORD_OFFSET; 525 case AMDGPU::BUFFER_STORE_BYTE_OFFEN: 526 return AMDGPU::BUFFER_STORE_BYTE_OFFSET; 527 case AMDGPU::BUFFER_STORE_SHORT_OFFEN: 528 return AMDGPU::BUFFER_STORE_SHORT_OFFSET; 529 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: 530 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; 531 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN: 532 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; 533 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN: 534 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET; 535 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN: 536 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET; 537 default: 538 return -1; 539 } 540 } 541 542 static int getOffsetMUBUFLoad(unsigned Opc) { 543 switch (Opc) { 544 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 545 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 546 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN: 547 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET; 548 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN: 549 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET; 550 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN: 551 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET; 552 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN: 553 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET; 554 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN: 555 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; 556 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN: 557 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET; 558 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN: 559 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET; 560 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN: 561 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET; 562 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN: 563 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET; 564 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN: 565 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET; 566 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN: 567 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET; 568 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN: 569 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET; 570 default: 571 return -1; 572 } 573 } 574 575 static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI, 576 int Index, 577 unsigned Lane, 578 unsigned ValueReg, 579 bool IsKill) { 580 MachineBasicBlock *MBB = MI->getParent(); 581 MachineFunction *MF = MI->getParent()->getParent(); 582 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 583 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 584 const SIInstrInfo *TII = ST.getInstrInfo(); 585 586 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane); 587 588 if (Reg == AMDGPU::NoRegister) 589 return MachineInstrBuilder(); 590 591 bool IsStore = MI->mayStore(); 592 MachineRegisterInfo &MRI = MF->getRegInfo(); 593 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 594 595 unsigned Dst = IsStore ? Reg : ValueReg; 596 unsigned Src = IsStore ? ValueReg : Reg; 597 unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32 598 : AMDGPU::V_ACCVGPR_READ_B32; 599 600 return BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst) 601 .addReg(Src, getKillRegState(IsKill)); 602 } 603 604 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not 605 // need to handle the case where an SGPR may need to be spilled while spilling. 606 static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, 607 MachineFrameInfo &MFI, 608 MachineBasicBlock::iterator MI, 609 int Index, 610 int64_t Offset) { 611 MachineBasicBlock *MBB = MI->getParent(); 612 const DebugLoc &DL = MI->getDebugLoc(); 613 bool IsStore = MI->mayStore(); 614 615 unsigned Opc = MI->getOpcode(); 616 int LoadStoreOp = IsStore ? 617 getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc); 618 if (LoadStoreOp == -1) 619 return false; 620 621 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); 622 if (spillVGPRtoAGPR(MI, Index, 0, Reg->getReg(), false).getInstr()) 623 return true; 624 625 MachineInstrBuilder NewMI = 626 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) 627 .add(*Reg) 628 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) 629 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) 630 .addImm(Offset) 631 .addImm(0) // glc 632 .addImm(0) // slc 633 .addImm(0) // tfe 634 .addImm(0) // dlc 635 .cloneMemRefs(*MI); 636 637 const MachineOperand *VDataIn = TII->getNamedOperand(*MI, 638 AMDGPU::OpName::vdata_in); 639 if (VDataIn) 640 NewMI.add(*VDataIn); 641 return true; 642 } 643 644 void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, 645 unsigned LoadStoreOp, 646 int Index, 647 unsigned ValueReg, 648 bool IsKill, 649 unsigned ScratchRsrcReg, 650 unsigned ScratchOffsetReg, 651 int64_t InstOffset, 652 MachineMemOperand *MMO, 653 RegScavenger *RS) const { 654 MachineBasicBlock *MBB = MI->getParent(); 655 MachineFunction *MF = MI->getParent()->getParent(); 656 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 657 const SIInstrInfo *TII = ST.getInstrInfo(); 658 const MachineFrameInfo &MFI = MF->getFrameInfo(); 659 660 const MCInstrDesc &Desc = TII->get(LoadStoreOp); 661 const DebugLoc &DL = MI->getDebugLoc(); 662 bool IsStore = Desc.mayStore(); 663 664 bool Scavenged = false; 665 unsigned SOffset = ScratchOffsetReg; 666 667 const unsigned EltSize = 4; 668 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); 669 unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / (EltSize * CHAR_BIT); 670 unsigned Size = NumSubRegs * EltSize; 671 int64_t Offset = InstOffset + MFI.getObjectOffset(Index); 672 int64_t ScratchOffsetRegDelta = 0; 673 674 unsigned Align = MFI.getObjectAlignment(Index); 675 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); 676 677 Register TmpReg = 678 hasAGPRs(RC) ? TII->getNamedOperand(*MI, AMDGPU::OpName::tmp)->getReg() 679 : Register(); 680 681 assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset"); 682 683 if (!isUInt<12>(Offset + Size - EltSize)) { 684 SOffset = AMDGPU::NoRegister; 685 686 // We currently only support spilling VGPRs to EltSize boundaries, meaning 687 // we can simplify the adjustment of Offset here to just scale with 688 // WavefrontSize. 689 Offset *= ST.getWavefrontSize(); 690 691 // We don't have access to the register scavenger if this function is called 692 // during PEI::scavengeFrameVirtualRegs(). 693 if (RS) 694 SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false); 695 696 if (SOffset == AMDGPU::NoRegister) { 697 // There are no free SGPRs, and since we are in the process of spilling 698 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true 699 // on SI/CI and on VI it is true until we implement spilling using scalar 700 // stores), we have no way to free up an SGPR. Our solution here is to 701 // add the offset directly to the ScratchOffset register, and then 702 // subtract the offset after the spill to return ScratchOffset to it's 703 // original value. 704 SOffset = ScratchOffsetReg; 705 ScratchOffsetRegDelta = Offset; 706 } else { 707 Scavenged = true; 708 } 709 710 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) 711 .addReg(ScratchOffsetReg) 712 .addImm(Offset); 713 714 Offset = 0; 715 } 716 717 for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) { 718 unsigned SubReg = NumSubRegs == 1 ? 719 Register(ValueReg) : getSubReg(ValueReg, getSubRegFromChannel(i)); 720 721 unsigned SOffsetRegState = 0; 722 unsigned SrcDstRegState = getDefRegState(!IsStore); 723 if (i + 1 == e) { 724 SOffsetRegState |= getKillRegState(Scavenged); 725 // The last implicit use carries the "Kill" flag. 726 SrcDstRegState |= getKillRegState(IsKill); 727 } 728 729 auto MIB = spillVGPRtoAGPR(MI, Index, i, SubReg, IsKill); 730 731 if (!MIB.getInstr()) { 732 unsigned FinalReg = SubReg; 733 if (TmpReg != AMDGPU::NoRegister) { 734 if (IsStore) 735 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_READ_B32), TmpReg) 736 .addReg(SubReg, getKillRegState(IsKill)); 737 SubReg = TmpReg; 738 } 739 740 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i); 741 MachineMemOperand *NewMMO 742 = MF->getMachineMemOperand(PInfo, MMO->getFlags(), 743 EltSize, MinAlign(Align, EltSize * i)); 744 745 MIB = BuildMI(*MBB, MI, DL, Desc) 746 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)) 747 .addReg(ScratchRsrcReg) 748 .addReg(SOffset, SOffsetRegState) 749 .addImm(Offset) 750 .addImm(0) // glc 751 .addImm(0) // slc 752 .addImm(0) // tfe 753 .addImm(0) // dlc 754 .addMemOperand(NewMMO); 755 756 if (!IsStore && TmpReg != AMDGPU::NoRegister) 757 MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32), 758 FinalReg) 759 .addReg(TmpReg, RegState::Kill); 760 } 761 762 if (NumSubRegs > 1) 763 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); 764 } 765 766 if (ScratchOffsetRegDelta != 0) { 767 // Subtract the offset we added to the ScratchOffset register. 768 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffsetReg) 769 .addReg(ScratchOffsetReg) 770 .addImm(ScratchOffsetRegDelta); 771 } 772 } 773 774 static std::pair<unsigned, unsigned> getSpillEltSize(unsigned SuperRegSize, 775 bool Store) { 776 if (SuperRegSize % 16 == 0) { 777 return { 16, Store ? AMDGPU::S_BUFFER_STORE_DWORDX4_SGPR : 778 AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR }; 779 } 780 781 if (SuperRegSize % 8 == 0) { 782 return { 8, Store ? AMDGPU::S_BUFFER_STORE_DWORDX2_SGPR : 783 AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR }; 784 } 785 786 return { 4, Store ? AMDGPU::S_BUFFER_STORE_DWORD_SGPR : 787 AMDGPU::S_BUFFER_LOAD_DWORD_SGPR}; 788 } 789 790 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, 791 int Index, 792 RegScavenger *RS, 793 bool OnlyToVGPR) const { 794 MachineBasicBlock *MBB = MI->getParent(); 795 MachineFunction *MF = MBB->getParent(); 796 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 797 DenseSet<unsigned> SGPRSpillVGPRDefinedSet; 798 799 ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills 800 = MFI->getSGPRToVGPRSpills(Index); 801 bool SpillToVGPR = !VGPRSpills.empty(); 802 if (OnlyToVGPR && !SpillToVGPR) 803 return false; 804 805 MachineRegisterInfo &MRI = MF->getRegInfo(); 806 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 807 const SIInstrInfo *TII = ST.getInstrInfo(); 808 809 Register SuperReg = MI->getOperand(0).getReg(); 810 bool IsKill = MI->getOperand(0).isKill(); 811 const DebugLoc &DL = MI->getDebugLoc(); 812 813 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 814 815 bool SpillToSMEM = spillSGPRToSMEM(); 816 if (SpillToSMEM && OnlyToVGPR) 817 return false; 818 819 Register FrameReg = getFrameRegister(*MF); 820 821 assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() && 822 SuperReg != MFI->getFrameOffsetReg() && 823 SuperReg != MFI->getScratchWaveOffsetReg())); 824 825 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 826 827 unsigned OffsetReg = AMDGPU::M0; 828 unsigned M0CopyReg = AMDGPU::NoRegister; 829 830 if (SpillToSMEM) { 831 if (RS->isRegUsed(AMDGPU::M0)) { 832 M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 833 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg) 834 .addReg(AMDGPU::M0); 835 } 836 } 837 838 unsigned ScalarStoreOp; 839 unsigned EltSize = 4; 840 const TargetRegisterClass *RC = getPhysRegClass(SuperReg); 841 if (SpillToSMEM && isSGPRClass(RC)) { 842 // XXX - if private_element_size is larger than 4 it might be useful to be 843 // able to spill wider vmem spills. 844 std::tie(EltSize, ScalarStoreOp) = 845 getSpillEltSize(getRegSizeInBits(*RC) / 8, true); 846 } 847 848 ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); 849 unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 850 851 // SubReg carries the "Kill" flag when SubReg == SuperReg. 852 unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); 853 for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { 854 unsigned SubReg = NumSubRegs == 1 ? 855 SuperReg : getSubReg(SuperReg, SplitParts[i]); 856 857 if (SpillToSMEM) { 858 int64_t FrOffset = FrameInfo.getObjectOffset(Index); 859 860 // The allocated memory size is really the wavefront size * the frame 861 // index size. The widest register class is 64 bytes, so a 4-byte scratch 862 // allocation is enough to spill this in a single stack object. 863 // 864 // FIXME: Frame size/offsets are computed earlier than this, so the extra 865 // space is still unnecessarily allocated. 866 867 unsigned Align = FrameInfo.getObjectAlignment(Index); 868 MachinePointerInfo PtrInfo 869 = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); 870 MachineMemOperand *MMO 871 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 872 EltSize, MinAlign(Align, EltSize * i)); 873 874 // SMEM instructions only support a single offset, so increment the wave 875 // offset. 876 877 int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); 878 if (Offset != 0) { 879 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) 880 .addReg(FrameReg) 881 .addImm(Offset); 882 } else { 883 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) 884 .addReg(FrameReg); 885 } 886 887 BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp)) 888 .addReg(SubReg, getKillRegState(IsKill)) // sdata 889 .addReg(MFI->getScratchRSrcReg()) // sbase 890 .addReg(OffsetReg, RegState::Kill) // soff 891 .addImm(0) // glc 892 .addImm(0) // dlc 893 .addMemOperand(MMO); 894 895 continue; 896 } 897 898 if (SpillToVGPR) { 899 SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; 900 901 // During SGPR spilling to VGPR, determine if the VGPR is defined. The 902 // only circumstance in which we say it is undefined is when it is the 903 // first spill to this VGPR in the first basic block. 904 bool VGPRDefined = true; 905 if (MBB == &MF->front()) 906 VGPRDefined = !SGPRSpillVGPRDefinedSet.insert(Spill.VGPR).second; 907 908 // Mark the "old value of vgpr" input undef only if this is the first sgpr 909 // spill to this specific vgpr in the first basic block. 910 BuildMI(*MBB, MI, DL, 911 TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), 912 Spill.VGPR) 913 .addReg(SubReg, getKillRegState(IsKill)) 914 .addImm(Spill.Lane) 915 .addReg(Spill.VGPR, VGPRDefined ? 0 : RegState::Undef); 916 917 // FIXME: Since this spills to another register instead of an actual 918 // frame index, we should delete the frame index when all references to 919 // it are fixed. 920 } else { 921 // XXX - Can to VGPR spill fail for some subregisters but not others? 922 if (OnlyToVGPR) 923 return false; 924 925 // Spill SGPR to a frame index. 926 // TODO: Should VI try to spill to VGPR and then spill to SMEM? 927 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 928 // TODO: Should VI try to spill to VGPR and then spill to SMEM? 929 930 MachineInstrBuilder Mov 931 = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 932 .addReg(SubReg, SubKillState); 933 934 935 // There could be undef components of a spilled super register. 936 // TODO: Can we detect this and skip the spill? 937 if (NumSubRegs > 1) { 938 // The last implicit use of the SuperReg carries the "Kill" flag. 939 unsigned SuperKillState = 0; 940 if (i + 1 == e) 941 SuperKillState |= getKillRegState(IsKill); 942 Mov.addReg(SuperReg, RegState::Implicit | SuperKillState); 943 } 944 945 unsigned Align = FrameInfo.getObjectAlignment(Index); 946 MachinePointerInfo PtrInfo 947 = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); 948 MachineMemOperand *MMO 949 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 950 EltSize, MinAlign(Align, EltSize * i)); 951 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) 952 .addReg(TmpReg, RegState::Kill) // src 953 .addFrameIndex(Index) // vaddr 954 .addReg(MFI->getScratchRSrcReg()) // srrsrc 955 .addReg(MFI->getStackPtrOffsetReg()) // soffset 956 .addImm(i * 4) // offset 957 .addMemOperand(MMO); 958 } 959 } 960 961 if (M0CopyReg != AMDGPU::NoRegister) { 962 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0) 963 .addReg(M0CopyReg, RegState::Kill); 964 } 965 966 MI->eraseFromParent(); 967 MFI->addToSpilledSGPRs(NumSubRegs); 968 return true; 969 } 970 971 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, 972 int Index, 973 RegScavenger *RS, 974 bool OnlyToVGPR) const { 975 MachineFunction *MF = MI->getParent()->getParent(); 976 MachineRegisterInfo &MRI = MF->getRegInfo(); 977 MachineBasicBlock *MBB = MI->getParent(); 978 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 979 980 ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills 981 = MFI->getSGPRToVGPRSpills(Index); 982 bool SpillToVGPR = !VGPRSpills.empty(); 983 if (OnlyToVGPR && !SpillToVGPR) 984 return false; 985 986 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 987 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 988 const SIInstrInfo *TII = ST.getInstrInfo(); 989 const DebugLoc &DL = MI->getDebugLoc(); 990 991 Register SuperReg = MI->getOperand(0).getReg(); 992 bool SpillToSMEM = spillSGPRToSMEM(); 993 if (SpillToSMEM && OnlyToVGPR) 994 return false; 995 996 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 997 998 unsigned OffsetReg = AMDGPU::M0; 999 unsigned M0CopyReg = AMDGPU::NoRegister; 1000 1001 if (SpillToSMEM) { 1002 if (RS->isRegUsed(AMDGPU::M0)) { 1003 M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 1004 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg) 1005 .addReg(AMDGPU::M0); 1006 } 1007 } 1008 1009 unsigned EltSize = 4; 1010 unsigned ScalarLoadOp; 1011 1012 Register FrameReg = getFrameRegister(*MF); 1013 1014 const TargetRegisterClass *RC = getPhysRegClass(SuperReg); 1015 if (SpillToSMEM && isSGPRClass(RC)) { 1016 // XXX - if private_element_size is larger than 4 it might be useful to be 1017 // able to spill wider vmem spills. 1018 std::tie(EltSize, ScalarLoadOp) = 1019 getSpillEltSize(getRegSizeInBits(*RC) / 8, false); 1020 } 1021 1022 ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); 1023 unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 1024 1025 // SubReg carries the "Kill" flag when SubReg == SuperReg. 1026 int64_t FrOffset = FrameInfo.getObjectOffset(Index); 1027 1028 for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { 1029 unsigned SubReg = NumSubRegs == 1 ? 1030 SuperReg : getSubReg(SuperReg, SplitParts[i]); 1031 1032 if (SpillToSMEM) { 1033 // FIXME: Size may be > 4 but extra bytes wasted. 1034 unsigned Align = FrameInfo.getObjectAlignment(Index); 1035 MachinePointerInfo PtrInfo 1036 = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); 1037 MachineMemOperand *MMO 1038 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, 1039 EltSize, MinAlign(Align, EltSize * i)); 1040 1041 // Add i * 4 offset 1042 int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); 1043 if (Offset != 0) { 1044 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) 1045 .addReg(FrameReg) 1046 .addImm(Offset); 1047 } else { 1048 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) 1049 .addReg(FrameReg); 1050 } 1051 1052 auto MIB = 1053 BuildMI(*MBB, MI, DL, TII->get(ScalarLoadOp), SubReg) 1054 .addReg(MFI->getScratchRSrcReg()) // sbase 1055 .addReg(OffsetReg, RegState::Kill) // soff 1056 .addImm(0) // glc 1057 .addImm(0) // dlc 1058 .addMemOperand(MMO); 1059 1060 if (NumSubRegs > 1 && i == 0) 1061 MIB.addReg(SuperReg, RegState::ImplicitDefine); 1062 1063 continue; 1064 } 1065 1066 if (SpillToVGPR) { 1067 SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; 1068 auto MIB = 1069 BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), 1070 SubReg) 1071 .addReg(Spill.VGPR) 1072 .addImm(Spill.Lane); 1073 1074 if (NumSubRegs > 1 && i == 0) 1075 MIB.addReg(SuperReg, RegState::ImplicitDefine); 1076 } else { 1077 if (OnlyToVGPR) 1078 return false; 1079 1080 // Restore SGPR from a stack slot. 1081 // FIXME: We should use S_LOAD_DWORD here for VI. 1082 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1083 unsigned Align = FrameInfo.getObjectAlignment(Index); 1084 1085 MachinePointerInfo PtrInfo 1086 = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); 1087 1088 MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo, 1089 MachineMemOperand::MOLoad, EltSize, 1090 MinAlign(Align, EltSize * i)); 1091 1092 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg) 1093 .addFrameIndex(Index) // vaddr 1094 .addReg(MFI->getScratchRSrcReg()) // srsrc 1095 .addReg(MFI->getStackPtrOffsetReg()) // soffset 1096 .addImm(i * 4) // offset 1097 .addMemOperand(MMO); 1098 1099 auto MIB = 1100 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) 1101 .addReg(TmpReg, RegState::Kill); 1102 1103 if (NumSubRegs > 1) 1104 MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); 1105 } 1106 } 1107 1108 if (M0CopyReg != AMDGPU::NoRegister) { 1109 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0) 1110 .addReg(M0CopyReg, RegState::Kill); 1111 } 1112 1113 MI->eraseFromParent(); 1114 return true; 1115 } 1116 1117 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to 1118 /// a VGPR and the stack slot can be safely eliminated when all other users are 1119 /// handled. 1120 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( 1121 MachineBasicBlock::iterator MI, 1122 int FI, 1123 RegScavenger *RS) const { 1124 switch (MI->getOpcode()) { 1125 case AMDGPU::SI_SPILL_S1024_SAVE: 1126 case AMDGPU::SI_SPILL_S512_SAVE: 1127 case AMDGPU::SI_SPILL_S256_SAVE: 1128 case AMDGPU::SI_SPILL_S160_SAVE: 1129 case AMDGPU::SI_SPILL_S128_SAVE: 1130 case AMDGPU::SI_SPILL_S96_SAVE: 1131 case AMDGPU::SI_SPILL_S64_SAVE: 1132 case AMDGPU::SI_SPILL_S32_SAVE: 1133 return spillSGPR(MI, FI, RS, true); 1134 case AMDGPU::SI_SPILL_S1024_RESTORE: 1135 case AMDGPU::SI_SPILL_S512_RESTORE: 1136 case AMDGPU::SI_SPILL_S256_RESTORE: 1137 case AMDGPU::SI_SPILL_S160_RESTORE: 1138 case AMDGPU::SI_SPILL_S128_RESTORE: 1139 case AMDGPU::SI_SPILL_S96_RESTORE: 1140 case AMDGPU::SI_SPILL_S64_RESTORE: 1141 case AMDGPU::SI_SPILL_S32_RESTORE: 1142 return restoreSGPR(MI, FI, RS, true); 1143 default: 1144 llvm_unreachable("not an SGPR spill instruction"); 1145 } 1146 } 1147 1148 void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, 1149 int SPAdj, unsigned FIOperandNum, 1150 RegScavenger *RS) const { 1151 MachineFunction *MF = MI->getParent()->getParent(); 1152 MachineRegisterInfo &MRI = MF->getRegInfo(); 1153 MachineBasicBlock *MBB = MI->getParent(); 1154 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1155 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 1156 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 1157 const SIInstrInfo *TII = ST.getInstrInfo(); 1158 DebugLoc DL = MI->getDebugLoc(); 1159 1160 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?"); 1161 1162 MachineOperand &FIOp = MI->getOperand(FIOperandNum); 1163 int Index = MI->getOperand(FIOperandNum).getIndex(); 1164 1165 Register FrameReg = getFrameRegister(*MF); 1166 1167 switch (MI->getOpcode()) { 1168 // SGPR register spill 1169 case AMDGPU::SI_SPILL_S1024_SAVE: 1170 case AMDGPU::SI_SPILL_S512_SAVE: 1171 case AMDGPU::SI_SPILL_S256_SAVE: 1172 case AMDGPU::SI_SPILL_S160_SAVE: 1173 case AMDGPU::SI_SPILL_S128_SAVE: 1174 case AMDGPU::SI_SPILL_S96_SAVE: 1175 case AMDGPU::SI_SPILL_S64_SAVE: 1176 case AMDGPU::SI_SPILL_S32_SAVE: { 1177 spillSGPR(MI, Index, RS); 1178 break; 1179 } 1180 1181 // SGPR register restore 1182 case AMDGPU::SI_SPILL_S1024_RESTORE: 1183 case AMDGPU::SI_SPILL_S512_RESTORE: 1184 case AMDGPU::SI_SPILL_S256_RESTORE: 1185 case AMDGPU::SI_SPILL_S160_RESTORE: 1186 case AMDGPU::SI_SPILL_S128_RESTORE: 1187 case AMDGPU::SI_SPILL_S96_RESTORE: 1188 case AMDGPU::SI_SPILL_S64_RESTORE: 1189 case AMDGPU::SI_SPILL_S32_RESTORE: { 1190 restoreSGPR(MI, Index, RS); 1191 break; 1192 } 1193 1194 // VGPR register spill 1195 case AMDGPU::SI_SPILL_V1024_SAVE: 1196 case AMDGPU::SI_SPILL_V512_SAVE: 1197 case AMDGPU::SI_SPILL_V256_SAVE: 1198 case AMDGPU::SI_SPILL_V160_SAVE: 1199 case AMDGPU::SI_SPILL_V128_SAVE: 1200 case AMDGPU::SI_SPILL_V96_SAVE: 1201 case AMDGPU::SI_SPILL_V64_SAVE: 1202 case AMDGPU::SI_SPILL_V32_SAVE: 1203 case AMDGPU::SI_SPILL_A1024_SAVE: 1204 case AMDGPU::SI_SPILL_A512_SAVE: 1205 case AMDGPU::SI_SPILL_A128_SAVE: 1206 case AMDGPU::SI_SPILL_A64_SAVE: 1207 case AMDGPU::SI_SPILL_A32_SAVE: { 1208 const MachineOperand *VData = TII->getNamedOperand(*MI, 1209 AMDGPU::OpName::vdata); 1210 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 1211 MFI->getStackPtrOffsetReg()); 1212 1213 buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, 1214 Index, 1215 VData->getReg(), VData->isKill(), 1216 TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), 1217 FrameReg, 1218 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 1219 *MI->memoperands_begin(), 1220 RS); 1221 MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); 1222 MI->eraseFromParent(); 1223 break; 1224 } 1225 case AMDGPU::SI_SPILL_V32_RESTORE: 1226 case AMDGPU::SI_SPILL_V64_RESTORE: 1227 case AMDGPU::SI_SPILL_V96_RESTORE: 1228 case AMDGPU::SI_SPILL_V128_RESTORE: 1229 case AMDGPU::SI_SPILL_V160_RESTORE: 1230 case AMDGPU::SI_SPILL_V256_RESTORE: 1231 case AMDGPU::SI_SPILL_V512_RESTORE: 1232 case AMDGPU::SI_SPILL_V1024_RESTORE: 1233 case AMDGPU::SI_SPILL_A32_RESTORE: 1234 case AMDGPU::SI_SPILL_A64_RESTORE: 1235 case AMDGPU::SI_SPILL_A128_RESTORE: 1236 case AMDGPU::SI_SPILL_A512_RESTORE: 1237 case AMDGPU::SI_SPILL_A1024_RESTORE: { 1238 const MachineOperand *VData = TII->getNamedOperand(*MI, 1239 AMDGPU::OpName::vdata); 1240 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 1241 MFI->getStackPtrOffsetReg()); 1242 1243 buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, 1244 Index, 1245 VData->getReg(), VData->isKill(), 1246 TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), 1247 FrameReg, 1248 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 1249 *MI->memoperands_begin(), 1250 RS); 1251 MI->eraseFromParent(); 1252 break; 1253 } 1254 1255 default: { 1256 const DebugLoc &DL = MI->getDebugLoc(); 1257 bool IsMUBUF = TII->isMUBUF(*MI); 1258 1259 if (!IsMUBUF && !MFI->isEntryFunction()) { 1260 // Convert to an absolute stack address by finding the offset from the 1261 // scratch wave base and scaling by the wave size. 1262 // 1263 // In an entry function/kernel the offset is already the absolute 1264 // address relative to the frame register. 1265 1266 unsigned DiffReg 1267 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 1268 1269 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32; 1270 Register ResultReg = IsCopy ? 1271 MI->getOperand(0).getReg() : 1272 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1273 1274 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg) 1275 .addReg(FrameReg) 1276 .addReg(MFI->getScratchWaveOffsetReg()); 1277 1278 int64_t Offset = FrameInfo.getObjectOffset(Index); 1279 if (Offset == 0) { 1280 // XXX - This never happens because of emergency scavenging slot at 0? 1281 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg) 1282 .addImm(Log2_32(ST.getWavefrontSize())) 1283 .addReg(DiffReg); 1284 } else { 1285 unsigned ScaledReg 1286 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1287 1288 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg) 1289 .addImm(Log2_32(ST.getWavefrontSize())) 1290 .addReg(DiffReg, RegState::Kill); 1291 1292 // TODO: Fold if use instruction is another add of a constant. 1293 if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { 1294 TII->getAddNoCarry(*MBB, MI, DL, ResultReg) 1295 .addImm(Offset) 1296 .addReg(ScaledReg, RegState::Kill) 1297 .addImm(0); // clamp bit 1298 } else { 1299 unsigned ConstOffsetReg 1300 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 1301 1302 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) 1303 .addImm(Offset); 1304 TII->getAddNoCarry(*MBB, MI, DL, ResultReg) 1305 .addReg(ConstOffsetReg, RegState::Kill) 1306 .addReg(ScaledReg, RegState::Kill) 1307 .addImm(0); // clamp bit 1308 } 1309 } 1310 1311 // Don't introduce an extra copy if we're just materializing in a mov. 1312 if (IsCopy) 1313 MI->eraseFromParent(); 1314 else 1315 FIOp.ChangeToRegister(ResultReg, false, false, true); 1316 return; 1317 } 1318 1319 if (IsMUBUF) { 1320 // Disable offen so we don't need a 0 vgpr base. 1321 assert(static_cast<int>(FIOperandNum) == 1322 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 1323 AMDGPU::OpName::vaddr)); 1324 1325 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 1326 MFI->getStackPtrOffsetReg()); 1327 1328 TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->setReg(FrameReg); 1329 1330 int64_t Offset = FrameInfo.getObjectOffset(Index); 1331 int64_t OldImm 1332 = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(); 1333 int64_t NewOffset = OldImm + Offset; 1334 1335 if (isUInt<12>(NewOffset) && 1336 buildMUBUFOffsetLoadStore(TII, FrameInfo, MI, Index, NewOffset)) { 1337 MI->eraseFromParent(); 1338 return; 1339 } 1340 } 1341 1342 // If the offset is simply too big, don't convert to a scratch wave offset 1343 // relative index. 1344 1345 int64_t Offset = FrameInfo.getObjectOffset(Index); 1346 FIOp.ChangeToImmediate(Offset); 1347 if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { 1348 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1349 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 1350 .addImm(Offset); 1351 FIOp.ChangeToRegister(TmpReg, false, false, true); 1352 } 1353 } 1354 } 1355 } 1356 1357 StringRef SIRegisterInfo::getRegAsmName(unsigned Reg) const { 1358 return AMDGPUInstPrinter::getRegisterName(Reg); 1359 } 1360 1361 // FIXME: This is very slow. It might be worth creating a map from physreg to 1362 // register class. 1363 const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { 1364 assert(!Register::isVirtualRegister(Reg)); 1365 1366 static const TargetRegisterClass *const BaseClasses[] = { 1367 &AMDGPU::VGPR_32RegClass, 1368 &AMDGPU::SReg_32RegClass, 1369 &AMDGPU::AGPR_32RegClass, 1370 &AMDGPU::VReg_64RegClass, 1371 &AMDGPU::SReg_64RegClass, 1372 &AMDGPU::AReg_64RegClass, 1373 &AMDGPU::VReg_96RegClass, 1374 &AMDGPU::SReg_96RegClass, 1375 &AMDGPU::VReg_128RegClass, 1376 &AMDGPU::SReg_128RegClass, 1377 &AMDGPU::AReg_128RegClass, 1378 &AMDGPU::VReg_160RegClass, 1379 &AMDGPU::SReg_160RegClass, 1380 &AMDGPU::VReg_256RegClass, 1381 &AMDGPU::SReg_256RegClass, 1382 &AMDGPU::VReg_512RegClass, 1383 &AMDGPU::SReg_512RegClass, 1384 &AMDGPU::AReg_512RegClass, 1385 &AMDGPU::SReg_1024RegClass, 1386 &AMDGPU::VReg_1024RegClass, 1387 &AMDGPU::AReg_1024RegClass, 1388 &AMDGPU::SCC_CLASSRegClass, 1389 &AMDGPU::Pseudo_SReg_32RegClass, 1390 &AMDGPU::Pseudo_SReg_128RegClass, 1391 }; 1392 1393 for (const TargetRegisterClass *BaseClass : BaseClasses) { 1394 if (BaseClass->contains(Reg)) { 1395 return BaseClass; 1396 } 1397 } 1398 return nullptr; 1399 } 1400 1401 // TODO: It might be helpful to have some target specific flags in 1402 // TargetRegisterClass to mark which classes are VGPRs to make this trivial. 1403 bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { 1404 unsigned Size = getRegSizeInBits(*RC); 1405 if (Size < 32) 1406 return false; 1407 switch (Size) { 1408 case 32: 1409 return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr; 1410 case 64: 1411 return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr; 1412 case 96: 1413 return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr; 1414 case 128: 1415 return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr; 1416 case 160: 1417 return getCommonSubClass(&AMDGPU::VReg_160RegClass, RC) != nullptr; 1418 case 256: 1419 return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr; 1420 case 512: 1421 return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr; 1422 case 1024: 1423 return getCommonSubClass(&AMDGPU::VReg_1024RegClass, RC) != nullptr; 1424 default: 1425 llvm_unreachable("Invalid register class size"); 1426 } 1427 } 1428 1429 bool SIRegisterInfo::hasAGPRs(const TargetRegisterClass *RC) const { 1430 unsigned Size = getRegSizeInBits(*RC); 1431 if (Size < 32) 1432 return false; 1433 switch (Size) { 1434 case 32: 1435 return getCommonSubClass(&AMDGPU::AGPR_32RegClass, RC) != nullptr; 1436 case 64: 1437 return getCommonSubClass(&AMDGPU::AReg_64RegClass, RC) != nullptr; 1438 case 96: 1439 return false; 1440 case 128: 1441 return getCommonSubClass(&AMDGPU::AReg_128RegClass, RC) != nullptr; 1442 case 160: 1443 case 256: 1444 return false; 1445 case 512: 1446 return getCommonSubClass(&AMDGPU::AReg_512RegClass, RC) != nullptr; 1447 case 1024: 1448 return getCommonSubClass(&AMDGPU::AReg_1024RegClass, RC) != nullptr; 1449 default: 1450 llvm_unreachable("Invalid register class size"); 1451 } 1452 } 1453 1454 const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( 1455 const TargetRegisterClass *SRC) const { 1456 switch (getRegSizeInBits(*SRC)) { 1457 case 32: 1458 return &AMDGPU::VGPR_32RegClass; 1459 case 64: 1460 return &AMDGPU::VReg_64RegClass; 1461 case 96: 1462 return &AMDGPU::VReg_96RegClass; 1463 case 128: 1464 return &AMDGPU::VReg_128RegClass; 1465 case 160: 1466 return &AMDGPU::VReg_160RegClass; 1467 case 256: 1468 return &AMDGPU::VReg_256RegClass; 1469 case 512: 1470 return &AMDGPU::VReg_512RegClass; 1471 case 1024: 1472 return &AMDGPU::VReg_1024RegClass; 1473 default: 1474 llvm_unreachable("Invalid register class size"); 1475 } 1476 } 1477 1478 const TargetRegisterClass *SIRegisterInfo::getEquivalentAGPRClass( 1479 const TargetRegisterClass *SRC) const { 1480 switch (getRegSizeInBits(*SRC)) { 1481 case 32: 1482 return &AMDGPU::AGPR_32RegClass; 1483 case 64: 1484 return &AMDGPU::AReg_64RegClass; 1485 case 128: 1486 return &AMDGPU::AReg_128RegClass; 1487 case 512: 1488 return &AMDGPU::AReg_512RegClass; 1489 case 1024: 1490 return &AMDGPU::AReg_1024RegClass; 1491 default: 1492 llvm_unreachable("Invalid register class size"); 1493 } 1494 } 1495 1496 const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass( 1497 const TargetRegisterClass *VRC) const { 1498 switch (getRegSizeInBits(*VRC)) { 1499 case 32: 1500 return &AMDGPU::SGPR_32RegClass; 1501 case 64: 1502 return &AMDGPU::SReg_64RegClass; 1503 case 96: 1504 return &AMDGPU::SReg_96RegClass; 1505 case 128: 1506 return &AMDGPU::SReg_128RegClass; 1507 case 160: 1508 return &AMDGPU::SReg_160RegClass; 1509 case 256: 1510 return &AMDGPU::SReg_256RegClass; 1511 case 512: 1512 return &AMDGPU::SReg_512RegClass; 1513 case 1024: 1514 return &AMDGPU::SReg_1024RegClass; 1515 default: 1516 llvm_unreachable("Invalid register class size"); 1517 } 1518 } 1519 1520 const TargetRegisterClass *SIRegisterInfo::getSubRegClass( 1521 const TargetRegisterClass *RC, unsigned SubIdx) const { 1522 if (SubIdx == AMDGPU::NoSubRegister) 1523 return RC; 1524 1525 // We can assume that each lane corresponds to one 32-bit register. 1526 unsigned Count = getSubRegIndexLaneMask(SubIdx).getNumLanes(); 1527 if (isSGPRClass(RC)) { 1528 switch (Count) { 1529 case 1: 1530 return &AMDGPU::SGPR_32RegClass; 1531 case 2: 1532 return &AMDGPU::SReg_64RegClass; 1533 case 3: 1534 return &AMDGPU::SReg_96RegClass; 1535 case 4: 1536 return &AMDGPU::SReg_128RegClass; 1537 case 5: 1538 return &AMDGPU::SReg_160RegClass; 1539 case 8: 1540 return &AMDGPU::SReg_256RegClass; 1541 case 16: 1542 return &AMDGPU::SReg_512RegClass; 1543 case 32: /* fall-through */ 1544 default: 1545 llvm_unreachable("Invalid sub-register class size"); 1546 } 1547 } else if (hasAGPRs(RC)) { 1548 switch (Count) { 1549 case 1: 1550 return &AMDGPU::AGPR_32RegClass; 1551 case 2: 1552 return &AMDGPU::AReg_64RegClass; 1553 case 4: 1554 return &AMDGPU::AReg_128RegClass; 1555 case 16: 1556 return &AMDGPU::AReg_512RegClass; 1557 case 32: /* fall-through */ 1558 default: 1559 llvm_unreachable("Invalid sub-register class size"); 1560 } 1561 } else { 1562 switch (Count) { 1563 case 1: 1564 return &AMDGPU::VGPR_32RegClass; 1565 case 2: 1566 return &AMDGPU::VReg_64RegClass; 1567 case 3: 1568 return &AMDGPU::VReg_96RegClass; 1569 case 4: 1570 return &AMDGPU::VReg_128RegClass; 1571 case 5: 1572 return &AMDGPU::VReg_160RegClass; 1573 case 8: 1574 return &AMDGPU::VReg_256RegClass; 1575 case 16: 1576 return &AMDGPU::VReg_512RegClass; 1577 case 32: /* fall-through */ 1578 default: 1579 llvm_unreachable("Invalid sub-register class size"); 1580 } 1581 } 1582 } 1583 1584 bool SIRegisterInfo::shouldRewriteCopySrc( 1585 const TargetRegisterClass *DefRC, 1586 unsigned DefSubReg, 1587 const TargetRegisterClass *SrcRC, 1588 unsigned SrcSubReg) const { 1589 // We want to prefer the smallest register class possible, so we don't want to 1590 // stop and rewrite on anything that looks like a subregister 1591 // extract. Operations mostly don't care about the super register class, so we 1592 // only want to stop on the most basic of copies between the same register 1593 // class. 1594 // 1595 // e.g. if we have something like 1596 // %0 = ... 1597 // %1 = ... 1598 // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2 1599 // %3 = COPY %2, sub0 1600 // 1601 // We want to look through the COPY to find: 1602 // => %3 = COPY %0 1603 1604 // Plain copy. 1605 return getCommonSubClass(DefRC, SrcRC) != nullptr; 1606 } 1607 1608 /// Returns a register that is not used at any point in the function. 1609 /// If all registers are used, then this function will return 1610 // AMDGPU::NoRegister. 1611 unsigned 1612 SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, 1613 const TargetRegisterClass *RC, 1614 const MachineFunction &MF) const { 1615 1616 for (unsigned Reg : *RC) 1617 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 1618 return Reg; 1619 return AMDGPU::NoRegister; 1620 } 1621 1622 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, 1623 unsigned EltSize) const { 1624 if (EltSize == 4) { 1625 static const int16_t Sub0_31[] = { 1626 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1627 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 1628 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 1629 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 1630 AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19, 1631 AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23, 1632 AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27, 1633 AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31, 1634 }; 1635 1636 static const int16_t Sub0_15[] = { 1637 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1638 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 1639 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 1640 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 1641 }; 1642 1643 static const int16_t Sub0_7[] = { 1644 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1645 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 1646 }; 1647 1648 static const int16_t Sub0_4[] = { 1649 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, 1650 }; 1651 1652 static const int16_t Sub0_3[] = { 1653 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1654 }; 1655 1656 static const int16_t Sub0_2[] = { 1657 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 1658 }; 1659 1660 static const int16_t Sub0_1[] = { 1661 AMDGPU::sub0, AMDGPU::sub1, 1662 }; 1663 1664 switch (AMDGPU::getRegBitWidth(*RC->MC)) { 1665 case 32: 1666 return {}; 1667 case 64: 1668 return makeArrayRef(Sub0_1); 1669 case 96: 1670 return makeArrayRef(Sub0_2); 1671 case 128: 1672 return makeArrayRef(Sub0_3); 1673 case 160: 1674 return makeArrayRef(Sub0_4); 1675 case 256: 1676 return makeArrayRef(Sub0_7); 1677 case 512: 1678 return makeArrayRef(Sub0_15); 1679 case 1024: 1680 return makeArrayRef(Sub0_31); 1681 default: 1682 llvm_unreachable("unhandled register size"); 1683 } 1684 } 1685 1686 if (EltSize == 8) { 1687 static const int16_t Sub0_31_64[] = { 1688 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 1689 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 1690 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 1691 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, 1692 AMDGPU::sub16_sub17, AMDGPU::sub18_sub19, 1693 AMDGPU::sub20_sub21, AMDGPU::sub22_sub23, 1694 AMDGPU::sub24_sub25, AMDGPU::sub26_sub27, 1695 AMDGPU::sub28_sub29, AMDGPU::sub30_sub31 1696 }; 1697 1698 static const int16_t Sub0_15_64[] = { 1699 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 1700 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 1701 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 1702 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15 1703 }; 1704 1705 static const int16_t Sub0_7_64[] = { 1706 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 1707 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7 1708 }; 1709 1710 1711 static const int16_t Sub0_3_64[] = { 1712 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3 1713 }; 1714 1715 switch (AMDGPU::getRegBitWidth(*RC->MC)) { 1716 case 64: 1717 return {}; 1718 case 128: 1719 return makeArrayRef(Sub0_3_64); 1720 case 256: 1721 return makeArrayRef(Sub0_7_64); 1722 case 512: 1723 return makeArrayRef(Sub0_15_64); 1724 case 1024: 1725 return makeArrayRef(Sub0_31_64); 1726 default: 1727 llvm_unreachable("unhandled register size"); 1728 } 1729 } 1730 1731 if (EltSize == 16) { 1732 1733 static const int16_t Sub0_31_128[] = { 1734 AMDGPU::sub0_sub1_sub2_sub3, 1735 AMDGPU::sub4_sub5_sub6_sub7, 1736 AMDGPU::sub8_sub9_sub10_sub11, 1737 AMDGPU::sub12_sub13_sub14_sub15, 1738 AMDGPU::sub16_sub17_sub18_sub19, 1739 AMDGPU::sub20_sub21_sub22_sub23, 1740 AMDGPU::sub24_sub25_sub26_sub27, 1741 AMDGPU::sub28_sub29_sub30_sub31 1742 }; 1743 1744 static const int16_t Sub0_15_128[] = { 1745 AMDGPU::sub0_sub1_sub2_sub3, 1746 AMDGPU::sub4_sub5_sub6_sub7, 1747 AMDGPU::sub8_sub9_sub10_sub11, 1748 AMDGPU::sub12_sub13_sub14_sub15 1749 }; 1750 1751 static const int16_t Sub0_7_128[] = { 1752 AMDGPU::sub0_sub1_sub2_sub3, 1753 AMDGPU::sub4_sub5_sub6_sub7 1754 }; 1755 1756 switch (AMDGPU::getRegBitWidth(*RC->MC)) { 1757 case 128: 1758 return {}; 1759 case 256: 1760 return makeArrayRef(Sub0_7_128); 1761 case 512: 1762 return makeArrayRef(Sub0_15_128); 1763 case 1024: 1764 return makeArrayRef(Sub0_31_128); 1765 default: 1766 llvm_unreachable("unhandled register size"); 1767 } 1768 } 1769 1770 assert(EltSize == 32 && "unhandled elt size"); 1771 1772 static const int16_t Sub0_31_256[] = { 1773 AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, 1774 AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15, 1775 AMDGPU::sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23, 1776 AMDGPU::sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31 1777 }; 1778 1779 static const int16_t Sub0_15_256[] = { 1780 AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, 1781 AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15 1782 }; 1783 1784 switch (AMDGPU::getRegBitWidth(*RC->MC)) { 1785 case 256: 1786 return {}; 1787 case 512: 1788 return makeArrayRef(Sub0_15_256); 1789 case 1024: 1790 return makeArrayRef(Sub0_31_256); 1791 default: 1792 llvm_unreachable("unhandled register size"); 1793 } 1794 } 1795 1796 const TargetRegisterClass* 1797 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI, 1798 unsigned Reg) const { 1799 if (Register::isVirtualRegister(Reg)) 1800 return MRI.getRegClass(Reg); 1801 1802 return getPhysRegClass(Reg); 1803 } 1804 1805 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, 1806 unsigned Reg) const { 1807 const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg); 1808 assert(RC && "Register class for the reg not found"); 1809 return hasVGPRs(RC); 1810 } 1811 1812 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI, 1813 unsigned Reg) const { 1814 const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg); 1815 assert(RC && "Register class for the reg not found"); 1816 return hasAGPRs(RC); 1817 } 1818 1819 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, 1820 const TargetRegisterClass *SrcRC, 1821 unsigned SubReg, 1822 const TargetRegisterClass *DstRC, 1823 unsigned DstSubReg, 1824 const TargetRegisterClass *NewRC, 1825 LiveIntervals &LIS) const { 1826 unsigned SrcSize = getRegSizeInBits(*SrcRC); 1827 unsigned DstSize = getRegSizeInBits(*DstRC); 1828 unsigned NewSize = getRegSizeInBits(*NewRC); 1829 1830 // Do not increase size of registers beyond dword, we would need to allocate 1831 // adjacent registers and constraint regalloc more than needed. 1832 1833 // Always allow dword coalescing. 1834 if (SrcSize <= 32 || DstSize <= 32) 1835 return true; 1836 1837 return NewSize <= DstSize || NewSize <= SrcSize; 1838 } 1839 1840 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, 1841 MachineFunction &MF) const { 1842 1843 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1844 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1845 1846 unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), 1847 MF.getFunction()); 1848 switch (RC->getID()) { 1849 default: 1850 return AMDGPURegisterInfo::getRegPressureLimit(RC, MF); 1851 case AMDGPU::VGPR_32RegClassID: 1852 return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF)); 1853 case AMDGPU::SGPR_32RegClassID: 1854 return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF)); 1855 } 1856 } 1857 1858 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, 1859 unsigned Idx) const { 1860 if (Idx == getVGPRPressureSet() || Idx == getAGPRPressureSet()) 1861 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, 1862 const_cast<MachineFunction &>(MF)); 1863 1864 if (Idx == getSGPRPressureSet()) 1865 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass, 1866 const_cast<MachineFunction &>(MF)); 1867 1868 return AMDGPURegisterInfo::getRegPressureSetLimit(MF, Idx); 1869 } 1870 1871 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const { 1872 static const int Empty[] = { -1 }; 1873 1874 if (hasRegUnit(AMDGPU::M0, RegUnit)) 1875 return Empty; 1876 return AMDGPURegisterInfo::getRegUnitPressureSets(RegUnit); 1877 } 1878 1879 unsigned SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const { 1880 // Not a callee saved register. 1881 return AMDGPU::SGPR30_SGPR31; 1882 } 1883 1884 const TargetRegisterClass * 1885 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, 1886 const RegisterBank &RB, 1887 const MachineRegisterInfo &MRI) const { 1888 switch (Size) { 1889 case 1: { 1890 switch (RB.getID()) { 1891 case AMDGPU::VGPRRegBankID: 1892 return &AMDGPU::VGPR_32RegClass; 1893 case AMDGPU::VCCRegBankID: 1894 return isWave32 ? 1895 &AMDGPU::SReg_32_XM0_XEXECRegClass : &AMDGPU::SReg_64_XEXECRegClass; 1896 case AMDGPU::SGPRRegBankID: 1897 return &AMDGPU::SReg_32_XM0RegClass; 1898 case AMDGPU::SCCRegBankID: 1899 // This needs to return an allocatable class, so don't bother returning 1900 // the dummy SCC class. 1901 return &AMDGPU::SReg_32_XM0RegClass; 1902 default: 1903 llvm_unreachable("unknown register bank"); 1904 } 1905 } 1906 case 32: 1907 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass : 1908 &AMDGPU::SReg_32_XM0RegClass; 1909 case 64: 1910 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass : 1911 &AMDGPU::SReg_64_XEXECRegClass; 1912 case 96: 1913 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_96RegClass : 1914 &AMDGPU::SReg_96RegClass; 1915 case 128: 1916 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass : 1917 &AMDGPU::SReg_128RegClass; 1918 case 160: 1919 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_160RegClass : 1920 &AMDGPU::SReg_160RegClass; 1921 case 256: 1922 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_256RegClass : 1923 &AMDGPU::SReg_256RegClass; 1924 case 512: 1925 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_512RegClass : 1926 &AMDGPU::SReg_512RegClass; 1927 default: 1928 if (Size < 32) 1929 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass : 1930 &AMDGPU::SReg_32_XM0RegClass; 1931 return nullptr; 1932 } 1933 } 1934 1935 const TargetRegisterClass * 1936 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, 1937 const MachineRegisterInfo &MRI) const { 1938 if (const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg())) 1939 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI); 1940 return nullptr; 1941 } 1942 1943 unsigned SIRegisterInfo::getVCC() const { 1944 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; 1945 } 1946 1947 const TargetRegisterClass * 1948 SIRegisterInfo::getRegClass(unsigned RCID) const { 1949 switch ((int)RCID) { 1950 case AMDGPU::SReg_1RegClassID: 1951 return getBoolRC(); 1952 case AMDGPU::SReg_1_XEXECRegClassID: 1953 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass 1954 : &AMDGPU::SReg_64_XEXECRegClass; 1955 case -1: 1956 return nullptr; 1957 default: 1958 return AMDGPURegisterInfo::getRegClass(RCID); 1959 } 1960 } 1961 1962 // Find reaching register definition 1963 MachineInstr *SIRegisterInfo::findReachingDef(unsigned Reg, unsigned SubReg, 1964 MachineInstr &Use, 1965 MachineRegisterInfo &MRI, 1966 LiveIntervals *LIS) const { 1967 auto &MDT = LIS->getAnalysis<MachineDominatorTree>(); 1968 SlotIndex UseIdx = LIS->getInstructionIndex(Use); 1969 SlotIndex DefIdx; 1970 1971 if (Register::isVirtualRegister(Reg)) { 1972 if (!LIS->hasInterval(Reg)) 1973 return nullptr; 1974 LiveInterval &LI = LIS->getInterval(Reg); 1975 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg) 1976 : MRI.getMaxLaneMaskForVReg(Reg); 1977 VNInfo *V = nullptr; 1978 if (LI.hasSubRanges()) { 1979 for (auto &S : LI.subranges()) { 1980 if ((S.LaneMask & SubLanes) == SubLanes) { 1981 V = S.getVNInfoAt(UseIdx); 1982 break; 1983 } 1984 } 1985 } else { 1986 V = LI.getVNInfoAt(UseIdx); 1987 } 1988 if (!V) 1989 return nullptr; 1990 DefIdx = V->def; 1991 } else { 1992 // Find last def. 1993 for (MCRegUnitIterator Units(Reg, this); Units.isValid(); ++Units) { 1994 LiveRange &LR = LIS->getRegUnit(*Units); 1995 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) { 1996 if (!DefIdx.isValid() || 1997 MDT.dominates(LIS->getInstructionFromIndex(DefIdx), 1998 LIS->getInstructionFromIndex(V->def))) 1999 DefIdx = V->def; 2000 } else { 2001 return nullptr; 2002 } 2003 } 2004 } 2005 2006 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx); 2007 2008 if (!Def || !MDT.dominates(Def, &Use)) 2009 return nullptr; 2010 2011 assert(Def->modifiesRegister(Reg, this)); 2012 2013 return Def; 2014 } 2015