1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI implementation of the TargetRegisterInfo class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "SIRegisterInfo.h" 15 #include "AMDGPURegisterBankInfo.h" 16 #include "AMDGPUSubtarget.h" 17 #include "SIInstrInfo.h" 18 #include "SIMachineFunctionInfo.h" 19 #include "MCTargetDesc/AMDGPUInstPrinter.h" 20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 21 #include "llvm/CodeGen/LiveIntervals.h" 22 #include "llvm/CodeGen/MachineDominators.h" 23 #include "llvm/CodeGen/MachineFrameInfo.h" 24 #include "llvm/CodeGen/MachineInstrBuilder.h" 25 #include "llvm/CodeGen/RegisterScavenging.h" 26 #include "llvm/CodeGen/SlotIndexes.h" 27 #include "llvm/IR/Function.h" 28 #include "llvm/IR/LLVMContext.h" 29 30 using namespace llvm; 31 32 static bool hasPressureSet(const int *PSets, unsigned PSetID) { 33 for (unsigned i = 0; PSets[i] != -1; ++i) { 34 if (PSets[i] == (int)PSetID) 35 return true; 36 } 37 return false; 38 } 39 40 void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg, 41 BitVector &PressureSets) const { 42 for (MCRegUnitIterator U(Reg, this); U.isValid(); ++U) { 43 const int *PSets = getRegUnitPressureSets(*U); 44 if (hasPressureSet(PSets, PSetID)) { 45 PressureSets.set(PSetID); 46 break; 47 } 48 } 49 } 50 51 static cl::opt<bool> EnableSpillSGPRToSMEM( 52 "amdgpu-spill-sgpr-to-smem", 53 cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"), 54 cl::init(false)); 55 56 static cl::opt<bool> EnableSpillSGPRToVGPR( 57 "amdgpu-spill-sgpr-to-vgpr", 58 cl::desc("Enable spilling VGPRs to SGPRs"), 59 cl::ReallyHidden, 60 cl::init(true)); 61 62 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) : 63 AMDGPURegisterInfo(), 64 SGPRPressureSets(getNumRegPressureSets()), 65 VGPRPressureSets(getNumRegPressureSets()), 66 AGPRPressureSets(getNumRegPressureSets()), 67 SpillSGPRToVGPR(false), 68 SpillSGPRToSMEM(false), 69 isWave32(ST.isWave32()) { 70 if (EnableSpillSGPRToSMEM && ST.hasScalarStores()) 71 SpillSGPRToSMEM = true; 72 else if (EnableSpillSGPRToVGPR) 73 SpillSGPRToVGPR = true; 74 75 unsigned NumRegPressureSets = getNumRegPressureSets(); 76 77 SGPRSetID = NumRegPressureSets; 78 VGPRSetID = NumRegPressureSets; 79 AGPRSetID = NumRegPressureSets; 80 81 for (unsigned i = 0; i < NumRegPressureSets; ++i) { 82 classifyPressureSet(i, AMDGPU::SGPR0, SGPRPressureSets); 83 classifyPressureSet(i, AMDGPU::VGPR0, VGPRPressureSets); 84 classifyPressureSet(i, AMDGPU::AGPR0, AGPRPressureSets); 85 } 86 87 // Determine the number of reg units for each pressure set. 88 std::vector<unsigned> PressureSetRegUnits(NumRegPressureSets, 0); 89 for (unsigned i = 0, e = getNumRegUnits(); i != e; ++i) { 90 const int *PSets = getRegUnitPressureSets(i); 91 for (unsigned j = 0; PSets[j] != -1; ++j) { 92 ++PressureSetRegUnits[PSets[j]]; 93 } 94 } 95 96 unsigned VGPRMax = 0, SGPRMax = 0, AGPRMax = 0; 97 for (unsigned i = 0; i < NumRegPressureSets; ++i) { 98 if (isVGPRPressureSet(i) && PressureSetRegUnits[i] > VGPRMax) { 99 VGPRSetID = i; 100 VGPRMax = PressureSetRegUnits[i]; 101 continue; 102 } 103 if (isSGPRPressureSet(i) && PressureSetRegUnits[i] > SGPRMax) { 104 SGPRSetID = i; 105 SGPRMax = PressureSetRegUnits[i]; 106 } 107 if (isAGPRPressureSet(i) && PressureSetRegUnits[i] > AGPRMax) { 108 AGPRSetID = i; 109 AGPRMax = PressureSetRegUnits[i]; 110 continue; 111 } 112 } 113 114 assert(SGPRSetID < NumRegPressureSets && 115 VGPRSetID < NumRegPressureSets && 116 AGPRSetID < NumRegPressureSets); 117 } 118 119 unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( 120 const MachineFunction &MF) const { 121 122 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 123 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4; 124 unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); 125 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); 126 } 127 128 static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) { 129 unsigned Reg; 130 131 // Try to place it in a hole after PrivateSegmentBufferReg. 132 if (RegCount & 3) { 133 // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to 134 // alignment constraints, so we have a hole where can put the wave offset. 135 Reg = RegCount - 1; 136 } else { 137 // We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the 138 // wave offset before it. 139 Reg = RegCount - 5; 140 } 141 142 return Reg; 143 } 144 145 unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( 146 const MachineFunction &MF) const { 147 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 148 unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF)); 149 return AMDGPU::SGPR_32RegClass.getRegister(Reg); 150 } 151 152 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { 153 BitVector Reserved(getNumRegs()); 154 155 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but 156 // this seems likely to result in bugs, so I'm marking them as reserved. 157 reserveRegisterTuples(Reserved, AMDGPU::EXEC); 158 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); 159 160 // M0 has to be reserved so that llvm accepts it as a live-in into a block. 161 reserveRegisterTuples(Reserved, AMDGPU::M0); 162 163 // Reserve src_vccz, src_execz, src_scc. 164 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ); 165 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ); 166 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC); 167 168 // Reserve the memory aperture registers. 169 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE); 170 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); 171 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); 172 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT); 173 174 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen. 175 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID); 176 177 // Reserve xnack_mask registers - support is not implemented in Codegen. 178 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK); 179 180 // Reserve lds_direct register - support is not implemented in Codegen. 181 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT); 182 183 // Reserve Trap Handler registers - support is not implemented in Codegen. 184 reserveRegisterTuples(Reserved, AMDGPU::TBA); 185 reserveRegisterTuples(Reserved, AMDGPU::TMA); 186 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1); 187 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3); 188 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5); 189 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7); 190 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); 191 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); 192 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13); 193 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15); 194 195 // Reserve null register - it shall never be allocated 196 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL); 197 198 // Disallow vcc_hi allocation in wave32. It may be allocated but most likely 199 // will result in bugs. 200 if (isWave32) { 201 Reserved.set(AMDGPU::VCC); 202 Reserved.set(AMDGPU::VCC_HI); 203 } 204 205 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 206 207 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); 208 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 209 for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) { 210 unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); 211 reserveRegisterTuples(Reserved, Reg); 212 } 213 214 unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); 215 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); 216 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) { 217 unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); 218 reserveRegisterTuples(Reserved, Reg); 219 Reg = AMDGPU::AGPR_32RegClass.getRegister(i); 220 reserveRegisterTuples(Reserved, Reg); 221 } 222 223 // Reserve all the rest AGPRs if there are no instructions to use it. 224 if (!ST.hasMAIInsts()) { 225 for (unsigned i = 0; i < MaxNumVGPRs; ++i) { 226 unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); 227 reserveRegisterTuples(Reserved, Reg); 228 } 229 } 230 231 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 232 233 unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); 234 if (ScratchWaveOffsetReg != AMDGPU::NoRegister) { 235 // Reserve 1 SGPR for scratch wave offset in case we need to spill. 236 reserveRegisterTuples(Reserved, ScratchWaveOffsetReg); 237 } 238 239 unsigned ScratchRSrcReg = MFI->getScratchRSrcReg(); 240 if (ScratchRSrcReg != AMDGPU::NoRegister) { 241 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need 242 // to spill. 243 // TODO: May need to reserve a VGPR if doing LDS spilling. 244 reserveRegisterTuples(Reserved, ScratchRSrcReg); 245 assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg)); 246 } 247 248 // We have to assume the SP is needed in case there are calls in the function, 249 // which is detected after the function is lowered. If we aren't really going 250 // to need SP, don't bother reserving it. 251 unsigned StackPtrReg = MFI->getStackPtrOffsetReg(); 252 253 if (StackPtrReg != AMDGPU::NoRegister) { 254 reserveRegisterTuples(Reserved, StackPtrReg); 255 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg)); 256 } 257 258 unsigned FrameReg = MFI->getFrameOffsetReg(); 259 if (FrameReg != AMDGPU::NoRegister) { 260 reserveRegisterTuples(Reserved, FrameReg); 261 assert(!isSubRegister(ScratchRSrcReg, FrameReg)); 262 } 263 264 for (unsigned Reg : MFI->WWMReservedRegs) { 265 reserveRegisterTuples(Reserved, Reg); 266 } 267 268 // FIXME: Stop using reserved registers for this. 269 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs()) 270 reserveRegisterTuples(Reserved, Reg); 271 272 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) 273 reserveRegisterTuples(Reserved, Reg); 274 275 return Reserved; 276 } 277 278 bool SIRegisterInfo::canRealignStack(const MachineFunction &MF) const { 279 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 280 // On entry, the base address is 0, so it can't possibly need any more 281 // alignment. 282 283 // FIXME: Should be able to specify the entry frame alignment per calling 284 // convention instead. 285 if (Info->isEntryFunction()) 286 return false; 287 288 return TargetRegisterInfo::canRealignStack(MF); 289 } 290 291 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { 292 const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>(); 293 if (Info->isEntryFunction()) { 294 const MachineFrameInfo &MFI = Fn.getFrameInfo(); 295 return MFI.hasStackObjects() || MFI.hasCalls(); 296 } 297 298 // May need scavenger for dealing with callee saved registers. 299 return true; 300 } 301 302 bool SIRegisterInfo::requiresFrameIndexScavenging( 303 const MachineFunction &MF) const { 304 const MachineFrameInfo &MFI = MF.getFrameInfo(); 305 if (MFI.hasStackObjects()) 306 return true; 307 308 // May need to deal with callee saved registers. 309 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 310 return !Info->isEntryFunction(); 311 } 312 313 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( 314 const MachineFunction &MF) const { 315 const MachineFrameInfo &MFI = MF.getFrameInfo(); 316 if (!MFI.hasStackObjects()) 317 return false; 318 319 // The scavenger is used for large frames which may require finding a free 320 // register for large offsets. 321 if (!isUInt<12>(MFI.getStackSize())) 322 return true; 323 324 // If using scalar stores, for spills, m0 is needed for the scalar store 325 // offset (pre-GFX9). m0 is unallocatable, so we can't create a virtual 326 // register for it during frame index elimination, so the scavenger is 327 // directly needed. 328 return MF.getSubtarget<GCNSubtarget>().hasScalarStores() && 329 MF.getInfo<SIMachineFunctionInfo>()->hasSpilledSGPRs(); 330 } 331 332 bool SIRegisterInfo::requiresVirtualBaseRegisters( 333 const MachineFunction &) const { 334 // There are no special dedicated stack or frame pointers. 335 return true; 336 } 337 338 bool SIRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const { 339 // This helps catch bugs as verifier errors. 340 return true; 341 } 342 343 int64_t SIRegisterInfo::getMUBUFInstrOffset(const MachineInstr *MI) const { 344 assert(SIInstrInfo::isMUBUF(*MI)); 345 346 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 347 AMDGPU::OpName::offset); 348 return MI->getOperand(OffIdx).getImm(); 349 } 350 351 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI, 352 int Idx) const { 353 if (!SIInstrInfo::isMUBUF(*MI)) 354 return 0; 355 356 assert(Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 357 AMDGPU::OpName::vaddr) && 358 "Should never see frame index on non-address operand"); 359 360 return getMUBUFInstrOffset(MI); 361 } 362 363 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { 364 if (!MI->mayLoadOrStore()) 365 return false; 366 367 int64_t FullOffset = Offset + getMUBUFInstrOffset(MI); 368 369 return !isUInt<12>(FullOffset); 370 } 371 372 void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, 373 unsigned BaseReg, 374 int FrameIdx, 375 int64_t Offset) const { 376 MachineBasicBlock::iterator Ins = MBB->begin(); 377 DebugLoc DL; // Defaults to "unknown" 378 379 if (Ins != MBB->end()) 380 DL = Ins->getDebugLoc(); 381 382 MachineFunction *MF = MBB->getParent(); 383 const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>(); 384 const SIInstrInfo *TII = Subtarget.getInstrInfo(); 385 386 if (Offset == 0) { 387 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg) 388 .addFrameIndex(FrameIdx); 389 return; 390 } 391 392 MachineRegisterInfo &MRI = MF->getRegInfo(); 393 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 394 395 Register FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 396 397 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) 398 .addImm(Offset); 399 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), FIReg) 400 .addFrameIndex(FrameIdx); 401 402 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg) 403 .addReg(OffsetReg, RegState::Kill) 404 .addReg(FIReg) 405 .addImm(0); // clamp bit 406 } 407 408 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, 409 int64_t Offset) const { 410 411 MachineBasicBlock *MBB = MI.getParent(); 412 MachineFunction *MF = MBB->getParent(); 413 const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>(); 414 const SIInstrInfo *TII = Subtarget.getInstrInfo(); 415 416 #ifndef NDEBUG 417 // FIXME: Is it possible to be storing a frame index to itself? 418 bool SeenFI = false; 419 for (const MachineOperand &MO: MI.operands()) { 420 if (MO.isFI()) { 421 if (SeenFI) 422 llvm_unreachable("should not see multiple frame indices"); 423 424 SeenFI = true; 425 } 426 } 427 #endif 428 429 MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 430 assert(FIOp && FIOp->isFI() && "frame index must be address operand"); 431 assert(TII->isMUBUF(MI)); 432 assert(TII->getNamedOperand(MI, AMDGPU::OpName::soffset)->getReg() == 433 MF->getInfo<SIMachineFunctionInfo>()->getFrameOffsetReg() && 434 "should only be seeing frame offset relative FrameIndex"); 435 436 437 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); 438 int64_t NewOffset = OffsetOp->getImm() + Offset; 439 assert(isUInt<12>(NewOffset) && "offset should be legal"); 440 441 FIOp->ChangeToRegister(BaseReg, false); 442 OffsetOp->setImm(NewOffset); 443 } 444 445 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, 446 unsigned BaseReg, 447 int64_t Offset) const { 448 if (!SIInstrInfo::isMUBUF(*MI)) 449 return false; 450 451 int64_t NewOffset = Offset + getMUBUFInstrOffset(MI); 452 453 return isUInt<12>(NewOffset); 454 } 455 456 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( 457 const MachineFunction &MF, unsigned Kind) const { 458 // This is inaccurate. It depends on the instruction and address space. The 459 // only place where we should hit this is for dealing with frame indexes / 460 // private accesses, so this is correct in that case. 461 return &AMDGPU::VGPR_32RegClass; 462 } 463 464 static unsigned getNumSubRegsForSpillOp(unsigned Op) { 465 466 switch (Op) { 467 case AMDGPU::SI_SPILL_S1024_SAVE: 468 case AMDGPU::SI_SPILL_S1024_RESTORE: 469 case AMDGPU::SI_SPILL_V1024_SAVE: 470 case AMDGPU::SI_SPILL_V1024_RESTORE: 471 case AMDGPU::SI_SPILL_A1024_SAVE: 472 case AMDGPU::SI_SPILL_A1024_RESTORE: 473 return 32; 474 case AMDGPU::SI_SPILL_S512_SAVE: 475 case AMDGPU::SI_SPILL_S512_RESTORE: 476 case AMDGPU::SI_SPILL_V512_SAVE: 477 case AMDGPU::SI_SPILL_V512_RESTORE: 478 case AMDGPU::SI_SPILL_A512_SAVE: 479 case AMDGPU::SI_SPILL_A512_RESTORE: 480 return 16; 481 case AMDGPU::SI_SPILL_S256_SAVE: 482 case AMDGPU::SI_SPILL_S256_RESTORE: 483 case AMDGPU::SI_SPILL_V256_SAVE: 484 case AMDGPU::SI_SPILL_V256_RESTORE: 485 return 8; 486 case AMDGPU::SI_SPILL_S160_SAVE: 487 case AMDGPU::SI_SPILL_S160_RESTORE: 488 case AMDGPU::SI_SPILL_V160_SAVE: 489 case AMDGPU::SI_SPILL_V160_RESTORE: 490 return 5; 491 case AMDGPU::SI_SPILL_S128_SAVE: 492 case AMDGPU::SI_SPILL_S128_RESTORE: 493 case AMDGPU::SI_SPILL_V128_SAVE: 494 case AMDGPU::SI_SPILL_V128_RESTORE: 495 case AMDGPU::SI_SPILL_A128_SAVE: 496 case AMDGPU::SI_SPILL_A128_RESTORE: 497 return 4; 498 case AMDGPU::SI_SPILL_S96_SAVE: 499 case AMDGPU::SI_SPILL_S96_RESTORE: 500 case AMDGPU::SI_SPILL_V96_SAVE: 501 case AMDGPU::SI_SPILL_V96_RESTORE: 502 return 3; 503 case AMDGPU::SI_SPILL_S64_SAVE: 504 case AMDGPU::SI_SPILL_S64_RESTORE: 505 case AMDGPU::SI_SPILL_V64_SAVE: 506 case AMDGPU::SI_SPILL_V64_RESTORE: 507 case AMDGPU::SI_SPILL_A64_SAVE: 508 case AMDGPU::SI_SPILL_A64_RESTORE: 509 return 2; 510 case AMDGPU::SI_SPILL_S32_SAVE: 511 case AMDGPU::SI_SPILL_S32_RESTORE: 512 case AMDGPU::SI_SPILL_V32_SAVE: 513 case AMDGPU::SI_SPILL_V32_RESTORE: 514 case AMDGPU::SI_SPILL_A32_SAVE: 515 case AMDGPU::SI_SPILL_A32_RESTORE: 516 return 1; 517 default: llvm_unreachable("Invalid spill opcode"); 518 } 519 } 520 521 static int getOffsetMUBUFStore(unsigned Opc) { 522 switch (Opc) { 523 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 524 return AMDGPU::BUFFER_STORE_DWORD_OFFSET; 525 case AMDGPU::BUFFER_STORE_BYTE_OFFEN: 526 return AMDGPU::BUFFER_STORE_BYTE_OFFSET; 527 case AMDGPU::BUFFER_STORE_SHORT_OFFEN: 528 return AMDGPU::BUFFER_STORE_SHORT_OFFSET; 529 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: 530 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; 531 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN: 532 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; 533 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN: 534 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET; 535 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN: 536 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET; 537 default: 538 return -1; 539 } 540 } 541 542 static int getOffsetMUBUFLoad(unsigned Opc) { 543 switch (Opc) { 544 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 545 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 546 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN: 547 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET; 548 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN: 549 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET; 550 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN: 551 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET; 552 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN: 553 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET; 554 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN: 555 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; 556 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN: 557 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET; 558 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN: 559 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET; 560 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN: 561 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET; 562 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN: 563 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET; 564 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN: 565 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET; 566 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN: 567 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET; 568 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN: 569 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET; 570 default: 571 return -1; 572 } 573 } 574 575 static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI, 576 int Index, 577 unsigned Lane, 578 unsigned ValueReg, 579 bool IsKill) { 580 MachineBasicBlock *MBB = MI->getParent(); 581 MachineFunction *MF = MI->getParent()->getParent(); 582 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 583 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 584 const SIInstrInfo *TII = ST.getInstrInfo(); 585 586 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane); 587 588 if (Reg == AMDGPU::NoRegister) 589 return MachineInstrBuilder(); 590 591 bool IsStore = MI->mayStore(); 592 MachineRegisterInfo &MRI = MF->getRegInfo(); 593 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 594 595 unsigned Dst = IsStore ? Reg : ValueReg; 596 unsigned Src = IsStore ? ValueReg : Reg; 597 unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32 598 : AMDGPU::V_ACCVGPR_READ_B32; 599 600 return BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst) 601 .addReg(Src, getKillRegState(IsKill)); 602 } 603 604 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not 605 // need to handle the case where an SGPR may need to be spilled while spilling. 606 static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, 607 MachineFrameInfo &MFI, 608 MachineBasicBlock::iterator MI, 609 int Index, 610 int64_t Offset) { 611 MachineBasicBlock *MBB = MI->getParent(); 612 const DebugLoc &DL = MI->getDebugLoc(); 613 bool IsStore = MI->mayStore(); 614 615 unsigned Opc = MI->getOpcode(); 616 int LoadStoreOp = IsStore ? 617 getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc); 618 if (LoadStoreOp == -1) 619 return false; 620 621 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); 622 if (spillVGPRtoAGPR(MI, Index, 0, Reg->getReg(), false).getInstr()) 623 return true; 624 625 MachineInstrBuilder NewMI = 626 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) 627 .add(*Reg) 628 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) 629 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) 630 .addImm(Offset) 631 .addImm(0) // glc 632 .addImm(0) // slc 633 .addImm(0) // tfe 634 .addImm(0) // dlc 635 .cloneMemRefs(*MI); 636 637 const MachineOperand *VDataIn = TII->getNamedOperand(*MI, 638 AMDGPU::OpName::vdata_in); 639 if (VDataIn) 640 NewMI.add(*VDataIn); 641 return true; 642 } 643 644 void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, 645 unsigned LoadStoreOp, 646 int Index, 647 unsigned ValueReg, 648 bool IsKill, 649 unsigned ScratchRsrcReg, 650 unsigned ScratchOffsetReg, 651 int64_t InstOffset, 652 MachineMemOperand *MMO, 653 RegScavenger *RS) const { 654 MachineBasicBlock *MBB = MI->getParent(); 655 MachineFunction *MF = MI->getParent()->getParent(); 656 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 657 const SIInstrInfo *TII = ST.getInstrInfo(); 658 const MachineFrameInfo &MFI = MF->getFrameInfo(); 659 660 const MCInstrDesc &Desc = TII->get(LoadStoreOp); 661 const DebugLoc &DL = MI->getDebugLoc(); 662 bool IsStore = Desc.mayStore(); 663 664 bool Scavenged = false; 665 unsigned SOffset = ScratchOffsetReg; 666 667 const unsigned EltSize = 4; 668 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); 669 unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / (EltSize * CHAR_BIT); 670 unsigned Size = NumSubRegs * EltSize; 671 int64_t Offset = InstOffset + MFI.getObjectOffset(Index); 672 int64_t ScratchOffsetRegDelta = 0; 673 674 unsigned Align = MFI.getObjectAlignment(Index); 675 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); 676 677 Register TmpReg = 678 hasAGPRs(RC) ? TII->getNamedOperand(*MI, AMDGPU::OpName::tmp)->getReg() 679 : Register(); 680 681 assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset"); 682 683 if (!isUInt<12>(Offset + Size - EltSize)) { 684 SOffset = AMDGPU::NoRegister; 685 686 // We currently only support spilling VGPRs to EltSize boundaries, meaning 687 // we can simplify the adjustment of Offset here to just scale with 688 // WavefrontSize. 689 Offset *= ST.getWavefrontSize(); 690 691 // We don't have access to the register scavenger if this function is called 692 // during PEI::scavengeFrameVirtualRegs(). 693 if (RS) 694 SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false); 695 696 if (SOffset == AMDGPU::NoRegister) { 697 // There are no free SGPRs, and since we are in the process of spilling 698 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true 699 // on SI/CI and on VI it is true until we implement spilling using scalar 700 // stores), we have no way to free up an SGPR. Our solution here is to 701 // add the offset directly to the ScratchOffset register, and then 702 // subtract the offset after the spill to return ScratchOffset to it's 703 // original value. 704 SOffset = ScratchOffsetReg; 705 ScratchOffsetRegDelta = Offset; 706 } else { 707 Scavenged = true; 708 } 709 710 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) 711 .addReg(ScratchOffsetReg) 712 .addImm(Offset); 713 714 Offset = 0; 715 } 716 717 for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) { 718 Register SubReg = NumSubRegs == 1 719 ? Register(ValueReg) 720 : getSubReg(ValueReg, getSubRegFromChannel(i)); 721 722 unsigned SOffsetRegState = 0; 723 unsigned SrcDstRegState = getDefRegState(!IsStore); 724 if (i + 1 == e) { 725 SOffsetRegState |= getKillRegState(Scavenged); 726 // The last implicit use carries the "Kill" flag. 727 SrcDstRegState |= getKillRegState(IsKill); 728 } 729 730 auto MIB = spillVGPRtoAGPR(MI, Index, i, SubReg, IsKill); 731 732 if (!MIB.getInstr()) { 733 unsigned FinalReg = SubReg; 734 if (TmpReg != AMDGPU::NoRegister) { 735 if (IsStore) 736 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_READ_B32), TmpReg) 737 .addReg(SubReg, getKillRegState(IsKill)); 738 SubReg = TmpReg; 739 } 740 741 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i); 742 MachineMemOperand *NewMMO 743 = MF->getMachineMemOperand(PInfo, MMO->getFlags(), 744 EltSize, MinAlign(Align, EltSize * i)); 745 746 MIB = BuildMI(*MBB, MI, DL, Desc) 747 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)) 748 .addReg(ScratchRsrcReg) 749 .addReg(SOffset, SOffsetRegState) 750 .addImm(Offset) 751 .addImm(0) // glc 752 .addImm(0) // slc 753 .addImm(0) // tfe 754 .addImm(0) // dlc 755 .addMemOperand(NewMMO); 756 757 if (!IsStore && TmpReg != AMDGPU::NoRegister) 758 MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32), 759 FinalReg) 760 .addReg(TmpReg, RegState::Kill); 761 } 762 763 if (NumSubRegs > 1) 764 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); 765 } 766 767 if (ScratchOffsetRegDelta != 0) { 768 // Subtract the offset we added to the ScratchOffset register. 769 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffsetReg) 770 .addReg(ScratchOffsetReg) 771 .addImm(ScratchOffsetRegDelta); 772 } 773 } 774 775 static std::pair<unsigned, unsigned> getSpillEltSize(unsigned SuperRegSize, 776 bool Store) { 777 if (SuperRegSize % 16 == 0) { 778 return { 16, Store ? AMDGPU::S_BUFFER_STORE_DWORDX4_SGPR : 779 AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR }; 780 } 781 782 if (SuperRegSize % 8 == 0) { 783 return { 8, Store ? AMDGPU::S_BUFFER_STORE_DWORDX2_SGPR : 784 AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR }; 785 } 786 787 return { 4, Store ? AMDGPU::S_BUFFER_STORE_DWORD_SGPR : 788 AMDGPU::S_BUFFER_LOAD_DWORD_SGPR}; 789 } 790 791 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, 792 int Index, 793 RegScavenger *RS, 794 bool OnlyToVGPR) const { 795 MachineBasicBlock *MBB = MI->getParent(); 796 MachineFunction *MF = MBB->getParent(); 797 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 798 DenseSet<unsigned> SGPRSpillVGPRDefinedSet; 799 800 ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills 801 = MFI->getSGPRToVGPRSpills(Index); 802 bool SpillToVGPR = !VGPRSpills.empty(); 803 if (OnlyToVGPR && !SpillToVGPR) 804 return false; 805 806 MachineRegisterInfo &MRI = MF->getRegInfo(); 807 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 808 const SIInstrInfo *TII = ST.getInstrInfo(); 809 810 Register SuperReg = MI->getOperand(0).getReg(); 811 bool IsKill = MI->getOperand(0).isKill(); 812 const DebugLoc &DL = MI->getDebugLoc(); 813 814 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 815 816 bool SpillToSMEM = spillSGPRToSMEM(); 817 if (SpillToSMEM && OnlyToVGPR) 818 return false; 819 820 Register FrameReg = getFrameRegister(*MF); 821 822 assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() && 823 SuperReg != MFI->getFrameOffsetReg() && 824 SuperReg != MFI->getScratchWaveOffsetReg())); 825 826 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 827 828 unsigned OffsetReg = AMDGPU::M0; 829 unsigned M0CopyReg = AMDGPU::NoRegister; 830 831 if (SpillToSMEM) { 832 if (RS->isRegUsed(AMDGPU::M0)) { 833 M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 834 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg) 835 .addReg(AMDGPU::M0); 836 } 837 } 838 839 unsigned ScalarStoreOp; 840 unsigned EltSize = 4; 841 const TargetRegisterClass *RC = getPhysRegClass(SuperReg); 842 if (SpillToSMEM && isSGPRClass(RC)) { 843 // XXX - if private_element_size is larger than 4 it might be useful to be 844 // able to spill wider vmem spills. 845 std::tie(EltSize, ScalarStoreOp) = 846 getSpillEltSize(getRegSizeInBits(*RC) / 8, true); 847 } 848 849 ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); 850 unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 851 852 // SubReg carries the "Kill" flag when SubReg == SuperReg. 853 unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); 854 for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { 855 Register SubReg = 856 NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]); 857 858 if (SpillToSMEM) { 859 int64_t FrOffset = FrameInfo.getObjectOffset(Index); 860 861 // The allocated memory size is really the wavefront size * the frame 862 // index size. The widest register class is 64 bytes, so a 4-byte scratch 863 // allocation is enough to spill this in a single stack object. 864 // 865 // FIXME: Frame size/offsets are computed earlier than this, so the extra 866 // space is still unnecessarily allocated. 867 868 unsigned Align = FrameInfo.getObjectAlignment(Index); 869 MachinePointerInfo PtrInfo 870 = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); 871 MachineMemOperand *MMO 872 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 873 EltSize, MinAlign(Align, EltSize * i)); 874 875 // SMEM instructions only support a single offset, so increment the wave 876 // offset. 877 878 int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); 879 if (Offset != 0) { 880 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) 881 .addReg(FrameReg) 882 .addImm(Offset); 883 } else { 884 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) 885 .addReg(FrameReg); 886 } 887 888 BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp)) 889 .addReg(SubReg, getKillRegState(IsKill)) // sdata 890 .addReg(MFI->getScratchRSrcReg()) // sbase 891 .addReg(OffsetReg, RegState::Kill) // soff 892 .addImm(0) // glc 893 .addImm(0) // dlc 894 .addMemOperand(MMO); 895 896 continue; 897 } 898 899 if (SpillToVGPR) { 900 SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; 901 902 // During SGPR spilling to VGPR, determine if the VGPR is defined. The 903 // only circumstance in which we say it is undefined is when it is the 904 // first spill to this VGPR in the first basic block. 905 bool VGPRDefined = true; 906 if (MBB == &MF->front()) 907 VGPRDefined = !SGPRSpillVGPRDefinedSet.insert(Spill.VGPR).second; 908 909 // Mark the "old value of vgpr" input undef only if this is the first sgpr 910 // spill to this specific vgpr in the first basic block. 911 BuildMI(*MBB, MI, DL, 912 TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), 913 Spill.VGPR) 914 .addReg(SubReg, getKillRegState(IsKill)) 915 .addImm(Spill.Lane) 916 .addReg(Spill.VGPR, VGPRDefined ? 0 : RegState::Undef); 917 918 // FIXME: Since this spills to another register instead of an actual 919 // frame index, we should delete the frame index when all references to 920 // it are fixed. 921 } else { 922 // XXX - Can to VGPR spill fail for some subregisters but not others? 923 if (OnlyToVGPR) 924 return false; 925 926 // Spill SGPR to a frame index. 927 // TODO: Should VI try to spill to VGPR and then spill to SMEM? 928 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 929 // TODO: Should VI try to spill to VGPR and then spill to SMEM? 930 931 MachineInstrBuilder Mov 932 = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 933 .addReg(SubReg, SubKillState); 934 935 936 // There could be undef components of a spilled super register. 937 // TODO: Can we detect this and skip the spill? 938 if (NumSubRegs > 1) { 939 // The last implicit use of the SuperReg carries the "Kill" flag. 940 unsigned SuperKillState = 0; 941 if (i + 1 == e) 942 SuperKillState |= getKillRegState(IsKill); 943 Mov.addReg(SuperReg, RegState::Implicit | SuperKillState); 944 } 945 946 unsigned Align = FrameInfo.getObjectAlignment(Index); 947 MachinePointerInfo PtrInfo 948 = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); 949 MachineMemOperand *MMO 950 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 951 EltSize, MinAlign(Align, EltSize * i)); 952 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) 953 .addReg(TmpReg, RegState::Kill) // src 954 .addFrameIndex(Index) // vaddr 955 .addReg(MFI->getScratchRSrcReg()) // srrsrc 956 .addReg(MFI->getStackPtrOffsetReg()) // soffset 957 .addImm(i * 4) // offset 958 .addMemOperand(MMO); 959 } 960 } 961 962 if (M0CopyReg != AMDGPU::NoRegister) { 963 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0) 964 .addReg(M0CopyReg, RegState::Kill); 965 } 966 967 MI->eraseFromParent(); 968 MFI->addToSpilledSGPRs(NumSubRegs); 969 return true; 970 } 971 972 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, 973 int Index, 974 RegScavenger *RS, 975 bool OnlyToVGPR) const { 976 MachineFunction *MF = MI->getParent()->getParent(); 977 MachineRegisterInfo &MRI = MF->getRegInfo(); 978 MachineBasicBlock *MBB = MI->getParent(); 979 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 980 981 ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills 982 = MFI->getSGPRToVGPRSpills(Index); 983 bool SpillToVGPR = !VGPRSpills.empty(); 984 if (OnlyToVGPR && !SpillToVGPR) 985 return false; 986 987 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 988 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 989 const SIInstrInfo *TII = ST.getInstrInfo(); 990 const DebugLoc &DL = MI->getDebugLoc(); 991 992 Register SuperReg = MI->getOperand(0).getReg(); 993 bool SpillToSMEM = spillSGPRToSMEM(); 994 if (SpillToSMEM && OnlyToVGPR) 995 return false; 996 997 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 998 999 unsigned OffsetReg = AMDGPU::M0; 1000 unsigned M0CopyReg = AMDGPU::NoRegister; 1001 1002 if (SpillToSMEM) { 1003 if (RS->isRegUsed(AMDGPU::M0)) { 1004 M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 1005 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg) 1006 .addReg(AMDGPU::M0); 1007 } 1008 } 1009 1010 unsigned EltSize = 4; 1011 unsigned ScalarLoadOp; 1012 1013 Register FrameReg = getFrameRegister(*MF); 1014 1015 const TargetRegisterClass *RC = getPhysRegClass(SuperReg); 1016 if (SpillToSMEM && isSGPRClass(RC)) { 1017 // XXX - if private_element_size is larger than 4 it might be useful to be 1018 // able to spill wider vmem spills. 1019 std::tie(EltSize, ScalarLoadOp) = 1020 getSpillEltSize(getRegSizeInBits(*RC) / 8, false); 1021 } 1022 1023 ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); 1024 unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 1025 1026 // SubReg carries the "Kill" flag when SubReg == SuperReg. 1027 int64_t FrOffset = FrameInfo.getObjectOffset(Index); 1028 1029 for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { 1030 Register SubReg = 1031 NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]); 1032 1033 if (SpillToSMEM) { 1034 // FIXME: Size may be > 4 but extra bytes wasted. 1035 unsigned Align = FrameInfo.getObjectAlignment(Index); 1036 MachinePointerInfo PtrInfo 1037 = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); 1038 MachineMemOperand *MMO 1039 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, 1040 EltSize, MinAlign(Align, EltSize * i)); 1041 1042 // Add i * 4 offset 1043 int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); 1044 if (Offset != 0) { 1045 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) 1046 .addReg(FrameReg) 1047 .addImm(Offset); 1048 } else { 1049 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) 1050 .addReg(FrameReg); 1051 } 1052 1053 auto MIB = 1054 BuildMI(*MBB, MI, DL, TII->get(ScalarLoadOp), SubReg) 1055 .addReg(MFI->getScratchRSrcReg()) // sbase 1056 .addReg(OffsetReg, RegState::Kill) // soff 1057 .addImm(0) // glc 1058 .addImm(0) // dlc 1059 .addMemOperand(MMO); 1060 1061 if (NumSubRegs > 1 && i == 0) 1062 MIB.addReg(SuperReg, RegState::ImplicitDefine); 1063 1064 continue; 1065 } 1066 1067 if (SpillToVGPR) { 1068 SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; 1069 auto MIB = 1070 BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), 1071 SubReg) 1072 .addReg(Spill.VGPR) 1073 .addImm(Spill.Lane); 1074 1075 if (NumSubRegs > 1 && i == 0) 1076 MIB.addReg(SuperReg, RegState::ImplicitDefine); 1077 } else { 1078 if (OnlyToVGPR) 1079 return false; 1080 1081 // Restore SGPR from a stack slot. 1082 // FIXME: We should use S_LOAD_DWORD here for VI. 1083 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1084 unsigned Align = FrameInfo.getObjectAlignment(Index); 1085 1086 MachinePointerInfo PtrInfo 1087 = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); 1088 1089 MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo, 1090 MachineMemOperand::MOLoad, EltSize, 1091 MinAlign(Align, EltSize * i)); 1092 1093 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg) 1094 .addFrameIndex(Index) // vaddr 1095 .addReg(MFI->getScratchRSrcReg()) // srsrc 1096 .addReg(MFI->getStackPtrOffsetReg()) // soffset 1097 .addImm(i * 4) // offset 1098 .addMemOperand(MMO); 1099 1100 auto MIB = 1101 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) 1102 .addReg(TmpReg, RegState::Kill); 1103 1104 if (NumSubRegs > 1) 1105 MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); 1106 } 1107 } 1108 1109 if (M0CopyReg != AMDGPU::NoRegister) { 1110 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0) 1111 .addReg(M0CopyReg, RegState::Kill); 1112 } 1113 1114 MI->eraseFromParent(); 1115 return true; 1116 } 1117 1118 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to 1119 /// a VGPR and the stack slot can be safely eliminated when all other users are 1120 /// handled. 1121 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( 1122 MachineBasicBlock::iterator MI, 1123 int FI, 1124 RegScavenger *RS) const { 1125 switch (MI->getOpcode()) { 1126 case AMDGPU::SI_SPILL_S1024_SAVE: 1127 case AMDGPU::SI_SPILL_S512_SAVE: 1128 case AMDGPU::SI_SPILL_S256_SAVE: 1129 case AMDGPU::SI_SPILL_S160_SAVE: 1130 case AMDGPU::SI_SPILL_S128_SAVE: 1131 case AMDGPU::SI_SPILL_S96_SAVE: 1132 case AMDGPU::SI_SPILL_S64_SAVE: 1133 case AMDGPU::SI_SPILL_S32_SAVE: 1134 return spillSGPR(MI, FI, RS, true); 1135 case AMDGPU::SI_SPILL_S1024_RESTORE: 1136 case AMDGPU::SI_SPILL_S512_RESTORE: 1137 case AMDGPU::SI_SPILL_S256_RESTORE: 1138 case AMDGPU::SI_SPILL_S160_RESTORE: 1139 case AMDGPU::SI_SPILL_S128_RESTORE: 1140 case AMDGPU::SI_SPILL_S96_RESTORE: 1141 case AMDGPU::SI_SPILL_S64_RESTORE: 1142 case AMDGPU::SI_SPILL_S32_RESTORE: 1143 return restoreSGPR(MI, FI, RS, true); 1144 default: 1145 llvm_unreachable("not an SGPR spill instruction"); 1146 } 1147 } 1148 1149 void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, 1150 int SPAdj, unsigned FIOperandNum, 1151 RegScavenger *RS) const { 1152 MachineFunction *MF = MI->getParent()->getParent(); 1153 MachineRegisterInfo &MRI = MF->getRegInfo(); 1154 MachineBasicBlock *MBB = MI->getParent(); 1155 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1156 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 1157 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 1158 const SIInstrInfo *TII = ST.getInstrInfo(); 1159 DebugLoc DL = MI->getDebugLoc(); 1160 1161 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?"); 1162 1163 MachineOperand &FIOp = MI->getOperand(FIOperandNum); 1164 int Index = MI->getOperand(FIOperandNum).getIndex(); 1165 1166 Register FrameReg = getFrameRegister(*MF); 1167 1168 switch (MI->getOpcode()) { 1169 // SGPR register spill 1170 case AMDGPU::SI_SPILL_S1024_SAVE: 1171 case AMDGPU::SI_SPILL_S512_SAVE: 1172 case AMDGPU::SI_SPILL_S256_SAVE: 1173 case AMDGPU::SI_SPILL_S160_SAVE: 1174 case AMDGPU::SI_SPILL_S128_SAVE: 1175 case AMDGPU::SI_SPILL_S96_SAVE: 1176 case AMDGPU::SI_SPILL_S64_SAVE: 1177 case AMDGPU::SI_SPILL_S32_SAVE: { 1178 spillSGPR(MI, Index, RS); 1179 break; 1180 } 1181 1182 // SGPR register restore 1183 case AMDGPU::SI_SPILL_S1024_RESTORE: 1184 case AMDGPU::SI_SPILL_S512_RESTORE: 1185 case AMDGPU::SI_SPILL_S256_RESTORE: 1186 case AMDGPU::SI_SPILL_S160_RESTORE: 1187 case AMDGPU::SI_SPILL_S128_RESTORE: 1188 case AMDGPU::SI_SPILL_S96_RESTORE: 1189 case AMDGPU::SI_SPILL_S64_RESTORE: 1190 case AMDGPU::SI_SPILL_S32_RESTORE: { 1191 restoreSGPR(MI, Index, RS); 1192 break; 1193 } 1194 1195 // VGPR register spill 1196 case AMDGPU::SI_SPILL_V1024_SAVE: 1197 case AMDGPU::SI_SPILL_V512_SAVE: 1198 case AMDGPU::SI_SPILL_V256_SAVE: 1199 case AMDGPU::SI_SPILL_V160_SAVE: 1200 case AMDGPU::SI_SPILL_V128_SAVE: 1201 case AMDGPU::SI_SPILL_V96_SAVE: 1202 case AMDGPU::SI_SPILL_V64_SAVE: 1203 case AMDGPU::SI_SPILL_V32_SAVE: 1204 case AMDGPU::SI_SPILL_A1024_SAVE: 1205 case AMDGPU::SI_SPILL_A512_SAVE: 1206 case AMDGPU::SI_SPILL_A128_SAVE: 1207 case AMDGPU::SI_SPILL_A64_SAVE: 1208 case AMDGPU::SI_SPILL_A32_SAVE: { 1209 const MachineOperand *VData = TII->getNamedOperand(*MI, 1210 AMDGPU::OpName::vdata); 1211 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 1212 MFI->getStackPtrOffsetReg()); 1213 1214 buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, 1215 Index, 1216 VData->getReg(), VData->isKill(), 1217 TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), 1218 FrameReg, 1219 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 1220 *MI->memoperands_begin(), 1221 RS); 1222 MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); 1223 MI->eraseFromParent(); 1224 break; 1225 } 1226 case AMDGPU::SI_SPILL_V32_RESTORE: 1227 case AMDGPU::SI_SPILL_V64_RESTORE: 1228 case AMDGPU::SI_SPILL_V96_RESTORE: 1229 case AMDGPU::SI_SPILL_V128_RESTORE: 1230 case AMDGPU::SI_SPILL_V160_RESTORE: 1231 case AMDGPU::SI_SPILL_V256_RESTORE: 1232 case AMDGPU::SI_SPILL_V512_RESTORE: 1233 case AMDGPU::SI_SPILL_V1024_RESTORE: 1234 case AMDGPU::SI_SPILL_A32_RESTORE: 1235 case AMDGPU::SI_SPILL_A64_RESTORE: 1236 case AMDGPU::SI_SPILL_A128_RESTORE: 1237 case AMDGPU::SI_SPILL_A512_RESTORE: 1238 case AMDGPU::SI_SPILL_A1024_RESTORE: { 1239 const MachineOperand *VData = TII->getNamedOperand(*MI, 1240 AMDGPU::OpName::vdata); 1241 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 1242 MFI->getStackPtrOffsetReg()); 1243 1244 buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, 1245 Index, 1246 VData->getReg(), VData->isKill(), 1247 TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), 1248 FrameReg, 1249 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 1250 *MI->memoperands_begin(), 1251 RS); 1252 MI->eraseFromParent(); 1253 break; 1254 } 1255 1256 default: { 1257 const DebugLoc &DL = MI->getDebugLoc(); 1258 bool IsMUBUF = TII->isMUBUF(*MI); 1259 1260 if (!IsMUBUF && !MFI->isEntryFunction()) { 1261 // Convert to an absolute stack address by finding the offset from the 1262 // scratch wave base and scaling by the wave size. 1263 // 1264 // In an entry function/kernel the offset is already the absolute 1265 // address relative to the frame register. 1266 1267 Register DiffReg = 1268 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 1269 1270 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32; 1271 Register ResultReg = IsCopy ? 1272 MI->getOperand(0).getReg() : 1273 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1274 1275 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg) 1276 .addReg(FrameReg) 1277 .addReg(MFI->getScratchWaveOffsetReg()); 1278 1279 int64_t Offset = FrameInfo.getObjectOffset(Index); 1280 if (Offset == 0) { 1281 // XXX - This never happens because of emergency scavenging slot at 0? 1282 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg) 1283 .addImm(Log2_32(ST.getWavefrontSize())) 1284 .addReg(DiffReg); 1285 } else { 1286 Register ScaledReg = 1287 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1288 1289 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg) 1290 .addImm(Log2_32(ST.getWavefrontSize())) 1291 .addReg(DiffReg, RegState::Kill); 1292 1293 // TODO: Fold if use instruction is another add of a constant. 1294 if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { 1295 TII->getAddNoCarry(*MBB, MI, DL, ResultReg) 1296 .addImm(Offset) 1297 .addReg(ScaledReg, RegState::Kill) 1298 .addImm(0); // clamp bit 1299 } else { 1300 Register ConstOffsetReg = 1301 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 1302 1303 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) 1304 .addImm(Offset); 1305 TII->getAddNoCarry(*MBB, MI, DL, ResultReg) 1306 .addReg(ConstOffsetReg, RegState::Kill) 1307 .addReg(ScaledReg, RegState::Kill) 1308 .addImm(0); // clamp bit 1309 } 1310 } 1311 1312 // Don't introduce an extra copy if we're just materializing in a mov. 1313 if (IsCopy) 1314 MI->eraseFromParent(); 1315 else 1316 FIOp.ChangeToRegister(ResultReg, false, false, true); 1317 return; 1318 } 1319 1320 if (IsMUBUF) { 1321 // Disable offen so we don't need a 0 vgpr base. 1322 assert(static_cast<int>(FIOperandNum) == 1323 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 1324 AMDGPU::OpName::vaddr)); 1325 1326 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 1327 MFI->getStackPtrOffsetReg()); 1328 1329 TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->setReg(FrameReg); 1330 1331 int64_t Offset = FrameInfo.getObjectOffset(Index); 1332 int64_t OldImm 1333 = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(); 1334 int64_t NewOffset = OldImm + Offset; 1335 1336 if (isUInt<12>(NewOffset) && 1337 buildMUBUFOffsetLoadStore(TII, FrameInfo, MI, Index, NewOffset)) { 1338 MI->eraseFromParent(); 1339 return; 1340 } 1341 } 1342 1343 // If the offset is simply too big, don't convert to a scratch wave offset 1344 // relative index. 1345 1346 int64_t Offset = FrameInfo.getObjectOffset(Index); 1347 FIOp.ChangeToImmediate(Offset); 1348 if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { 1349 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1350 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 1351 .addImm(Offset); 1352 FIOp.ChangeToRegister(TmpReg, false, false, true); 1353 } 1354 } 1355 } 1356 } 1357 1358 StringRef SIRegisterInfo::getRegAsmName(unsigned Reg) const { 1359 return AMDGPUInstPrinter::getRegisterName(Reg); 1360 } 1361 1362 // FIXME: This is very slow. It might be worth creating a map from physreg to 1363 // register class. 1364 const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { 1365 assert(!Register::isVirtualRegister(Reg)); 1366 1367 static const TargetRegisterClass *const BaseClasses[] = { 1368 &AMDGPU::VGPR_32RegClass, 1369 &AMDGPU::SReg_32RegClass, 1370 &AMDGPU::AGPR_32RegClass, 1371 &AMDGPU::VReg_64RegClass, 1372 &AMDGPU::SReg_64RegClass, 1373 &AMDGPU::AReg_64RegClass, 1374 &AMDGPU::VReg_96RegClass, 1375 &AMDGPU::SReg_96RegClass, 1376 &AMDGPU::VReg_128RegClass, 1377 &AMDGPU::SReg_128RegClass, 1378 &AMDGPU::AReg_128RegClass, 1379 &AMDGPU::VReg_160RegClass, 1380 &AMDGPU::SReg_160RegClass, 1381 &AMDGPU::VReg_256RegClass, 1382 &AMDGPU::SReg_256RegClass, 1383 &AMDGPU::VReg_512RegClass, 1384 &AMDGPU::SReg_512RegClass, 1385 &AMDGPU::AReg_512RegClass, 1386 &AMDGPU::SReg_1024RegClass, 1387 &AMDGPU::VReg_1024RegClass, 1388 &AMDGPU::AReg_1024RegClass, 1389 &AMDGPU::SCC_CLASSRegClass, 1390 &AMDGPU::Pseudo_SReg_32RegClass, 1391 &AMDGPU::Pseudo_SReg_128RegClass, 1392 }; 1393 1394 for (const TargetRegisterClass *BaseClass : BaseClasses) { 1395 if (BaseClass->contains(Reg)) { 1396 return BaseClass; 1397 } 1398 } 1399 return nullptr; 1400 } 1401 1402 // TODO: It might be helpful to have some target specific flags in 1403 // TargetRegisterClass to mark which classes are VGPRs to make this trivial. 1404 bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { 1405 unsigned Size = getRegSizeInBits(*RC); 1406 if (Size < 32) 1407 return false; 1408 switch (Size) { 1409 case 32: 1410 return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr; 1411 case 64: 1412 return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr; 1413 case 96: 1414 return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr; 1415 case 128: 1416 return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr; 1417 case 160: 1418 return getCommonSubClass(&AMDGPU::VReg_160RegClass, RC) != nullptr; 1419 case 256: 1420 return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr; 1421 case 512: 1422 return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr; 1423 case 1024: 1424 return getCommonSubClass(&AMDGPU::VReg_1024RegClass, RC) != nullptr; 1425 default: 1426 llvm_unreachable("Invalid register class size"); 1427 } 1428 } 1429 1430 bool SIRegisterInfo::hasAGPRs(const TargetRegisterClass *RC) const { 1431 unsigned Size = getRegSizeInBits(*RC); 1432 if (Size < 32) 1433 return false; 1434 switch (Size) { 1435 case 32: 1436 return getCommonSubClass(&AMDGPU::AGPR_32RegClass, RC) != nullptr; 1437 case 64: 1438 return getCommonSubClass(&AMDGPU::AReg_64RegClass, RC) != nullptr; 1439 case 96: 1440 return false; 1441 case 128: 1442 return getCommonSubClass(&AMDGPU::AReg_128RegClass, RC) != nullptr; 1443 case 160: 1444 case 256: 1445 return false; 1446 case 512: 1447 return getCommonSubClass(&AMDGPU::AReg_512RegClass, RC) != nullptr; 1448 case 1024: 1449 return getCommonSubClass(&AMDGPU::AReg_1024RegClass, RC) != nullptr; 1450 default: 1451 llvm_unreachable("Invalid register class size"); 1452 } 1453 } 1454 1455 const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( 1456 const TargetRegisterClass *SRC) const { 1457 switch (getRegSizeInBits(*SRC)) { 1458 case 32: 1459 return &AMDGPU::VGPR_32RegClass; 1460 case 64: 1461 return &AMDGPU::VReg_64RegClass; 1462 case 96: 1463 return &AMDGPU::VReg_96RegClass; 1464 case 128: 1465 return &AMDGPU::VReg_128RegClass; 1466 case 160: 1467 return &AMDGPU::VReg_160RegClass; 1468 case 256: 1469 return &AMDGPU::VReg_256RegClass; 1470 case 512: 1471 return &AMDGPU::VReg_512RegClass; 1472 case 1024: 1473 return &AMDGPU::VReg_1024RegClass; 1474 default: 1475 llvm_unreachable("Invalid register class size"); 1476 } 1477 } 1478 1479 const TargetRegisterClass *SIRegisterInfo::getEquivalentAGPRClass( 1480 const TargetRegisterClass *SRC) const { 1481 switch (getRegSizeInBits(*SRC)) { 1482 case 32: 1483 return &AMDGPU::AGPR_32RegClass; 1484 case 64: 1485 return &AMDGPU::AReg_64RegClass; 1486 case 128: 1487 return &AMDGPU::AReg_128RegClass; 1488 case 512: 1489 return &AMDGPU::AReg_512RegClass; 1490 case 1024: 1491 return &AMDGPU::AReg_1024RegClass; 1492 default: 1493 llvm_unreachable("Invalid register class size"); 1494 } 1495 } 1496 1497 const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass( 1498 const TargetRegisterClass *VRC) const { 1499 switch (getRegSizeInBits(*VRC)) { 1500 case 32: 1501 return &AMDGPU::SGPR_32RegClass; 1502 case 64: 1503 return &AMDGPU::SReg_64RegClass; 1504 case 96: 1505 return &AMDGPU::SReg_96RegClass; 1506 case 128: 1507 return &AMDGPU::SReg_128RegClass; 1508 case 160: 1509 return &AMDGPU::SReg_160RegClass; 1510 case 256: 1511 return &AMDGPU::SReg_256RegClass; 1512 case 512: 1513 return &AMDGPU::SReg_512RegClass; 1514 case 1024: 1515 return &AMDGPU::SReg_1024RegClass; 1516 default: 1517 llvm_unreachable("Invalid register class size"); 1518 } 1519 } 1520 1521 const TargetRegisterClass *SIRegisterInfo::getSubRegClass( 1522 const TargetRegisterClass *RC, unsigned SubIdx) const { 1523 if (SubIdx == AMDGPU::NoSubRegister) 1524 return RC; 1525 1526 // We can assume that each lane corresponds to one 32-bit register. 1527 unsigned Count = getSubRegIndexLaneMask(SubIdx).getNumLanes(); 1528 if (isSGPRClass(RC)) { 1529 switch (Count) { 1530 case 1: 1531 return &AMDGPU::SGPR_32RegClass; 1532 case 2: 1533 return &AMDGPU::SReg_64RegClass; 1534 case 3: 1535 return &AMDGPU::SReg_96RegClass; 1536 case 4: 1537 return &AMDGPU::SReg_128RegClass; 1538 case 5: 1539 return &AMDGPU::SReg_160RegClass; 1540 case 8: 1541 return &AMDGPU::SReg_256RegClass; 1542 case 16: 1543 return &AMDGPU::SReg_512RegClass; 1544 case 32: /* fall-through */ 1545 default: 1546 llvm_unreachable("Invalid sub-register class size"); 1547 } 1548 } else if (hasAGPRs(RC)) { 1549 switch (Count) { 1550 case 1: 1551 return &AMDGPU::AGPR_32RegClass; 1552 case 2: 1553 return &AMDGPU::AReg_64RegClass; 1554 case 4: 1555 return &AMDGPU::AReg_128RegClass; 1556 case 16: 1557 return &AMDGPU::AReg_512RegClass; 1558 case 32: /* fall-through */ 1559 default: 1560 llvm_unreachable("Invalid sub-register class size"); 1561 } 1562 } else { 1563 switch (Count) { 1564 case 1: 1565 return &AMDGPU::VGPR_32RegClass; 1566 case 2: 1567 return &AMDGPU::VReg_64RegClass; 1568 case 3: 1569 return &AMDGPU::VReg_96RegClass; 1570 case 4: 1571 return &AMDGPU::VReg_128RegClass; 1572 case 5: 1573 return &AMDGPU::VReg_160RegClass; 1574 case 8: 1575 return &AMDGPU::VReg_256RegClass; 1576 case 16: 1577 return &AMDGPU::VReg_512RegClass; 1578 case 32: /* fall-through */ 1579 default: 1580 llvm_unreachable("Invalid sub-register class size"); 1581 } 1582 } 1583 } 1584 1585 bool SIRegisterInfo::shouldRewriteCopySrc( 1586 const TargetRegisterClass *DefRC, 1587 unsigned DefSubReg, 1588 const TargetRegisterClass *SrcRC, 1589 unsigned SrcSubReg) const { 1590 // We want to prefer the smallest register class possible, so we don't want to 1591 // stop and rewrite on anything that looks like a subregister 1592 // extract. Operations mostly don't care about the super register class, so we 1593 // only want to stop on the most basic of copies between the same register 1594 // class. 1595 // 1596 // e.g. if we have something like 1597 // %0 = ... 1598 // %1 = ... 1599 // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2 1600 // %3 = COPY %2, sub0 1601 // 1602 // We want to look through the COPY to find: 1603 // => %3 = COPY %0 1604 1605 // Plain copy. 1606 return getCommonSubClass(DefRC, SrcRC) != nullptr; 1607 } 1608 1609 /// Returns a register that is not used at any point in the function. 1610 /// If all registers are used, then this function will return 1611 // AMDGPU::NoRegister. 1612 unsigned 1613 SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, 1614 const TargetRegisterClass *RC, 1615 const MachineFunction &MF) const { 1616 1617 for (unsigned Reg : *RC) 1618 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 1619 return Reg; 1620 return AMDGPU::NoRegister; 1621 } 1622 1623 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, 1624 unsigned EltSize) const { 1625 if (EltSize == 4) { 1626 static const int16_t Sub0_31[] = { 1627 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1628 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 1629 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 1630 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 1631 AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19, 1632 AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23, 1633 AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27, 1634 AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31, 1635 }; 1636 1637 static const int16_t Sub0_15[] = { 1638 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1639 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 1640 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 1641 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 1642 }; 1643 1644 static const int16_t Sub0_7[] = { 1645 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1646 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 1647 }; 1648 1649 static const int16_t Sub0_4[] = { 1650 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, 1651 }; 1652 1653 static const int16_t Sub0_3[] = { 1654 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1655 }; 1656 1657 static const int16_t Sub0_2[] = { 1658 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 1659 }; 1660 1661 static const int16_t Sub0_1[] = { 1662 AMDGPU::sub0, AMDGPU::sub1, 1663 }; 1664 1665 switch (AMDGPU::getRegBitWidth(*RC->MC)) { 1666 case 32: 1667 return {}; 1668 case 64: 1669 return makeArrayRef(Sub0_1); 1670 case 96: 1671 return makeArrayRef(Sub0_2); 1672 case 128: 1673 return makeArrayRef(Sub0_3); 1674 case 160: 1675 return makeArrayRef(Sub0_4); 1676 case 256: 1677 return makeArrayRef(Sub0_7); 1678 case 512: 1679 return makeArrayRef(Sub0_15); 1680 case 1024: 1681 return makeArrayRef(Sub0_31); 1682 default: 1683 llvm_unreachable("unhandled register size"); 1684 } 1685 } 1686 1687 if (EltSize == 8) { 1688 static const int16_t Sub0_31_64[] = { 1689 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 1690 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 1691 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 1692 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, 1693 AMDGPU::sub16_sub17, AMDGPU::sub18_sub19, 1694 AMDGPU::sub20_sub21, AMDGPU::sub22_sub23, 1695 AMDGPU::sub24_sub25, AMDGPU::sub26_sub27, 1696 AMDGPU::sub28_sub29, AMDGPU::sub30_sub31 1697 }; 1698 1699 static const int16_t Sub0_15_64[] = { 1700 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 1701 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 1702 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 1703 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15 1704 }; 1705 1706 static const int16_t Sub0_7_64[] = { 1707 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 1708 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7 1709 }; 1710 1711 1712 static const int16_t Sub0_3_64[] = { 1713 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3 1714 }; 1715 1716 switch (AMDGPU::getRegBitWidth(*RC->MC)) { 1717 case 64: 1718 return {}; 1719 case 128: 1720 return makeArrayRef(Sub0_3_64); 1721 case 256: 1722 return makeArrayRef(Sub0_7_64); 1723 case 512: 1724 return makeArrayRef(Sub0_15_64); 1725 case 1024: 1726 return makeArrayRef(Sub0_31_64); 1727 default: 1728 llvm_unreachable("unhandled register size"); 1729 } 1730 } 1731 1732 if (EltSize == 16) { 1733 1734 static const int16_t Sub0_31_128[] = { 1735 AMDGPU::sub0_sub1_sub2_sub3, 1736 AMDGPU::sub4_sub5_sub6_sub7, 1737 AMDGPU::sub8_sub9_sub10_sub11, 1738 AMDGPU::sub12_sub13_sub14_sub15, 1739 AMDGPU::sub16_sub17_sub18_sub19, 1740 AMDGPU::sub20_sub21_sub22_sub23, 1741 AMDGPU::sub24_sub25_sub26_sub27, 1742 AMDGPU::sub28_sub29_sub30_sub31 1743 }; 1744 1745 static const int16_t Sub0_15_128[] = { 1746 AMDGPU::sub0_sub1_sub2_sub3, 1747 AMDGPU::sub4_sub5_sub6_sub7, 1748 AMDGPU::sub8_sub9_sub10_sub11, 1749 AMDGPU::sub12_sub13_sub14_sub15 1750 }; 1751 1752 static const int16_t Sub0_7_128[] = { 1753 AMDGPU::sub0_sub1_sub2_sub3, 1754 AMDGPU::sub4_sub5_sub6_sub7 1755 }; 1756 1757 switch (AMDGPU::getRegBitWidth(*RC->MC)) { 1758 case 128: 1759 return {}; 1760 case 256: 1761 return makeArrayRef(Sub0_7_128); 1762 case 512: 1763 return makeArrayRef(Sub0_15_128); 1764 case 1024: 1765 return makeArrayRef(Sub0_31_128); 1766 default: 1767 llvm_unreachable("unhandled register size"); 1768 } 1769 } 1770 1771 assert(EltSize == 32 && "unhandled elt size"); 1772 1773 static const int16_t Sub0_31_256[] = { 1774 AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, 1775 AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15, 1776 AMDGPU::sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23, 1777 AMDGPU::sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31 1778 }; 1779 1780 static const int16_t Sub0_15_256[] = { 1781 AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, 1782 AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15 1783 }; 1784 1785 switch (AMDGPU::getRegBitWidth(*RC->MC)) { 1786 case 256: 1787 return {}; 1788 case 512: 1789 return makeArrayRef(Sub0_15_256); 1790 case 1024: 1791 return makeArrayRef(Sub0_31_256); 1792 default: 1793 llvm_unreachable("unhandled register size"); 1794 } 1795 } 1796 1797 const TargetRegisterClass* 1798 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI, 1799 unsigned Reg) const { 1800 if (Register::isVirtualRegister(Reg)) 1801 return MRI.getRegClass(Reg); 1802 1803 return getPhysRegClass(Reg); 1804 } 1805 1806 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, 1807 unsigned Reg) const { 1808 const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg); 1809 assert(RC && "Register class for the reg not found"); 1810 return hasVGPRs(RC); 1811 } 1812 1813 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI, 1814 unsigned Reg) const { 1815 const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg); 1816 assert(RC && "Register class for the reg not found"); 1817 return hasAGPRs(RC); 1818 } 1819 1820 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, 1821 const TargetRegisterClass *SrcRC, 1822 unsigned SubReg, 1823 const TargetRegisterClass *DstRC, 1824 unsigned DstSubReg, 1825 const TargetRegisterClass *NewRC, 1826 LiveIntervals &LIS) const { 1827 unsigned SrcSize = getRegSizeInBits(*SrcRC); 1828 unsigned DstSize = getRegSizeInBits(*DstRC); 1829 unsigned NewSize = getRegSizeInBits(*NewRC); 1830 1831 // Do not increase size of registers beyond dword, we would need to allocate 1832 // adjacent registers and constraint regalloc more than needed. 1833 1834 // Always allow dword coalescing. 1835 if (SrcSize <= 32 || DstSize <= 32) 1836 return true; 1837 1838 return NewSize <= DstSize || NewSize <= SrcSize; 1839 } 1840 1841 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, 1842 MachineFunction &MF) const { 1843 1844 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1845 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1846 1847 unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), 1848 MF.getFunction()); 1849 switch (RC->getID()) { 1850 default: 1851 return AMDGPURegisterInfo::getRegPressureLimit(RC, MF); 1852 case AMDGPU::VGPR_32RegClassID: 1853 return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF)); 1854 case AMDGPU::SGPR_32RegClassID: 1855 return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF)); 1856 } 1857 } 1858 1859 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, 1860 unsigned Idx) const { 1861 if (Idx == getVGPRPressureSet() || Idx == getAGPRPressureSet()) 1862 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, 1863 const_cast<MachineFunction &>(MF)); 1864 1865 if (Idx == getSGPRPressureSet()) 1866 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass, 1867 const_cast<MachineFunction &>(MF)); 1868 1869 return AMDGPURegisterInfo::getRegPressureSetLimit(MF, Idx); 1870 } 1871 1872 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const { 1873 static const int Empty[] = { -1 }; 1874 1875 if (hasRegUnit(AMDGPU::M0, RegUnit)) 1876 return Empty; 1877 return AMDGPURegisterInfo::getRegUnitPressureSets(RegUnit); 1878 } 1879 1880 unsigned SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const { 1881 // Not a callee saved register. 1882 return AMDGPU::SGPR30_SGPR31; 1883 } 1884 1885 const TargetRegisterClass * 1886 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, 1887 const RegisterBank &RB, 1888 const MachineRegisterInfo &MRI) const { 1889 switch (Size) { 1890 case 1: { 1891 switch (RB.getID()) { 1892 case AMDGPU::VGPRRegBankID: 1893 return &AMDGPU::VGPR_32RegClass; 1894 case AMDGPU::VCCRegBankID: 1895 return isWave32 ? 1896 &AMDGPU::SReg_32_XM0_XEXECRegClass : &AMDGPU::SReg_64_XEXECRegClass; 1897 case AMDGPU::SGPRRegBankID: 1898 return &AMDGPU::SReg_32_XM0RegClass; 1899 case AMDGPU::SCCRegBankID: 1900 // This needs to return an allocatable class, so don't bother returning 1901 // the dummy SCC class. 1902 return &AMDGPU::SReg_32_XM0RegClass; 1903 default: 1904 llvm_unreachable("unknown register bank"); 1905 } 1906 } 1907 case 32: 1908 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass : 1909 &AMDGPU::SReg_32_XM0RegClass; 1910 case 64: 1911 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass : 1912 &AMDGPU::SReg_64_XEXECRegClass; 1913 case 96: 1914 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_96RegClass : 1915 &AMDGPU::SReg_96RegClass; 1916 case 128: 1917 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass : 1918 &AMDGPU::SReg_128RegClass; 1919 case 160: 1920 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_160RegClass : 1921 &AMDGPU::SReg_160RegClass; 1922 case 256: 1923 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_256RegClass : 1924 &AMDGPU::SReg_256RegClass; 1925 case 512: 1926 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_512RegClass : 1927 &AMDGPU::SReg_512RegClass; 1928 default: 1929 if (Size < 32) 1930 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass : 1931 &AMDGPU::SReg_32_XM0RegClass; 1932 return nullptr; 1933 } 1934 } 1935 1936 const TargetRegisterClass * 1937 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, 1938 const MachineRegisterInfo &MRI) const { 1939 if (const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg())) 1940 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI); 1941 return nullptr; 1942 } 1943 1944 unsigned SIRegisterInfo::getVCC() const { 1945 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; 1946 } 1947 1948 const TargetRegisterClass * 1949 SIRegisterInfo::getRegClass(unsigned RCID) const { 1950 switch ((int)RCID) { 1951 case AMDGPU::SReg_1RegClassID: 1952 return getBoolRC(); 1953 case AMDGPU::SReg_1_XEXECRegClassID: 1954 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass 1955 : &AMDGPU::SReg_64_XEXECRegClass; 1956 case -1: 1957 return nullptr; 1958 default: 1959 return AMDGPURegisterInfo::getRegClass(RCID); 1960 } 1961 } 1962 1963 // Find reaching register definition 1964 MachineInstr *SIRegisterInfo::findReachingDef(unsigned Reg, unsigned SubReg, 1965 MachineInstr &Use, 1966 MachineRegisterInfo &MRI, 1967 LiveIntervals *LIS) const { 1968 auto &MDT = LIS->getAnalysis<MachineDominatorTree>(); 1969 SlotIndex UseIdx = LIS->getInstructionIndex(Use); 1970 SlotIndex DefIdx; 1971 1972 if (Register::isVirtualRegister(Reg)) { 1973 if (!LIS->hasInterval(Reg)) 1974 return nullptr; 1975 LiveInterval &LI = LIS->getInterval(Reg); 1976 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg) 1977 : MRI.getMaxLaneMaskForVReg(Reg); 1978 VNInfo *V = nullptr; 1979 if (LI.hasSubRanges()) { 1980 for (auto &S : LI.subranges()) { 1981 if ((S.LaneMask & SubLanes) == SubLanes) { 1982 V = S.getVNInfoAt(UseIdx); 1983 break; 1984 } 1985 } 1986 } else { 1987 V = LI.getVNInfoAt(UseIdx); 1988 } 1989 if (!V) 1990 return nullptr; 1991 DefIdx = V->def; 1992 } else { 1993 // Find last def. 1994 for (MCRegUnitIterator Units(Reg, this); Units.isValid(); ++Units) { 1995 LiveRange &LR = LIS->getRegUnit(*Units); 1996 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) { 1997 if (!DefIdx.isValid() || 1998 MDT.dominates(LIS->getInstructionFromIndex(DefIdx), 1999 LIS->getInstructionFromIndex(V->def))) 2000 DefIdx = V->def; 2001 } else { 2002 return nullptr; 2003 } 2004 } 2005 } 2006 2007 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx); 2008 2009 if (!Def || !MDT.dominates(Def, &Use)) 2010 return nullptr; 2011 2012 assert(Def->modifiesRegister(Reg, this)); 2013 2014 return Def; 2015 } 2016