1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI implementation of the TargetRegisterInfo class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "SIRegisterInfo.h" 15 #include "AMDGPU.h" 16 #include "AMDGPURegisterBankInfo.h" 17 #include "GCNSubtarget.h" 18 #include "MCTargetDesc/AMDGPUInstPrinter.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "llvm/CodeGen/LiveIntervals.h" 22 #include "llvm/CodeGen/MachineDominators.h" 23 #include "llvm/CodeGen/RegisterScavenging.h" 24 25 using namespace llvm; 26 27 #define GET_REGINFO_TARGET_DESC 28 #include "AMDGPUGenRegisterInfo.inc" 29 30 static cl::opt<bool> EnableSpillSGPRToVGPR( 31 "amdgpu-spill-sgpr-to-vgpr", 32 cl::desc("Enable spilling VGPRs to SGPRs"), 33 cl::ReallyHidden, 34 cl::init(true)); 35 36 std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts; 37 std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable; 38 39 // Map numbers of DWORDs to indexes in SubRegFromChannelTable. 40 // Valid indexes are shifted 1, such that a 0 mapping means unsupported. 41 // e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8, 42 // meaning index 7 in SubRegFromChannelTable. 43 static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = { 44 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9}; 45 46 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) 47 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST), 48 SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) { 49 50 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 && 51 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) && 52 (getSubRegIndexLaneMask(AMDGPU::lo16) | 53 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() == 54 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() && 55 "getNumCoveredRegs() will not work with generated subreg masks!"); 56 57 RegPressureIgnoredUnits.resize(getNumRegUnits()); 58 RegPressureIgnoredUnits.set( 59 *MCRegUnitIterator(MCRegister::from(AMDGPU::M0), this)); 60 for (auto Reg : AMDGPU::VGPR_HI16RegClass) 61 RegPressureIgnoredUnits.set(*MCRegUnitIterator(Reg, this)); 62 63 // HACK: Until this is fully tablegen'd. 64 static llvm::once_flag InitializeRegSplitPartsFlag; 65 66 static auto InitializeRegSplitPartsOnce = [this]() { 67 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) { 68 unsigned Size = getSubRegIdxSize(Idx); 69 if (Size & 31) 70 continue; 71 std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1]; 72 unsigned Pos = getSubRegIdxOffset(Idx); 73 if (Pos % Size) 74 continue; 75 Pos /= Size; 76 if (Vec.empty()) { 77 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits. 78 Vec.resize(MaxNumParts); 79 } 80 Vec[Pos] = Idx; 81 } 82 }; 83 84 static llvm::once_flag InitializeSubRegFromChannelTableFlag; 85 86 static auto InitializeSubRegFromChannelTableOnce = [this]() { 87 for (auto &Row : SubRegFromChannelTable) 88 Row.fill(AMDGPU::NoSubRegister); 89 for (uint16_t Idx = 1; Idx < getNumSubRegIndices(); ++Idx) { 90 unsigned Width = AMDGPUSubRegIdxRanges[Idx].Size / 32; 91 unsigned Offset = AMDGPUSubRegIdxRanges[Idx].Offset / 32; 92 assert(Width < SubRegFromChannelTableWidthMap.size()); 93 Width = SubRegFromChannelTableWidthMap[Width]; 94 if (Width == 0) 95 continue; 96 unsigned TableIdx = Width - 1; 97 assert(TableIdx < SubRegFromChannelTable.size()); 98 assert(Offset < SubRegFromChannelTable[TableIdx].size()); 99 SubRegFromChannelTable[TableIdx][Offset] = Idx; 100 } 101 }; 102 103 llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce); 104 llvm::call_once(InitializeSubRegFromChannelTableFlag, 105 InitializeSubRegFromChannelTableOnce); 106 } 107 108 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, 109 MCRegister Reg) const { 110 MCRegAliasIterator R(Reg, this, true); 111 112 for (; R.isValid(); ++R) 113 Reserved.set(*R); 114 } 115 116 // Forced to be here by one .inc 117 const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs( 118 const MachineFunction *MF) const { 119 CallingConv::ID CC = MF->getFunction().getCallingConv(); 120 switch (CC) { 121 case CallingConv::C: 122 case CallingConv::Fast: 123 case CallingConv::Cold: 124 case CallingConv::AMDGPU_Gfx: 125 return MF->getSubtarget<GCNSubtarget>().hasGFX90AInsts() 126 ? CSR_AMDGPU_HighRegs_With_AGPRs_SaveList 127 : CSR_AMDGPU_HighRegs_SaveList; 128 default: { 129 // Dummy to not crash RegisterClassInfo. 130 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister; 131 return &NoCalleeSavedReg; 132 } 133 } 134 } 135 136 const MCPhysReg * 137 SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const { 138 return nullptr; 139 } 140 141 const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, 142 CallingConv::ID CC) const { 143 switch (CC) { 144 case CallingConv::C: 145 case CallingConv::Fast: 146 case CallingConv::Cold: 147 case CallingConv::AMDGPU_Gfx: 148 return MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts() 149 ? CSR_AMDGPU_HighRegs_With_AGPRs_RegMask 150 : CSR_AMDGPU_HighRegs_RegMask; 151 default: 152 return nullptr; 153 } 154 } 155 156 const uint32_t *SIRegisterInfo::getNoPreservedMask() const { 157 return CSR_AMDGPU_NoRegs_RegMask; 158 } 159 160 Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { 161 const SIFrameLowering *TFI = 162 MF.getSubtarget<GCNSubtarget>().getFrameLowering(); 163 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 164 // During ISel lowering we always reserve the stack pointer in entry 165 // functions, but never actually want to reference it when accessing our own 166 // frame. If we need a frame pointer we use it, but otherwise we can just use 167 // an immediate "0" which we represent by returning NoRegister. 168 if (FuncInfo->isEntryFunction()) { 169 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register(); 170 } 171 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() 172 : FuncInfo->getStackPtrOffsetReg(); 173 } 174 175 bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const { 176 // When we need stack realignment, we can't reference off of the 177 // stack pointer, so we reserve a base pointer. 178 const MachineFrameInfo &MFI = MF.getFrameInfo(); 179 return MFI.getNumFixedObjects() && needsStackRealignment(MF); 180 } 181 182 Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; } 183 184 const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const { 185 return CSR_AMDGPU_AllVGPRs_RegMask; 186 } 187 188 const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const { 189 return CSR_AMDGPU_AllAGPRs_RegMask; 190 } 191 192 const uint32_t *SIRegisterInfo::getAllVectorRegMask() const { 193 return CSR_AMDGPU_AllVectorRegs_RegMask; 194 } 195 196 const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const { 197 return CSR_AMDGPU_AllAllocatableSRegs_RegMask; 198 } 199 200 unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel, 201 unsigned NumRegs) { 202 assert(NumRegs < SubRegFromChannelTableWidthMap.size()); 203 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs]; 204 assert(NumRegIndex && "Not implemented"); 205 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size()); 206 return SubRegFromChannelTable[NumRegIndex - 1][Channel]; 207 } 208 209 MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg( 210 const MachineFunction &MF) const { 211 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4; 212 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); 213 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass); 214 } 215 216 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { 217 BitVector Reserved(getNumRegs()); 218 Reserved.set(AMDGPU::MODE); 219 220 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but 221 // this seems likely to result in bugs, so I'm marking them as reserved. 222 reserveRegisterTuples(Reserved, AMDGPU::EXEC); 223 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); 224 225 // M0 has to be reserved so that llvm accepts it as a live-in into a block. 226 reserveRegisterTuples(Reserved, AMDGPU::M0); 227 228 // Reserve src_vccz, src_execz, src_scc. 229 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ); 230 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ); 231 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC); 232 233 // Reserve the memory aperture registers. 234 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE); 235 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); 236 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); 237 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT); 238 239 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen. 240 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID); 241 242 // Reserve xnack_mask registers - support is not implemented in Codegen. 243 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK); 244 245 // Reserve lds_direct register - support is not implemented in Codegen. 246 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT); 247 248 // Reserve Trap Handler registers - support is not implemented in Codegen. 249 reserveRegisterTuples(Reserved, AMDGPU::TBA); 250 reserveRegisterTuples(Reserved, AMDGPU::TMA); 251 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1); 252 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3); 253 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5); 254 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7); 255 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); 256 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); 257 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13); 258 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15); 259 260 // Reserve null register - it shall never be allocated 261 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL); 262 263 // Disallow vcc_hi allocation in wave32. It may be allocated but most likely 264 // will result in bugs. 265 if (isWave32) { 266 Reserved.set(AMDGPU::VCC); 267 Reserved.set(AMDGPU::VCC_HI); 268 } 269 270 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); 271 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 272 for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) { 273 unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); 274 reserveRegisterTuples(Reserved, Reg); 275 } 276 277 unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); 278 // TODO: In an entry function without calls and AGPRs used it is possible 279 // to use the whole register budget for VGPRs. Even more it shall 280 // be possible to estimate maximum AGPR/VGPR pressure and split 281 // register file accordingly. 282 if (ST.hasGFX90AInsts()) 283 MaxNumVGPRs /= 2; 284 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); 285 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) { 286 unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); 287 reserveRegisterTuples(Reserved, Reg); 288 Reg = AMDGPU::AGPR_32RegClass.getRegister(i); 289 reserveRegisterTuples(Reserved, Reg); 290 } 291 292 for (auto Reg : AMDGPU::SReg_32RegClass) { 293 Reserved.set(getSubReg(Reg, AMDGPU::hi16)); 294 Register Low = getSubReg(Reg, AMDGPU::lo16); 295 // This is to prevent BB vcc liveness errors. 296 if (!AMDGPU::SGPR_LO16RegClass.contains(Low)) 297 Reserved.set(Low); 298 } 299 300 for (auto Reg : AMDGPU::AGPR_32RegClass) { 301 Reserved.set(getSubReg(Reg, AMDGPU::hi16)); 302 } 303 304 // Reserve all the rest AGPRs if there are no instructions to use it. 305 if (!ST.hasMAIInsts()) { 306 for (unsigned i = 0; i < MaxNumVGPRs; ++i) { 307 unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); 308 reserveRegisterTuples(Reserved, Reg); 309 } 310 } 311 312 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 313 314 Register ScratchRSrcReg = MFI->getScratchRSrcReg(); 315 if (ScratchRSrcReg != AMDGPU::NoRegister) { 316 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need 317 // to spill. 318 // TODO: May need to reserve a VGPR if doing LDS spilling. 319 reserveRegisterTuples(Reserved, ScratchRSrcReg); 320 } 321 322 // We have to assume the SP is needed in case there are calls in the function, 323 // which is detected after the function is lowered. If we aren't really going 324 // to need SP, don't bother reserving it. 325 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg(); 326 327 if (StackPtrReg) { 328 reserveRegisterTuples(Reserved, StackPtrReg); 329 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg)); 330 } 331 332 MCRegister FrameReg = MFI->getFrameOffsetReg(); 333 if (FrameReg) { 334 reserveRegisterTuples(Reserved, FrameReg); 335 assert(!isSubRegister(ScratchRSrcReg, FrameReg)); 336 } 337 338 if (hasBasePointer(MF)) { 339 MCRegister BasePtrReg = getBaseRegister(); 340 reserveRegisterTuples(Reserved, BasePtrReg); 341 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg)); 342 } 343 344 for (MCRegister Reg : MFI->WWMReservedRegs) { 345 reserveRegisterTuples(Reserved, Reg); 346 } 347 348 // FIXME: Stop using reserved registers for this. 349 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs()) 350 reserveRegisterTuples(Reserved, Reg); 351 352 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) 353 reserveRegisterTuples(Reserved, Reg); 354 355 for (auto SSpill : MFI->getSGPRSpillVGPRs()) 356 reserveRegisterTuples(Reserved, SSpill.VGPR); 357 358 return Reserved; 359 } 360 361 bool SIRegisterInfo::canRealignStack(const MachineFunction &MF) const { 362 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 363 // On entry, the base address is 0, so it can't possibly need any more 364 // alignment. 365 366 // FIXME: Should be able to specify the entry frame alignment per calling 367 // convention instead. 368 if (Info->isEntryFunction()) 369 return false; 370 371 return TargetRegisterInfo::canRealignStack(MF); 372 } 373 374 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { 375 const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>(); 376 if (Info->isEntryFunction()) { 377 const MachineFrameInfo &MFI = Fn.getFrameInfo(); 378 return MFI.hasStackObjects() || MFI.hasCalls(); 379 } 380 381 // May need scavenger for dealing with callee saved registers. 382 return true; 383 } 384 385 bool SIRegisterInfo::requiresFrameIndexScavenging( 386 const MachineFunction &MF) const { 387 // Do not use frame virtual registers. They used to be used for SGPRs, but 388 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the 389 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a 390 // spill. 391 return false; 392 } 393 394 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( 395 const MachineFunction &MF) const { 396 const MachineFrameInfo &MFI = MF.getFrameInfo(); 397 return MFI.hasStackObjects(); 398 } 399 400 bool SIRegisterInfo::requiresVirtualBaseRegisters( 401 const MachineFunction &) const { 402 // There are no special dedicated stack or frame pointers. 403 return true; 404 } 405 406 int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const { 407 assert(SIInstrInfo::isMUBUF(*MI) || SIInstrInfo::isFLATScratch(*MI)); 408 409 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 410 AMDGPU::OpName::offset); 411 return MI->getOperand(OffIdx).getImm(); 412 } 413 414 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI, 415 int Idx) const { 416 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 417 return 0; 418 419 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 420 AMDGPU::OpName::vaddr) || 421 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 422 AMDGPU::OpName::saddr))) && 423 "Should never see frame index on non-address operand"); 424 425 return getScratchInstrOffset(MI); 426 } 427 428 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { 429 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 430 return false; 431 432 int64_t FullOffset = Offset + getScratchInstrOffset(MI); 433 434 if (SIInstrInfo::isMUBUF(*MI)) 435 return !SIInstrInfo::isLegalMUBUFImmOffset(FullOffset); 436 437 const SIInstrInfo *TII = ST.getInstrInfo(); 438 return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS, true); 439 } 440 441 Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, 442 int FrameIdx, 443 int64_t Offset) const { 444 MachineBasicBlock::iterator Ins = MBB->begin(); 445 DebugLoc DL; // Defaults to "unknown" 446 447 if (Ins != MBB->end()) 448 DL = Ins->getDebugLoc(); 449 450 MachineFunction *MF = MBB->getParent(); 451 const SIInstrInfo *TII = ST.getInstrInfo(); 452 MachineRegisterInfo &MRI = MF->getRegInfo(); 453 unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32 454 : AMDGPU::V_MOV_B32_e32; 455 456 Register BaseReg = MRI.createVirtualRegister( 457 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass 458 : &AMDGPU::VGPR_32RegClass); 459 460 if (Offset == 0) { 461 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg) 462 .addFrameIndex(FrameIdx); 463 return BaseReg; 464 } 465 466 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 467 468 Register FIReg = MRI.createVirtualRegister( 469 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass 470 : &AMDGPU::VGPR_32RegClass); 471 472 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) 473 .addImm(Offset); 474 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg) 475 .addFrameIndex(FrameIdx); 476 477 if (ST.enableFlatScratch() ) { 478 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_U32), BaseReg) 479 .addReg(OffsetReg, RegState::Kill) 480 .addReg(FIReg); 481 return BaseReg; 482 } 483 484 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg) 485 .addReg(OffsetReg, RegState::Kill) 486 .addReg(FIReg) 487 .addImm(0); // clamp bit 488 489 return BaseReg; 490 } 491 492 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, 493 int64_t Offset) const { 494 const SIInstrInfo *TII = ST.getInstrInfo(); 495 bool IsFlat = TII->isFLATScratch(MI); 496 497 #ifndef NDEBUG 498 // FIXME: Is it possible to be storing a frame index to itself? 499 bool SeenFI = false; 500 for (const MachineOperand &MO: MI.operands()) { 501 if (MO.isFI()) { 502 if (SeenFI) 503 llvm_unreachable("should not see multiple frame indices"); 504 505 SeenFI = true; 506 } 507 } 508 #endif 509 510 MachineOperand *FIOp = 511 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr 512 : AMDGPU::OpName::vaddr); 513 514 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); 515 int64_t NewOffset = OffsetOp->getImm() + Offset; 516 517 assert(FIOp && FIOp->isFI() && "frame index must be address operand"); 518 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI)); 519 520 if (IsFlat) { 521 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, true) && 522 "offset should be legal"); 523 FIOp->ChangeToRegister(BaseReg, false); 524 OffsetOp->setImm(NewOffset); 525 return; 526 } 527 528 #ifndef NDEBUG 529 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset); 530 assert(SOffset->isImm() && SOffset->getImm() == 0); 531 #endif 532 533 assert(SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) && 534 "offset should be legal"); 535 536 FIOp->ChangeToRegister(BaseReg, false); 537 OffsetOp->setImm(NewOffset); 538 } 539 540 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, 541 Register BaseReg, 542 int64_t Offset) const { 543 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 544 return false; 545 546 int64_t NewOffset = Offset + getScratchInstrOffset(MI); 547 548 if (SIInstrInfo::isMUBUF(*MI)) 549 return SIInstrInfo::isLegalMUBUFImmOffset(NewOffset); 550 551 const SIInstrInfo *TII = ST.getInstrInfo(); 552 return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, true); 553 } 554 555 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( 556 const MachineFunction &MF, unsigned Kind) const { 557 // This is inaccurate. It depends on the instruction and address space. The 558 // only place where we should hit this is for dealing with frame indexes / 559 // private accesses, so this is correct in that case. 560 return &AMDGPU::VGPR_32RegClass; 561 } 562 563 static unsigned getNumSubRegsForSpillOp(unsigned Op) { 564 565 switch (Op) { 566 case AMDGPU::SI_SPILL_S1024_SAVE: 567 case AMDGPU::SI_SPILL_S1024_RESTORE: 568 case AMDGPU::SI_SPILL_V1024_SAVE: 569 case AMDGPU::SI_SPILL_V1024_RESTORE: 570 case AMDGPU::SI_SPILL_A1024_SAVE: 571 case AMDGPU::SI_SPILL_A1024_RESTORE: 572 return 32; 573 case AMDGPU::SI_SPILL_S512_SAVE: 574 case AMDGPU::SI_SPILL_S512_RESTORE: 575 case AMDGPU::SI_SPILL_V512_SAVE: 576 case AMDGPU::SI_SPILL_V512_RESTORE: 577 case AMDGPU::SI_SPILL_A512_SAVE: 578 case AMDGPU::SI_SPILL_A512_RESTORE: 579 return 16; 580 case AMDGPU::SI_SPILL_S256_SAVE: 581 case AMDGPU::SI_SPILL_S256_RESTORE: 582 case AMDGPU::SI_SPILL_V256_SAVE: 583 case AMDGPU::SI_SPILL_V256_RESTORE: 584 case AMDGPU::SI_SPILL_A256_SAVE: 585 case AMDGPU::SI_SPILL_A256_RESTORE: 586 return 8; 587 case AMDGPU::SI_SPILL_S192_SAVE: 588 case AMDGPU::SI_SPILL_S192_RESTORE: 589 case AMDGPU::SI_SPILL_V192_SAVE: 590 case AMDGPU::SI_SPILL_V192_RESTORE: 591 case AMDGPU::SI_SPILL_A192_SAVE: 592 case AMDGPU::SI_SPILL_A192_RESTORE: 593 return 6; 594 case AMDGPU::SI_SPILL_S160_SAVE: 595 case AMDGPU::SI_SPILL_S160_RESTORE: 596 case AMDGPU::SI_SPILL_V160_SAVE: 597 case AMDGPU::SI_SPILL_V160_RESTORE: 598 case AMDGPU::SI_SPILL_A160_SAVE: 599 case AMDGPU::SI_SPILL_A160_RESTORE: 600 return 5; 601 case AMDGPU::SI_SPILL_S128_SAVE: 602 case AMDGPU::SI_SPILL_S128_RESTORE: 603 case AMDGPU::SI_SPILL_V128_SAVE: 604 case AMDGPU::SI_SPILL_V128_RESTORE: 605 case AMDGPU::SI_SPILL_A128_SAVE: 606 case AMDGPU::SI_SPILL_A128_RESTORE: 607 return 4; 608 case AMDGPU::SI_SPILL_S96_SAVE: 609 case AMDGPU::SI_SPILL_S96_RESTORE: 610 case AMDGPU::SI_SPILL_V96_SAVE: 611 case AMDGPU::SI_SPILL_V96_RESTORE: 612 case AMDGPU::SI_SPILL_A96_SAVE: 613 case AMDGPU::SI_SPILL_A96_RESTORE: 614 return 3; 615 case AMDGPU::SI_SPILL_S64_SAVE: 616 case AMDGPU::SI_SPILL_S64_RESTORE: 617 case AMDGPU::SI_SPILL_V64_SAVE: 618 case AMDGPU::SI_SPILL_V64_RESTORE: 619 case AMDGPU::SI_SPILL_A64_SAVE: 620 case AMDGPU::SI_SPILL_A64_RESTORE: 621 return 2; 622 case AMDGPU::SI_SPILL_S32_SAVE: 623 case AMDGPU::SI_SPILL_S32_RESTORE: 624 case AMDGPU::SI_SPILL_V32_SAVE: 625 case AMDGPU::SI_SPILL_V32_RESTORE: 626 case AMDGPU::SI_SPILL_A32_SAVE: 627 case AMDGPU::SI_SPILL_A32_RESTORE: 628 return 1; 629 default: llvm_unreachable("Invalid spill opcode"); 630 } 631 } 632 633 static int getOffsetMUBUFStore(unsigned Opc) { 634 switch (Opc) { 635 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 636 return AMDGPU::BUFFER_STORE_DWORD_OFFSET; 637 case AMDGPU::BUFFER_STORE_BYTE_OFFEN: 638 return AMDGPU::BUFFER_STORE_BYTE_OFFSET; 639 case AMDGPU::BUFFER_STORE_SHORT_OFFEN: 640 return AMDGPU::BUFFER_STORE_SHORT_OFFSET; 641 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: 642 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; 643 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN: 644 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; 645 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN: 646 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET; 647 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN: 648 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET; 649 default: 650 return -1; 651 } 652 } 653 654 static int getOffsetMUBUFLoad(unsigned Opc) { 655 switch (Opc) { 656 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 657 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 658 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN: 659 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET; 660 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN: 661 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET; 662 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN: 663 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET; 664 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN: 665 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET; 666 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN: 667 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; 668 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN: 669 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET; 670 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN: 671 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET; 672 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN: 673 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET; 674 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN: 675 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET; 676 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN: 677 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET; 678 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN: 679 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET; 680 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN: 681 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET; 682 default: 683 return -1; 684 } 685 } 686 687 static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, 688 MachineBasicBlock::iterator MI, 689 int Index, 690 unsigned Lane, 691 unsigned ValueReg, 692 bool IsKill) { 693 MachineBasicBlock *MBB = MI->getParent(); 694 MachineFunction *MF = MI->getParent()->getParent(); 695 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 696 const SIInstrInfo *TII = ST.getInstrInfo(); 697 698 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane); 699 700 if (Reg == AMDGPU::NoRegister) 701 return MachineInstrBuilder(); 702 703 bool IsStore = MI->mayStore(); 704 MachineRegisterInfo &MRI = MF->getRegInfo(); 705 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 706 707 unsigned Dst = IsStore ? Reg : ValueReg; 708 unsigned Src = IsStore ? ValueReg : Reg; 709 unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 710 : AMDGPU::V_ACCVGPR_READ_B32_e64; 711 712 auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst) 713 .addReg(Src, getKillRegState(IsKill)); 714 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 715 return MIB; 716 } 717 718 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not 719 // need to handle the case where an SGPR may need to be spilled while spilling. 720 static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, 721 MachineFrameInfo &MFI, 722 MachineBasicBlock::iterator MI, 723 int Index, 724 int64_t Offset) { 725 const SIInstrInfo *TII = ST.getInstrInfo(); 726 MachineBasicBlock *MBB = MI->getParent(); 727 const DebugLoc &DL = MI->getDebugLoc(); 728 bool IsStore = MI->mayStore(); 729 730 unsigned Opc = MI->getOpcode(); 731 int LoadStoreOp = IsStore ? 732 getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc); 733 if (LoadStoreOp == -1) 734 return false; 735 736 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); 737 if (spillVGPRtoAGPR(ST, MI, Index, 0, Reg->getReg(), false).getInstr()) 738 return true; 739 740 MachineInstrBuilder NewMI = 741 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) 742 .add(*Reg) 743 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) 744 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) 745 .addImm(Offset) 746 .addImm(0) // glc 747 .addImm(0) // slc 748 .addImm(0) // tfe 749 .addImm(0) // dlc 750 .addImm(0) // swz 751 .addImm(0) // scc 752 .cloneMemRefs(*MI); 753 754 const MachineOperand *VDataIn = TII->getNamedOperand(*MI, 755 AMDGPU::OpName::vdata_in); 756 if (VDataIn) 757 NewMI.add(*VDataIn); 758 return true; 759 } 760 761 static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, 762 unsigned LoadStoreOp, 763 unsigned EltSize) { 764 bool IsStore = TII->get(LoadStoreOp).mayStore(); 765 bool UseST = 766 AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 && 767 AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::saddr) < 0; 768 769 switch (EltSize) { 770 case 4: 771 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 772 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR; 773 break; 774 case 8: 775 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR 776 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR; 777 break; 778 case 12: 779 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR 780 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR; 781 break; 782 case 16: 783 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR 784 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR; 785 break; 786 default: 787 llvm_unreachable("Unexpected spill load/store size!"); 788 } 789 790 if (UseST) 791 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); 792 793 return LoadStoreOp; 794 } 795 796 void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, 797 unsigned LoadStoreOp, 798 int Index, 799 Register ValueReg, 800 bool IsKill, 801 MCRegister ScratchOffsetReg, 802 int64_t InstOffset, 803 MachineMemOperand *MMO, 804 RegScavenger *RS) const { 805 MachineBasicBlock *MBB = MI->getParent(); 806 MachineFunction *MF = MI->getParent()->getParent(); 807 const SIInstrInfo *TII = ST.getInstrInfo(); 808 const MachineFrameInfo &MFI = MF->getFrameInfo(); 809 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>(); 810 811 const MCInstrDesc *Desc = &TII->get(LoadStoreOp); 812 const DebugLoc &DL = MI->getDebugLoc(); 813 bool IsStore = Desc->mayStore(); 814 bool IsFlat = TII->isFLATScratch(LoadStoreOp); 815 816 bool Scavenged = false; 817 MCRegister SOffset = ScratchOffsetReg; 818 819 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); 820 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores. 821 const bool IsAGPR = !ST.hasGFX90AInsts() && hasAGPRs(RC); 822 const unsigned RegWidth = AMDGPU::getRegBitWidth(RC->getID()) / 8; 823 824 // Always use 4 byte operations for AGPRs because we need to scavenge 825 // a temporary VGPR. 826 unsigned EltSize = (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u) : 4u; 827 unsigned NumSubRegs = RegWidth / EltSize; 828 unsigned Size = NumSubRegs * EltSize; 829 unsigned RemSize = RegWidth - Size; 830 unsigned NumRemSubRegs = RemSize ? 1 : 0; 831 int64_t Offset = InstOffset + MFI.getObjectOffset(Index); 832 int64_t MaxOffset = Offset + Size + RemSize - EltSize; 833 int64_t ScratchOffsetRegDelta = 0; 834 835 if (IsFlat && EltSize > 4) { 836 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); 837 Desc = &TII->get(LoadStoreOp); 838 } 839 840 Align Alignment = MFI.getObjectAlign(Index); 841 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); 842 843 assert((IsFlat || ((Offset % EltSize) == 0)) && 844 "unexpected VGPR spill offset"); 845 846 bool IsOffsetLegal = IsFlat 847 ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, true) 848 : SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset); 849 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) { 850 SOffset = MCRegister(); 851 852 // We currently only support spilling VGPRs to EltSize boundaries, meaning 853 // we can simplify the adjustment of Offset here to just scale with 854 // WavefrontSize. 855 if (!IsFlat) 856 Offset *= ST.getWavefrontSize(); 857 858 // We don't have access to the register scavenger if this function is called 859 // during PEI::scavengeFrameVirtualRegs(). 860 if (RS) 861 SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false); 862 863 if (!SOffset) { 864 // There are no free SGPRs, and since we are in the process of spilling 865 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true 866 // on SI/CI and on VI it is true until we implement spilling using scalar 867 // stores), we have no way to free up an SGPR. Our solution here is to 868 // add the offset directly to the ScratchOffset or StackPtrOffset 869 // register, and then subtract the offset after the spill to return the 870 // register to it's original value. 871 if (!ScratchOffsetReg) 872 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg(); 873 SOffset = ScratchOffsetReg; 874 ScratchOffsetRegDelta = Offset; 875 } else { 876 Scavenged = true; 877 } 878 879 if (!SOffset) 880 report_fatal_error("could not scavenge SGPR to spill in entry function"); 881 882 if (ScratchOffsetReg == AMDGPU::NoRegister) { 883 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset) 884 .addImm(Offset); 885 } else { 886 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) 887 .addReg(ScratchOffsetReg) 888 .addImm(Offset); 889 } 890 891 Offset = 0; 892 } 893 894 if (IsFlat && SOffset == AMDGPU::NoRegister) { 895 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 896 && "Unexpected vaddr for flat scratch with a FI operand"); 897 898 assert(ST.hasFlatScratchSTMode()); 899 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); 900 Desc = &TII->get(LoadStoreOp); 901 } 902 903 Register TmpReg; 904 905 for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e; 906 ++i, RegOffset += EltSize) { 907 if (i == NumSubRegs) { 908 EltSize = RemSize; 909 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); 910 } 911 Desc = &TII->get(LoadStoreOp); 912 913 unsigned NumRegs = EltSize / 4; 914 Register SubReg = e == 1 915 ? ValueReg 916 : Register(getSubReg(ValueReg, 917 getSubRegFromChannel(RegOffset / 4, NumRegs))); 918 919 unsigned SOffsetRegState = 0; 920 unsigned SrcDstRegState = getDefRegState(!IsStore); 921 if (i + 1 == e) { 922 SOffsetRegState |= getKillRegState(Scavenged); 923 // The last implicit use carries the "Kill" flag. 924 SrcDstRegState |= getKillRegState(IsKill); 925 } 926 927 // Make sure the whole register is defined if there are undef components by 928 // adding an implicit def of the super-reg on the first instruction. 929 bool NeedSuperRegDef = e > 1 && IsStore && i == 0; 930 bool NeedSuperRegImpOperand = e > 1; 931 932 unsigned Lane = RegOffset / 4; 933 unsigned LaneE = (RegOffset + EltSize) / 4; 934 for ( ; Lane != LaneE; ++Lane) { 935 bool IsSubReg = e > 1 || EltSize > 4; 936 Register Sub = IsSubReg 937 ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane))) 938 : ValueReg; 939 auto MIB = spillVGPRtoAGPR(ST, MI, Index, Lane, Sub, IsKill); 940 if (!MIB.getInstr()) 941 break; 942 if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == 0)) { 943 MIB.addReg(ValueReg, RegState::ImplicitDefine); 944 NeedSuperRegDef = false; 945 } 946 if (IsSubReg || NeedSuperRegImpOperand) { 947 NeedSuperRegImpOperand = true; 948 unsigned State = SrcDstRegState; 949 if (Lane + 1 != LaneE) 950 State &= ~RegState::Kill; 951 MIB.addReg(ValueReg, RegState::Implicit | State); 952 } 953 } 954 955 if (Lane == LaneE) // Fully spilled into AGPRs. 956 continue; 957 958 // Offset in bytes from the beginning of the ValueReg to its portion we 959 // still need to spill. It may differ from RegOffset if a portion of 960 // current SubReg has been already spilled into AGPRs by the loop above. 961 unsigned RemRegOffset = Lane * 4; 962 unsigned RemEltSize = EltSize - (RemRegOffset - RegOffset); 963 if (RemEltSize != EltSize) { // Partially spilled to AGPRs 964 assert(IsFlat && EltSize > 4); 965 966 unsigned NumRegs = RemEltSize / 4; 967 SubReg = Register(getSubReg(ValueReg, 968 getSubRegFromChannel(RemRegOffset / 4, NumRegs))); 969 unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize); 970 Desc = &TII->get(Opc); 971 } 972 973 unsigned FinalReg = SubReg; 974 975 if (IsAGPR) { 976 assert(EltSize == 4); 977 978 if (!TmpReg) { 979 assert(RS && "Needs to have RegScavenger to spill an AGPR!"); 980 // FIXME: change to scavengeRegisterBackwards() 981 TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 982 RS->setRegUsed(TmpReg); 983 } 984 if (IsStore) { 985 auto AccRead = BuildMI(*MBB, MI, DL, 986 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TmpReg) 987 .addReg(SubReg, getKillRegState(IsKill)); 988 if (NeedSuperRegDef) 989 AccRead.addReg(ValueReg, RegState::ImplicitDefine); 990 AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse); 991 } 992 SubReg = TmpReg; 993 } 994 995 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RemRegOffset); 996 MachineMemOperand *NewMMO = 997 MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize, 998 commonAlignment(Alignment, RemRegOffset)); 999 1000 auto MIB = BuildMI(*MBB, MI, DL, *Desc) 1001 .addReg(SubReg, 1002 getDefRegState(!IsStore) | getKillRegState(IsKill)); 1003 if (!IsFlat) 1004 MIB.addReg(FuncInfo->getScratchRSrcReg()); 1005 1006 if (SOffset == AMDGPU::NoRegister) { 1007 if (!IsFlat) 1008 MIB.addImm(0); 1009 } else { 1010 MIB.addReg(SOffset, SOffsetRegState); 1011 } 1012 MIB.addImm(Offset + RemRegOffset) 1013 .addImm(0) // glc 1014 .addImm(0) // slc 1015 .addImm(0); // tfe for MUBUF or dlc for FLAT 1016 if (!IsFlat) 1017 MIB.addImm(0) // dlc 1018 .addImm(0); // swz 1019 MIB.addImm(0); // scc 1020 MIB.addMemOperand(NewMMO); 1021 1022 if (!IsAGPR && NeedSuperRegDef) 1023 MIB.addReg(ValueReg, RegState::ImplicitDefine); 1024 1025 if (!IsStore && TmpReg != AMDGPU::NoRegister) { 1026 MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), 1027 FinalReg) 1028 .addReg(TmpReg, RegState::Kill); 1029 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1030 } 1031 1032 if (NeedSuperRegImpOperand) 1033 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); 1034 } 1035 1036 if (ScratchOffsetRegDelta != 0) { 1037 // Subtract the offset we added to the ScratchOffset register. 1038 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), SOffset) 1039 .addReg(SOffset) 1040 .addImm(ScratchOffsetRegDelta); 1041 } 1042 } 1043 1044 // Generate a VMEM access which loads or stores the VGPR containing an SGPR 1045 // spill such that all the lanes set in VGPRLanes are loaded or stored. 1046 // This generates exec mask manipulation and will use SGPRs available in MI 1047 // or VGPR lanes in the VGPR to save and restore the exec mask. 1048 void SIRegisterInfo::buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI, 1049 int Index, int Offset, 1050 unsigned EltSize, Register VGPR, 1051 int64_t VGPRLanes, 1052 RegScavenger *RS, 1053 bool IsLoad) const { 1054 MachineBasicBlock *MBB = MI->getParent(); 1055 MachineFunction *MF = MBB->getParent(); 1056 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1057 const SIInstrInfo *TII = ST.getInstrInfo(); 1058 1059 Register SuperReg = MI->getOperand(0).getReg(); 1060 const TargetRegisterClass *RC = getPhysRegClass(SuperReg); 1061 ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); 1062 unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 1063 unsigned FirstPart = Offset * 32; 1064 unsigned ExecLane = 0; 1065 1066 bool IsKill = MI->getOperand(0).isKill(); 1067 const DebugLoc &DL = MI->getDebugLoc(); 1068 1069 // Cannot handle load/store to EXEC 1070 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI && 1071 SuperReg != AMDGPU::EXEC && "exec should never spill"); 1072 1073 // On Wave32 only handle EXEC_LO. 1074 // On Wave64 only update EXEC_HI if there is sufficent space for a copy. 1075 bool OnlyExecLo = isWave32 || NumSubRegs == 1 || SuperReg == AMDGPU::EXEC_HI; 1076 1077 unsigned ExecMovOpc = OnlyExecLo ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 1078 Register ExecReg = OnlyExecLo ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 1079 Register SavedExecReg; 1080 1081 // Backup EXEC 1082 if (OnlyExecLo) { 1083 SavedExecReg = 1084 NumSubRegs == 1 1085 ? SuperReg 1086 : Register(getSubReg(SuperReg, SplitParts[FirstPart + ExecLane])); 1087 } else { 1088 // If src/dst is an odd size it is possible subreg0 is not aligned. 1089 for (; ExecLane < (NumSubRegs - 1); ++ExecLane) { 1090 SavedExecReg = getMatchingSuperReg( 1091 getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]), AMDGPU::sub0, 1092 &AMDGPU::SReg_64_XEXECRegClass); 1093 if (SavedExecReg) 1094 break; 1095 } 1096 } 1097 assert(SavedExecReg); 1098 BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), SavedExecReg).addReg(ExecReg); 1099 1100 // Setup EXEC 1101 BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg).addImm(VGPRLanes); 1102 1103 // Load/store VGPR 1104 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 1105 assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill); 1106 1107 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF) 1108 ? getBaseRegister() 1109 : getFrameRegister(*MF); 1110 1111 Align Alignment = FrameInfo.getObjectAlign(Index); 1112 MachinePointerInfo PtrInfo = 1113 MachinePointerInfo::getFixedStack(*MF, Index); 1114 MachineMemOperand *MMO = MF->getMachineMemOperand( 1115 PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore, 1116 EltSize, Alignment); 1117 1118 if (IsLoad) { 1119 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 1120 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 1121 buildSpillLoadStore(MI, Opc, 1122 Index, 1123 VGPR, false, 1124 FrameReg, 1125 Offset * EltSize, MMO, 1126 RS); 1127 } else { 1128 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 1129 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 1130 buildSpillLoadStore(MI, Opc, Index, VGPR, 1131 IsKill, FrameReg, 1132 Offset * EltSize, MMO, RS); 1133 // This only ever adds one VGPR spill 1134 MFI->addToSpilledVGPRs(1); 1135 } 1136 1137 // Restore EXEC 1138 BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg) 1139 .addReg(SavedExecReg, getKillRegState(IsLoad || IsKill)); 1140 1141 // Restore clobbered SGPRs 1142 if (IsLoad) { 1143 // Nothing to do; register will be overwritten 1144 } else if (!IsKill) { 1145 // Restore SGPRs from appropriate VGPR lanes 1146 if (!OnlyExecLo) { 1147 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), 1148 getSubReg(SuperReg, SplitParts[FirstPart + ExecLane + 1])) 1149 .addReg(VGPR) 1150 .addImm(ExecLane + 1); 1151 } 1152 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), 1153 NumSubRegs == 1 ? SavedExecReg 1154 : Register(getSubReg( 1155 SuperReg, SplitParts[FirstPart + ExecLane]))) 1156 .addReg(VGPR, RegState::Kill) 1157 .addImm(ExecLane); 1158 } 1159 } 1160 1161 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, 1162 int Index, 1163 RegScavenger *RS, 1164 bool OnlyToVGPR) const { 1165 MachineBasicBlock *MBB = MI->getParent(); 1166 MachineFunction *MF = MBB->getParent(); 1167 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1168 1169 ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills 1170 = MFI->getSGPRToVGPRSpills(Index); 1171 bool SpillToVGPR = !VGPRSpills.empty(); 1172 if (OnlyToVGPR && !SpillToVGPR) 1173 return false; 1174 1175 const SIInstrInfo *TII = ST.getInstrInfo(); 1176 1177 Register SuperReg = MI->getOperand(0).getReg(); 1178 bool IsKill = MI->getOperand(0).isKill(); 1179 const DebugLoc &DL = MI->getDebugLoc(); 1180 1181 assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() && 1182 SuperReg != MFI->getFrameOffsetReg())); 1183 1184 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 1185 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI && 1186 SuperReg != AMDGPU::EXEC && "exec should never spill"); 1187 1188 unsigned EltSize = 4; 1189 const TargetRegisterClass *RC = getPhysRegClass(SuperReg); 1190 1191 ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); 1192 unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 1193 1194 if (SpillToVGPR) { 1195 for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { 1196 Register SubReg = NumSubRegs == 1 1197 ? SuperReg 1198 : Register(getSubReg(SuperReg, SplitParts[i])); 1199 SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; 1200 1201 bool UseKill = IsKill && i == NumSubRegs - 1; 1202 1203 // Mark the "old value of vgpr" input undef only if this is the first sgpr 1204 // spill to this specific vgpr in the first basic block. 1205 auto MIB = 1206 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill.VGPR) 1207 .addReg(SubReg, getKillRegState(UseKill)) 1208 .addImm(Spill.Lane) 1209 .addReg(Spill.VGPR); 1210 1211 if (i == 0 && NumSubRegs > 1) { 1212 // We may be spilling a super-register which is only partially defined, 1213 // and need to ensure later spills think the value is defined. 1214 MIB.addReg(SuperReg, RegState::ImplicitDefine); 1215 } 1216 1217 if (NumSubRegs > 1) 1218 MIB.addReg(SuperReg, getKillRegState(UseKill) | RegState::Implicit); 1219 1220 // FIXME: Since this spills to another register instead of an actual 1221 // frame index, we should delete the frame index when all references to 1222 // it are fixed. 1223 } 1224 } else { 1225 // Scavenged temporary VGPR to use. It must be scavenged once for any number 1226 // of spilled subregs. 1227 Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 1228 RS->setRegUsed(TmpVGPR); 1229 1230 // SubReg carries the "Kill" flag when SubReg == SuperReg. 1231 unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); 1232 1233 unsigned PerVGPR = 32; 1234 unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR; 1235 int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL; 1236 1237 for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) { 1238 unsigned TmpVGPRFlags = RegState::Undef; 1239 1240 // Write sub registers into the VGPR 1241 for (unsigned i = Offset * PerVGPR, 1242 e = std::min((Offset + 1) * PerVGPR, NumSubRegs); 1243 i < e; ++i) { 1244 Register SubReg = NumSubRegs == 1 1245 ? SuperReg 1246 : Register(getSubReg(SuperReg, SplitParts[i])); 1247 1248 MachineInstrBuilder WriteLane = 1249 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), TmpVGPR) 1250 .addReg(SubReg, SubKillState) 1251 .addImm(i % PerVGPR) 1252 .addReg(TmpVGPR, TmpVGPRFlags); 1253 TmpVGPRFlags = 0; 1254 1255 // There could be undef components of a spilled super register. 1256 // TODO: Can we detect this and skip the spill? 1257 if (NumSubRegs > 1) { 1258 // The last implicit use of the SuperReg carries the "Kill" flag. 1259 unsigned SuperKillState = 0; 1260 if (i + 1 == NumSubRegs) 1261 SuperKillState |= getKillRegState(IsKill); 1262 WriteLane.addReg(SuperReg, RegState::Implicit | SuperKillState); 1263 } 1264 } 1265 1266 // Write out VGPR 1267 buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes, 1268 RS, false); 1269 } 1270 } 1271 1272 MI->eraseFromParent(); 1273 MFI->addToSpilledSGPRs(NumSubRegs); 1274 return true; 1275 } 1276 1277 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, 1278 int Index, 1279 RegScavenger *RS, 1280 bool OnlyToVGPR) const { 1281 MachineFunction *MF = MI->getParent()->getParent(); 1282 MachineBasicBlock *MBB = MI->getParent(); 1283 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1284 1285 ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills 1286 = MFI->getSGPRToVGPRSpills(Index); 1287 bool SpillToVGPR = !VGPRSpills.empty(); 1288 if (OnlyToVGPR && !SpillToVGPR) 1289 return false; 1290 1291 const SIInstrInfo *TII = ST.getInstrInfo(); 1292 const DebugLoc &DL = MI->getDebugLoc(); 1293 1294 Register SuperReg = MI->getOperand(0).getReg(); 1295 1296 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 1297 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI && 1298 SuperReg != AMDGPU::EXEC && "exec should never spill"); 1299 1300 unsigned EltSize = 4; 1301 1302 const TargetRegisterClass *RC = getPhysRegClass(SuperReg); 1303 1304 ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); 1305 unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 1306 1307 if (SpillToVGPR) { 1308 for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { 1309 Register SubReg = NumSubRegs == 1 1310 ? SuperReg 1311 : Register(getSubReg(SuperReg, SplitParts[i])); 1312 1313 SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; 1314 auto MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg) 1315 .addReg(Spill.VGPR) 1316 .addImm(Spill.Lane); 1317 if (NumSubRegs > 1 && i == 0) 1318 MIB.addReg(SuperReg, RegState::ImplicitDefine); 1319 } 1320 } else { 1321 Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 1322 RS->setRegUsed(TmpVGPR); 1323 1324 unsigned PerVGPR = 32; 1325 unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR; 1326 int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL; 1327 1328 for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) { 1329 // Load in VGPR data 1330 buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes, 1331 RS, true); 1332 1333 // Unpack lanes 1334 for (unsigned i = Offset * PerVGPR, 1335 e = std::min((Offset + 1) * PerVGPR, NumSubRegs); 1336 i < e; ++i) { 1337 Register SubReg = NumSubRegs == 1 1338 ? SuperReg 1339 : Register(getSubReg(SuperReg, SplitParts[i])); 1340 1341 bool LastSubReg = (i + 1 == e); 1342 auto MIB = 1343 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg) 1344 .addReg(TmpVGPR, getKillRegState(LastSubReg)) 1345 .addImm(i); 1346 if (NumSubRegs > 1 && i == 0) 1347 MIB.addReg(SuperReg, RegState::ImplicitDefine); 1348 } 1349 } 1350 } 1351 1352 MI->eraseFromParent(); 1353 return true; 1354 } 1355 1356 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to 1357 /// a VGPR and the stack slot can be safely eliminated when all other users are 1358 /// handled. 1359 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( 1360 MachineBasicBlock::iterator MI, 1361 int FI, 1362 RegScavenger *RS) const { 1363 switch (MI->getOpcode()) { 1364 case AMDGPU::SI_SPILL_S1024_SAVE: 1365 case AMDGPU::SI_SPILL_S512_SAVE: 1366 case AMDGPU::SI_SPILL_S256_SAVE: 1367 case AMDGPU::SI_SPILL_S192_SAVE: 1368 case AMDGPU::SI_SPILL_S160_SAVE: 1369 case AMDGPU::SI_SPILL_S128_SAVE: 1370 case AMDGPU::SI_SPILL_S96_SAVE: 1371 case AMDGPU::SI_SPILL_S64_SAVE: 1372 case AMDGPU::SI_SPILL_S32_SAVE: 1373 return spillSGPR(MI, FI, RS, true); 1374 case AMDGPU::SI_SPILL_S1024_RESTORE: 1375 case AMDGPU::SI_SPILL_S512_RESTORE: 1376 case AMDGPU::SI_SPILL_S256_RESTORE: 1377 case AMDGPU::SI_SPILL_S192_RESTORE: 1378 case AMDGPU::SI_SPILL_S160_RESTORE: 1379 case AMDGPU::SI_SPILL_S128_RESTORE: 1380 case AMDGPU::SI_SPILL_S96_RESTORE: 1381 case AMDGPU::SI_SPILL_S64_RESTORE: 1382 case AMDGPU::SI_SPILL_S32_RESTORE: 1383 return restoreSGPR(MI, FI, RS, true); 1384 default: 1385 llvm_unreachable("not an SGPR spill instruction"); 1386 } 1387 } 1388 1389 void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, 1390 int SPAdj, unsigned FIOperandNum, 1391 RegScavenger *RS) const { 1392 MachineFunction *MF = MI->getParent()->getParent(); 1393 MachineBasicBlock *MBB = MI->getParent(); 1394 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1395 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 1396 const SIInstrInfo *TII = ST.getInstrInfo(); 1397 DebugLoc DL = MI->getDebugLoc(); 1398 1399 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?"); 1400 1401 MachineOperand &FIOp = MI->getOperand(FIOperandNum); 1402 int Index = MI->getOperand(FIOperandNum).getIndex(); 1403 1404 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF) 1405 ? getBaseRegister() 1406 : getFrameRegister(*MF); 1407 1408 switch (MI->getOpcode()) { 1409 // SGPR register spill 1410 case AMDGPU::SI_SPILL_S1024_SAVE: 1411 case AMDGPU::SI_SPILL_S512_SAVE: 1412 case AMDGPU::SI_SPILL_S256_SAVE: 1413 case AMDGPU::SI_SPILL_S192_SAVE: 1414 case AMDGPU::SI_SPILL_S160_SAVE: 1415 case AMDGPU::SI_SPILL_S128_SAVE: 1416 case AMDGPU::SI_SPILL_S96_SAVE: 1417 case AMDGPU::SI_SPILL_S64_SAVE: 1418 case AMDGPU::SI_SPILL_S32_SAVE: { 1419 spillSGPR(MI, Index, RS); 1420 break; 1421 } 1422 1423 // SGPR register restore 1424 case AMDGPU::SI_SPILL_S1024_RESTORE: 1425 case AMDGPU::SI_SPILL_S512_RESTORE: 1426 case AMDGPU::SI_SPILL_S256_RESTORE: 1427 case AMDGPU::SI_SPILL_S192_RESTORE: 1428 case AMDGPU::SI_SPILL_S160_RESTORE: 1429 case AMDGPU::SI_SPILL_S128_RESTORE: 1430 case AMDGPU::SI_SPILL_S96_RESTORE: 1431 case AMDGPU::SI_SPILL_S64_RESTORE: 1432 case AMDGPU::SI_SPILL_S32_RESTORE: { 1433 restoreSGPR(MI, Index, RS); 1434 break; 1435 } 1436 1437 // VGPR register spill 1438 case AMDGPU::SI_SPILL_V1024_SAVE: 1439 case AMDGPU::SI_SPILL_V512_SAVE: 1440 case AMDGPU::SI_SPILL_V256_SAVE: 1441 case AMDGPU::SI_SPILL_V192_SAVE: 1442 case AMDGPU::SI_SPILL_V160_SAVE: 1443 case AMDGPU::SI_SPILL_V128_SAVE: 1444 case AMDGPU::SI_SPILL_V96_SAVE: 1445 case AMDGPU::SI_SPILL_V64_SAVE: 1446 case AMDGPU::SI_SPILL_V32_SAVE: 1447 case AMDGPU::SI_SPILL_A1024_SAVE: 1448 case AMDGPU::SI_SPILL_A512_SAVE: 1449 case AMDGPU::SI_SPILL_A256_SAVE: 1450 case AMDGPU::SI_SPILL_A192_SAVE: 1451 case AMDGPU::SI_SPILL_A160_SAVE: 1452 case AMDGPU::SI_SPILL_A128_SAVE: 1453 case AMDGPU::SI_SPILL_A96_SAVE: 1454 case AMDGPU::SI_SPILL_A64_SAVE: 1455 case AMDGPU::SI_SPILL_A32_SAVE: { 1456 const MachineOperand *VData = TII->getNamedOperand(*MI, 1457 AMDGPU::OpName::vdata); 1458 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 1459 MFI->getStackPtrOffsetReg()); 1460 1461 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 1462 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 1463 buildSpillLoadStore(MI, Opc, 1464 Index, 1465 VData->getReg(), VData->isKill(), 1466 FrameReg, 1467 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 1468 *MI->memoperands_begin(), 1469 RS); 1470 MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); 1471 MI->eraseFromParent(); 1472 break; 1473 } 1474 case AMDGPU::SI_SPILL_V32_RESTORE: 1475 case AMDGPU::SI_SPILL_V64_RESTORE: 1476 case AMDGPU::SI_SPILL_V96_RESTORE: 1477 case AMDGPU::SI_SPILL_V128_RESTORE: 1478 case AMDGPU::SI_SPILL_V160_RESTORE: 1479 case AMDGPU::SI_SPILL_V192_RESTORE: 1480 case AMDGPU::SI_SPILL_V256_RESTORE: 1481 case AMDGPU::SI_SPILL_V512_RESTORE: 1482 case AMDGPU::SI_SPILL_V1024_RESTORE: 1483 case AMDGPU::SI_SPILL_A32_RESTORE: 1484 case AMDGPU::SI_SPILL_A64_RESTORE: 1485 case AMDGPU::SI_SPILL_A96_RESTORE: 1486 case AMDGPU::SI_SPILL_A128_RESTORE: 1487 case AMDGPU::SI_SPILL_A160_RESTORE: 1488 case AMDGPU::SI_SPILL_A192_RESTORE: 1489 case AMDGPU::SI_SPILL_A256_RESTORE: 1490 case AMDGPU::SI_SPILL_A512_RESTORE: 1491 case AMDGPU::SI_SPILL_A1024_RESTORE: { 1492 const MachineOperand *VData = TII->getNamedOperand(*MI, 1493 AMDGPU::OpName::vdata); 1494 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 1495 MFI->getStackPtrOffsetReg()); 1496 1497 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 1498 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 1499 buildSpillLoadStore(MI, Opc, 1500 Index, 1501 VData->getReg(), VData->isKill(), 1502 FrameReg, 1503 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 1504 *MI->memoperands_begin(), 1505 RS); 1506 MI->eraseFromParent(); 1507 break; 1508 } 1509 1510 default: { 1511 const DebugLoc &DL = MI->getDebugLoc(); 1512 1513 int64_t Offset = FrameInfo.getObjectOffset(Index); 1514 if (ST.enableFlatScratch()) { 1515 if (TII->isFLATScratch(*MI)) { 1516 assert((int16_t)FIOperandNum == 1517 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 1518 AMDGPU::OpName::saddr)); 1519 1520 // The offset is always swizzled, just replace it 1521 if (FrameReg) 1522 FIOp.ChangeToRegister(FrameReg, false); 1523 1524 if (!Offset) 1525 return; 1526 1527 MachineOperand *OffsetOp = 1528 TII->getNamedOperand(*MI, AMDGPU::OpName::offset); 1529 int64_t NewOffset = Offset + OffsetOp->getImm(); 1530 if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 1531 true)) { 1532 OffsetOp->setImm(NewOffset); 1533 if (FrameReg) 1534 return; 1535 Offset = 0; 1536 } 1537 1538 assert(!TII->getNamedOperand(*MI, AMDGPU::OpName::vaddr) && 1539 "Unexpected vaddr for flat scratch with a FI operand"); 1540 1541 // On GFX10 we have ST mode to use no registers for an address. 1542 // Otherwise we need to materialize 0 into an SGPR. 1543 if (!Offset && ST.hasFlatScratchSTMode()) { 1544 unsigned Opc = MI->getOpcode(); 1545 unsigned NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc); 1546 MI->RemoveOperand( 1547 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr)); 1548 MI->setDesc(TII->get(NewOpc)); 1549 return; 1550 } 1551 } 1552 1553 if (!FrameReg) { 1554 FIOp.ChangeToImmediate(Offset); 1555 if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) 1556 return; 1557 } 1558 1559 // We need to use register here. Check if we can use an SGPR or need 1560 // a VGPR. 1561 FIOp.ChangeToRegister(AMDGPU::M0, false); 1562 bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp); 1563 1564 if (!Offset && FrameReg && UseSGPR) { 1565 FIOp.setReg(FrameReg); 1566 return; 1567 } 1568 1569 const TargetRegisterClass *RC = UseSGPR ? &AMDGPU::SReg_32_XM0RegClass 1570 : &AMDGPU::VGPR_32RegClass; 1571 1572 Register TmpReg = RS->scavengeRegister(RC, MI, 0, !UseSGPR); 1573 FIOp.setReg(TmpReg); 1574 FIOp.setIsKill(true); 1575 1576 if ((!FrameReg || !Offset) && TmpReg) { 1577 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1578 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg); 1579 if (FrameReg) 1580 MIB.addReg(FrameReg); 1581 else 1582 MIB.addImm(Offset); 1583 1584 return; 1585 } 1586 1587 Register TmpSReg = 1588 UseSGPR ? TmpReg 1589 : RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, 1590 !UseSGPR); 1591 1592 // TODO: for flat scratch another attempt can be made with a VGPR index 1593 // if no SGPRs can be scavenged. 1594 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) 1595 report_fatal_error("Cannot scavenge register in FI elimination!"); 1596 1597 if (!TmpSReg) { 1598 // Use frame register and restore it after. 1599 TmpSReg = FrameReg; 1600 FIOp.setReg(FrameReg); 1601 FIOp.setIsKill(false); 1602 } 1603 1604 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), TmpSReg) 1605 .addReg(FrameReg) 1606 .addImm(Offset); 1607 1608 if (!UseSGPR) 1609 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 1610 .addReg(TmpSReg, RegState::Kill); 1611 1612 if (TmpSReg == FrameReg) { 1613 // Undo frame register modification. 1614 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_SUB_U32), 1615 FrameReg) 1616 .addReg(FrameReg) 1617 .addImm(Offset); 1618 } 1619 1620 return; 1621 } 1622 1623 bool IsMUBUF = TII->isMUBUF(*MI); 1624 1625 if (!IsMUBUF && !MFI->isEntryFunction()) { 1626 // Convert to a swizzled stack address by scaling by the wave size. 1627 // 1628 // In an entry function/kernel the offset is already swizzled. 1629 1630 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32; 1631 Register ResultReg = 1632 IsCopy ? MI->getOperand(0).getReg() 1633 : RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 1634 1635 int64_t Offset = FrameInfo.getObjectOffset(Index); 1636 if (Offset == 0) { 1637 // XXX - This never happens because of emergency scavenging slot at 0? 1638 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg) 1639 .addImm(ST.getWavefrontSizeLog2()) 1640 .addReg(FrameReg); 1641 } else { 1642 if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) { 1643 // Reuse ResultReg in intermediate step. 1644 Register ScaledReg = ResultReg; 1645 1646 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), 1647 ScaledReg) 1648 .addImm(ST.getWavefrontSizeLog2()) 1649 .addReg(FrameReg); 1650 1651 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; 1652 1653 // TODO: Fold if use instruction is another add of a constant. 1654 if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { 1655 // FIXME: This can fail 1656 MIB.addImm(Offset); 1657 MIB.addReg(ScaledReg, RegState::Kill); 1658 if (!IsVOP2) 1659 MIB.addImm(0); // clamp bit 1660 } else { 1661 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 && 1662 "Need to reuse carry out register"); 1663 1664 // Use scavenged unused carry out as offset register. 1665 Register ConstOffsetReg; 1666 if (!isWave32) 1667 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0); 1668 else 1669 ConstOffsetReg = MIB.getReg(1); 1670 1671 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) 1672 .addImm(Offset); 1673 MIB.addReg(ConstOffsetReg, RegState::Kill); 1674 MIB.addReg(ScaledReg, RegState::Kill); 1675 MIB.addImm(0); // clamp bit 1676 } 1677 } else { 1678 // We have to produce a carry out, and there isn't a free SGPR pair 1679 // for it. We can keep the whole computation on the SALU to avoid 1680 // clobbering an additional register at the cost of an extra mov. 1681 1682 // We may have 1 free scratch SGPR even though a carry out is 1683 // unavailable. Only one additional mov is needed. 1684 Register TmpScaledReg = 1685 RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); 1686 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg; 1687 1688 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg) 1689 .addReg(FrameReg) 1690 .addImm(ST.getWavefrontSizeLog2()); 1691 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), ScaledReg) 1692 .addReg(ScaledReg, RegState::Kill) 1693 .addImm(Offset); 1694 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg) 1695 .addReg(ScaledReg, RegState::Kill); 1696 1697 // If there were truly no free SGPRs, we need to undo everything. 1698 if (!TmpScaledReg.isValid()) { 1699 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScaledReg) 1700 .addReg(ScaledReg, RegState::Kill) 1701 .addImm(Offset); 1702 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg) 1703 .addReg(FrameReg) 1704 .addImm(ST.getWavefrontSizeLog2()); 1705 } 1706 } 1707 } 1708 1709 // Don't introduce an extra copy if we're just materializing in a mov. 1710 if (IsCopy) 1711 MI->eraseFromParent(); 1712 else 1713 FIOp.ChangeToRegister(ResultReg, false, false, true); 1714 return; 1715 } 1716 1717 if (IsMUBUF) { 1718 // Disable offen so we don't need a 0 vgpr base. 1719 assert(static_cast<int>(FIOperandNum) == 1720 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 1721 AMDGPU::OpName::vaddr)); 1722 1723 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset); 1724 assert((SOffset.isImm() && SOffset.getImm() == 0)); 1725 1726 if (FrameReg != AMDGPU::NoRegister) 1727 SOffset.ChangeToRegister(FrameReg, false); 1728 1729 int64_t Offset = FrameInfo.getObjectOffset(Index); 1730 int64_t OldImm 1731 = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(); 1732 int64_t NewOffset = OldImm + Offset; 1733 1734 if (SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) && 1735 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) { 1736 MI->eraseFromParent(); 1737 return; 1738 } 1739 } 1740 1741 // If the offset is simply too big, don't convert to a scratch wave offset 1742 // relative index. 1743 1744 FIOp.ChangeToImmediate(Offset); 1745 if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { 1746 Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 1747 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 1748 .addImm(Offset); 1749 FIOp.ChangeToRegister(TmpReg, false, false, true); 1750 } 1751 } 1752 } 1753 } 1754 1755 StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const { 1756 return AMDGPUInstPrinter::getRegisterName(Reg); 1757 } 1758 1759 static const TargetRegisterClass * 1760 getAnyVGPRClassForBitWidth(unsigned BitWidth) { 1761 if (BitWidth <= 64) 1762 return &AMDGPU::VReg_64RegClass; 1763 if (BitWidth <= 96) 1764 return &AMDGPU::VReg_96RegClass; 1765 if (BitWidth <= 128) 1766 return &AMDGPU::VReg_128RegClass; 1767 if (BitWidth <= 160) 1768 return &AMDGPU::VReg_160RegClass; 1769 if (BitWidth <= 192) 1770 return &AMDGPU::VReg_192RegClass; 1771 if (BitWidth <= 256) 1772 return &AMDGPU::VReg_256RegClass; 1773 if (BitWidth <= 512) 1774 return &AMDGPU::VReg_512RegClass; 1775 if (BitWidth <= 1024) 1776 return &AMDGPU::VReg_1024RegClass; 1777 1778 return nullptr; 1779 } 1780 1781 static const TargetRegisterClass * 1782 getAlignedVGPRClassForBitWidth(unsigned BitWidth) { 1783 if (BitWidth <= 64) 1784 return &AMDGPU::VReg_64_Align2RegClass; 1785 if (BitWidth <= 96) 1786 return &AMDGPU::VReg_96_Align2RegClass; 1787 if (BitWidth <= 128) 1788 return &AMDGPU::VReg_128_Align2RegClass; 1789 if (BitWidth <= 160) 1790 return &AMDGPU::VReg_160_Align2RegClass; 1791 if (BitWidth <= 192) 1792 return &AMDGPU::VReg_192_Align2RegClass; 1793 if (BitWidth <= 256) 1794 return &AMDGPU::VReg_256_Align2RegClass; 1795 if (BitWidth <= 512) 1796 return &AMDGPU::VReg_512_Align2RegClass; 1797 if (BitWidth <= 1024) 1798 return &AMDGPU::VReg_1024_Align2RegClass; 1799 1800 return nullptr; 1801 } 1802 1803 const TargetRegisterClass * 1804 SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const { 1805 if (BitWidth == 1) 1806 return &AMDGPU::VReg_1RegClass; 1807 if (BitWidth <= 16) 1808 return &AMDGPU::VGPR_LO16RegClass; 1809 if (BitWidth <= 32) 1810 return &AMDGPU::VGPR_32RegClass; 1811 return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth) 1812 : getAnyVGPRClassForBitWidth(BitWidth); 1813 } 1814 1815 static const TargetRegisterClass * 1816 getAnyAGPRClassForBitWidth(unsigned BitWidth) { 1817 if (BitWidth <= 64) 1818 return &AMDGPU::AReg_64RegClass; 1819 if (BitWidth <= 96) 1820 return &AMDGPU::AReg_96RegClass; 1821 if (BitWidth <= 128) 1822 return &AMDGPU::AReg_128RegClass; 1823 if (BitWidth <= 160) 1824 return &AMDGPU::AReg_160RegClass; 1825 if (BitWidth <= 192) 1826 return &AMDGPU::AReg_192RegClass; 1827 if (BitWidth <= 256) 1828 return &AMDGPU::AReg_256RegClass; 1829 if (BitWidth <= 512) 1830 return &AMDGPU::AReg_512RegClass; 1831 if (BitWidth <= 1024) 1832 return &AMDGPU::AReg_1024RegClass; 1833 1834 return nullptr; 1835 } 1836 1837 static const TargetRegisterClass * 1838 getAlignedAGPRClassForBitWidth(unsigned BitWidth) { 1839 if (BitWidth <= 64) 1840 return &AMDGPU::AReg_64_Align2RegClass; 1841 if (BitWidth <= 96) 1842 return &AMDGPU::AReg_96_Align2RegClass; 1843 if (BitWidth <= 128) 1844 return &AMDGPU::AReg_128_Align2RegClass; 1845 if (BitWidth <= 160) 1846 return &AMDGPU::AReg_160_Align2RegClass; 1847 if (BitWidth <= 192) 1848 return &AMDGPU::AReg_192_Align2RegClass; 1849 if (BitWidth <= 256) 1850 return &AMDGPU::AReg_256_Align2RegClass; 1851 if (BitWidth <= 512) 1852 return &AMDGPU::AReg_512_Align2RegClass; 1853 if (BitWidth <= 1024) 1854 return &AMDGPU::AReg_1024_Align2RegClass; 1855 1856 return nullptr; 1857 } 1858 1859 const TargetRegisterClass * 1860 SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const { 1861 if (BitWidth <= 16) 1862 return &AMDGPU::AGPR_LO16RegClass; 1863 if (BitWidth <= 32) 1864 return &AMDGPU::AGPR_32RegClass; 1865 return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth) 1866 : getAnyAGPRClassForBitWidth(BitWidth); 1867 } 1868 1869 const TargetRegisterClass * 1870 SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) { 1871 if (BitWidth <= 16) 1872 return &AMDGPU::SGPR_LO16RegClass; 1873 if (BitWidth <= 32) 1874 return &AMDGPU::SReg_32RegClass; 1875 if (BitWidth <= 64) 1876 return &AMDGPU::SReg_64RegClass; 1877 if (BitWidth <= 96) 1878 return &AMDGPU::SGPR_96RegClass; 1879 if (BitWidth <= 128) 1880 return &AMDGPU::SGPR_128RegClass; 1881 if (BitWidth <= 160) 1882 return &AMDGPU::SGPR_160RegClass; 1883 if (BitWidth <= 192) 1884 return &AMDGPU::SGPR_192RegClass; 1885 if (BitWidth <= 256) 1886 return &AMDGPU::SGPR_256RegClass; 1887 if (BitWidth <= 512) 1888 return &AMDGPU::SGPR_512RegClass; 1889 if (BitWidth <= 1024) 1890 return &AMDGPU::SGPR_1024RegClass; 1891 1892 return nullptr; 1893 } 1894 1895 // FIXME: This is very slow. It might be worth creating a map from physreg to 1896 // register class. 1897 const TargetRegisterClass * 1898 SIRegisterInfo::getPhysRegClass(MCRegister Reg) const { 1899 static const TargetRegisterClass *const BaseClasses[] = { 1900 &AMDGPU::VGPR_LO16RegClass, 1901 &AMDGPU::VGPR_HI16RegClass, 1902 &AMDGPU::SReg_LO16RegClass, 1903 &AMDGPU::AGPR_LO16RegClass, 1904 &AMDGPU::VGPR_32RegClass, 1905 &AMDGPU::SReg_32RegClass, 1906 &AMDGPU::AGPR_32RegClass, 1907 &AMDGPU::AGPR_32RegClass, 1908 &AMDGPU::VReg_64_Align2RegClass, 1909 &AMDGPU::VReg_64RegClass, 1910 &AMDGPU::SReg_64RegClass, 1911 &AMDGPU::AReg_64_Align2RegClass, 1912 &AMDGPU::AReg_64RegClass, 1913 &AMDGPU::VReg_96_Align2RegClass, 1914 &AMDGPU::VReg_96RegClass, 1915 &AMDGPU::SReg_96RegClass, 1916 &AMDGPU::AReg_96_Align2RegClass, 1917 &AMDGPU::AReg_96RegClass, 1918 &AMDGPU::VReg_128_Align2RegClass, 1919 &AMDGPU::VReg_128RegClass, 1920 &AMDGPU::SReg_128RegClass, 1921 &AMDGPU::AReg_128_Align2RegClass, 1922 &AMDGPU::AReg_128RegClass, 1923 &AMDGPU::VReg_160_Align2RegClass, 1924 &AMDGPU::VReg_160RegClass, 1925 &AMDGPU::SReg_160RegClass, 1926 &AMDGPU::AReg_160_Align2RegClass, 1927 &AMDGPU::AReg_160RegClass, 1928 &AMDGPU::VReg_192_Align2RegClass, 1929 &AMDGPU::VReg_192RegClass, 1930 &AMDGPU::SReg_192RegClass, 1931 &AMDGPU::AReg_192_Align2RegClass, 1932 &AMDGPU::AReg_192RegClass, 1933 &AMDGPU::VReg_256_Align2RegClass, 1934 &AMDGPU::VReg_256RegClass, 1935 &AMDGPU::SReg_256RegClass, 1936 &AMDGPU::AReg_256_Align2RegClass, 1937 &AMDGPU::AReg_256RegClass, 1938 &AMDGPU::VReg_512_Align2RegClass, 1939 &AMDGPU::VReg_512RegClass, 1940 &AMDGPU::SReg_512RegClass, 1941 &AMDGPU::AReg_512_Align2RegClass, 1942 &AMDGPU::AReg_512RegClass, 1943 &AMDGPU::SReg_1024RegClass, 1944 &AMDGPU::VReg_1024_Align2RegClass, 1945 &AMDGPU::VReg_1024RegClass, 1946 &AMDGPU::AReg_1024_Align2RegClass, 1947 &AMDGPU::AReg_1024RegClass, 1948 &AMDGPU::SCC_CLASSRegClass, 1949 &AMDGPU::Pseudo_SReg_32RegClass, 1950 &AMDGPU::Pseudo_SReg_128RegClass, 1951 }; 1952 1953 for (const TargetRegisterClass *BaseClass : BaseClasses) { 1954 if (BaseClass->contains(Reg)) { 1955 return BaseClass; 1956 } 1957 } 1958 return nullptr; 1959 } 1960 1961 bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo &MRI, 1962 Register Reg) const { 1963 const TargetRegisterClass *RC; 1964 if (Reg.isVirtual()) 1965 RC = MRI.getRegClass(Reg); 1966 else 1967 RC = getPhysRegClass(Reg); 1968 return isSGPRClass(RC); 1969 } 1970 1971 // TODO: It might be helpful to have some target specific flags in 1972 // TargetRegisterClass to mark which classes are VGPRs to make this trivial. 1973 bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { 1974 unsigned Size = getRegSizeInBits(*RC); 1975 if (Size == 16) { 1976 return getCommonSubClass(&AMDGPU::VGPR_LO16RegClass, RC) != nullptr || 1977 getCommonSubClass(&AMDGPU::VGPR_HI16RegClass, RC) != nullptr; 1978 } 1979 const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size); 1980 if (!VRC) { 1981 assert(Size < 32 && "Invalid register class size"); 1982 return false; 1983 } 1984 return getCommonSubClass(VRC, RC) != nullptr; 1985 } 1986 1987 bool SIRegisterInfo::hasAGPRs(const TargetRegisterClass *RC) const { 1988 unsigned Size = getRegSizeInBits(*RC); 1989 if (Size < 16) 1990 return false; 1991 const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size); 1992 if (!ARC) { 1993 assert(getVGPRClassForBitWidth(Size) && "Invalid register class size"); 1994 return false; 1995 } 1996 return getCommonSubClass(ARC, RC) != nullptr; 1997 } 1998 1999 const TargetRegisterClass * 2000 SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const { 2001 unsigned Size = getRegSizeInBits(*SRC); 2002 const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size); 2003 assert(VRC && "Invalid register class size"); 2004 return VRC; 2005 } 2006 2007 const TargetRegisterClass * 2008 SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const { 2009 unsigned Size = getRegSizeInBits(*SRC); 2010 const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size); 2011 assert(ARC && "Invalid register class size"); 2012 return ARC; 2013 } 2014 2015 const TargetRegisterClass * 2016 SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const { 2017 unsigned Size = getRegSizeInBits(*VRC); 2018 if (Size == 32) 2019 return &AMDGPU::SGPR_32RegClass; 2020 const TargetRegisterClass *SRC = getSGPRClassForBitWidth(Size); 2021 assert(SRC && "Invalid register class size"); 2022 return SRC; 2023 } 2024 2025 const TargetRegisterClass *SIRegisterInfo::getSubRegClass( 2026 const TargetRegisterClass *RC, unsigned SubIdx) const { 2027 if (SubIdx == AMDGPU::NoSubRegister) 2028 return RC; 2029 2030 // We can assume that each lane corresponds to one 32-bit register. 2031 unsigned Size = getNumChannelsFromSubReg(SubIdx) * 32; 2032 if (isSGPRClass(RC)) { 2033 if (Size == 32) 2034 RC = &AMDGPU::SGPR_32RegClass; 2035 else 2036 RC = getSGPRClassForBitWidth(Size); 2037 } else if (hasAGPRs(RC)) { 2038 RC = getAGPRClassForBitWidth(Size); 2039 } else { 2040 RC = getVGPRClassForBitWidth(Size); 2041 } 2042 assert(RC && "Invalid sub-register class size"); 2043 return RC; 2044 } 2045 2046 const TargetRegisterClass * 2047 SIRegisterInfo::getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, 2048 const TargetRegisterClass *SubRC, 2049 unsigned SubIdx) const { 2050 // Ensure this subregister index is aligned in the super register. 2051 const TargetRegisterClass *MatchRC = 2052 getMatchingSuperRegClass(SuperRC, SubRC, SubIdx); 2053 return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr; 2054 } 2055 2056 bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { 2057 if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST && 2058 OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST) 2059 return !ST.hasMFMAInlineLiteralBug(); 2060 2061 return OpType >= AMDGPU::OPERAND_SRC_FIRST && 2062 OpType <= AMDGPU::OPERAND_SRC_LAST; 2063 } 2064 2065 bool SIRegisterInfo::shouldRewriteCopySrc( 2066 const TargetRegisterClass *DefRC, 2067 unsigned DefSubReg, 2068 const TargetRegisterClass *SrcRC, 2069 unsigned SrcSubReg) const { 2070 // We want to prefer the smallest register class possible, so we don't want to 2071 // stop and rewrite on anything that looks like a subregister 2072 // extract. Operations mostly don't care about the super register class, so we 2073 // only want to stop on the most basic of copies between the same register 2074 // class. 2075 // 2076 // e.g. if we have something like 2077 // %0 = ... 2078 // %1 = ... 2079 // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2 2080 // %3 = COPY %2, sub0 2081 // 2082 // We want to look through the COPY to find: 2083 // => %3 = COPY %0 2084 2085 // Plain copy. 2086 return getCommonSubClass(DefRC, SrcRC) != nullptr; 2087 } 2088 2089 bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const { 2090 // TODO: 64-bit operands have extending behavior from 32-bit literal. 2091 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST && 2092 OpType <= AMDGPU::OPERAND_REG_IMM_LAST; 2093 } 2094 2095 /// Returns a lowest register that is not used at any point in the function. 2096 /// If all registers are used, then this function will return 2097 /// AMDGPU::NoRegister. If \p ReserveHighestVGPR = true, then return 2098 /// highest unused register. 2099 MCRegister SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, 2100 const TargetRegisterClass *RC, 2101 const MachineFunction &MF, 2102 bool ReserveHighestVGPR) const { 2103 if (ReserveHighestVGPR) { 2104 for (MCRegister Reg : reverse(*RC)) 2105 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 2106 return Reg; 2107 } else { 2108 for (MCRegister Reg : *RC) 2109 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 2110 return Reg; 2111 } 2112 return MCRegister(); 2113 } 2114 2115 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, 2116 unsigned EltSize) const { 2117 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC->MC); 2118 assert(RegBitWidth >= 32 && RegBitWidth <= 1024); 2119 2120 const unsigned RegDWORDs = RegBitWidth / 32; 2121 const unsigned EltDWORDs = EltSize / 4; 2122 assert(RegSplitParts.size() + 1 >= EltDWORDs); 2123 2124 const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1]; 2125 const unsigned NumParts = RegDWORDs / EltDWORDs; 2126 2127 return makeArrayRef(Parts.data(), NumParts); 2128 } 2129 2130 const TargetRegisterClass* 2131 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI, 2132 Register Reg) const { 2133 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegClass(Reg); 2134 } 2135 2136 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, 2137 Register Reg) const { 2138 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); 2139 // Registers without classes are unaddressable, SGPR-like registers. 2140 return RC && hasVGPRs(RC); 2141 } 2142 2143 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI, 2144 Register Reg) const { 2145 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); 2146 2147 // Registers without classes are unaddressable, SGPR-like registers. 2148 return RC && hasAGPRs(RC); 2149 } 2150 2151 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, 2152 const TargetRegisterClass *SrcRC, 2153 unsigned SubReg, 2154 const TargetRegisterClass *DstRC, 2155 unsigned DstSubReg, 2156 const TargetRegisterClass *NewRC, 2157 LiveIntervals &LIS) const { 2158 unsigned SrcSize = getRegSizeInBits(*SrcRC); 2159 unsigned DstSize = getRegSizeInBits(*DstRC); 2160 unsigned NewSize = getRegSizeInBits(*NewRC); 2161 2162 // Do not increase size of registers beyond dword, we would need to allocate 2163 // adjacent registers and constraint regalloc more than needed. 2164 2165 // Always allow dword coalescing. 2166 if (SrcSize <= 32 || DstSize <= 32) 2167 return true; 2168 2169 return NewSize <= DstSize || NewSize <= SrcSize; 2170 } 2171 2172 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, 2173 MachineFunction &MF) const { 2174 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2175 2176 unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), 2177 MF.getFunction()); 2178 switch (RC->getID()) { 2179 default: 2180 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF); 2181 case AMDGPU::VGPR_32RegClassID: 2182 case AMDGPU::VGPR_LO16RegClassID: 2183 case AMDGPU::VGPR_HI16RegClassID: 2184 return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF)); 2185 case AMDGPU::SGPR_32RegClassID: 2186 case AMDGPU::SGPR_LO16RegClassID: 2187 return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF)); 2188 } 2189 } 2190 2191 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, 2192 unsigned Idx) const { 2193 if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 || 2194 Idx == AMDGPU::RegisterPressureSets::AGPR_32) 2195 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, 2196 const_cast<MachineFunction &>(MF)); 2197 2198 if (Idx == AMDGPU::RegisterPressureSets::SReg_32) 2199 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass, 2200 const_cast<MachineFunction &>(MF)); 2201 2202 llvm_unreachable("Unexpected register pressure set!"); 2203 } 2204 2205 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const { 2206 static const int Empty[] = { -1 }; 2207 2208 if (RegPressureIgnoredUnits[RegUnit]) 2209 return Empty; 2210 2211 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit); 2212 } 2213 2214 MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const { 2215 // Not a callee saved register. 2216 return AMDGPU::SGPR30_SGPR31; 2217 } 2218 2219 const TargetRegisterClass * 2220 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, 2221 const RegisterBank &RB, 2222 const MachineRegisterInfo &MRI) const { 2223 switch (RB.getID()) { 2224 case AMDGPU::VGPRRegBankID: 2225 return getVGPRClassForBitWidth(std::max(32u, Size)); 2226 case AMDGPU::VCCRegBankID: 2227 assert(Size == 1); 2228 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass 2229 : &AMDGPU::SReg_64_XEXECRegClass; 2230 case AMDGPU::SGPRRegBankID: 2231 return getSGPRClassForBitWidth(std::max(32u, Size)); 2232 case AMDGPU::AGPRRegBankID: 2233 return getAGPRClassForBitWidth(std::max(32u, Size)); 2234 default: 2235 llvm_unreachable("unknown register bank"); 2236 } 2237 } 2238 2239 const TargetRegisterClass * 2240 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, 2241 const MachineRegisterInfo &MRI) const { 2242 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg()); 2243 if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>()) 2244 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI); 2245 2246 const TargetRegisterClass *RC = RCOrRB.get<const TargetRegisterClass*>(); 2247 return getAllocatableClass(RC); 2248 } 2249 2250 MCRegister SIRegisterInfo::getVCC() const { 2251 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; 2252 } 2253 2254 const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const { 2255 // VGPR tuples have an alignment requirement on gfx90a variants. 2256 return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass 2257 : &AMDGPU::VReg_64RegClass; 2258 } 2259 2260 const TargetRegisterClass * 2261 SIRegisterInfo::getRegClass(unsigned RCID) const { 2262 switch ((int)RCID) { 2263 case AMDGPU::SReg_1RegClassID: 2264 return getBoolRC(); 2265 case AMDGPU::SReg_1_XEXECRegClassID: 2266 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass 2267 : &AMDGPU::SReg_64_XEXECRegClass; 2268 case -1: 2269 return nullptr; 2270 default: 2271 return AMDGPUGenRegisterInfo::getRegClass(RCID); 2272 } 2273 } 2274 2275 // Find reaching register definition 2276 MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg, 2277 MachineInstr &Use, 2278 MachineRegisterInfo &MRI, 2279 LiveIntervals *LIS) const { 2280 auto &MDT = LIS->getAnalysis<MachineDominatorTree>(); 2281 SlotIndex UseIdx = LIS->getInstructionIndex(Use); 2282 SlotIndex DefIdx; 2283 2284 if (Reg.isVirtual()) { 2285 if (!LIS->hasInterval(Reg)) 2286 return nullptr; 2287 LiveInterval &LI = LIS->getInterval(Reg); 2288 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg) 2289 : MRI.getMaxLaneMaskForVReg(Reg); 2290 VNInfo *V = nullptr; 2291 if (LI.hasSubRanges()) { 2292 for (auto &S : LI.subranges()) { 2293 if ((S.LaneMask & SubLanes) == SubLanes) { 2294 V = S.getVNInfoAt(UseIdx); 2295 break; 2296 } 2297 } 2298 } else { 2299 V = LI.getVNInfoAt(UseIdx); 2300 } 2301 if (!V) 2302 return nullptr; 2303 DefIdx = V->def; 2304 } else { 2305 // Find last def. 2306 for (MCRegUnitIterator Units(Reg.asMCReg(), this); Units.isValid(); 2307 ++Units) { 2308 LiveRange &LR = LIS->getRegUnit(*Units); 2309 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) { 2310 if (!DefIdx.isValid() || 2311 MDT.dominates(LIS->getInstructionFromIndex(DefIdx), 2312 LIS->getInstructionFromIndex(V->def))) 2313 DefIdx = V->def; 2314 } else { 2315 return nullptr; 2316 } 2317 } 2318 } 2319 2320 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx); 2321 2322 if (!Def || !MDT.dominates(Def, &Use)) 2323 return nullptr; 2324 2325 assert(Def->modifiesRegister(Reg, this)); 2326 2327 return Def; 2328 } 2329 2330 MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const { 2331 assert(getRegSizeInBits(*getPhysRegClass(Reg)) <= 32); 2332 2333 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass, 2334 AMDGPU::SReg_32RegClass, 2335 AMDGPU::AGPR_32RegClass } ) { 2336 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC)) 2337 return Super; 2338 } 2339 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16, 2340 &AMDGPU::VGPR_32RegClass)) { 2341 return Super; 2342 } 2343 2344 return AMDGPU::NoRegister; 2345 } 2346 2347 bool SIRegisterInfo::isConstantPhysReg(MCRegister PhysReg) const { 2348 switch (PhysReg) { 2349 case AMDGPU::SGPR_NULL: 2350 case AMDGPU::SRC_SHARED_BASE: 2351 case AMDGPU::SRC_PRIVATE_BASE: 2352 case AMDGPU::SRC_SHARED_LIMIT: 2353 case AMDGPU::SRC_PRIVATE_LIMIT: 2354 return true; 2355 default: 2356 return false; 2357 } 2358 } 2359 2360 ArrayRef<MCPhysReg> 2361 SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const { 2362 return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(), 2363 ST.getMaxNumSGPRs(MF) / 4); 2364 } 2365 2366 ArrayRef<MCPhysReg> 2367 SIRegisterInfo::getAllSGPR64(const MachineFunction &MF) const { 2368 return makeArrayRef(AMDGPU::SGPR_64RegClass.begin(), 2369 ST.getMaxNumSGPRs(MF) / 2); 2370 } 2371 2372 ArrayRef<MCPhysReg> 2373 SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const { 2374 return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF)); 2375 } 2376