1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI implementation of the TargetRegisterInfo class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "SIRegisterInfo.h" 15 #include "AMDGPU.h" 16 #include "AMDGPURegisterBankInfo.h" 17 #include "GCNSubtarget.h" 18 #include "MCTargetDesc/AMDGPUInstPrinter.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "llvm/CodeGen/LiveIntervals.h" 22 #include "llvm/CodeGen/MachineDominators.h" 23 #include "llvm/CodeGen/RegisterScavenging.h" 24 25 using namespace llvm; 26 27 #define GET_REGINFO_TARGET_DESC 28 #include "AMDGPUGenRegisterInfo.inc" 29 30 static cl::opt<bool> EnableSpillSGPRToVGPR( 31 "amdgpu-spill-sgpr-to-vgpr", 32 cl::desc("Enable spilling VGPRs to SGPRs"), 33 cl::ReallyHidden, 34 cl::init(true)); 35 36 std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts; 37 std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable; 38 39 // Map numbers of DWORDs to indexes in SubRegFromChannelTable. 40 // Valid indexes are shifted 1, such that a 0 mapping means unsupported. 41 // e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8, 42 // meaning index 7 in SubRegFromChannelTable. 43 static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = { 44 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9}; 45 46 namespace llvm { 47 48 // A temporary struct to spill SGPRs. 49 // This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits 50 // just v_writelane and v_readlane. 51 // 52 // When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR 53 // is saved to scratch (or the other way around for loads). 54 // For this, a VGPR is required where the needed lanes can be clobbered. The 55 // RegScavenger can provide a VGPR where currently active lanes can be 56 // clobbered, but we still need to save inactive lanes. 57 // The high-level steps are: 58 // - Try to scavenge SGPR(s) to save exec 59 // - Try to scavenge VGPR 60 // - Save needed, all or inactive lanes of a TmpVGPR 61 // - Spill/Restore SGPRs using TmpVGPR 62 // - Restore TmpVGPR 63 // 64 // To save all lanes of TmpVGPR, exec needs to be saved and modified. If we 65 // cannot scavenge temporary SGPRs to save exec, we use the following code: 66 // buffer_store_dword TmpVGPR ; only if active lanes need to be saved 67 // s_not exec, exec 68 // buffer_store_dword TmpVGPR ; save inactive lanes 69 // s_not exec, exec 70 struct SGPRSpillBuilder { 71 struct PerVGPRData { 72 unsigned PerVGPR; 73 unsigned NumVGPRs; 74 int64_t VGPRLanes; 75 }; 76 77 // The SGPR to save 78 Register SuperReg; 79 MachineBasicBlock::iterator MI; 80 ArrayRef<int16_t> SplitParts; 81 unsigned NumSubRegs; 82 bool IsKill; 83 const DebugLoc &DL; 84 85 /* When spilling to stack */ 86 // The SGPRs are written into this VGPR, which is then written to scratch 87 // (or vice versa for loads). 88 Register TmpVGPR = AMDGPU::NoRegister; 89 // Temporary spill slot to save TmpVGPR to. 90 int TmpVGPRIndex = 0; 91 // If TmpVGPR is live before the spill or if it is scavenged. 92 bool TmpVGPRLive = false; 93 // Scavenged SGPR to save EXEC. 94 Register SavedExecReg = AMDGPU::NoRegister; 95 // Stack index to write the SGPRs to. 96 int Index; 97 unsigned EltSize = 4; 98 99 RegScavenger *RS; 100 MachineBasicBlock *MBB; 101 MachineFunction &MF; 102 SIMachineFunctionInfo &MFI; 103 const SIInstrInfo &TII; 104 const SIRegisterInfo &TRI; 105 bool IsWave32; 106 Register ExecReg; 107 unsigned MovOpc; 108 unsigned NotOpc; 109 110 SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, 111 bool IsWave32, MachineBasicBlock::iterator MI, int Index, 112 RegScavenger *RS) 113 : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(0).getReg(), 114 MI->getOperand(0).isKill(), Index, RS) {} 115 116 SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, 117 bool IsWave32, MachineBasicBlock::iterator MI, Register Reg, 118 bool IsKill, int Index, RegScavenger *RS) 119 : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()), 120 Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()), 121 MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI), 122 IsWave32(IsWave32) { 123 const TargetRegisterClass *RC = TRI.getPhysRegClass(SuperReg); 124 SplitParts = TRI.getRegSplitParts(RC, EltSize); 125 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 126 127 if (IsWave32) { 128 ExecReg = AMDGPU::EXEC_LO; 129 MovOpc = AMDGPU::S_MOV_B32; 130 NotOpc = AMDGPU::S_NOT_B32; 131 } else { 132 ExecReg = AMDGPU::EXEC; 133 MovOpc = AMDGPU::S_MOV_B64; 134 NotOpc = AMDGPU::S_NOT_B64; 135 } 136 137 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 138 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI && 139 SuperReg != AMDGPU::EXEC && "exec should never spill"); 140 } 141 142 PerVGPRData getPerVGPRData() { 143 PerVGPRData Data; 144 Data.PerVGPR = IsWave32 ? 32 : 64; 145 Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR; 146 Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL; 147 return Data; 148 } 149 150 // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is 151 // free. 152 // Writes these instructions if an SGPR can be scavenged: 153 // s_mov_b64 s[6:7], exec ; Save exec 154 // s_mov_b64 exec, 3 ; Wanted lanemask 155 // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot 156 // 157 // Writes these instructions if no SGPR can be scavenged: 158 // buffer_store_dword v0 ; Only if no free VGPR was found 159 // s_not_b64 exec, exec 160 // buffer_store_dword v0 ; Save inactive lanes 161 // ; exec stays inverted, it is flipped back in 162 // ; restore. 163 void prepare() { 164 // Scavenged temporary VGPR to use. It must be scavenged once for any number 165 // of spilled subregs. 166 // FIXME: The liveness analysis is limited and does not tell if a register 167 // is in use in lanes that are currently inactive. We can never be sure if 168 // a register as actually in use in another lane, so we need to save all 169 // used lanes of the chosen VGPR. 170 assert(RS && "Cannot spill SGPR to memory without RegScavenger"); 171 TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0, false); 172 173 // Reserve temporary stack slot 174 TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI); 175 if (TmpVGPR) { 176 // Found a register that is dead in the currently active lanes, we only 177 // need to spill inactive lanes. 178 TmpVGPRLive = false; 179 } else { 180 // Pick v0 because it doesn't make a difference. 181 TmpVGPR = AMDGPU::VGPR0; 182 TmpVGPRLive = true; 183 } 184 185 // Try to scavenge SGPRs to save exec 186 assert(!SavedExecReg && "Exec is already saved, refuse to save again"); 187 const TargetRegisterClass &RC = 188 IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass; 189 RS->setRegUsed(SuperReg); 190 SavedExecReg = RS->scavengeRegister(&RC, MI, 0, false); 191 192 int64_t VGPRLanes = getPerVGPRData().VGPRLanes; 193 194 if (SavedExecReg) { 195 RS->setRegUsed(SavedExecReg); 196 // Set exec to needed lanes 197 BuildMI(*MBB, MI, DL, TII.get(MovOpc), SavedExecReg).addReg(ExecReg); 198 auto I = 199 BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes); 200 if (!TmpVGPRLive) 201 I.addReg(TmpVGPR, RegState::ImplicitDefine); 202 // Spill needed lanes 203 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false); 204 } else { 205 // Spill active lanes 206 if (TmpVGPRLive) 207 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false, 208 /*IsKill*/ false); 209 // Spill inactive lanes 210 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 211 if (!TmpVGPRLive) 212 I.addReg(TmpVGPR, RegState::ImplicitDefine); 213 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false); 214 } 215 } 216 217 // Writes these instructions if an SGPR can be scavenged: 218 // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot 219 // s_waitcnt vmcnt(0) ; If a free VGPR was found 220 // s_mov_b64 exec, s[6:7] ; Save exec 221 // 222 // Writes these instructions if no SGPR can be scavenged: 223 // buffer_load_dword v0 ; Restore inactive lanes 224 // s_waitcnt vmcnt(0) ; If a free VGPR was found 225 // s_not_b64 exec, exec 226 // buffer_load_dword v0 ; Only if no free VGPR was found 227 void restore() { 228 if (SavedExecReg) { 229 // Restore used lanes 230 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true, 231 /*IsKill*/ false); 232 // Restore exec 233 auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg) 234 .addReg(SavedExecReg, RegState::Kill); 235 // Add an implicit use of the load so it is not dead. 236 // FIXME This inserts an unnecessary waitcnt 237 if (!TmpVGPRLive) { 238 I.addReg(TmpVGPR, RegState::ImplicitKill); 239 } 240 } else { 241 // Restore inactive lanes 242 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true, 243 /*IsKill*/ false); 244 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 245 if (!TmpVGPRLive) { 246 I.addReg(TmpVGPR, RegState::ImplicitKill); 247 } 248 // Restore active lanes 249 if (TmpVGPRLive) 250 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true); 251 } 252 } 253 254 // Write TmpVGPR to memory or read TmpVGPR from memory. 255 // Either using a single buffer_load/store if exec is set to the needed mask 256 // or using 257 // buffer_load 258 // s_not exec, exec 259 // buffer_load 260 // s_not exec, exec 261 void readWriteTmpVGPR(unsigned Offset, bool IsLoad) { 262 if (SavedExecReg) { 263 // Spill needed lanes 264 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad); 265 } else { 266 // Spill active lanes 267 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad, 268 /*IsKill*/ false); 269 // Spill inactive lanes 270 BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 271 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad); 272 BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 273 } 274 } 275 276 void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI) { 277 assert(MBB->getParent() == &MF); 278 MI = NewMI; 279 MBB = NewMBB; 280 } 281 }; 282 283 } // namespace llvm 284 285 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) 286 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST), 287 SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) { 288 289 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 && 290 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) && 291 (getSubRegIndexLaneMask(AMDGPU::lo16) | 292 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() == 293 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() && 294 "getNumCoveredRegs() will not work with generated subreg masks!"); 295 296 RegPressureIgnoredUnits.resize(getNumRegUnits()); 297 RegPressureIgnoredUnits.set( 298 *MCRegUnitIterator(MCRegister::from(AMDGPU::M0), this)); 299 for (auto Reg : AMDGPU::VGPR_HI16RegClass) 300 RegPressureIgnoredUnits.set(*MCRegUnitIterator(Reg, this)); 301 302 // HACK: Until this is fully tablegen'd. 303 static llvm::once_flag InitializeRegSplitPartsFlag; 304 305 static auto InitializeRegSplitPartsOnce = [this]() { 306 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) { 307 unsigned Size = getSubRegIdxSize(Idx); 308 if (Size & 31) 309 continue; 310 std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1]; 311 unsigned Pos = getSubRegIdxOffset(Idx); 312 if (Pos % Size) 313 continue; 314 Pos /= Size; 315 if (Vec.empty()) { 316 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits. 317 Vec.resize(MaxNumParts); 318 } 319 Vec[Pos] = Idx; 320 } 321 }; 322 323 static llvm::once_flag InitializeSubRegFromChannelTableFlag; 324 325 static auto InitializeSubRegFromChannelTableOnce = [this]() { 326 for (auto &Row : SubRegFromChannelTable) 327 Row.fill(AMDGPU::NoSubRegister); 328 for (uint16_t Idx = 1; Idx < getNumSubRegIndices(); ++Idx) { 329 unsigned Width = AMDGPUSubRegIdxRanges[Idx].Size / 32; 330 unsigned Offset = AMDGPUSubRegIdxRanges[Idx].Offset / 32; 331 assert(Width < SubRegFromChannelTableWidthMap.size()); 332 Width = SubRegFromChannelTableWidthMap[Width]; 333 if (Width == 0) 334 continue; 335 unsigned TableIdx = Width - 1; 336 assert(TableIdx < SubRegFromChannelTable.size()); 337 assert(Offset < SubRegFromChannelTable[TableIdx].size()); 338 SubRegFromChannelTable[TableIdx][Offset] = Idx; 339 } 340 }; 341 342 llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce); 343 llvm::call_once(InitializeSubRegFromChannelTableFlag, 344 InitializeSubRegFromChannelTableOnce); 345 } 346 347 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, 348 MCRegister Reg) const { 349 MCRegAliasIterator R(Reg, this, true); 350 351 for (; R.isValid(); ++R) 352 Reserved.set(*R); 353 } 354 355 // Forced to be here by one .inc 356 const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs( 357 const MachineFunction *MF) const { 358 CallingConv::ID CC = MF->getFunction().getCallingConv(); 359 switch (CC) { 360 case CallingConv::C: 361 case CallingConv::Fast: 362 case CallingConv::Cold: 363 return MF->getSubtarget<GCNSubtarget>().hasGFX90AInsts() 364 ? CSR_AMDGPU_HighRegs_With_AGPRs_SaveList 365 : CSR_AMDGPU_HighRegs_SaveList; 366 case CallingConv::AMDGPU_Gfx: 367 return MF->getSubtarget<GCNSubtarget>().hasGFX90AInsts() 368 ? CSR_AMDGPU_SI_Gfx_With_AGPRs_SaveList 369 : CSR_AMDGPU_SI_Gfx_SaveList; 370 default: { 371 // Dummy to not crash RegisterClassInfo. 372 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister; 373 return &NoCalleeSavedReg; 374 } 375 } 376 } 377 378 const MCPhysReg * 379 SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const { 380 return nullptr; 381 } 382 383 const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, 384 CallingConv::ID CC) const { 385 switch (CC) { 386 case CallingConv::C: 387 case CallingConv::Fast: 388 case CallingConv::Cold: 389 return MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts() 390 ? CSR_AMDGPU_HighRegs_With_AGPRs_RegMask 391 : CSR_AMDGPU_HighRegs_RegMask; 392 case CallingConv::AMDGPU_Gfx: 393 return MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts() 394 ? CSR_AMDGPU_SI_Gfx_With_AGPRs_RegMask 395 : CSR_AMDGPU_SI_Gfx_RegMask; 396 default: 397 return nullptr; 398 } 399 } 400 401 const uint32_t *SIRegisterInfo::getNoPreservedMask() const { 402 return CSR_AMDGPU_NoRegs_RegMask; 403 } 404 405 Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { 406 const SIFrameLowering *TFI = 407 MF.getSubtarget<GCNSubtarget>().getFrameLowering(); 408 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 409 // During ISel lowering we always reserve the stack pointer in entry 410 // functions, but never actually want to reference it when accessing our own 411 // frame. If we need a frame pointer we use it, but otherwise we can just use 412 // an immediate "0" which we represent by returning NoRegister. 413 if (FuncInfo->isEntryFunction()) { 414 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register(); 415 } 416 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() 417 : FuncInfo->getStackPtrOffsetReg(); 418 } 419 420 bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const { 421 // When we need stack realignment, we can't reference off of the 422 // stack pointer, so we reserve a base pointer. 423 const MachineFrameInfo &MFI = MF.getFrameInfo(); 424 return MFI.getNumFixedObjects() && shouldRealignStack(MF); 425 } 426 427 Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; } 428 429 const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const { 430 return CSR_AMDGPU_AllVGPRs_RegMask; 431 } 432 433 const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const { 434 return CSR_AMDGPU_AllAGPRs_RegMask; 435 } 436 437 const uint32_t *SIRegisterInfo::getAllVectorRegMask() const { 438 return CSR_AMDGPU_AllVectorRegs_RegMask; 439 } 440 441 const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const { 442 return CSR_AMDGPU_AllAllocatableSRegs_RegMask; 443 } 444 445 unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel, 446 unsigned NumRegs) { 447 assert(NumRegs < SubRegFromChannelTableWidthMap.size()); 448 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs]; 449 assert(NumRegIndex && "Not implemented"); 450 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size()); 451 return SubRegFromChannelTable[NumRegIndex - 1][Channel]; 452 } 453 454 MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg( 455 const MachineFunction &MF) const { 456 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4; 457 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); 458 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass); 459 } 460 461 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { 462 BitVector Reserved(getNumRegs()); 463 Reserved.set(AMDGPU::MODE); 464 465 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but 466 // this seems likely to result in bugs, so I'm marking them as reserved. 467 reserveRegisterTuples(Reserved, AMDGPU::EXEC); 468 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); 469 470 // M0 has to be reserved so that llvm accepts it as a live-in into a block. 471 reserveRegisterTuples(Reserved, AMDGPU::M0); 472 473 // Reserve src_vccz, src_execz, src_scc. 474 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ); 475 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ); 476 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC); 477 478 // Reserve the memory aperture registers. 479 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE); 480 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); 481 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); 482 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT); 483 484 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen. 485 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID); 486 487 // Reserve xnack_mask registers - support is not implemented in Codegen. 488 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK); 489 490 // Reserve lds_direct register - support is not implemented in Codegen. 491 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT); 492 493 // Reserve Trap Handler registers - support is not implemented in Codegen. 494 reserveRegisterTuples(Reserved, AMDGPU::TBA); 495 reserveRegisterTuples(Reserved, AMDGPU::TMA); 496 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1); 497 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3); 498 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5); 499 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7); 500 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); 501 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); 502 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13); 503 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15); 504 505 // Reserve null register - it shall never be allocated 506 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL); 507 508 // Disallow vcc_hi allocation in wave32. It may be allocated but most likely 509 // will result in bugs. 510 if (isWave32) { 511 Reserved.set(AMDGPU::VCC); 512 Reserved.set(AMDGPU::VCC_HI); 513 } 514 515 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); 516 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 517 for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) { 518 unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); 519 reserveRegisterTuples(Reserved, Reg); 520 } 521 522 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 523 unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); 524 unsigned MaxNumAGPRs = MaxNumVGPRs; 525 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); 526 527 if (ST.hasGFX90AInsts()) { 528 // In an entry function without calls and AGPRs used it is possible to use 529 // the whole register budget for VGPRs. 530 531 // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and 532 // split register file accordingly. 533 if (MFI->usesAGPRs(MF)) { 534 MaxNumVGPRs /= 2; 535 MaxNumAGPRs = MaxNumVGPRs; 536 } else { 537 if (MaxNumVGPRs > TotalNumVGPRs) { 538 MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs; 539 MaxNumVGPRs = TotalNumVGPRs; 540 } else 541 MaxNumAGPRs = 0; 542 } 543 } 544 545 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) { 546 unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); 547 reserveRegisterTuples(Reserved, Reg); 548 } 549 550 for (unsigned i = MaxNumAGPRs; i < TotalNumVGPRs; ++i) { 551 unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); 552 reserveRegisterTuples(Reserved, Reg); 553 } 554 555 for (auto Reg : AMDGPU::SReg_32RegClass) { 556 Reserved.set(getSubReg(Reg, AMDGPU::hi16)); 557 Register Low = getSubReg(Reg, AMDGPU::lo16); 558 // This is to prevent BB vcc liveness errors. 559 if (!AMDGPU::SGPR_LO16RegClass.contains(Low)) 560 Reserved.set(Low); 561 } 562 563 for (auto Reg : AMDGPU::AGPR_32RegClass) { 564 Reserved.set(getSubReg(Reg, AMDGPU::hi16)); 565 } 566 567 // Reserve all the rest AGPRs if there are no instructions to use it. 568 if (!ST.hasMAIInsts()) { 569 for (unsigned i = 0; i < MaxNumVGPRs; ++i) { 570 unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); 571 reserveRegisterTuples(Reserved, Reg); 572 } 573 } 574 575 Register ScratchRSrcReg = MFI->getScratchRSrcReg(); 576 if (ScratchRSrcReg != AMDGPU::NoRegister) { 577 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need 578 // to spill. 579 // TODO: May need to reserve a VGPR if doing LDS spilling. 580 reserveRegisterTuples(Reserved, ScratchRSrcReg); 581 } 582 583 // We have to assume the SP is needed in case there are calls in the function, 584 // which is detected after the function is lowered. If we aren't really going 585 // to need SP, don't bother reserving it. 586 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg(); 587 588 if (StackPtrReg) { 589 reserveRegisterTuples(Reserved, StackPtrReg); 590 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg)); 591 } 592 593 MCRegister FrameReg = MFI->getFrameOffsetReg(); 594 if (FrameReg) { 595 reserveRegisterTuples(Reserved, FrameReg); 596 assert(!isSubRegister(ScratchRSrcReg, FrameReg)); 597 } 598 599 if (hasBasePointer(MF)) { 600 MCRegister BasePtrReg = getBaseRegister(); 601 reserveRegisterTuples(Reserved, BasePtrReg); 602 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg)); 603 } 604 605 for (auto Reg : MFI->WWMReservedRegs) { 606 reserveRegisterTuples(Reserved, Reg.first); 607 } 608 609 // Reserve VGPRs used for SGPR spilling. 610 // Note we treat freezeReservedRegs unusually because we run register 611 // allocation in two phases. It's OK to re-freeze with new registers for the 612 // second run. 613 #if 0 614 for (auto &SpilledFI : MFI->sgpr_spill_vgprs()) { 615 for (auto &SpilledVGPR : SpilledFI.second) 616 reserveRegisterTuples(Reserved, SpilledVGPR.VGPR); 617 } 618 #endif 619 620 // FIXME: Stop using reserved registers for this. 621 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs()) 622 reserveRegisterTuples(Reserved, Reg); 623 624 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) 625 reserveRegisterTuples(Reserved, Reg); 626 627 for (auto SSpill : MFI->getSGPRSpillVGPRs()) 628 reserveRegisterTuples(Reserved, SSpill.VGPR); 629 630 return Reserved; 631 } 632 633 bool SIRegisterInfo::shouldRealignStack(const MachineFunction &MF) const { 634 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 635 // On entry, the base address is 0, so it can't possibly need any more 636 // alignment. 637 638 // FIXME: Should be able to specify the entry frame alignment per calling 639 // convention instead. 640 if (Info->isEntryFunction()) 641 return false; 642 643 return TargetRegisterInfo::shouldRealignStack(MF); 644 } 645 646 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { 647 const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>(); 648 if (Info->isEntryFunction()) { 649 const MachineFrameInfo &MFI = Fn.getFrameInfo(); 650 return MFI.hasStackObjects() || MFI.hasCalls(); 651 } 652 653 // May need scavenger for dealing with callee saved registers. 654 return true; 655 } 656 657 bool SIRegisterInfo::requiresFrameIndexScavenging( 658 const MachineFunction &MF) const { 659 // Do not use frame virtual registers. They used to be used for SGPRs, but 660 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the 661 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a 662 // spill. 663 return false; 664 } 665 666 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( 667 const MachineFunction &MF) const { 668 const MachineFrameInfo &MFI = MF.getFrameInfo(); 669 return MFI.hasStackObjects(); 670 } 671 672 bool SIRegisterInfo::requiresVirtualBaseRegisters( 673 const MachineFunction &) const { 674 // There are no special dedicated stack or frame pointers. 675 return true; 676 } 677 678 int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const { 679 assert(SIInstrInfo::isMUBUF(*MI) || SIInstrInfo::isFLATScratch(*MI)); 680 681 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 682 AMDGPU::OpName::offset); 683 return MI->getOperand(OffIdx).getImm(); 684 } 685 686 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI, 687 int Idx) const { 688 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 689 return 0; 690 691 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 692 AMDGPU::OpName::vaddr) || 693 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 694 AMDGPU::OpName::saddr))) && 695 "Should never see frame index on non-address operand"); 696 697 return getScratchInstrOffset(MI); 698 } 699 700 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { 701 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 702 return false; 703 704 int64_t FullOffset = Offset + getScratchInstrOffset(MI); 705 706 if (SIInstrInfo::isMUBUF(*MI)) 707 return !SIInstrInfo::isLegalMUBUFImmOffset(FullOffset); 708 709 const SIInstrInfo *TII = ST.getInstrInfo(); 710 return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS, 711 SIInstrFlags::FlatScratch); 712 } 713 714 Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, 715 int FrameIdx, 716 int64_t Offset) const { 717 MachineBasicBlock::iterator Ins = MBB->begin(); 718 DebugLoc DL; // Defaults to "unknown" 719 720 if (Ins != MBB->end()) 721 DL = Ins->getDebugLoc(); 722 723 MachineFunction *MF = MBB->getParent(); 724 const SIInstrInfo *TII = ST.getInstrInfo(); 725 MachineRegisterInfo &MRI = MF->getRegInfo(); 726 unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32 727 : AMDGPU::V_MOV_B32_e32; 728 729 Register BaseReg = MRI.createVirtualRegister( 730 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass 731 : &AMDGPU::VGPR_32RegClass); 732 733 if (Offset == 0) { 734 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg) 735 .addFrameIndex(FrameIdx); 736 return BaseReg; 737 } 738 739 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 740 741 Register FIReg = MRI.createVirtualRegister( 742 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass 743 : &AMDGPU::VGPR_32RegClass); 744 745 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) 746 .addImm(Offset); 747 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg) 748 .addFrameIndex(FrameIdx); 749 750 if (ST.enableFlatScratch() ) { 751 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg) 752 .addReg(OffsetReg, RegState::Kill) 753 .addReg(FIReg); 754 return BaseReg; 755 } 756 757 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg) 758 .addReg(OffsetReg, RegState::Kill) 759 .addReg(FIReg) 760 .addImm(0); // clamp bit 761 762 return BaseReg; 763 } 764 765 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, 766 int64_t Offset) const { 767 const SIInstrInfo *TII = ST.getInstrInfo(); 768 bool IsFlat = TII->isFLATScratch(MI); 769 770 #ifndef NDEBUG 771 // FIXME: Is it possible to be storing a frame index to itself? 772 bool SeenFI = false; 773 for (const MachineOperand &MO: MI.operands()) { 774 if (MO.isFI()) { 775 if (SeenFI) 776 llvm_unreachable("should not see multiple frame indices"); 777 778 SeenFI = true; 779 } 780 } 781 #endif 782 783 MachineOperand *FIOp = 784 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr 785 : AMDGPU::OpName::vaddr); 786 787 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); 788 int64_t NewOffset = OffsetOp->getImm() + Offset; 789 790 assert(FIOp && FIOp->isFI() && "frame index must be address operand"); 791 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI)); 792 793 if (IsFlat) { 794 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 795 SIInstrFlags::FlatScratch) && 796 "offset should be legal"); 797 FIOp->ChangeToRegister(BaseReg, false); 798 OffsetOp->setImm(NewOffset); 799 return; 800 } 801 802 #ifndef NDEBUG 803 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset); 804 assert(SOffset->isImm() && SOffset->getImm() == 0); 805 #endif 806 807 assert(SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) && 808 "offset should be legal"); 809 810 FIOp->ChangeToRegister(BaseReg, false); 811 OffsetOp->setImm(NewOffset); 812 } 813 814 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, 815 Register BaseReg, 816 int64_t Offset) const { 817 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 818 return false; 819 820 int64_t NewOffset = Offset + getScratchInstrOffset(MI); 821 822 if (SIInstrInfo::isMUBUF(*MI)) 823 return SIInstrInfo::isLegalMUBUFImmOffset(NewOffset); 824 825 const SIInstrInfo *TII = ST.getInstrInfo(); 826 return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 827 SIInstrFlags::FlatScratch); 828 } 829 830 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( 831 const MachineFunction &MF, unsigned Kind) const { 832 // This is inaccurate. It depends on the instruction and address space. The 833 // only place where we should hit this is for dealing with frame indexes / 834 // private accesses, so this is correct in that case. 835 return &AMDGPU::VGPR_32RegClass; 836 } 837 838 const TargetRegisterClass * 839 SIRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { 840 if (isAGPRClass(RC) && !ST.hasGFX90AInsts()) 841 return getEquivalentVGPRClass(RC); 842 843 return RC; 844 } 845 846 static unsigned getNumSubRegsForSpillOp(unsigned Op) { 847 848 switch (Op) { 849 case AMDGPU::SI_SPILL_S1024_SAVE: 850 case AMDGPU::SI_SPILL_S1024_RESTORE: 851 case AMDGPU::SI_SPILL_V1024_SAVE: 852 case AMDGPU::SI_SPILL_V1024_RESTORE: 853 case AMDGPU::SI_SPILL_A1024_SAVE: 854 case AMDGPU::SI_SPILL_A1024_RESTORE: 855 return 32; 856 case AMDGPU::SI_SPILL_S512_SAVE: 857 case AMDGPU::SI_SPILL_S512_RESTORE: 858 case AMDGPU::SI_SPILL_V512_SAVE: 859 case AMDGPU::SI_SPILL_V512_RESTORE: 860 case AMDGPU::SI_SPILL_A512_SAVE: 861 case AMDGPU::SI_SPILL_A512_RESTORE: 862 return 16; 863 case AMDGPU::SI_SPILL_S256_SAVE: 864 case AMDGPU::SI_SPILL_S256_RESTORE: 865 case AMDGPU::SI_SPILL_V256_SAVE: 866 case AMDGPU::SI_SPILL_V256_RESTORE: 867 case AMDGPU::SI_SPILL_A256_SAVE: 868 case AMDGPU::SI_SPILL_A256_RESTORE: 869 return 8; 870 case AMDGPU::SI_SPILL_S224_SAVE: 871 case AMDGPU::SI_SPILL_S224_RESTORE: 872 case AMDGPU::SI_SPILL_V224_SAVE: 873 case AMDGPU::SI_SPILL_V224_RESTORE: 874 case AMDGPU::SI_SPILL_A224_SAVE: 875 case AMDGPU::SI_SPILL_A224_RESTORE: 876 return 7; 877 case AMDGPU::SI_SPILL_S192_SAVE: 878 case AMDGPU::SI_SPILL_S192_RESTORE: 879 case AMDGPU::SI_SPILL_V192_SAVE: 880 case AMDGPU::SI_SPILL_V192_RESTORE: 881 case AMDGPU::SI_SPILL_A192_SAVE: 882 case AMDGPU::SI_SPILL_A192_RESTORE: 883 return 6; 884 case AMDGPU::SI_SPILL_S160_SAVE: 885 case AMDGPU::SI_SPILL_S160_RESTORE: 886 case AMDGPU::SI_SPILL_V160_SAVE: 887 case AMDGPU::SI_SPILL_V160_RESTORE: 888 case AMDGPU::SI_SPILL_A160_SAVE: 889 case AMDGPU::SI_SPILL_A160_RESTORE: 890 return 5; 891 case AMDGPU::SI_SPILL_S128_SAVE: 892 case AMDGPU::SI_SPILL_S128_RESTORE: 893 case AMDGPU::SI_SPILL_V128_SAVE: 894 case AMDGPU::SI_SPILL_V128_RESTORE: 895 case AMDGPU::SI_SPILL_A128_SAVE: 896 case AMDGPU::SI_SPILL_A128_RESTORE: 897 return 4; 898 case AMDGPU::SI_SPILL_S96_SAVE: 899 case AMDGPU::SI_SPILL_S96_RESTORE: 900 case AMDGPU::SI_SPILL_V96_SAVE: 901 case AMDGPU::SI_SPILL_V96_RESTORE: 902 case AMDGPU::SI_SPILL_A96_SAVE: 903 case AMDGPU::SI_SPILL_A96_RESTORE: 904 return 3; 905 case AMDGPU::SI_SPILL_S64_SAVE: 906 case AMDGPU::SI_SPILL_S64_RESTORE: 907 case AMDGPU::SI_SPILL_V64_SAVE: 908 case AMDGPU::SI_SPILL_V64_RESTORE: 909 case AMDGPU::SI_SPILL_A64_SAVE: 910 case AMDGPU::SI_SPILL_A64_RESTORE: 911 return 2; 912 case AMDGPU::SI_SPILL_S32_SAVE: 913 case AMDGPU::SI_SPILL_S32_RESTORE: 914 case AMDGPU::SI_SPILL_V32_SAVE: 915 case AMDGPU::SI_SPILL_V32_RESTORE: 916 case AMDGPU::SI_SPILL_A32_SAVE: 917 case AMDGPU::SI_SPILL_A32_RESTORE: 918 return 1; 919 default: llvm_unreachable("Invalid spill opcode"); 920 } 921 } 922 923 static int getOffsetMUBUFStore(unsigned Opc) { 924 switch (Opc) { 925 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 926 return AMDGPU::BUFFER_STORE_DWORD_OFFSET; 927 case AMDGPU::BUFFER_STORE_BYTE_OFFEN: 928 return AMDGPU::BUFFER_STORE_BYTE_OFFSET; 929 case AMDGPU::BUFFER_STORE_SHORT_OFFEN: 930 return AMDGPU::BUFFER_STORE_SHORT_OFFSET; 931 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: 932 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; 933 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN: 934 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; 935 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN: 936 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET; 937 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN: 938 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET; 939 default: 940 return -1; 941 } 942 } 943 944 static int getOffsetMUBUFLoad(unsigned Opc) { 945 switch (Opc) { 946 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 947 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 948 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN: 949 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET; 950 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN: 951 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET; 952 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN: 953 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET; 954 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN: 955 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET; 956 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN: 957 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; 958 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN: 959 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET; 960 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN: 961 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET; 962 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN: 963 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET; 964 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN: 965 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET; 966 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN: 967 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET; 968 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN: 969 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET; 970 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN: 971 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET; 972 default: 973 return -1; 974 } 975 } 976 977 static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, 978 MachineBasicBlock &MBB, 979 MachineBasicBlock::iterator MI, 980 int Index, unsigned Lane, 981 unsigned ValueReg, bool IsKill) { 982 MachineFunction *MF = MBB.getParent(); 983 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 984 const SIInstrInfo *TII = ST.getInstrInfo(); 985 986 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane); 987 988 if (Reg == AMDGPU::NoRegister) 989 return MachineInstrBuilder(); 990 991 bool IsStore = MI->mayStore(); 992 MachineRegisterInfo &MRI = MF->getRegInfo(); 993 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 994 995 unsigned Dst = IsStore ? Reg : ValueReg; 996 unsigned Src = IsStore ? ValueReg : Reg; 997 unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 998 : AMDGPU::V_ACCVGPR_READ_B32_e64; 999 1000 auto MIB = BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst) 1001 .addReg(Src, getKillRegState(IsKill)); 1002 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1003 return MIB; 1004 } 1005 1006 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not 1007 // need to handle the case where an SGPR may need to be spilled while spilling. 1008 static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, 1009 MachineFrameInfo &MFI, 1010 MachineBasicBlock::iterator MI, 1011 int Index, 1012 int64_t Offset) { 1013 const SIInstrInfo *TII = ST.getInstrInfo(); 1014 MachineBasicBlock *MBB = MI->getParent(); 1015 const DebugLoc &DL = MI->getDebugLoc(); 1016 bool IsStore = MI->mayStore(); 1017 1018 unsigned Opc = MI->getOpcode(); 1019 int LoadStoreOp = IsStore ? 1020 getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc); 1021 if (LoadStoreOp == -1) 1022 return false; 1023 1024 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); 1025 if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr()) 1026 return true; 1027 1028 MachineInstrBuilder NewMI = 1029 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) 1030 .add(*Reg) 1031 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) 1032 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) 1033 .addImm(Offset) 1034 .addImm(0) // cpol 1035 .addImm(0) // tfe 1036 .addImm(0) // swz 1037 .cloneMemRefs(*MI); 1038 1039 const MachineOperand *VDataIn = TII->getNamedOperand(*MI, 1040 AMDGPU::OpName::vdata_in); 1041 if (VDataIn) 1042 NewMI.add(*VDataIn); 1043 return true; 1044 } 1045 1046 static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, 1047 unsigned LoadStoreOp, 1048 unsigned EltSize) { 1049 bool IsStore = TII->get(LoadStoreOp).mayStore(); 1050 bool UseST = 1051 AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 && 1052 AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::saddr) < 0; 1053 1054 switch (EltSize) { 1055 case 4: 1056 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 1057 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR; 1058 break; 1059 case 8: 1060 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR 1061 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR; 1062 break; 1063 case 12: 1064 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR 1065 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR; 1066 break; 1067 case 16: 1068 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR 1069 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR; 1070 break; 1071 default: 1072 llvm_unreachable("Unexpected spill load/store size!"); 1073 } 1074 1075 if (UseST) 1076 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); 1077 1078 return LoadStoreOp; 1079 } 1080 1081 void SIRegisterInfo::buildSpillLoadStore( 1082 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, 1083 unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill, 1084 MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO, 1085 RegScavenger *RS, LivePhysRegs *LiveRegs) const { 1086 assert((!RS || !LiveRegs) && "Only RS or LiveRegs can be set but not both"); 1087 1088 MachineFunction *MF = MBB.getParent(); 1089 const SIInstrInfo *TII = ST.getInstrInfo(); 1090 const MachineFrameInfo &MFI = MF->getFrameInfo(); 1091 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>(); 1092 1093 const MCInstrDesc *Desc = &TII->get(LoadStoreOp); 1094 bool IsStore = Desc->mayStore(); 1095 bool IsFlat = TII->isFLATScratch(LoadStoreOp); 1096 1097 bool Scavenged = false; 1098 MCRegister SOffset = ScratchOffsetReg; 1099 1100 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); 1101 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores. 1102 const bool IsAGPR = !ST.hasGFX90AInsts() && hasAGPRs(RC); 1103 const unsigned RegWidth = AMDGPU::getRegBitWidth(RC->getID()) / 8; 1104 1105 // Always use 4 byte operations for AGPRs because we need to scavenge 1106 // a temporary VGPR. 1107 unsigned EltSize = (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u) : 4u; 1108 unsigned NumSubRegs = RegWidth / EltSize; 1109 unsigned Size = NumSubRegs * EltSize; 1110 unsigned RemSize = RegWidth - Size; 1111 unsigned NumRemSubRegs = RemSize ? 1 : 0; 1112 int64_t Offset = InstOffset + MFI.getObjectOffset(Index); 1113 int64_t MaxOffset = Offset + Size + RemSize - EltSize; 1114 int64_t ScratchOffsetRegDelta = 0; 1115 1116 if (IsFlat && EltSize > 4) { 1117 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); 1118 Desc = &TII->get(LoadStoreOp); 1119 } 1120 1121 Align Alignment = MFI.getObjectAlign(Index); 1122 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); 1123 1124 assert((IsFlat || ((Offset % EltSize) == 0)) && 1125 "unexpected VGPR spill offset"); 1126 1127 bool IsOffsetLegal = 1128 IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, 1129 SIInstrFlags::FlatScratch) 1130 : SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset); 1131 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) { 1132 SOffset = MCRegister(); 1133 1134 // We currently only support spilling VGPRs to EltSize boundaries, meaning 1135 // we can simplify the adjustment of Offset here to just scale with 1136 // WavefrontSize. 1137 if (!IsFlat) 1138 Offset *= ST.getWavefrontSize(); 1139 1140 // We don't have access to the register scavenger if this function is called 1141 // during PEI::scavengeFrameVirtualRegs() so use LiveRegs in this case. 1142 if (RS) { 1143 SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false); 1144 } else if (LiveRegs) { 1145 for (MCRegister Reg : AMDGPU::SGPR_32RegClass) { 1146 if (LiveRegs->available(MF->getRegInfo(), Reg)) { 1147 SOffset = Reg; 1148 break; 1149 } 1150 } 1151 } 1152 1153 if (!SOffset) { 1154 // There are no free SGPRs, and since we are in the process of spilling 1155 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true 1156 // on SI/CI and on VI it is true until we implement spilling using scalar 1157 // stores), we have no way to free up an SGPR. Our solution here is to 1158 // add the offset directly to the ScratchOffset or StackPtrOffset 1159 // register, and then subtract the offset after the spill to return the 1160 // register to it's original value. 1161 if (!ScratchOffsetReg) 1162 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg(); 1163 SOffset = ScratchOffsetReg; 1164 ScratchOffsetRegDelta = Offset; 1165 } else { 1166 Scavenged = true; 1167 } 1168 1169 if (!SOffset) 1170 report_fatal_error("could not scavenge SGPR to spill in entry function"); 1171 1172 if (ScratchOffsetReg == AMDGPU::NoRegister) { 1173 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset); 1174 } else { 1175 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) 1176 .addReg(ScratchOffsetReg) 1177 .addImm(Offset); 1178 } 1179 1180 Offset = 0; 1181 } 1182 1183 if (IsFlat && SOffset == AMDGPU::NoRegister) { 1184 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 1185 && "Unexpected vaddr for flat scratch with a FI operand"); 1186 1187 assert(ST.hasFlatScratchSTMode()); 1188 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); 1189 Desc = &TII->get(LoadStoreOp); 1190 } 1191 1192 Register TmpReg; 1193 1194 for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e; 1195 ++i, RegOffset += EltSize) { 1196 if (i == NumSubRegs) { 1197 EltSize = RemSize; 1198 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); 1199 } 1200 Desc = &TII->get(LoadStoreOp); 1201 1202 unsigned NumRegs = EltSize / 4; 1203 Register SubReg = e == 1 1204 ? ValueReg 1205 : Register(getSubReg(ValueReg, 1206 getSubRegFromChannel(RegOffset / 4, NumRegs))); 1207 1208 unsigned SOffsetRegState = 0; 1209 unsigned SrcDstRegState = getDefRegState(!IsStore); 1210 if (i + 1 == e) { 1211 SOffsetRegState |= getKillRegState(Scavenged); 1212 // The last implicit use carries the "Kill" flag. 1213 SrcDstRegState |= getKillRegState(IsKill); 1214 } 1215 1216 // Make sure the whole register is defined if there are undef components by 1217 // adding an implicit def of the super-reg on the first instruction. 1218 bool NeedSuperRegDef = e > 1 && IsStore && i == 0; 1219 bool NeedSuperRegImpOperand = e > 1; 1220 1221 // Remaining element size to spill into memory after some parts of it 1222 // spilled into either AGPRs or VGPRs. 1223 unsigned RemEltSize = EltSize; 1224 1225 // AGPRs to spill VGPRs and vice versa are allocated in a reverse order, 1226 // starting from the last lane. In case if a register cannot be completely 1227 // spilled into another register that will ensure its alignment does not 1228 // change. For targets with VGPR alignment requirement this is important 1229 // in case of flat scratch usage as we might get a scratch_load or 1230 // scratch_store of an unaligned register otherwise. 1231 for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS, 1232 LaneE = RegOffset / 4; 1233 Lane >= LaneE; --Lane) { 1234 bool IsSubReg = e > 1 || EltSize > 4; 1235 Register Sub = IsSubReg 1236 ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane))) 1237 : ValueReg; 1238 auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill); 1239 if (!MIB.getInstr()) 1240 break; 1241 if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && !i)) { 1242 MIB.addReg(ValueReg, RegState::ImplicitDefine); 1243 NeedSuperRegDef = false; 1244 } 1245 if (IsSubReg || NeedSuperRegImpOperand) { 1246 NeedSuperRegImpOperand = true; 1247 unsigned State = SrcDstRegState; 1248 if (Lane != LaneE) 1249 State &= ~RegState::Kill; 1250 MIB.addReg(ValueReg, RegState::Implicit | State); 1251 } 1252 RemEltSize -= 4; 1253 } 1254 1255 if (!RemEltSize) // Fully spilled into AGPRs. 1256 continue; 1257 1258 if (RemEltSize != EltSize) { // Partially spilled to AGPRs 1259 assert(IsFlat && EltSize > 4); 1260 1261 unsigned NumRegs = RemEltSize / 4; 1262 SubReg = Register(getSubReg(ValueReg, 1263 getSubRegFromChannel(RegOffset / 4, NumRegs))); 1264 unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize); 1265 Desc = &TII->get(Opc); 1266 } 1267 1268 unsigned FinalReg = SubReg; 1269 1270 if (IsAGPR) { 1271 assert(EltSize == 4); 1272 1273 if (!TmpReg) { 1274 assert(RS && "Needs to have RegScavenger to spill an AGPR!"); 1275 // FIXME: change to scavengeRegisterBackwards() 1276 TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 1277 RS->setRegUsed(TmpReg); 1278 } 1279 if (IsStore) { 1280 auto AccRead = BuildMI(MBB, MI, DL, 1281 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TmpReg) 1282 .addReg(SubReg, getKillRegState(IsKill)); 1283 if (NeedSuperRegDef) 1284 AccRead.addReg(ValueReg, RegState::ImplicitDefine); 1285 AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1286 } 1287 SubReg = TmpReg; 1288 } 1289 1290 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset); 1291 MachineMemOperand *NewMMO = 1292 MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize, 1293 commonAlignment(Alignment, RegOffset)); 1294 1295 auto MIB = 1296 BuildMI(MBB, MI, DL, *Desc) 1297 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)); 1298 if (!IsFlat) 1299 MIB.addReg(FuncInfo->getScratchRSrcReg()); 1300 1301 if (SOffset == AMDGPU::NoRegister) { 1302 if (!IsFlat) 1303 MIB.addImm(0); 1304 } else { 1305 MIB.addReg(SOffset, SOffsetRegState); 1306 } 1307 MIB.addImm(Offset + RegOffset) 1308 .addImm(0); // cpol 1309 if (!IsFlat) 1310 MIB.addImm(0) // tfe 1311 .addImm(0); // swz 1312 MIB.addMemOperand(NewMMO); 1313 1314 if (!IsAGPR && NeedSuperRegDef) 1315 MIB.addReg(ValueReg, RegState::ImplicitDefine); 1316 1317 if (!IsStore && TmpReg != AMDGPU::NoRegister) { 1318 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), 1319 FinalReg) 1320 .addReg(TmpReg, RegState::Kill); 1321 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1322 } 1323 1324 if (NeedSuperRegImpOperand) 1325 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); 1326 } 1327 1328 if (ScratchOffsetRegDelta != 0) { 1329 // Subtract the offset we added to the ScratchOffset register. 1330 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) 1331 .addReg(SOffset) 1332 .addImm(-ScratchOffsetRegDelta); 1333 } 1334 } 1335 1336 void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, 1337 int Offset, bool IsLoad, 1338 bool IsKill) const { 1339 // Load/store VGPR 1340 MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo(); 1341 assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill); 1342 1343 Register FrameReg = 1344 FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF) 1345 ? getBaseRegister() 1346 : getFrameRegister(SB.MF); 1347 1348 Align Alignment = FrameInfo.getObjectAlign(Index); 1349 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SB.MF, Index); 1350 MachineMemOperand *MMO = SB.MF.getMachineMemOperand( 1351 PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore, 1352 SB.EltSize, Alignment); 1353 1354 if (IsLoad) { 1355 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 1356 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 1357 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false, 1358 FrameReg, Offset * SB.EltSize, MMO, SB.RS); 1359 } else { 1360 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 1361 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 1362 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill, 1363 FrameReg, Offset * SB.EltSize, MMO, SB.RS); 1364 // This only ever adds one VGPR spill 1365 SB.MFI.addToSpilledVGPRs(1); 1366 } 1367 } 1368 1369 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, 1370 int Index, 1371 RegScavenger *RS, 1372 LiveIntervals *LIS, 1373 bool OnlyToVGPR) const { 1374 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); 1375 1376 ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills = 1377 SB.MFI.getSGPRToVGPRSpills(Index); 1378 bool SpillToVGPR = !VGPRSpills.empty(); 1379 if (OnlyToVGPR && !SpillToVGPR) 1380 return false; 1381 1382 assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() && 1383 SB.SuperReg != SB.MFI.getFrameOffsetReg())); 1384 1385 if (SpillToVGPR) { 1386 1387 assert(SB.NumSubRegs == VGPRSpills.size() && 1388 "Num of VGPR lanes should be equal to num of SGPRs spilled"); 1389 1390 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) { 1391 Register SubReg = 1392 SB.NumSubRegs == 1 1393 ? SB.SuperReg 1394 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1395 SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; 1396 1397 bool UseKill = SB.IsKill && i == SB.NumSubRegs - 1; 1398 1399 // Mark the "old value of vgpr" input undef only if this is the first sgpr 1400 // spill to this specific vgpr in the first basic block. 1401 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, 1402 SB.TII.get(AMDGPU::V_WRITELANE_B32), Spill.VGPR) 1403 .addReg(SubReg, getKillRegState(UseKill)) 1404 .addImm(Spill.Lane) 1405 .addReg(Spill.VGPR); 1406 if (LIS) { 1407 if (i == 0) 1408 LIS->ReplaceMachineInstrInMaps(*MI, *MIB); 1409 else 1410 LIS->InsertMachineInstrInMaps(*MIB); 1411 } 1412 1413 if (i == 0 && SB.NumSubRegs > 1) { 1414 // We may be spilling a super-register which is only partially defined, 1415 // and need to ensure later spills think the value is defined. 1416 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1417 } 1418 1419 if (SB.NumSubRegs > 1) 1420 MIB.addReg(SB.SuperReg, getKillRegState(UseKill) | RegState::Implicit); 1421 1422 // FIXME: Since this spills to another register instead of an actual 1423 // frame index, we should delete the frame index when all references to 1424 // it are fixed. 1425 } 1426 } else { 1427 SB.prepare(); 1428 1429 // SubReg carries the "Kill" flag when SubReg == SB.SuperReg. 1430 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill); 1431 1432 // Per VGPR helper data 1433 auto PVD = SB.getPerVGPRData(); 1434 1435 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1436 unsigned TmpVGPRFlags = RegState::Undef; 1437 1438 // Write sub registers into the VGPR 1439 for (unsigned i = Offset * PVD.PerVGPR, 1440 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1441 i < e; ++i) { 1442 Register SubReg = 1443 SB.NumSubRegs == 1 1444 ? SB.SuperReg 1445 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1446 1447 MachineInstrBuilder WriteLane = 1448 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32), 1449 SB.TmpVGPR) 1450 .addReg(SubReg, SubKillState) 1451 .addImm(i % PVD.PerVGPR) 1452 .addReg(SB.TmpVGPR, TmpVGPRFlags); 1453 TmpVGPRFlags = 0; 1454 1455 if (LIS) { 1456 if (i == 0) 1457 LIS->ReplaceMachineInstrInMaps(*MI, *WriteLane); 1458 else 1459 LIS->InsertMachineInstrInMaps(*WriteLane); 1460 } 1461 1462 // There could be undef components of a spilled super register. 1463 // TODO: Can we detect this and skip the spill? 1464 if (SB.NumSubRegs > 1) { 1465 // The last implicit use of the SB.SuperReg carries the "Kill" flag. 1466 unsigned SuperKillState = 0; 1467 if (i + 1 == SB.NumSubRegs) 1468 SuperKillState |= getKillRegState(SB.IsKill); 1469 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState); 1470 } 1471 } 1472 1473 // Write out VGPR 1474 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false); 1475 } 1476 1477 SB.restore(); 1478 } 1479 1480 MI->eraseFromParent(); 1481 SB.MFI.addToSpilledSGPRs(SB.NumSubRegs); 1482 1483 if (LIS) 1484 LIS->removeAllRegUnitsForPhysReg(SB.SuperReg); 1485 1486 return true; 1487 } 1488 1489 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, 1490 int Index, 1491 RegScavenger *RS, 1492 LiveIntervals *LIS, 1493 bool OnlyToVGPR) const { 1494 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); 1495 1496 ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills = 1497 SB.MFI.getSGPRToVGPRSpills(Index); 1498 bool SpillToVGPR = !VGPRSpills.empty(); 1499 if (OnlyToVGPR && !SpillToVGPR) 1500 return false; 1501 1502 if (SpillToVGPR) { 1503 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) { 1504 Register SubReg = 1505 SB.NumSubRegs == 1 1506 ? SB.SuperReg 1507 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1508 1509 SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; 1510 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32), 1511 SubReg) 1512 .addReg(Spill.VGPR) 1513 .addImm(Spill.Lane); 1514 if (SB.NumSubRegs > 1 && i == 0) 1515 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1516 if (LIS) { 1517 if (i == e - 1) 1518 LIS->ReplaceMachineInstrInMaps(*MI, *MIB); 1519 else 1520 LIS->InsertMachineInstrInMaps(*MIB); 1521 } 1522 1523 } 1524 } else { 1525 SB.prepare(); 1526 1527 // Per VGPR helper data 1528 auto PVD = SB.getPerVGPRData(); 1529 1530 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1531 // Load in VGPR data 1532 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true); 1533 1534 // Unpack lanes 1535 for (unsigned i = Offset * PVD.PerVGPR, 1536 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1537 i < e; ++i) { 1538 Register SubReg = 1539 SB.NumSubRegs == 1 1540 ? SB.SuperReg 1541 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1542 1543 bool LastSubReg = (i + 1 == e); 1544 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, 1545 SB.TII.get(AMDGPU::V_READLANE_B32), SubReg) 1546 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg)) 1547 .addImm(i); 1548 if (SB.NumSubRegs > 1 && i == 0) 1549 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1550 if (LIS) { 1551 if (i == e - 1) 1552 LIS->ReplaceMachineInstrInMaps(*MI, *MIB); 1553 else 1554 LIS->InsertMachineInstrInMaps(*MIB); 1555 } 1556 } 1557 } 1558 1559 SB.restore(); 1560 } 1561 1562 MI->eraseFromParent(); 1563 1564 if (LIS) 1565 LIS->removeAllRegUnitsForPhysReg(SB.SuperReg); 1566 1567 return true; 1568 } 1569 1570 bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI, 1571 MachineBasicBlock &RestoreMBB, 1572 Register SGPR, RegScavenger *RS) const { 1573 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0, 1574 RS); 1575 SB.prepare(); 1576 // Generate the spill of SGPR to SB.TmpVGPR. 1577 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill); 1578 auto PVD = SB.getPerVGPRData(); 1579 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1580 unsigned TmpVGPRFlags = RegState::Undef; 1581 // Write sub registers into the VGPR 1582 for (unsigned i = Offset * PVD.PerVGPR, 1583 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1584 i < e; ++i) { 1585 Register SubReg = 1586 SB.NumSubRegs == 1 1587 ? SB.SuperReg 1588 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1589 1590 MachineInstrBuilder WriteLane = 1591 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32), 1592 SB.TmpVGPR) 1593 .addReg(SubReg, SubKillState) 1594 .addImm(i % PVD.PerVGPR) 1595 .addReg(SB.TmpVGPR, TmpVGPRFlags); 1596 TmpVGPRFlags = 0; 1597 // There could be undef components of a spilled super register. 1598 // TODO: Can we detect this and skip the spill? 1599 if (SB.NumSubRegs > 1) { 1600 // The last implicit use of the SB.SuperReg carries the "Kill" flag. 1601 unsigned SuperKillState = 0; 1602 if (i + 1 == SB.NumSubRegs) 1603 SuperKillState |= getKillRegState(SB.IsKill); 1604 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState); 1605 } 1606 } 1607 // Don't need to write VGPR out. 1608 } 1609 1610 // Restore clobbered registers in the specified restore block. 1611 MI = RestoreMBB.end(); 1612 SB.setMI(&RestoreMBB, MI); 1613 // Generate the restore of SGPR from SB.TmpVGPR. 1614 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1615 // Don't need to load VGPR in. 1616 // Unpack lanes 1617 for (unsigned i = Offset * PVD.PerVGPR, 1618 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1619 i < e; ++i) { 1620 Register SubReg = 1621 SB.NumSubRegs == 1 1622 ? SB.SuperReg 1623 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1624 bool LastSubReg = (i + 1 == e); 1625 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32), 1626 SubReg) 1627 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg)) 1628 .addImm(i); 1629 if (SB.NumSubRegs > 1 && i == 0) 1630 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1631 } 1632 } 1633 SB.restore(); 1634 1635 SB.MFI.addToSpilledSGPRs(SB.NumSubRegs); 1636 return false; 1637 } 1638 1639 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to 1640 /// a VGPR and the stack slot can be safely eliminated when all other users are 1641 /// handled. 1642 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( 1643 MachineBasicBlock::iterator MI, 1644 int FI, 1645 RegScavenger *RS, 1646 LiveIntervals *LIS) const { 1647 switch (MI->getOpcode()) { 1648 case AMDGPU::SI_SPILL_S1024_SAVE: 1649 case AMDGPU::SI_SPILL_S512_SAVE: 1650 case AMDGPU::SI_SPILL_S256_SAVE: 1651 case AMDGPU::SI_SPILL_S224_SAVE: 1652 case AMDGPU::SI_SPILL_S192_SAVE: 1653 case AMDGPU::SI_SPILL_S160_SAVE: 1654 case AMDGPU::SI_SPILL_S128_SAVE: 1655 case AMDGPU::SI_SPILL_S96_SAVE: 1656 case AMDGPU::SI_SPILL_S64_SAVE: 1657 case AMDGPU::SI_SPILL_S32_SAVE: 1658 return spillSGPR(MI, FI, RS, LIS, true); 1659 case AMDGPU::SI_SPILL_S1024_RESTORE: 1660 case AMDGPU::SI_SPILL_S512_RESTORE: 1661 case AMDGPU::SI_SPILL_S256_RESTORE: 1662 case AMDGPU::SI_SPILL_S224_RESTORE: 1663 case AMDGPU::SI_SPILL_S192_RESTORE: 1664 case AMDGPU::SI_SPILL_S160_RESTORE: 1665 case AMDGPU::SI_SPILL_S128_RESTORE: 1666 case AMDGPU::SI_SPILL_S96_RESTORE: 1667 case AMDGPU::SI_SPILL_S64_RESTORE: 1668 case AMDGPU::SI_SPILL_S32_RESTORE: 1669 return restoreSGPR(MI, FI, RS, LIS, true); 1670 default: 1671 llvm_unreachable("not an SGPR spill instruction"); 1672 } 1673 } 1674 1675 void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, 1676 int SPAdj, unsigned FIOperandNum, 1677 RegScavenger *RS) const { 1678 MachineFunction *MF = MI->getParent()->getParent(); 1679 MachineBasicBlock *MBB = MI->getParent(); 1680 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1681 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 1682 const SIInstrInfo *TII = ST.getInstrInfo(); 1683 DebugLoc DL = MI->getDebugLoc(); 1684 1685 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?"); 1686 1687 MachineOperand &FIOp = MI->getOperand(FIOperandNum); 1688 int Index = MI->getOperand(FIOperandNum).getIndex(); 1689 1690 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF) 1691 ? getBaseRegister() 1692 : getFrameRegister(*MF); 1693 1694 switch (MI->getOpcode()) { 1695 // SGPR register spill 1696 case AMDGPU::SI_SPILL_S1024_SAVE: 1697 case AMDGPU::SI_SPILL_S512_SAVE: 1698 case AMDGPU::SI_SPILL_S256_SAVE: 1699 case AMDGPU::SI_SPILL_S224_SAVE: 1700 case AMDGPU::SI_SPILL_S192_SAVE: 1701 case AMDGPU::SI_SPILL_S160_SAVE: 1702 case AMDGPU::SI_SPILL_S128_SAVE: 1703 case AMDGPU::SI_SPILL_S96_SAVE: 1704 case AMDGPU::SI_SPILL_S64_SAVE: 1705 case AMDGPU::SI_SPILL_S32_SAVE: { 1706 spillSGPR(MI, Index, RS); 1707 break; 1708 } 1709 1710 // SGPR register restore 1711 case AMDGPU::SI_SPILL_S1024_RESTORE: 1712 case AMDGPU::SI_SPILL_S512_RESTORE: 1713 case AMDGPU::SI_SPILL_S256_RESTORE: 1714 case AMDGPU::SI_SPILL_S224_RESTORE: 1715 case AMDGPU::SI_SPILL_S192_RESTORE: 1716 case AMDGPU::SI_SPILL_S160_RESTORE: 1717 case AMDGPU::SI_SPILL_S128_RESTORE: 1718 case AMDGPU::SI_SPILL_S96_RESTORE: 1719 case AMDGPU::SI_SPILL_S64_RESTORE: 1720 case AMDGPU::SI_SPILL_S32_RESTORE: { 1721 restoreSGPR(MI, Index, RS); 1722 break; 1723 } 1724 1725 // VGPR register spill 1726 case AMDGPU::SI_SPILL_V1024_SAVE: 1727 case AMDGPU::SI_SPILL_V512_SAVE: 1728 case AMDGPU::SI_SPILL_V256_SAVE: 1729 case AMDGPU::SI_SPILL_V224_SAVE: 1730 case AMDGPU::SI_SPILL_V192_SAVE: 1731 case AMDGPU::SI_SPILL_V160_SAVE: 1732 case AMDGPU::SI_SPILL_V128_SAVE: 1733 case AMDGPU::SI_SPILL_V96_SAVE: 1734 case AMDGPU::SI_SPILL_V64_SAVE: 1735 case AMDGPU::SI_SPILL_V32_SAVE: 1736 case AMDGPU::SI_SPILL_A1024_SAVE: 1737 case AMDGPU::SI_SPILL_A512_SAVE: 1738 case AMDGPU::SI_SPILL_A256_SAVE: 1739 case AMDGPU::SI_SPILL_A224_SAVE: 1740 case AMDGPU::SI_SPILL_A192_SAVE: 1741 case AMDGPU::SI_SPILL_A160_SAVE: 1742 case AMDGPU::SI_SPILL_A128_SAVE: 1743 case AMDGPU::SI_SPILL_A96_SAVE: 1744 case AMDGPU::SI_SPILL_A64_SAVE: 1745 case AMDGPU::SI_SPILL_A32_SAVE: { 1746 const MachineOperand *VData = TII->getNamedOperand(*MI, 1747 AMDGPU::OpName::vdata); 1748 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 1749 MFI->getStackPtrOffsetReg()); 1750 1751 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 1752 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 1753 auto *MBB = MI->getParent(); 1754 buildSpillLoadStore( 1755 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, 1756 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 1757 *MI->memoperands_begin(), RS); 1758 MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); 1759 MI->eraseFromParent(); 1760 break; 1761 } 1762 case AMDGPU::SI_SPILL_V32_RESTORE: 1763 case AMDGPU::SI_SPILL_V64_RESTORE: 1764 case AMDGPU::SI_SPILL_V96_RESTORE: 1765 case AMDGPU::SI_SPILL_V128_RESTORE: 1766 case AMDGPU::SI_SPILL_V160_RESTORE: 1767 case AMDGPU::SI_SPILL_V192_RESTORE: 1768 case AMDGPU::SI_SPILL_V224_RESTORE: 1769 case AMDGPU::SI_SPILL_V256_RESTORE: 1770 case AMDGPU::SI_SPILL_V512_RESTORE: 1771 case AMDGPU::SI_SPILL_V1024_RESTORE: 1772 case AMDGPU::SI_SPILL_A32_RESTORE: 1773 case AMDGPU::SI_SPILL_A64_RESTORE: 1774 case AMDGPU::SI_SPILL_A96_RESTORE: 1775 case AMDGPU::SI_SPILL_A128_RESTORE: 1776 case AMDGPU::SI_SPILL_A160_RESTORE: 1777 case AMDGPU::SI_SPILL_A192_RESTORE: 1778 case AMDGPU::SI_SPILL_A224_RESTORE: 1779 case AMDGPU::SI_SPILL_A256_RESTORE: 1780 case AMDGPU::SI_SPILL_A512_RESTORE: 1781 case AMDGPU::SI_SPILL_A1024_RESTORE: { 1782 const MachineOperand *VData = TII->getNamedOperand(*MI, 1783 AMDGPU::OpName::vdata); 1784 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 1785 MFI->getStackPtrOffsetReg()); 1786 1787 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 1788 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 1789 auto *MBB = MI->getParent(); 1790 buildSpillLoadStore( 1791 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, 1792 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 1793 *MI->memoperands_begin(), RS); 1794 MI->eraseFromParent(); 1795 break; 1796 } 1797 1798 default: { 1799 // Other access to frame index 1800 const DebugLoc &DL = MI->getDebugLoc(); 1801 1802 int64_t Offset = FrameInfo.getObjectOffset(Index); 1803 if (ST.enableFlatScratch()) { 1804 if (TII->isFLATScratch(*MI)) { 1805 assert((int16_t)FIOperandNum == 1806 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 1807 AMDGPU::OpName::saddr)); 1808 1809 // The offset is always swizzled, just replace it 1810 if (FrameReg) 1811 FIOp.ChangeToRegister(FrameReg, false); 1812 1813 if (!Offset) 1814 return; 1815 1816 MachineOperand *OffsetOp = 1817 TII->getNamedOperand(*MI, AMDGPU::OpName::offset); 1818 int64_t NewOffset = Offset + OffsetOp->getImm(); 1819 if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 1820 SIInstrFlags::FlatScratch)) { 1821 OffsetOp->setImm(NewOffset); 1822 if (FrameReg) 1823 return; 1824 Offset = 0; 1825 } 1826 1827 assert(!TII->getNamedOperand(*MI, AMDGPU::OpName::vaddr) && 1828 "Unexpected vaddr for flat scratch with a FI operand"); 1829 1830 // On GFX10 we have ST mode to use no registers for an address. 1831 // Otherwise we need to materialize 0 into an SGPR. 1832 if (!Offset && ST.hasFlatScratchSTMode()) { 1833 unsigned Opc = MI->getOpcode(); 1834 unsigned NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc); 1835 MI->RemoveOperand( 1836 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr)); 1837 MI->setDesc(TII->get(NewOpc)); 1838 return; 1839 } 1840 } 1841 1842 if (!FrameReg) { 1843 FIOp.ChangeToImmediate(Offset); 1844 if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) 1845 return; 1846 } 1847 1848 // We need to use register here. Check if we can use an SGPR or need 1849 // a VGPR. 1850 FIOp.ChangeToRegister(AMDGPU::M0, false); 1851 bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp); 1852 1853 if (!Offset && FrameReg && UseSGPR) { 1854 FIOp.setReg(FrameReg); 1855 return; 1856 } 1857 1858 const TargetRegisterClass *RC = UseSGPR ? &AMDGPU::SReg_32_XM0RegClass 1859 : &AMDGPU::VGPR_32RegClass; 1860 1861 Register TmpReg = RS->scavengeRegister(RC, MI, 0, !UseSGPR); 1862 FIOp.setReg(TmpReg); 1863 FIOp.setIsKill(true); 1864 1865 if ((!FrameReg || !Offset) && TmpReg) { 1866 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1867 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg); 1868 if (FrameReg) 1869 MIB.addReg(FrameReg); 1870 else 1871 MIB.addImm(Offset); 1872 1873 return; 1874 } 1875 1876 Register TmpSReg = 1877 UseSGPR ? TmpReg 1878 : RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, 1879 !UseSGPR); 1880 1881 // TODO: for flat scratch another attempt can be made with a VGPR index 1882 // if no SGPRs can be scavenged. 1883 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) 1884 report_fatal_error("Cannot scavenge register in FI elimination!"); 1885 1886 if (!TmpSReg) { 1887 // Use frame register and restore it after. 1888 TmpSReg = FrameReg; 1889 FIOp.setReg(FrameReg); 1890 FIOp.setIsKill(false); 1891 } 1892 1893 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg) 1894 .addReg(FrameReg) 1895 .addImm(Offset); 1896 1897 if (!UseSGPR) 1898 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 1899 .addReg(TmpSReg, RegState::Kill); 1900 1901 if (TmpSReg == FrameReg) { 1902 // Undo frame register modification. 1903 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32), 1904 FrameReg) 1905 .addReg(FrameReg) 1906 .addImm(-Offset); 1907 } 1908 1909 return; 1910 } 1911 1912 bool IsMUBUF = TII->isMUBUF(*MI); 1913 1914 if (!IsMUBUF && !MFI->isEntryFunction()) { 1915 // Convert to a swizzled stack address by scaling by the wave size. 1916 // 1917 // In an entry function/kernel the offset is already swizzled. 1918 1919 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32; 1920 Register ResultReg = 1921 IsCopy ? MI->getOperand(0).getReg() 1922 : RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 1923 1924 int64_t Offset = FrameInfo.getObjectOffset(Index); 1925 if (Offset == 0) { 1926 // XXX - This never happens because of emergency scavenging slot at 0? 1927 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg) 1928 .addImm(ST.getWavefrontSizeLog2()) 1929 .addReg(FrameReg); 1930 } else { 1931 if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) { 1932 // Reuse ResultReg in intermediate step. 1933 Register ScaledReg = ResultReg; 1934 1935 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), 1936 ScaledReg) 1937 .addImm(ST.getWavefrontSizeLog2()) 1938 .addReg(FrameReg); 1939 1940 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; 1941 1942 // TODO: Fold if use instruction is another add of a constant. 1943 if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { 1944 // FIXME: This can fail 1945 MIB.addImm(Offset); 1946 MIB.addReg(ScaledReg, RegState::Kill); 1947 if (!IsVOP2) 1948 MIB.addImm(0); // clamp bit 1949 } else { 1950 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 && 1951 "Need to reuse carry out register"); 1952 1953 // Use scavenged unused carry out as offset register. 1954 Register ConstOffsetReg; 1955 if (!isWave32) 1956 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0); 1957 else 1958 ConstOffsetReg = MIB.getReg(1); 1959 1960 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) 1961 .addImm(Offset); 1962 MIB.addReg(ConstOffsetReg, RegState::Kill); 1963 MIB.addReg(ScaledReg, RegState::Kill); 1964 MIB.addImm(0); // clamp bit 1965 } 1966 } else { 1967 // We have to produce a carry out, and there isn't a free SGPR pair 1968 // for it. We can keep the whole computation on the SALU to avoid 1969 // clobbering an additional register at the cost of an extra mov. 1970 1971 // We may have 1 free scratch SGPR even though a carry out is 1972 // unavailable. Only one additional mov is needed. 1973 Register TmpScaledReg = 1974 RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); 1975 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg; 1976 1977 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg) 1978 .addReg(FrameReg) 1979 .addImm(ST.getWavefrontSizeLog2()); 1980 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) 1981 .addReg(ScaledReg, RegState::Kill) 1982 .addImm(Offset); 1983 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg) 1984 .addReg(ScaledReg, RegState::Kill); 1985 1986 // If there were truly no free SGPRs, we need to undo everything. 1987 if (!TmpScaledReg.isValid()) { 1988 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) 1989 .addReg(ScaledReg, RegState::Kill) 1990 .addImm(-Offset); 1991 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg) 1992 .addReg(FrameReg) 1993 .addImm(ST.getWavefrontSizeLog2()); 1994 } 1995 } 1996 } 1997 1998 // Don't introduce an extra copy if we're just materializing in a mov. 1999 if (IsCopy) 2000 MI->eraseFromParent(); 2001 else 2002 FIOp.ChangeToRegister(ResultReg, false, false, true); 2003 return; 2004 } 2005 2006 if (IsMUBUF) { 2007 // Disable offen so we don't need a 0 vgpr base. 2008 assert(static_cast<int>(FIOperandNum) == 2009 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 2010 AMDGPU::OpName::vaddr)); 2011 2012 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset); 2013 assert((SOffset.isImm() && SOffset.getImm() == 0)); 2014 2015 if (FrameReg != AMDGPU::NoRegister) 2016 SOffset.ChangeToRegister(FrameReg, false); 2017 2018 int64_t Offset = FrameInfo.getObjectOffset(Index); 2019 int64_t OldImm 2020 = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(); 2021 int64_t NewOffset = OldImm + Offset; 2022 2023 if (SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) && 2024 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) { 2025 MI->eraseFromParent(); 2026 return; 2027 } 2028 } 2029 2030 // If the offset is simply too big, don't convert to a scratch wave offset 2031 // relative index. 2032 2033 FIOp.ChangeToImmediate(Offset); 2034 if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { 2035 Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 2036 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 2037 .addImm(Offset); 2038 FIOp.ChangeToRegister(TmpReg, false, false, true); 2039 } 2040 } 2041 } 2042 } 2043 2044 StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const { 2045 return AMDGPUInstPrinter::getRegisterName(Reg); 2046 } 2047 2048 static const TargetRegisterClass * 2049 getAnyVGPRClassForBitWidth(unsigned BitWidth) { 2050 if (BitWidth <= 64) 2051 return &AMDGPU::VReg_64RegClass; 2052 if (BitWidth <= 96) 2053 return &AMDGPU::VReg_96RegClass; 2054 if (BitWidth <= 128) 2055 return &AMDGPU::VReg_128RegClass; 2056 if (BitWidth <= 160) 2057 return &AMDGPU::VReg_160RegClass; 2058 if (BitWidth <= 192) 2059 return &AMDGPU::VReg_192RegClass; 2060 if (BitWidth <= 224) 2061 return &AMDGPU::VReg_224RegClass; 2062 if (BitWidth <= 256) 2063 return &AMDGPU::VReg_256RegClass; 2064 if (BitWidth <= 512) 2065 return &AMDGPU::VReg_512RegClass; 2066 if (BitWidth <= 1024) 2067 return &AMDGPU::VReg_1024RegClass; 2068 2069 return nullptr; 2070 } 2071 2072 static const TargetRegisterClass * 2073 getAlignedVGPRClassForBitWidth(unsigned BitWidth) { 2074 if (BitWidth <= 64) 2075 return &AMDGPU::VReg_64_Align2RegClass; 2076 if (BitWidth <= 96) 2077 return &AMDGPU::VReg_96_Align2RegClass; 2078 if (BitWidth <= 128) 2079 return &AMDGPU::VReg_128_Align2RegClass; 2080 if (BitWidth <= 160) 2081 return &AMDGPU::VReg_160_Align2RegClass; 2082 if (BitWidth <= 192) 2083 return &AMDGPU::VReg_192_Align2RegClass; 2084 if (BitWidth <= 224) 2085 return &AMDGPU::VReg_224_Align2RegClass; 2086 if (BitWidth <= 256) 2087 return &AMDGPU::VReg_256_Align2RegClass; 2088 if (BitWidth <= 512) 2089 return &AMDGPU::VReg_512_Align2RegClass; 2090 if (BitWidth <= 1024) 2091 return &AMDGPU::VReg_1024_Align2RegClass; 2092 2093 return nullptr; 2094 } 2095 2096 const TargetRegisterClass * 2097 SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const { 2098 if (BitWidth == 1) 2099 return &AMDGPU::VReg_1RegClass; 2100 if (BitWidth <= 16) 2101 return &AMDGPU::VGPR_LO16RegClass; 2102 if (BitWidth <= 32) 2103 return &AMDGPU::VGPR_32RegClass; 2104 return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth) 2105 : getAnyVGPRClassForBitWidth(BitWidth); 2106 } 2107 2108 static const TargetRegisterClass * 2109 getAnyAGPRClassForBitWidth(unsigned BitWidth) { 2110 if (BitWidth <= 64) 2111 return &AMDGPU::AReg_64RegClass; 2112 if (BitWidth <= 96) 2113 return &AMDGPU::AReg_96RegClass; 2114 if (BitWidth <= 128) 2115 return &AMDGPU::AReg_128RegClass; 2116 if (BitWidth <= 160) 2117 return &AMDGPU::AReg_160RegClass; 2118 if (BitWidth <= 192) 2119 return &AMDGPU::AReg_192RegClass; 2120 if (BitWidth <= 224) 2121 return &AMDGPU::AReg_224RegClass; 2122 if (BitWidth <= 256) 2123 return &AMDGPU::AReg_256RegClass; 2124 if (BitWidth <= 512) 2125 return &AMDGPU::AReg_512RegClass; 2126 if (BitWidth <= 1024) 2127 return &AMDGPU::AReg_1024RegClass; 2128 2129 return nullptr; 2130 } 2131 2132 static const TargetRegisterClass * 2133 getAlignedAGPRClassForBitWidth(unsigned BitWidth) { 2134 if (BitWidth <= 64) 2135 return &AMDGPU::AReg_64_Align2RegClass; 2136 if (BitWidth <= 96) 2137 return &AMDGPU::AReg_96_Align2RegClass; 2138 if (BitWidth <= 128) 2139 return &AMDGPU::AReg_128_Align2RegClass; 2140 if (BitWidth <= 160) 2141 return &AMDGPU::AReg_160_Align2RegClass; 2142 if (BitWidth <= 192) 2143 return &AMDGPU::AReg_192_Align2RegClass; 2144 if (BitWidth <= 224) 2145 return &AMDGPU::AReg_224_Align2RegClass; 2146 if (BitWidth <= 256) 2147 return &AMDGPU::AReg_256_Align2RegClass; 2148 if (BitWidth <= 512) 2149 return &AMDGPU::AReg_512_Align2RegClass; 2150 if (BitWidth <= 1024) 2151 return &AMDGPU::AReg_1024_Align2RegClass; 2152 2153 return nullptr; 2154 } 2155 2156 const TargetRegisterClass * 2157 SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const { 2158 if (BitWidth <= 16) 2159 return &AMDGPU::AGPR_LO16RegClass; 2160 if (BitWidth <= 32) 2161 return &AMDGPU::AGPR_32RegClass; 2162 return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth) 2163 : getAnyAGPRClassForBitWidth(BitWidth); 2164 } 2165 2166 const TargetRegisterClass * 2167 SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) { 2168 if (BitWidth <= 16) 2169 return &AMDGPU::SGPR_LO16RegClass; 2170 if (BitWidth <= 32) 2171 return &AMDGPU::SReg_32RegClass; 2172 if (BitWidth <= 64) 2173 return &AMDGPU::SReg_64RegClass; 2174 if (BitWidth <= 96) 2175 return &AMDGPU::SGPR_96RegClass; 2176 if (BitWidth <= 128) 2177 return &AMDGPU::SGPR_128RegClass; 2178 if (BitWidth <= 160) 2179 return &AMDGPU::SGPR_160RegClass; 2180 if (BitWidth <= 192) 2181 return &AMDGPU::SGPR_192RegClass; 2182 if (BitWidth <= 224) 2183 return &AMDGPU::SGPR_224RegClass; 2184 if (BitWidth <= 256) 2185 return &AMDGPU::SGPR_256RegClass; 2186 if (BitWidth <= 512) 2187 return &AMDGPU::SGPR_512RegClass; 2188 if (BitWidth <= 1024) 2189 return &AMDGPU::SGPR_1024RegClass; 2190 2191 return nullptr; 2192 } 2193 2194 // FIXME: This is very slow. It might be worth creating a map from physreg to 2195 // register class. 2196 const TargetRegisterClass * 2197 SIRegisterInfo::getPhysRegClass(MCRegister Reg) const { 2198 static const TargetRegisterClass *const BaseClasses[] = { 2199 &AMDGPU::VGPR_LO16RegClass, 2200 &AMDGPU::VGPR_HI16RegClass, 2201 &AMDGPU::SReg_LO16RegClass, 2202 &AMDGPU::AGPR_LO16RegClass, 2203 &AMDGPU::VGPR_32RegClass, 2204 &AMDGPU::SReg_32RegClass, 2205 &AMDGPU::AGPR_32RegClass, 2206 &AMDGPU::AGPR_32RegClass, 2207 &AMDGPU::VReg_64_Align2RegClass, 2208 &AMDGPU::VReg_64RegClass, 2209 &AMDGPU::SReg_64RegClass, 2210 &AMDGPU::AReg_64_Align2RegClass, 2211 &AMDGPU::AReg_64RegClass, 2212 &AMDGPU::VReg_96_Align2RegClass, 2213 &AMDGPU::VReg_96RegClass, 2214 &AMDGPU::SReg_96RegClass, 2215 &AMDGPU::AReg_96_Align2RegClass, 2216 &AMDGPU::AReg_96RegClass, 2217 &AMDGPU::VReg_128_Align2RegClass, 2218 &AMDGPU::VReg_128RegClass, 2219 &AMDGPU::SReg_128RegClass, 2220 &AMDGPU::AReg_128_Align2RegClass, 2221 &AMDGPU::AReg_128RegClass, 2222 &AMDGPU::VReg_160_Align2RegClass, 2223 &AMDGPU::VReg_160RegClass, 2224 &AMDGPU::SReg_160RegClass, 2225 &AMDGPU::AReg_160_Align2RegClass, 2226 &AMDGPU::AReg_160RegClass, 2227 &AMDGPU::VReg_192_Align2RegClass, 2228 &AMDGPU::VReg_192RegClass, 2229 &AMDGPU::SReg_192RegClass, 2230 &AMDGPU::AReg_192_Align2RegClass, 2231 &AMDGPU::AReg_192RegClass, 2232 &AMDGPU::VReg_224_Align2RegClass, 2233 &AMDGPU::VReg_224RegClass, 2234 &AMDGPU::SReg_224RegClass, 2235 &AMDGPU::AReg_224_Align2RegClass, 2236 &AMDGPU::AReg_224RegClass, 2237 &AMDGPU::VReg_256_Align2RegClass, 2238 &AMDGPU::VReg_256RegClass, 2239 &AMDGPU::SReg_256RegClass, 2240 &AMDGPU::AReg_256_Align2RegClass, 2241 &AMDGPU::AReg_256RegClass, 2242 &AMDGPU::VReg_512_Align2RegClass, 2243 &AMDGPU::VReg_512RegClass, 2244 &AMDGPU::SReg_512RegClass, 2245 &AMDGPU::AReg_512_Align2RegClass, 2246 &AMDGPU::AReg_512RegClass, 2247 &AMDGPU::SReg_1024RegClass, 2248 &AMDGPU::VReg_1024_Align2RegClass, 2249 &AMDGPU::VReg_1024RegClass, 2250 &AMDGPU::AReg_1024_Align2RegClass, 2251 &AMDGPU::AReg_1024RegClass, 2252 &AMDGPU::SCC_CLASSRegClass, 2253 &AMDGPU::Pseudo_SReg_32RegClass, 2254 &AMDGPU::Pseudo_SReg_128RegClass, 2255 }; 2256 2257 for (const TargetRegisterClass *BaseClass : BaseClasses) { 2258 if (BaseClass->contains(Reg)) { 2259 return BaseClass; 2260 } 2261 } 2262 return nullptr; 2263 } 2264 2265 bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo &MRI, 2266 Register Reg) const { 2267 const TargetRegisterClass *RC; 2268 if (Reg.isVirtual()) 2269 RC = MRI.getRegClass(Reg); 2270 else 2271 RC = getPhysRegClass(Reg); 2272 return isSGPRClass(RC); 2273 } 2274 2275 const TargetRegisterClass * 2276 SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const { 2277 unsigned Size = getRegSizeInBits(*SRC); 2278 const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size); 2279 assert(VRC && "Invalid register class size"); 2280 return VRC; 2281 } 2282 2283 const TargetRegisterClass * 2284 SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const { 2285 unsigned Size = getRegSizeInBits(*SRC); 2286 const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size); 2287 assert(ARC && "Invalid register class size"); 2288 return ARC; 2289 } 2290 2291 const TargetRegisterClass * 2292 SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const { 2293 unsigned Size = getRegSizeInBits(*VRC); 2294 if (Size == 32) 2295 return &AMDGPU::SGPR_32RegClass; 2296 const TargetRegisterClass *SRC = getSGPRClassForBitWidth(Size); 2297 assert(SRC && "Invalid register class size"); 2298 return SRC; 2299 } 2300 2301 const TargetRegisterClass *SIRegisterInfo::getSubRegClass( 2302 const TargetRegisterClass *RC, unsigned SubIdx) const { 2303 if (SubIdx == AMDGPU::NoSubRegister) 2304 return RC; 2305 2306 // We can assume that each lane corresponds to one 32-bit register. 2307 unsigned Size = getNumChannelsFromSubReg(SubIdx) * 32; 2308 if (isSGPRClass(RC)) { 2309 if (Size == 32) 2310 RC = &AMDGPU::SGPR_32RegClass; 2311 else 2312 RC = getSGPRClassForBitWidth(Size); 2313 } else if (hasAGPRs(RC)) { 2314 RC = getAGPRClassForBitWidth(Size); 2315 } else { 2316 RC = getVGPRClassForBitWidth(Size); 2317 } 2318 assert(RC && "Invalid sub-register class size"); 2319 return RC; 2320 } 2321 2322 const TargetRegisterClass * 2323 SIRegisterInfo::getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, 2324 const TargetRegisterClass *SubRC, 2325 unsigned SubIdx) const { 2326 // Ensure this subregister index is aligned in the super register. 2327 const TargetRegisterClass *MatchRC = 2328 getMatchingSuperRegClass(SuperRC, SubRC, SubIdx); 2329 return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr; 2330 } 2331 2332 bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { 2333 if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST && 2334 OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST) 2335 return !ST.hasMFMAInlineLiteralBug(); 2336 2337 return OpType >= AMDGPU::OPERAND_SRC_FIRST && 2338 OpType <= AMDGPU::OPERAND_SRC_LAST; 2339 } 2340 2341 bool SIRegisterInfo::shouldRewriteCopySrc( 2342 const TargetRegisterClass *DefRC, 2343 unsigned DefSubReg, 2344 const TargetRegisterClass *SrcRC, 2345 unsigned SrcSubReg) const { 2346 // We want to prefer the smallest register class possible, so we don't want to 2347 // stop and rewrite on anything that looks like a subregister 2348 // extract. Operations mostly don't care about the super register class, so we 2349 // only want to stop on the most basic of copies between the same register 2350 // class. 2351 // 2352 // e.g. if we have something like 2353 // %0 = ... 2354 // %1 = ... 2355 // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2 2356 // %3 = COPY %2, sub0 2357 // 2358 // We want to look through the COPY to find: 2359 // => %3 = COPY %0 2360 2361 // Plain copy. 2362 return getCommonSubClass(DefRC, SrcRC) != nullptr; 2363 } 2364 2365 bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const { 2366 // TODO: 64-bit operands have extending behavior from 32-bit literal. 2367 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST && 2368 OpType <= AMDGPU::OPERAND_REG_IMM_LAST; 2369 } 2370 2371 /// Returns a lowest register that is not used at any point in the function. 2372 /// If all registers are used, then this function will return 2373 /// AMDGPU::NoRegister. If \p ReserveHighestVGPR = true, then return 2374 /// highest unused register. 2375 MCRegister SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, 2376 const TargetRegisterClass *RC, 2377 const MachineFunction &MF, 2378 bool ReserveHighestVGPR) const { 2379 if (ReserveHighestVGPR) { 2380 for (MCRegister Reg : reverse(*RC)) 2381 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 2382 return Reg; 2383 } else { 2384 for (MCRegister Reg : *RC) 2385 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 2386 return Reg; 2387 } 2388 return MCRegister(); 2389 } 2390 2391 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, 2392 unsigned EltSize) const { 2393 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC->MC); 2394 assert(RegBitWidth >= 32 && RegBitWidth <= 1024); 2395 2396 const unsigned RegDWORDs = RegBitWidth / 32; 2397 const unsigned EltDWORDs = EltSize / 4; 2398 assert(RegSplitParts.size() + 1 >= EltDWORDs); 2399 2400 const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1]; 2401 const unsigned NumParts = RegDWORDs / EltDWORDs; 2402 2403 return makeArrayRef(Parts.data(), NumParts); 2404 } 2405 2406 const TargetRegisterClass* 2407 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI, 2408 Register Reg) const { 2409 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegClass(Reg); 2410 } 2411 2412 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, 2413 Register Reg) const { 2414 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); 2415 // Registers without classes are unaddressable, SGPR-like registers. 2416 return RC && isVGPRClass(RC); 2417 } 2418 2419 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI, 2420 Register Reg) const { 2421 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); 2422 2423 // Registers without classes are unaddressable, SGPR-like registers. 2424 return RC && isAGPRClass(RC); 2425 } 2426 2427 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, 2428 const TargetRegisterClass *SrcRC, 2429 unsigned SubReg, 2430 const TargetRegisterClass *DstRC, 2431 unsigned DstSubReg, 2432 const TargetRegisterClass *NewRC, 2433 LiveIntervals &LIS) const { 2434 unsigned SrcSize = getRegSizeInBits(*SrcRC); 2435 unsigned DstSize = getRegSizeInBits(*DstRC); 2436 unsigned NewSize = getRegSizeInBits(*NewRC); 2437 2438 // Do not increase size of registers beyond dword, we would need to allocate 2439 // adjacent registers and constraint regalloc more than needed. 2440 2441 // Always allow dword coalescing. 2442 if (SrcSize <= 32 || DstSize <= 32) 2443 return true; 2444 2445 return NewSize <= DstSize || NewSize <= SrcSize; 2446 } 2447 2448 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, 2449 MachineFunction &MF) const { 2450 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2451 2452 unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), 2453 MF.getFunction()); 2454 switch (RC->getID()) { 2455 default: 2456 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF); 2457 case AMDGPU::VGPR_32RegClassID: 2458 case AMDGPU::VGPR_LO16RegClassID: 2459 case AMDGPU::VGPR_HI16RegClassID: 2460 return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF)); 2461 case AMDGPU::SGPR_32RegClassID: 2462 case AMDGPU::SGPR_LO16RegClassID: 2463 return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF)); 2464 } 2465 } 2466 2467 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, 2468 unsigned Idx) const { 2469 if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 || 2470 Idx == AMDGPU::RegisterPressureSets::AGPR_32) 2471 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, 2472 const_cast<MachineFunction &>(MF)); 2473 2474 if (Idx == AMDGPU::RegisterPressureSets::SReg_32) 2475 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass, 2476 const_cast<MachineFunction &>(MF)); 2477 2478 llvm_unreachable("Unexpected register pressure set!"); 2479 } 2480 2481 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const { 2482 static const int Empty[] = { -1 }; 2483 2484 if (RegPressureIgnoredUnits[RegUnit]) 2485 return Empty; 2486 2487 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit); 2488 } 2489 2490 MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const { 2491 // Not a callee saved register. 2492 return AMDGPU::SGPR30_SGPR31; 2493 } 2494 2495 const TargetRegisterClass * 2496 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, 2497 const RegisterBank &RB, 2498 const MachineRegisterInfo &MRI) const { 2499 switch (RB.getID()) { 2500 case AMDGPU::VGPRRegBankID: 2501 return getVGPRClassForBitWidth(std::max(32u, Size)); 2502 case AMDGPU::VCCRegBankID: 2503 assert(Size == 1); 2504 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass 2505 : &AMDGPU::SReg_64_XEXECRegClass; 2506 case AMDGPU::SGPRRegBankID: 2507 return getSGPRClassForBitWidth(std::max(32u, Size)); 2508 case AMDGPU::AGPRRegBankID: 2509 return getAGPRClassForBitWidth(std::max(32u, Size)); 2510 default: 2511 llvm_unreachable("unknown register bank"); 2512 } 2513 } 2514 2515 const TargetRegisterClass * 2516 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, 2517 const MachineRegisterInfo &MRI) const { 2518 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg()); 2519 if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>()) 2520 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI); 2521 2522 if (const auto *RC = RCOrRB.dyn_cast<const TargetRegisterClass *>()) 2523 return getAllocatableClass(RC); 2524 2525 return nullptr; 2526 } 2527 2528 MCRegister SIRegisterInfo::getVCC() const { 2529 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; 2530 } 2531 2532 const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const { 2533 // VGPR tuples have an alignment requirement on gfx90a variants. 2534 return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass 2535 : &AMDGPU::VReg_64RegClass; 2536 } 2537 2538 const TargetRegisterClass * 2539 SIRegisterInfo::getRegClass(unsigned RCID) const { 2540 switch ((int)RCID) { 2541 case AMDGPU::SReg_1RegClassID: 2542 return getBoolRC(); 2543 case AMDGPU::SReg_1_XEXECRegClassID: 2544 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass 2545 : &AMDGPU::SReg_64_XEXECRegClass; 2546 case -1: 2547 return nullptr; 2548 default: 2549 return AMDGPUGenRegisterInfo::getRegClass(RCID); 2550 } 2551 } 2552 2553 // Find reaching register definition 2554 MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg, 2555 MachineInstr &Use, 2556 MachineRegisterInfo &MRI, 2557 LiveIntervals *LIS) const { 2558 auto &MDT = LIS->getAnalysis<MachineDominatorTree>(); 2559 SlotIndex UseIdx = LIS->getInstructionIndex(Use); 2560 SlotIndex DefIdx; 2561 2562 if (Reg.isVirtual()) { 2563 if (!LIS->hasInterval(Reg)) 2564 return nullptr; 2565 LiveInterval &LI = LIS->getInterval(Reg); 2566 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg) 2567 : MRI.getMaxLaneMaskForVReg(Reg); 2568 VNInfo *V = nullptr; 2569 if (LI.hasSubRanges()) { 2570 for (auto &S : LI.subranges()) { 2571 if ((S.LaneMask & SubLanes) == SubLanes) { 2572 V = S.getVNInfoAt(UseIdx); 2573 break; 2574 } 2575 } 2576 } else { 2577 V = LI.getVNInfoAt(UseIdx); 2578 } 2579 if (!V) 2580 return nullptr; 2581 DefIdx = V->def; 2582 } else { 2583 // Find last def. 2584 for (MCRegUnitIterator Units(Reg.asMCReg(), this); Units.isValid(); 2585 ++Units) { 2586 LiveRange &LR = LIS->getRegUnit(*Units); 2587 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) { 2588 if (!DefIdx.isValid() || 2589 MDT.dominates(LIS->getInstructionFromIndex(DefIdx), 2590 LIS->getInstructionFromIndex(V->def))) 2591 DefIdx = V->def; 2592 } else { 2593 return nullptr; 2594 } 2595 } 2596 } 2597 2598 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx); 2599 2600 if (!Def || !MDT.dominates(Def, &Use)) 2601 return nullptr; 2602 2603 assert(Def->modifiesRegister(Reg, this)); 2604 2605 return Def; 2606 } 2607 2608 MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const { 2609 assert(getRegSizeInBits(*getPhysRegClass(Reg)) <= 32); 2610 2611 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass, 2612 AMDGPU::SReg_32RegClass, 2613 AMDGPU::AGPR_32RegClass } ) { 2614 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC)) 2615 return Super; 2616 } 2617 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16, 2618 &AMDGPU::VGPR_32RegClass)) { 2619 return Super; 2620 } 2621 2622 return AMDGPU::NoRegister; 2623 } 2624 2625 bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const { 2626 if (!ST.needsAlignedVGPRs()) 2627 return true; 2628 2629 if (hasVGPRs(&RC)) 2630 return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC))); 2631 if (hasAGPRs(&RC)) 2632 return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC))); 2633 2634 return true; 2635 } 2636 2637 bool SIRegisterInfo::isConstantPhysReg(MCRegister PhysReg) const { 2638 switch (PhysReg) { 2639 case AMDGPU::SGPR_NULL: 2640 case AMDGPU::SRC_SHARED_BASE: 2641 case AMDGPU::SRC_PRIVATE_BASE: 2642 case AMDGPU::SRC_SHARED_LIMIT: 2643 case AMDGPU::SRC_PRIVATE_LIMIT: 2644 return true; 2645 default: 2646 return false; 2647 } 2648 } 2649 2650 ArrayRef<MCPhysReg> 2651 SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const { 2652 return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(), 2653 ST.getMaxNumSGPRs(MF) / 4); 2654 } 2655 2656 ArrayRef<MCPhysReg> 2657 SIRegisterInfo::getAllSGPR64(const MachineFunction &MF) const { 2658 return makeArrayRef(AMDGPU::SGPR_64RegClass.begin(), 2659 ST.getMaxNumSGPRs(MF) / 2); 2660 } 2661 2662 ArrayRef<MCPhysReg> 2663 SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const { 2664 return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF)); 2665 } 2666