1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI implementation of the TargetRegisterInfo class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "SIRegisterInfo.h" 15 #include "AMDGPU.h" 16 #include "AMDGPURegisterBankInfo.h" 17 #include "GCNSubtarget.h" 18 #include "MCTargetDesc/AMDGPUInstPrinter.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "llvm/CodeGen/LiveIntervals.h" 22 #include "llvm/CodeGen/MachineDominators.h" 23 #include "llvm/CodeGen/RegisterScavenging.h" 24 25 using namespace llvm; 26 27 #define GET_REGINFO_TARGET_DESC 28 #include "AMDGPUGenRegisterInfo.inc" 29 30 static cl::opt<bool> EnableSpillSGPRToVGPR( 31 "amdgpu-spill-sgpr-to-vgpr", 32 cl::desc("Enable spilling VGPRs to SGPRs"), 33 cl::ReallyHidden, 34 cl::init(true)); 35 36 std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts; 37 std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable; 38 39 // Map numbers of DWORDs to indexes in SubRegFromChannelTable. 40 // Valid indexes are shifted 1, such that a 0 mapping means unsupported. 41 // e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8, 42 // meaning index 7 in SubRegFromChannelTable. 43 static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = { 44 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9}; 45 46 namespace llvm { 47 48 // A temporary struct to spill SGPRs. 49 // This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits 50 // just v_writelane and v_readlane. 51 // 52 // When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR 53 // is saved to scratch (or the other way around for loads). 54 // For this, a VGPR is required where the needed lanes can be clobbered. The 55 // RegScavenger can provide a VGPR where currently active lanes can be 56 // clobbered, but we still need to save inactive lanes. 57 // The high-level steps are: 58 // - Try to scavenge SGPR(s) to save exec 59 // - Try to scavenge VGPR 60 // - Save needed, all or inactive lanes of a TmpVGPR 61 // - Spill/Restore SGPRs using TmpVGPR 62 // - Restore TmpVGPR 63 // 64 // To save all lanes of TmpVGPR, exec needs to be saved and modified. If we 65 // cannot scavenge temporary SGPRs to save exec, we use the following code: 66 // buffer_store_dword TmpVGPR ; only if active lanes need to be saved 67 // s_not exec, exec 68 // buffer_store_dword TmpVGPR ; save inactive lanes 69 // s_not exec, exec 70 struct SGPRSpillBuilder { 71 struct PerVGPRData { 72 unsigned PerVGPR; 73 unsigned NumVGPRs; 74 int64_t VGPRLanes; 75 }; 76 77 // The SGPR to save 78 Register SuperReg; 79 MachineBasicBlock::iterator MI; 80 ArrayRef<int16_t> SplitParts; 81 unsigned NumSubRegs; 82 bool IsKill; 83 const DebugLoc &DL; 84 85 /* When spilling to stack */ 86 // The SGPRs are written into this VGPR, which is then written to scratch 87 // (or vice versa for loads). 88 Register TmpVGPR = AMDGPU::NoRegister; 89 // Temporary spill slot to save TmpVGPR to. 90 int TmpVGPRIndex = 0; 91 // If TmpVGPR is live before the spill or if it is scavenged. 92 bool TmpVGPRLive = false; 93 // Scavenged SGPR to save EXEC. 94 Register SavedExecReg = AMDGPU::NoRegister; 95 // Stack index to write the SGPRs to. 96 int Index; 97 unsigned EltSize = 4; 98 99 RegScavenger *RS; 100 MachineBasicBlock *MBB; 101 MachineFunction &MF; 102 SIMachineFunctionInfo &MFI; 103 const SIInstrInfo &TII; 104 const SIRegisterInfo &TRI; 105 bool IsWave32; 106 Register ExecReg; 107 unsigned MovOpc; 108 unsigned NotOpc; 109 110 SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, 111 bool IsWave32, MachineBasicBlock::iterator MI, int Index, 112 RegScavenger *RS) 113 : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(0).getReg(), 114 MI->getOperand(0).isKill(), Index, RS) {} 115 116 SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, 117 bool IsWave32, MachineBasicBlock::iterator MI, Register Reg, 118 bool IsKill, int Index, RegScavenger *RS) 119 : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()), 120 Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()), 121 MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI), 122 IsWave32(IsWave32) { 123 const TargetRegisterClass *RC = TRI.getPhysRegClass(SuperReg); 124 SplitParts = TRI.getRegSplitParts(RC, EltSize); 125 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 126 127 if (IsWave32) { 128 ExecReg = AMDGPU::EXEC_LO; 129 MovOpc = AMDGPU::S_MOV_B32; 130 NotOpc = AMDGPU::S_NOT_B32; 131 } else { 132 ExecReg = AMDGPU::EXEC; 133 MovOpc = AMDGPU::S_MOV_B64; 134 NotOpc = AMDGPU::S_NOT_B64; 135 } 136 137 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 138 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI && 139 SuperReg != AMDGPU::EXEC && "exec should never spill"); 140 } 141 142 PerVGPRData getPerVGPRData() { 143 PerVGPRData Data; 144 Data.PerVGPR = IsWave32 ? 32 : 64; 145 Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR; 146 Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL; 147 return Data; 148 } 149 150 // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is 151 // free. 152 // Writes these instructions if an SGPR can be scavenged: 153 // s_mov_b64 s[6:7], exec ; Save exec 154 // s_mov_b64 exec, 3 ; Wanted lanemask 155 // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot 156 // 157 // Writes these instructions if no SGPR can be scavenged: 158 // buffer_store_dword v0 ; Only if no free VGPR was found 159 // s_not_b64 exec, exec 160 // buffer_store_dword v0 ; Save inactive lanes 161 // ; exec stays inverted, it is flipped back in 162 // ; restore. 163 void prepare() { 164 // Scavenged temporary VGPR to use. It must be scavenged once for any number 165 // of spilled subregs. 166 // FIXME: The liveness analysis is limited and does not tell if a register 167 // is in use in lanes that are currently inactive. We can never be sure if 168 // a register as actually in use in another lane, so we need to save all 169 // used lanes of the chosen VGPR. 170 assert(RS && "Cannot spill SGPR to memory without RegScavenger"); 171 TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0, false); 172 173 // Reserve temporary stack slot 174 TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI); 175 if (TmpVGPR) { 176 // Found a register that is dead in the currently active lanes, we only 177 // need to spill inactive lanes. 178 TmpVGPRLive = false; 179 } else { 180 // Pick v0 because it doesn't make a difference. 181 TmpVGPR = AMDGPU::VGPR0; 182 TmpVGPRLive = true; 183 } 184 185 if (TmpVGPRLive) { 186 // We need to inform the scavenger that this index is already in use until 187 // we're done with the custom emergency spill. 188 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR); 189 } 190 191 // We may end up recursively calling the scavenger, and don't want to re-use 192 // the same register. 193 RS->setRegUsed(TmpVGPR); 194 195 // Try to scavenge SGPRs to save exec 196 assert(!SavedExecReg && "Exec is already saved, refuse to save again"); 197 const TargetRegisterClass &RC = 198 IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass; 199 RS->setRegUsed(SuperReg); 200 SavedExecReg = RS->scavengeRegister(&RC, MI, 0, false); 201 202 int64_t VGPRLanes = getPerVGPRData().VGPRLanes; 203 204 if (SavedExecReg) { 205 RS->setRegUsed(SavedExecReg); 206 // Set exec to needed lanes 207 BuildMI(*MBB, MI, DL, TII.get(MovOpc), SavedExecReg).addReg(ExecReg); 208 auto I = 209 BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes); 210 if (!TmpVGPRLive) 211 I.addReg(TmpVGPR, RegState::ImplicitDefine); 212 // Spill needed lanes 213 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false); 214 } else { 215 // The modify and restore of exec clobber SCC, which we would have to save 216 // and restore. FIXME: We probably would need to reserve a register for 217 // this. 218 if (RS->isRegUsed(AMDGPU::SCC)) 219 MI->emitError("unhandled SGPR spill to memory"); 220 221 // Spill active lanes 222 if (TmpVGPRLive) 223 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false, 224 /*IsKill*/ false); 225 // Spill inactive lanes 226 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 227 if (!TmpVGPRLive) 228 I.addReg(TmpVGPR, RegState::ImplicitDefine); 229 I->getOperand(2).setIsDead(true); // Mark SCC as dead. 230 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false); 231 } 232 } 233 234 // Writes these instructions if an SGPR can be scavenged: 235 // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot 236 // s_waitcnt vmcnt(0) ; If a free VGPR was found 237 // s_mov_b64 exec, s[6:7] ; Save exec 238 // 239 // Writes these instructions if no SGPR can be scavenged: 240 // buffer_load_dword v0 ; Restore inactive lanes 241 // s_waitcnt vmcnt(0) ; If a free VGPR was found 242 // s_not_b64 exec, exec 243 // buffer_load_dword v0 ; Only if no free VGPR was found 244 void restore() { 245 if (SavedExecReg) { 246 // Restore used lanes 247 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true, 248 /*IsKill*/ false); 249 // Restore exec 250 auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg) 251 .addReg(SavedExecReg, RegState::Kill); 252 // Add an implicit use of the load so it is not dead. 253 // FIXME This inserts an unnecessary waitcnt 254 if (!TmpVGPRLive) { 255 I.addReg(TmpVGPR, RegState::ImplicitKill); 256 } 257 } else { 258 // Restore inactive lanes 259 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true, 260 /*IsKill*/ false); 261 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 262 if (!TmpVGPRLive) 263 I.addReg(TmpVGPR, RegState::ImplicitKill); 264 I->getOperand(2).setIsDead(true); // Mark SCC as dead. 265 266 // Restore active lanes 267 if (TmpVGPRLive) 268 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true); 269 } 270 271 // Inform the scavenger where we're releasing our custom scavenged register. 272 if (TmpVGPRLive) { 273 MachineBasicBlock::iterator RestorePt = std::prev(MI); 274 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR, &*RestorePt); 275 } 276 } 277 278 // Write TmpVGPR to memory or read TmpVGPR from memory. 279 // Either using a single buffer_load/store if exec is set to the needed mask 280 // or using 281 // buffer_load 282 // s_not exec, exec 283 // buffer_load 284 // s_not exec, exec 285 void readWriteTmpVGPR(unsigned Offset, bool IsLoad) { 286 if (SavedExecReg) { 287 // Spill needed lanes 288 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad); 289 } else { 290 // The modify and restore of exec clobber SCC, which we would have to save 291 // and restore. FIXME: We probably would need to reserve a register for 292 // this. 293 if (RS->isRegUsed(AMDGPU::SCC)) 294 MI->emitError("unhandled SGPR spill to memory"); 295 296 // Spill active lanes 297 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad, 298 /*IsKill*/ false); 299 // Spill inactive lanes 300 auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 301 Not0->getOperand(2).setIsDead(); // Mark SCC as dead. 302 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad); 303 auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 304 Not1->getOperand(2).setIsDead(); // Mark SCC as dead. 305 } 306 } 307 308 void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI) { 309 assert(MBB->getParent() == &MF); 310 MI = NewMI; 311 MBB = NewMBB; 312 } 313 }; 314 315 } // namespace llvm 316 317 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) 318 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST), 319 SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) { 320 321 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 && 322 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) && 323 (getSubRegIndexLaneMask(AMDGPU::lo16) | 324 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() == 325 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() && 326 "getNumCoveredRegs() will not work with generated subreg masks!"); 327 328 RegPressureIgnoredUnits.resize(getNumRegUnits()); 329 RegPressureIgnoredUnits.set( 330 *MCRegUnitIterator(MCRegister::from(AMDGPU::M0), this)); 331 for (auto Reg : AMDGPU::VGPR_HI16RegClass) 332 RegPressureIgnoredUnits.set(*MCRegUnitIterator(Reg, this)); 333 334 // HACK: Until this is fully tablegen'd. 335 static llvm::once_flag InitializeRegSplitPartsFlag; 336 337 static auto InitializeRegSplitPartsOnce = [this]() { 338 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) { 339 unsigned Size = getSubRegIdxSize(Idx); 340 if (Size & 31) 341 continue; 342 std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1]; 343 unsigned Pos = getSubRegIdxOffset(Idx); 344 if (Pos % Size) 345 continue; 346 Pos /= Size; 347 if (Vec.empty()) { 348 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits. 349 Vec.resize(MaxNumParts); 350 } 351 Vec[Pos] = Idx; 352 } 353 }; 354 355 static llvm::once_flag InitializeSubRegFromChannelTableFlag; 356 357 static auto InitializeSubRegFromChannelTableOnce = [this]() { 358 for (auto &Row : SubRegFromChannelTable) 359 Row.fill(AMDGPU::NoSubRegister); 360 for (uint16_t Idx = 1; Idx < getNumSubRegIndices(); ++Idx) { 361 unsigned Width = AMDGPUSubRegIdxRanges[Idx].Size / 32; 362 unsigned Offset = AMDGPUSubRegIdxRanges[Idx].Offset / 32; 363 assert(Width < SubRegFromChannelTableWidthMap.size()); 364 Width = SubRegFromChannelTableWidthMap[Width]; 365 if (Width == 0) 366 continue; 367 unsigned TableIdx = Width - 1; 368 assert(TableIdx < SubRegFromChannelTable.size()); 369 assert(Offset < SubRegFromChannelTable[TableIdx].size()); 370 SubRegFromChannelTable[TableIdx][Offset] = Idx; 371 } 372 }; 373 374 llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce); 375 llvm::call_once(InitializeSubRegFromChannelTableFlag, 376 InitializeSubRegFromChannelTableOnce); 377 } 378 379 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, 380 MCRegister Reg) const { 381 MCRegAliasIterator R(Reg, this, true); 382 383 for (; R.isValid(); ++R) 384 Reserved.set(*R); 385 } 386 387 // Forced to be here by one .inc 388 const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs( 389 const MachineFunction *MF) const { 390 CallingConv::ID CC = MF->getFunction().getCallingConv(); 391 switch (CC) { 392 case CallingConv::C: 393 case CallingConv::Fast: 394 case CallingConv::Cold: 395 return ST.hasGFX90AInsts() ? CSR_AMDGPU_HighRegs_With_AGPRs_SaveList 396 : CSR_AMDGPU_HighRegs_SaveList; 397 case CallingConv::AMDGPU_Gfx: 398 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_With_AGPRs_SaveList 399 : CSR_AMDGPU_SI_Gfx_SaveList; 400 default: { 401 // Dummy to not crash RegisterClassInfo. 402 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister; 403 return &NoCalleeSavedReg; 404 } 405 } 406 } 407 408 const MCPhysReg * 409 SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const { 410 return nullptr; 411 } 412 413 const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, 414 CallingConv::ID CC) const { 415 switch (CC) { 416 case CallingConv::C: 417 case CallingConv::Fast: 418 case CallingConv::Cold: 419 return ST.hasGFX90AInsts() ? CSR_AMDGPU_HighRegs_With_AGPRs_RegMask 420 : CSR_AMDGPU_HighRegs_RegMask; 421 case CallingConv::AMDGPU_Gfx: 422 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_With_AGPRs_RegMask 423 : CSR_AMDGPU_SI_Gfx_RegMask; 424 default: 425 return nullptr; 426 } 427 } 428 429 const uint32_t *SIRegisterInfo::getNoPreservedMask() const { 430 return CSR_AMDGPU_NoRegs_RegMask; 431 } 432 433 const TargetRegisterClass * 434 SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, 435 const MachineFunction &MF) const { 436 // FIXME: Should have a helper function like getEquivalentVGPRClass to get the 437 // equivalent AV class. If used one, the verifier will crash after 438 // RegBankSelect in the GISel flow. The aligned regclasses are not fully given 439 // until Instruction selection. 440 if (ST.hasMAIInsts() && (isVGPRClass(RC) || isAGPRClass(RC))) { 441 if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass) 442 return &AMDGPU::AV_32RegClass; 443 if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass) 444 return &AMDGPU::AV_64RegClass; 445 if (RC == &AMDGPU::VReg_64_Align2RegClass || 446 RC == &AMDGPU::AReg_64_Align2RegClass) 447 return &AMDGPU::AV_64_Align2RegClass; 448 if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass) 449 return &AMDGPU::AV_96RegClass; 450 if (RC == &AMDGPU::VReg_96_Align2RegClass || 451 RC == &AMDGPU::AReg_96_Align2RegClass) 452 return &AMDGPU::AV_96_Align2RegClass; 453 if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass) 454 return &AMDGPU::AV_128RegClass; 455 if (RC == &AMDGPU::VReg_128_Align2RegClass || 456 RC == &AMDGPU::AReg_128_Align2RegClass) 457 return &AMDGPU::AV_128_Align2RegClass; 458 if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass) 459 return &AMDGPU::AV_160RegClass; 460 if (RC == &AMDGPU::VReg_160_Align2RegClass || 461 RC == &AMDGPU::AReg_160_Align2RegClass) 462 return &AMDGPU::AV_160_Align2RegClass; 463 if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass) 464 return &AMDGPU::AV_192RegClass; 465 if (RC == &AMDGPU::VReg_192_Align2RegClass || 466 RC == &AMDGPU::AReg_192_Align2RegClass) 467 return &AMDGPU::AV_192_Align2RegClass; 468 if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass) 469 return &AMDGPU::AV_256RegClass; 470 if (RC == &AMDGPU::VReg_256_Align2RegClass || 471 RC == &AMDGPU::AReg_256_Align2RegClass) 472 return &AMDGPU::AV_256_Align2RegClass; 473 if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass) 474 return &AMDGPU::AV_512RegClass; 475 if (RC == &AMDGPU::VReg_512_Align2RegClass || 476 RC == &AMDGPU::AReg_512_Align2RegClass) 477 return &AMDGPU::AV_512_Align2RegClass; 478 if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass) 479 return &AMDGPU::AV_1024RegClass; 480 if (RC == &AMDGPU::VReg_1024_Align2RegClass || 481 RC == &AMDGPU::AReg_1024_Align2RegClass) 482 return &AMDGPU::AV_1024_Align2RegClass; 483 } 484 485 return TargetRegisterInfo::getLargestLegalSuperClass(RC, MF); 486 } 487 488 Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { 489 const SIFrameLowering *TFI = ST.getFrameLowering(); 490 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 491 // During ISel lowering we always reserve the stack pointer in entry 492 // functions, but never actually want to reference it when accessing our own 493 // frame. If we need a frame pointer we use it, but otherwise we can just use 494 // an immediate "0" which we represent by returning NoRegister. 495 if (FuncInfo->isEntryFunction()) { 496 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register(); 497 } 498 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() 499 : FuncInfo->getStackPtrOffsetReg(); 500 } 501 502 bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const { 503 // When we need stack realignment, we can't reference off of the 504 // stack pointer, so we reserve a base pointer. 505 const MachineFrameInfo &MFI = MF.getFrameInfo(); 506 return MFI.getNumFixedObjects() && shouldRealignStack(MF); 507 } 508 509 Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; } 510 511 const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const { 512 return CSR_AMDGPU_AllVGPRs_RegMask; 513 } 514 515 const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const { 516 return CSR_AMDGPU_AllAGPRs_RegMask; 517 } 518 519 const uint32_t *SIRegisterInfo::getAllVectorRegMask() const { 520 return CSR_AMDGPU_AllVectorRegs_RegMask; 521 } 522 523 const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const { 524 return CSR_AMDGPU_AllAllocatableSRegs_RegMask; 525 } 526 527 unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel, 528 unsigned NumRegs) { 529 assert(NumRegs < SubRegFromChannelTableWidthMap.size()); 530 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs]; 531 assert(NumRegIndex && "Not implemented"); 532 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size()); 533 return SubRegFromChannelTable[NumRegIndex - 1][Channel]; 534 } 535 536 MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg( 537 const MachineFunction &MF) const { 538 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4; 539 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); 540 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass); 541 } 542 543 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { 544 BitVector Reserved(getNumRegs()); 545 Reserved.set(AMDGPU::MODE); 546 547 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but 548 // this seems likely to result in bugs, so I'm marking them as reserved. 549 reserveRegisterTuples(Reserved, AMDGPU::EXEC); 550 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); 551 552 // M0 has to be reserved so that llvm accepts it as a live-in into a block. 553 reserveRegisterTuples(Reserved, AMDGPU::M0); 554 555 // Reserve src_vccz, src_execz, src_scc. 556 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ); 557 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ); 558 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC); 559 560 // Reserve the memory aperture registers. 561 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE); 562 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); 563 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); 564 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT); 565 566 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen. 567 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID); 568 569 // Reserve xnack_mask registers - support is not implemented in Codegen. 570 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK); 571 572 // Reserve lds_direct register - support is not implemented in Codegen. 573 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT); 574 575 // Reserve Trap Handler registers - support is not implemented in Codegen. 576 reserveRegisterTuples(Reserved, AMDGPU::TBA); 577 reserveRegisterTuples(Reserved, AMDGPU::TMA); 578 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1); 579 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3); 580 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5); 581 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7); 582 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); 583 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); 584 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13); 585 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15); 586 587 // Reserve null register - it shall never be allocated 588 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL); 589 590 // Disallow vcc_hi allocation in wave32. It may be allocated but most likely 591 // will result in bugs. 592 if (isWave32) { 593 Reserved.set(AMDGPU::VCC); 594 Reserved.set(AMDGPU::VCC_HI); 595 } 596 597 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); 598 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 599 for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) { 600 unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); 601 reserveRegisterTuples(Reserved, Reg); 602 } 603 604 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 605 unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); 606 unsigned MaxNumAGPRs = MaxNumVGPRs; 607 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); 608 609 if (ST.hasGFX90AInsts()) { 610 // In an entry function without calls and AGPRs used it is possible to use 611 // the whole register budget for VGPRs. 612 613 // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and 614 // split register file accordingly. 615 if (MFI->usesAGPRs(MF)) { 616 MaxNumVGPRs /= 2; 617 MaxNumAGPRs = MaxNumVGPRs; 618 } else { 619 if (MaxNumVGPRs > TotalNumVGPRs) { 620 MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs; 621 MaxNumVGPRs = TotalNumVGPRs; 622 } else 623 MaxNumAGPRs = 0; 624 } 625 } else if (ST.hasMAIInsts() && MFI->usesAGPRs(MF)) { 626 // In order to guarantee copying between AGPRs, we need a scratch VGPR 627 // available at all times. 628 reserveRegisterTuples(Reserved, AMDGPU::VGPR32); 629 } 630 631 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) { 632 unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); 633 reserveRegisterTuples(Reserved, Reg); 634 } 635 636 for (unsigned i = MaxNumAGPRs; i < TotalNumVGPRs; ++i) { 637 unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); 638 reserveRegisterTuples(Reserved, Reg); 639 } 640 641 for (auto Reg : AMDGPU::SReg_32RegClass) { 642 Reserved.set(getSubReg(Reg, AMDGPU::hi16)); 643 Register Low = getSubReg(Reg, AMDGPU::lo16); 644 // This is to prevent BB vcc liveness errors. 645 if (!AMDGPU::SGPR_LO16RegClass.contains(Low)) 646 Reserved.set(Low); 647 } 648 649 for (auto Reg : AMDGPU::AGPR_32RegClass) { 650 Reserved.set(getSubReg(Reg, AMDGPU::hi16)); 651 } 652 653 // Reserve all the rest AGPRs if there are no instructions to use it. 654 if (!ST.hasMAIInsts()) { 655 for (unsigned i = 0; i < MaxNumVGPRs; ++i) { 656 unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); 657 reserveRegisterTuples(Reserved, Reg); 658 } 659 } 660 661 Register ScratchRSrcReg = MFI->getScratchRSrcReg(); 662 if (ScratchRSrcReg != AMDGPU::NoRegister) { 663 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need 664 // to spill. 665 // TODO: May need to reserve a VGPR if doing LDS spilling. 666 reserveRegisterTuples(Reserved, ScratchRSrcReg); 667 } 668 669 // We have to assume the SP is needed in case there are calls in the function, 670 // which is detected after the function is lowered. If we aren't really going 671 // to need SP, don't bother reserving it. 672 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg(); 673 674 if (StackPtrReg) { 675 reserveRegisterTuples(Reserved, StackPtrReg); 676 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg)); 677 } 678 679 MCRegister FrameReg = MFI->getFrameOffsetReg(); 680 if (FrameReg) { 681 reserveRegisterTuples(Reserved, FrameReg); 682 assert(!isSubRegister(ScratchRSrcReg, FrameReg)); 683 } 684 685 if (hasBasePointer(MF)) { 686 MCRegister BasePtrReg = getBaseRegister(); 687 reserveRegisterTuples(Reserved, BasePtrReg); 688 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg)); 689 } 690 691 for (auto Reg : MFI->WWMReservedRegs) { 692 reserveRegisterTuples(Reserved, Reg.first); 693 } 694 695 // Reserve VGPRs used for SGPR spilling. 696 // Note we treat freezeReservedRegs unusually because we run register 697 // allocation in two phases. It's OK to re-freeze with new registers for the 698 // second run. 699 #if 0 700 for (auto &SpilledFI : MFI->sgpr_spill_vgprs()) { 701 for (auto &SpilledVGPR : SpilledFI.second) 702 reserveRegisterTuples(Reserved, SpilledVGPR.VGPR); 703 } 704 #endif 705 706 // FIXME: Stop using reserved registers for this. 707 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs()) 708 reserveRegisterTuples(Reserved, Reg); 709 710 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) 711 reserveRegisterTuples(Reserved, Reg); 712 713 for (auto SSpill : MFI->getSGPRSpillVGPRs()) 714 reserveRegisterTuples(Reserved, SSpill.VGPR); 715 716 return Reserved; 717 } 718 719 bool SIRegisterInfo::isAsmClobberable(const MachineFunction &MF, 720 MCRegister PhysReg) const { 721 return !MF.getRegInfo().isReserved(PhysReg); 722 } 723 724 bool SIRegisterInfo::shouldRealignStack(const MachineFunction &MF) const { 725 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 726 // On entry, the base address is 0, so it can't possibly need any more 727 // alignment. 728 729 // FIXME: Should be able to specify the entry frame alignment per calling 730 // convention instead. 731 if (Info->isEntryFunction()) 732 return false; 733 734 return TargetRegisterInfo::shouldRealignStack(MF); 735 } 736 737 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { 738 const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>(); 739 if (Info->isEntryFunction()) { 740 const MachineFrameInfo &MFI = Fn.getFrameInfo(); 741 return MFI.hasStackObjects() || MFI.hasCalls(); 742 } 743 744 // May need scavenger for dealing with callee saved registers. 745 return true; 746 } 747 748 bool SIRegisterInfo::requiresFrameIndexScavenging( 749 const MachineFunction &MF) const { 750 // Do not use frame virtual registers. They used to be used for SGPRs, but 751 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the 752 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a 753 // spill. 754 return false; 755 } 756 757 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( 758 const MachineFunction &MF) const { 759 const MachineFrameInfo &MFI = MF.getFrameInfo(); 760 return MFI.hasStackObjects(); 761 } 762 763 bool SIRegisterInfo::requiresVirtualBaseRegisters( 764 const MachineFunction &) const { 765 // There are no special dedicated stack or frame pointers. 766 return true; 767 } 768 769 int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const { 770 assert(SIInstrInfo::isMUBUF(*MI) || SIInstrInfo::isFLATScratch(*MI)); 771 772 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 773 AMDGPU::OpName::offset); 774 return MI->getOperand(OffIdx).getImm(); 775 } 776 777 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI, 778 int Idx) const { 779 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 780 return 0; 781 782 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 783 AMDGPU::OpName::vaddr) || 784 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 785 AMDGPU::OpName::saddr))) && 786 "Should never see frame index on non-address operand"); 787 788 return getScratchInstrOffset(MI); 789 } 790 791 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { 792 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 793 return false; 794 795 int64_t FullOffset = Offset + getScratchInstrOffset(MI); 796 797 if (SIInstrInfo::isMUBUF(*MI)) 798 return !SIInstrInfo::isLegalMUBUFImmOffset(FullOffset); 799 800 const SIInstrInfo *TII = ST.getInstrInfo(); 801 return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS, 802 SIInstrFlags::FlatScratch); 803 } 804 805 Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, 806 int FrameIdx, 807 int64_t Offset) const { 808 MachineBasicBlock::iterator Ins = MBB->begin(); 809 DebugLoc DL; // Defaults to "unknown" 810 811 if (Ins != MBB->end()) 812 DL = Ins->getDebugLoc(); 813 814 MachineFunction *MF = MBB->getParent(); 815 const SIInstrInfo *TII = ST.getInstrInfo(); 816 MachineRegisterInfo &MRI = MF->getRegInfo(); 817 unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32 818 : AMDGPU::V_MOV_B32_e32; 819 820 Register BaseReg = MRI.createVirtualRegister( 821 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass 822 : &AMDGPU::VGPR_32RegClass); 823 824 if (Offset == 0) { 825 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg) 826 .addFrameIndex(FrameIdx); 827 return BaseReg; 828 } 829 830 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 831 832 Register FIReg = MRI.createVirtualRegister( 833 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass 834 : &AMDGPU::VGPR_32RegClass); 835 836 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) 837 .addImm(Offset); 838 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg) 839 .addFrameIndex(FrameIdx); 840 841 if (ST.enableFlatScratch() ) { 842 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg) 843 .addReg(OffsetReg, RegState::Kill) 844 .addReg(FIReg); 845 return BaseReg; 846 } 847 848 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg) 849 .addReg(OffsetReg, RegState::Kill) 850 .addReg(FIReg) 851 .addImm(0); // clamp bit 852 853 return BaseReg; 854 } 855 856 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, 857 int64_t Offset) const { 858 const SIInstrInfo *TII = ST.getInstrInfo(); 859 bool IsFlat = TII->isFLATScratch(MI); 860 861 #ifndef NDEBUG 862 // FIXME: Is it possible to be storing a frame index to itself? 863 bool SeenFI = false; 864 for (const MachineOperand &MO: MI.operands()) { 865 if (MO.isFI()) { 866 if (SeenFI) 867 llvm_unreachable("should not see multiple frame indices"); 868 869 SeenFI = true; 870 } 871 } 872 #endif 873 874 MachineOperand *FIOp = 875 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr 876 : AMDGPU::OpName::vaddr); 877 878 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); 879 int64_t NewOffset = OffsetOp->getImm() + Offset; 880 881 assert(FIOp && FIOp->isFI() && "frame index must be address operand"); 882 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI)); 883 884 if (IsFlat) { 885 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 886 SIInstrFlags::FlatScratch) && 887 "offset should be legal"); 888 FIOp->ChangeToRegister(BaseReg, false); 889 OffsetOp->setImm(NewOffset); 890 return; 891 } 892 893 #ifndef NDEBUG 894 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset); 895 assert(SOffset->isImm() && SOffset->getImm() == 0); 896 #endif 897 898 assert(SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) && 899 "offset should be legal"); 900 901 FIOp->ChangeToRegister(BaseReg, false); 902 OffsetOp->setImm(NewOffset); 903 } 904 905 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, 906 Register BaseReg, 907 int64_t Offset) const { 908 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 909 return false; 910 911 int64_t NewOffset = Offset + getScratchInstrOffset(MI); 912 913 if (SIInstrInfo::isMUBUF(*MI)) 914 return SIInstrInfo::isLegalMUBUFImmOffset(NewOffset); 915 916 const SIInstrInfo *TII = ST.getInstrInfo(); 917 return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 918 SIInstrFlags::FlatScratch); 919 } 920 921 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( 922 const MachineFunction &MF, unsigned Kind) const { 923 // This is inaccurate. It depends on the instruction and address space. The 924 // only place where we should hit this is for dealing with frame indexes / 925 // private accesses, so this is correct in that case. 926 return &AMDGPU::VGPR_32RegClass; 927 } 928 929 const TargetRegisterClass * 930 SIRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { 931 if (isAGPRClass(RC) && !ST.hasGFX90AInsts()) 932 return getEquivalentVGPRClass(RC); 933 934 return RC; 935 } 936 937 static unsigned getNumSubRegsForSpillOp(unsigned Op) { 938 939 switch (Op) { 940 case AMDGPU::SI_SPILL_S1024_SAVE: 941 case AMDGPU::SI_SPILL_S1024_RESTORE: 942 case AMDGPU::SI_SPILL_V1024_SAVE: 943 case AMDGPU::SI_SPILL_V1024_RESTORE: 944 case AMDGPU::SI_SPILL_A1024_SAVE: 945 case AMDGPU::SI_SPILL_A1024_RESTORE: 946 case AMDGPU::SI_SPILL_AV1024_SAVE: 947 case AMDGPU::SI_SPILL_AV1024_RESTORE: 948 return 32; 949 case AMDGPU::SI_SPILL_S512_SAVE: 950 case AMDGPU::SI_SPILL_S512_RESTORE: 951 case AMDGPU::SI_SPILL_V512_SAVE: 952 case AMDGPU::SI_SPILL_V512_RESTORE: 953 case AMDGPU::SI_SPILL_A512_SAVE: 954 case AMDGPU::SI_SPILL_A512_RESTORE: 955 case AMDGPU::SI_SPILL_AV512_SAVE: 956 case AMDGPU::SI_SPILL_AV512_RESTORE: 957 return 16; 958 case AMDGPU::SI_SPILL_S256_SAVE: 959 case AMDGPU::SI_SPILL_S256_RESTORE: 960 case AMDGPU::SI_SPILL_V256_SAVE: 961 case AMDGPU::SI_SPILL_V256_RESTORE: 962 case AMDGPU::SI_SPILL_A256_SAVE: 963 case AMDGPU::SI_SPILL_A256_RESTORE: 964 case AMDGPU::SI_SPILL_AV256_SAVE: 965 case AMDGPU::SI_SPILL_AV256_RESTORE: 966 return 8; 967 case AMDGPU::SI_SPILL_S224_SAVE: 968 case AMDGPU::SI_SPILL_S224_RESTORE: 969 case AMDGPU::SI_SPILL_V224_SAVE: 970 case AMDGPU::SI_SPILL_V224_RESTORE: 971 case AMDGPU::SI_SPILL_A224_SAVE: 972 case AMDGPU::SI_SPILL_A224_RESTORE: 973 case AMDGPU::SI_SPILL_AV224_SAVE: 974 case AMDGPU::SI_SPILL_AV224_RESTORE: 975 return 7; 976 case AMDGPU::SI_SPILL_S192_SAVE: 977 case AMDGPU::SI_SPILL_S192_RESTORE: 978 case AMDGPU::SI_SPILL_V192_SAVE: 979 case AMDGPU::SI_SPILL_V192_RESTORE: 980 case AMDGPU::SI_SPILL_A192_SAVE: 981 case AMDGPU::SI_SPILL_A192_RESTORE: 982 case AMDGPU::SI_SPILL_AV192_SAVE: 983 case AMDGPU::SI_SPILL_AV192_RESTORE: 984 return 6; 985 case AMDGPU::SI_SPILL_S160_SAVE: 986 case AMDGPU::SI_SPILL_S160_RESTORE: 987 case AMDGPU::SI_SPILL_V160_SAVE: 988 case AMDGPU::SI_SPILL_V160_RESTORE: 989 case AMDGPU::SI_SPILL_A160_SAVE: 990 case AMDGPU::SI_SPILL_A160_RESTORE: 991 case AMDGPU::SI_SPILL_AV160_SAVE: 992 case AMDGPU::SI_SPILL_AV160_RESTORE: 993 return 5; 994 case AMDGPU::SI_SPILL_S128_SAVE: 995 case AMDGPU::SI_SPILL_S128_RESTORE: 996 case AMDGPU::SI_SPILL_V128_SAVE: 997 case AMDGPU::SI_SPILL_V128_RESTORE: 998 case AMDGPU::SI_SPILL_A128_SAVE: 999 case AMDGPU::SI_SPILL_A128_RESTORE: 1000 case AMDGPU::SI_SPILL_AV128_SAVE: 1001 case AMDGPU::SI_SPILL_AV128_RESTORE: 1002 return 4; 1003 case AMDGPU::SI_SPILL_S96_SAVE: 1004 case AMDGPU::SI_SPILL_S96_RESTORE: 1005 case AMDGPU::SI_SPILL_V96_SAVE: 1006 case AMDGPU::SI_SPILL_V96_RESTORE: 1007 case AMDGPU::SI_SPILL_A96_SAVE: 1008 case AMDGPU::SI_SPILL_A96_RESTORE: 1009 case AMDGPU::SI_SPILL_AV96_SAVE: 1010 case AMDGPU::SI_SPILL_AV96_RESTORE: 1011 return 3; 1012 case AMDGPU::SI_SPILL_S64_SAVE: 1013 case AMDGPU::SI_SPILL_S64_RESTORE: 1014 case AMDGPU::SI_SPILL_V64_SAVE: 1015 case AMDGPU::SI_SPILL_V64_RESTORE: 1016 case AMDGPU::SI_SPILL_A64_SAVE: 1017 case AMDGPU::SI_SPILL_A64_RESTORE: 1018 case AMDGPU::SI_SPILL_AV64_SAVE: 1019 case AMDGPU::SI_SPILL_AV64_RESTORE: 1020 return 2; 1021 case AMDGPU::SI_SPILL_S32_SAVE: 1022 case AMDGPU::SI_SPILL_S32_RESTORE: 1023 case AMDGPU::SI_SPILL_V32_SAVE: 1024 case AMDGPU::SI_SPILL_V32_RESTORE: 1025 case AMDGPU::SI_SPILL_A32_SAVE: 1026 case AMDGPU::SI_SPILL_A32_RESTORE: 1027 case AMDGPU::SI_SPILL_AV32_SAVE: 1028 case AMDGPU::SI_SPILL_AV32_RESTORE: 1029 return 1; 1030 default: llvm_unreachable("Invalid spill opcode"); 1031 } 1032 } 1033 1034 static int getOffsetMUBUFStore(unsigned Opc) { 1035 switch (Opc) { 1036 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 1037 return AMDGPU::BUFFER_STORE_DWORD_OFFSET; 1038 case AMDGPU::BUFFER_STORE_BYTE_OFFEN: 1039 return AMDGPU::BUFFER_STORE_BYTE_OFFSET; 1040 case AMDGPU::BUFFER_STORE_SHORT_OFFEN: 1041 return AMDGPU::BUFFER_STORE_SHORT_OFFSET; 1042 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: 1043 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; 1044 case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN: 1045 return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET; 1046 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN: 1047 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; 1048 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN: 1049 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET; 1050 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN: 1051 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET; 1052 default: 1053 return -1; 1054 } 1055 } 1056 1057 static int getOffsetMUBUFLoad(unsigned Opc) { 1058 switch (Opc) { 1059 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 1060 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 1061 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN: 1062 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET; 1063 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN: 1064 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET; 1065 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN: 1066 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET; 1067 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN: 1068 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET; 1069 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN: 1070 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; 1071 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN: 1072 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET; 1073 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN: 1074 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET; 1075 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN: 1076 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET; 1077 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN: 1078 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET; 1079 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN: 1080 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET; 1081 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN: 1082 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET; 1083 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN: 1084 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET; 1085 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN: 1086 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET; 1087 default: 1088 return -1; 1089 } 1090 } 1091 1092 static int getOffenMUBUFStore(unsigned Opc) { 1093 switch (Opc) { 1094 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 1095 return AMDGPU::BUFFER_STORE_DWORD_OFFEN; 1096 case AMDGPU::BUFFER_STORE_BYTE_OFFSET: 1097 return AMDGPU::BUFFER_STORE_BYTE_OFFEN; 1098 case AMDGPU::BUFFER_STORE_SHORT_OFFSET: 1099 return AMDGPU::BUFFER_STORE_SHORT_OFFEN; 1100 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET: 1101 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN; 1102 case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET: 1103 return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN; 1104 case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET: 1105 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN; 1106 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET: 1107 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN; 1108 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET: 1109 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN; 1110 default: 1111 return -1; 1112 } 1113 } 1114 1115 static int getOffenMUBUFLoad(unsigned Opc) { 1116 switch (Opc) { 1117 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 1118 return AMDGPU::BUFFER_LOAD_DWORD_OFFEN; 1119 case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET: 1120 return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN; 1121 case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET: 1122 return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN; 1123 case AMDGPU::BUFFER_LOAD_USHORT_OFFSET: 1124 return AMDGPU::BUFFER_LOAD_USHORT_OFFEN; 1125 case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET: 1126 return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN; 1127 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET: 1128 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; 1129 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET: 1130 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN; 1131 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET: 1132 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN; 1133 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET: 1134 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN; 1135 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET: 1136 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN; 1137 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET: 1138 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN; 1139 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET: 1140 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN; 1141 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET: 1142 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN; 1143 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET: 1144 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN; 1145 default: 1146 return -1; 1147 } 1148 } 1149 1150 static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, 1151 MachineBasicBlock &MBB, 1152 MachineBasicBlock::iterator MI, 1153 int Index, unsigned Lane, 1154 unsigned ValueReg, bool IsKill) { 1155 MachineFunction *MF = MBB.getParent(); 1156 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1157 const SIInstrInfo *TII = ST.getInstrInfo(); 1158 1159 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane); 1160 1161 if (Reg == AMDGPU::NoRegister) 1162 return MachineInstrBuilder(); 1163 1164 bool IsStore = MI->mayStore(); 1165 MachineRegisterInfo &MRI = MF->getRegInfo(); 1166 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 1167 1168 unsigned Dst = IsStore ? Reg : ValueReg; 1169 unsigned Src = IsStore ? ValueReg : Reg; 1170 bool IsVGPR = TRI->isVGPR(MRI, Reg); 1171 DebugLoc DL = MI->getDebugLoc(); 1172 if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) { 1173 // Spiller during regalloc may restore a spilled register to its superclass. 1174 // It could result in AGPR spills restored to VGPRs or the other way around, 1175 // making the src and dst with identical regclasses at this point. It just 1176 // needs a copy in such cases. 1177 auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst) 1178 .addReg(Src, getKillRegState(IsKill)); 1179 CopyMIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1180 return CopyMIB; 1181 } 1182 unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 1183 : AMDGPU::V_ACCVGPR_READ_B32_e64; 1184 1185 auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst) 1186 .addReg(Src, getKillRegState(IsKill)); 1187 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1188 return MIB; 1189 } 1190 1191 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not 1192 // need to handle the case where an SGPR may need to be spilled while spilling. 1193 static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, 1194 MachineFrameInfo &MFI, 1195 MachineBasicBlock::iterator MI, 1196 int Index, 1197 int64_t Offset) { 1198 const SIInstrInfo *TII = ST.getInstrInfo(); 1199 MachineBasicBlock *MBB = MI->getParent(); 1200 const DebugLoc &DL = MI->getDebugLoc(); 1201 bool IsStore = MI->mayStore(); 1202 1203 unsigned Opc = MI->getOpcode(); 1204 int LoadStoreOp = IsStore ? 1205 getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc); 1206 if (LoadStoreOp == -1) 1207 return false; 1208 1209 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); 1210 if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr()) 1211 return true; 1212 1213 MachineInstrBuilder NewMI = 1214 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) 1215 .add(*Reg) 1216 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) 1217 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) 1218 .addImm(Offset) 1219 .addImm(0) // cpol 1220 .addImm(0) // tfe 1221 .addImm(0) // swz 1222 .cloneMemRefs(*MI); 1223 1224 const MachineOperand *VDataIn = TII->getNamedOperand(*MI, 1225 AMDGPU::OpName::vdata_in); 1226 if (VDataIn) 1227 NewMI.add(*VDataIn); 1228 return true; 1229 } 1230 1231 static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, 1232 unsigned LoadStoreOp, 1233 unsigned EltSize) { 1234 bool IsStore = TII->get(LoadStoreOp).mayStore(); 1235 bool HasVAddr = AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) != -1; 1236 bool UseST = 1237 !HasVAddr && 1238 AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::saddr) < 0; 1239 1240 switch (EltSize) { 1241 case 4: 1242 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 1243 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR; 1244 break; 1245 case 8: 1246 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR 1247 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR; 1248 break; 1249 case 12: 1250 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR 1251 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR; 1252 break; 1253 case 16: 1254 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR 1255 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR; 1256 break; 1257 default: 1258 llvm_unreachable("Unexpected spill load/store size!"); 1259 } 1260 1261 if (HasVAddr) 1262 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp); 1263 else if (UseST) 1264 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); 1265 1266 return LoadStoreOp; 1267 } 1268 1269 void SIRegisterInfo::buildSpillLoadStore( 1270 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, 1271 unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill, 1272 MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO, 1273 RegScavenger *RS, LivePhysRegs *LiveRegs) const { 1274 assert((!RS || !LiveRegs) && "Only RS or LiveRegs can be set but not both"); 1275 1276 MachineFunction *MF = MBB.getParent(); 1277 const SIInstrInfo *TII = ST.getInstrInfo(); 1278 const MachineFrameInfo &MFI = MF->getFrameInfo(); 1279 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>(); 1280 1281 const MCInstrDesc *Desc = &TII->get(LoadStoreOp); 1282 bool IsStore = Desc->mayStore(); 1283 bool IsFlat = TII->isFLATScratch(LoadStoreOp); 1284 1285 bool CanClobberSCC = false; 1286 bool Scavenged = false; 1287 MCRegister SOffset = ScratchOffsetReg; 1288 1289 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); 1290 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores. 1291 const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC); 1292 const unsigned RegWidth = AMDGPU::getRegBitWidth(RC->getID()) / 8; 1293 1294 // Always use 4 byte operations for AGPRs because we need to scavenge 1295 // a temporary VGPR. 1296 unsigned EltSize = (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u) : 4u; 1297 unsigned NumSubRegs = RegWidth / EltSize; 1298 unsigned Size = NumSubRegs * EltSize; 1299 unsigned RemSize = RegWidth - Size; 1300 unsigned NumRemSubRegs = RemSize ? 1 : 0; 1301 int64_t Offset = InstOffset + MFI.getObjectOffset(Index); 1302 int64_t MaterializedOffset = Offset; 1303 1304 int64_t MaxOffset = Offset + Size + RemSize - EltSize; 1305 int64_t ScratchOffsetRegDelta = 0; 1306 1307 if (IsFlat && EltSize > 4) { 1308 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); 1309 Desc = &TII->get(LoadStoreOp); 1310 } 1311 1312 Align Alignment = MFI.getObjectAlign(Index); 1313 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); 1314 1315 assert((IsFlat || ((Offset % EltSize) == 0)) && 1316 "unexpected VGPR spill offset"); 1317 1318 // Track a VGPR to use for a constant offset we need to materialize. 1319 Register TmpOffsetVGPR; 1320 1321 // Track a VGPR to use as an intermediate value. 1322 Register TmpIntermediateVGPR; 1323 bool UseVGPROffset = false; 1324 1325 // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate 1326 // combination. 1327 auto MaterializeVOffset = [&](Register SGPRBase, Register TmpVGPR, 1328 int64_t VOffset) { 1329 // We are using a VGPR offset 1330 if (IsFlat && SGPRBase) { 1331 // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free 1332 // SGPR, so perform the add as vector. 1333 // We don't need a base SGPR in the kernel. 1334 1335 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) >= 2) { 1336 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), TmpVGPR) 1337 .addReg(SGPRBase) 1338 .addImm(VOffset) 1339 .addImm(0); // clamp 1340 } else { 1341 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 1342 .addReg(SGPRBase); 1343 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), TmpVGPR) 1344 .addImm(VOffset) 1345 .addReg(TmpOffsetVGPR); 1346 } 1347 } else { 1348 assert(TmpOffsetVGPR); 1349 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 1350 .addImm(VOffset); 1351 } 1352 }; 1353 1354 bool IsOffsetLegal = 1355 IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, 1356 SIInstrFlags::FlatScratch) 1357 : SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset); 1358 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) { 1359 SOffset = MCRegister(); 1360 1361 // We don't have access to the register scavenger if this function is called 1362 // during PEI::scavengeFrameVirtualRegs() so use LiveRegs in this case. 1363 // TODO: Clobbering SCC is not necessary for scratch instructions in the 1364 // entry. 1365 if (RS) { 1366 SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false); 1367 1368 // Piggy back on the liveness scan we just did see if SCC is dead. 1369 CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC); 1370 } else if (LiveRegs) { 1371 CanClobberSCC = !LiveRegs->contains(AMDGPU::SCC); 1372 for (MCRegister Reg : AMDGPU::SGPR_32RegClass) { 1373 if (LiveRegs->available(MF->getRegInfo(), Reg)) { 1374 SOffset = Reg; 1375 break; 1376 } 1377 } 1378 } 1379 1380 if (ScratchOffsetReg != AMDGPU::NoRegister && !CanClobberSCC) 1381 SOffset = Register(); 1382 1383 if (!SOffset) { 1384 UseVGPROffset = true; 1385 1386 if (RS) { 1387 TmpOffsetVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 1388 } else { 1389 assert(LiveRegs); 1390 for (MCRegister Reg : AMDGPU::VGPR_32RegClass) { 1391 if (LiveRegs->available(MF->getRegInfo(), Reg)) { 1392 TmpOffsetVGPR = Reg; 1393 break; 1394 } 1395 } 1396 } 1397 1398 assert(TmpOffsetVGPR); 1399 } else if (!SOffset && CanClobberSCC) { 1400 // There are no free SGPRs, and since we are in the process of spilling 1401 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true 1402 // on SI/CI and on VI it is true until we implement spilling using scalar 1403 // stores), we have no way to free up an SGPR. Our solution here is to 1404 // add the offset directly to the ScratchOffset or StackPtrOffset 1405 // register, and then subtract the offset after the spill to return the 1406 // register to it's original value. 1407 1408 // TODO: If we don't have to do an emergency stack slot spill, converting 1409 // to use the VGPR offset is fewer instructions. 1410 if (!ScratchOffsetReg) 1411 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg(); 1412 SOffset = ScratchOffsetReg; 1413 ScratchOffsetRegDelta = Offset; 1414 } else { 1415 Scavenged = true; 1416 } 1417 1418 // We currently only support spilling VGPRs to EltSize boundaries, meaning 1419 // we can simplify the adjustment of Offset here to just scale with 1420 // WavefrontSize. 1421 if (!IsFlat && !UseVGPROffset) 1422 Offset *= ST.getWavefrontSize(); 1423 1424 if (!UseVGPROffset && !SOffset) 1425 report_fatal_error("could not scavenge SGPR to spill in entry function"); 1426 1427 if (UseVGPROffset) { 1428 // We are using a VGPR offset 1429 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, Offset); 1430 } else if (ScratchOffsetReg == AMDGPU::NoRegister) { 1431 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset); 1432 } else { 1433 assert(Offset != 0); 1434 auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) 1435 .addReg(ScratchOffsetReg) 1436 .addImm(Offset); 1437 Add->getOperand(3).setIsDead(); // Mark SCC as dead. 1438 } 1439 1440 Offset = 0; 1441 } 1442 1443 if (IsFlat && SOffset == AMDGPU::NoRegister) { 1444 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 1445 && "Unexpected vaddr for flat scratch with a FI operand"); 1446 1447 if (UseVGPROffset) { 1448 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp); 1449 } else { 1450 assert(ST.hasFlatScratchSTMode()); 1451 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); 1452 } 1453 1454 Desc = &TII->get(LoadStoreOp); 1455 } 1456 1457 for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e; 1458 ++i, RegOffset += EltSize) { 1459 if (i == NumSubRegs) { 1460 EltSize = RemSize; 1461 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); 1462 } 1463 Desc = &TII->get(LoadStoreOp); 1464 1465 if (!IsFlat && UseVGPROffset) { 1466 int NewLoadStoreOp = IsStore ? getOffenMUBUFStore(LoadStoreOp) 1467 : getOffenMUBUFLoad(LoadStoreOp); 1468 Desc = &TII->get(NewLoadStoreOp); 1469 } 1470 1471 if (UseVGPROffset && TmpOffsetVGPR == TmpIntermediateVGPR) { 1472 // If we are spilling an AGPR beyond the range of the memory instruction 1473 // offset and need to use a VGPR offset, we ideally have at least 2 1474 // scratch VGPRs. If we don't have a second free VGPR without spilling, 1475 // recycle the VGPR used for the offset which requires resetting after 1476 // each subregister. 1477 1478 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, MaterializedOffset); 1479 } 1480 1481 unsigned NumRegs = EltSize / 4; 1482 Register SubReg = e == 1 1483 ? ValueReg 1484 : Register(getSubReg(ValueReg, 1485 getSubRegFromChannel(RegOffset / 4, NumRegs))); 1486 1487 unsigned SOffsetRegState = 0; 1488 unsigned SrcDstRegState = getDefRegState(!IsStore); 1489 const bool IsLastSubReg = i + 1 == e; 1490 if (IsLastSubReg) { 1491 SOffsetRegState |= getKillRegState(Scavenged); 1492 // The last implicit use carries the "Kill" flag. 1493 SrcDstRegState |= getKillRegState(IsKill); 1494 } 1495 1496 // Make sure the whole register is defined if there are undef components by 1497 // adding an implicit def of the super-reg on the first instruction. 1498 bool NeedSuperRegDef = e > 1 && IsStore && i == 0; 1499 bool NeedSuperRegImpOperand = e > 1; 1500 1501 // Remaining element size to spill into memory after some parts of it 1502 // spilled into either AGPRs or VGPRs. 1503 unsigned RemEltSize = EltSize; 1504 1505 // AGPRs to spill VGPRs and vice versa are allocated in a reverse order, 1506 // starting from the last lane. In case if a register cannot be completely 1507 // spilled into another register that will ensure its alignment does not 1508 // change. For targets with VGPR alignment requirement this is important 1509 // in case of flat scratch usage as we might get a scratch_load or 1510 // scratch_store of an unaligned register otherwise. 1511 for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS, 1512 LaneE = RegOffset / 4; 1513 Lane >= LaneE; --Lane) { 1514 bool IsSubReg = e > 1 || EltSize > 4; 1515 Register Sub = IsSubReg 1516 ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane))) 1517 : ValueReg; 1518 auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill); 1519 if (!MIB.getInstr()) 1520 break; 1521 if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && !i)) { 1522 MIB.addReg(ValueReg, RegState::ImplicitDefine); 1523 NeedSuperRegDef = false; 1524 } 1525 if (IsSubReg || NeedSuperRegImpOperand) { 1526 NeedSuperRegImpOperand = true; 1527 unsigned State = SrcDstRegState; 1528 if (Lane != LaneE) 1529 State &= ~RegState::Kill; 1530 MIB.addReg(ValueReg, RegState::Implicit | State); 1531 } 1532 RemEltSize -= 4; 1533 } 1534 1535 if (!RemEltSize) // Fully spilled into AGPRs. 1536 continue; 1537 1538 if (RemEltSize != EltSize) { // Partially spilled to AGPRs 1539 assert(IsFlat && EltSize > 4); 1540 1541 unsigned NumRegs = RemEltSize / 4; 1542 SubReg = Register(getSubReg(ValueReg, 1543 getSubRegFromChannel(RegOffset / 4, NumRegs))); 1544 unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize); 1545 Desc = &TII->get(Opc); 1546 } 1547 1548 unsigned FinalReg = SubReg; 1549 1550 if (IsAGPR) { 1551 assert(EltSize == 4); 1552 1553 if (!TmpIntermediateVGPR) { 1554 assert(MF->getRegInfo().isReserved(AMDGPU::VGPR32)); 1555 TmpIntermediateVGPR = AMDGPU::VGPR32; 1556 } 1557 if (IsStore) { 1558 auto AccRead = BuildMI(MBB, MI, DL, 1559 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), 1560 TmpIntermediateVGPR) 1561 .addReg(SubReg, getKillRegState(IsKill)); 1562 if (NeedSuperRegDef) 1563 AccRead.addReg(ValueReg, RegState::ImplicitDefine); 1564 AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1565 } 1566 SubReg = TmpIntermediateVGPR; 1567 } else if (UseVGPROffset) { 1568 // FIXME: change to scavengeRegisterBackwards() 1569 if (!TmpOffsetVGPR) { 1570 TmpOffsetVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 1571 RS->setRegUsed(TmpOffsetVGPR); 1572 } 1573 } 1574 1575 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset); 1576 MachineMemOperand *NewMMO = 1577 MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize, 1578 commonAlignment(Alignment, RegOffset)); 1579 1580 auto MIB = 1581 BuildMI(MBB, MI, DL, *Desc) 1582 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)); 1583 1584 if (UseVGPROffset) { 1585 // For an AGPR spill, we reuse the same temp VGPR for the offset and the 1586 // intermediate accvgpr_write. 1587 MIB.addReg(TmpOffsetVGPR, getKillRegState(IsLastSubReg && !IsAGPR)); 1588 } 1589 1590 if (!IsFlat) 1591 MIB.addReg(FuncInfo->getScratchRSrcReg()); 1592 1593 if (SOffset == AMDGPU::NoRegister) { 1594 if (!IsFlat) { 1595 if (UseVGPROffset && ScratchOffsetReg) { 1596 assert(!FuncInfo->isEntryFunction()); 1597 MIB.addReg(ScratchOffsetReg); 1598 } else { 1599 assert(FuncInfo->isEntryFunction()); 1600 MIB.addImm(0); 1601 } 1602 } 1603 } else { 1604 MIB.addReg(SOffset, SOffsetRegState); 1605 } 1606 MIB.addImm(Offset + RegOffset) 1607 .addImm(0); // cpol 1608 if (!IsFlat) 1609 MIB.addImm(0) // tfe 1610 .addImm(0); // swz 1611 MIB.addMemOperand(NewMMO); 1612 1613 if (!IsAGPR && NeedSuperRegDef) 1614 MIB.addReg(ValueReg, RegState::ImplicitDefine); 1615 1616 if (!IsStore && IsAGPR && TmpIntermediateVGPR != AMDGPU::NoRegister) { 1617 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), 1618 FinalReg) 1619 .addReg(TmpIntermediateVGPR, RegState::Kill); 1620 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1621 } 1622 1623 if (NeedSuperRegImpOperand) 1624 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); 1625 } 1626 1627 if (ScratchOffsetRegDelta != 0) { 1628 // Subtract the offset we added to the ScratchOffset register. 1629 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) 1630 .addReg(SOffset) 1631 .addImm(-ScratchOffsetRegDelta); 1632 } 1633 } 1634 1635 void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, 1636 int Offset, bool IsLoad, 1637 bool IsKill) const { 1638 // Load/store VGPR 1639 MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo(); 1640 assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill); 1641 1642 Register FrameReg = 1643 FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF) 1644 ? getBaseRegister() 1645 : getFrameRegister(SB.MF); 1646 1647 Align Alignment = FrameInfo.getObjectAlign(Index); 1648 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SB.MF, Index); 1649 MachineMemOperand *MMO = SB.MF.getMachineMemOperand( 1650 PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore, 1651 SB.EltSize, Alignment); 1652 1653 if (IsLoad) { 1654 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 1655 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 1656 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false, 1657 FrameReg, Offset * SB.EltSize, MMO, SB.RS); 1658 } else { 1659 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 1660 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 1661 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill, 1662 FrameReg, Offset * SB.EltSize, MMO, SB.RS); 1663 // This only ever adds one VGPR spill 1664 SB.MFI.addToSpilledVGPRs(1); 1665 } 1666 } 1667 1668 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, 1669 int Index, 1670 RegScavenger *RS, 1671 LiveIntervals *LIS, 1672 bool OnlyToVGPR) const { 1673 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); 1674 1675 ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills = 1676 SB.MFI.getSGPRToVGPRSpills(Index); 1677 bool SpillToVGPR = !VGPRSpills.empty(); 1678 if (OnlyToVGPR && !SpillToVGPR) 1679 return false; 1680 1681 assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() && 1682 SB.SuperReg != SB.MFI.getFrameOffsetReg())); 1683 1684 if (SpillToVGPR) { 1685 1686 assert(SB.NumSubRegs == VGPRSpills.size() && 1687 "Num of VGPR lanes should be equal to num of SGPRs spilled"); 1688 1689 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) { 1690 Register SubReg = 1691 SB.NumSubRegs == 1 1692 ? SB.SuperReg 1693 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1694 SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; 1695 1696 bool UseKill = SB.IsKill && i == SB.NumSubRegs - 1; 1697 1698 // Mark the "old value of vgpr" input undef only if this is the first sgpr 1699 // spill to this specific vgpr in the first basic block. 1700 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, 1701 SB.TII.get(AMDGPU::V_WRITELANE_B32), Spill.VGPR) 1702 .addReg(SubReg, getKillRegState(UseKill)) 1703 .addImm(Spill.Lane) 1704 .addReg(Spill.VGPR); 1705 if (LIS) { 1706 if (i == 0) 1707 LIS->ReplaceMachineInstrInMaps(*MI, *MIB); 1708 else 1709 LIS->InsertMachineInstrInMaps(*MIB); 1710 } 1711 1712 if (i == 0 && SB.NumSubRegs > 1) { 1713 // We may be spilling a super-register which is only partially defined, 1714 // and need to ensure later spills think the value is defined. 1715 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1716 } 1717 1718 if (SB.NumSubRegs > 1) 1719 MIB.addReg(SB.SuperReg, getKillRegState(UseKill) | RegState::Implicit); 1720 1721 // FIXME: Since this spills to another register instead of an actual 1722 // frame index, we should delete the frame index when all references to 1723 // it are fixed. 1724 } 1725 } else { 1726 SB.prepare(); 1727 1728 // SubReg carries the "Kill" flag when SubReg == SB.SuperReg. 1729 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill); 1730 1731 // Per VGPR helper data 1732 auto PVD = SB.getPerVGPRData(); 1733 1734 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1735 unsigned TmpVGPRFlags = RegState::Undef; 1736 1737 // Write sub registers into the VGPR 1738 for (unsigned i = Offset * PVD.PerVGPR, 1739 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1740 i < e; ++i) { 1741 Register SubReg = 1742 SB.NumSubRegs == 1 1743 ? SB.SuperReg 1744 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1745 1746 MachineInstrBuilder WriteLane = 1747 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32), 1748 SB.TmpVGPR) 1749 .addReg(SubReg, SubKillState) 1750 .addImm(i % PVD.PerVGPR) 1751 .addReg(SB.TmpVGPR, TmpVGPRFlags); 1752 TmpVGPRFlags = 0; 1753 1754 if (LIS) { 1755 if (i == 0) 1756 LIS->ReplaceMachineInstrInMaps(*MI, *WriteLane); 1757 else 1758 LIS->InsertMachineInstrInMaps(*WriteLane); 1759 } 1760 1761 // There could be undef components of a spilled super register. 1762 // TODO: Can we detect this and skip the spill? 1763 if (SB.NumSubRegs > 1) { 1764 // The last implicit use of the SB.SuperReg carries the "Kill" flag. 1765 unsigned SuperKillState = 0; 1766 if (i + 1 == SB.NumSubRegs) 1767 SuperKillState |= getKillRegState(SB.IsKill); 1768 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState); 1769 } 1770 } 1771 1772 // Write out VGPR 1773 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false); 1774 } 1775 1776 SB.restore(); 1777 } 1778 1779 MI->eraseFromParent(); 1780 SB.MFI.addToSpilledSGPRs(SB.NumSubRegs); 1781 1782 if (LIS) 1783 LIS->removeAllRegUnitsForPhysReg(SB.SuperReg); 1784 1785 return true; 1786 } 1787 1788 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, 1789 int Index, 1790 RegScavenger *RS, 1791 LiveIntervals *LIS, 1792 bool OnlyToVGPR) const { 1793 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); 1794 1795 ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills = 1796 SB.MFI.getSGPRToVGPRSpills(Index); 1797 bool SpillToVGPR = !VGPRSpills.empty(); 1798 if (OnlyToVGPR && !SpillToVGPR) 1799 return false; 1800 1801 if (SpillToVGPR) { 1802 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) { 1803 Register SubReg = 1804 SB.NumSubRegs == 1 1805 ? SB.SuperReg 1806 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1807 1808 SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; 1809 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32), 1810 SubReg) 1811 .addReg(Spill.VGPR) 1812 .addImm(Spill.Lane); 1813 if (SB.NumSubRegs > 1 && i == 0) 1814 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1815 if (LIS) { 1816 if (i == e - 1) 1817 LIS->ReplaceMachineInstrInMaps(*MI, *MIB); 1818 else 1819 LIS->InsertMachineInstrInMaps(*MIB); 1820 } 1821 1822 } 1823 } else { 1824 SB.prepare(); 1825 1826 // Per VGPR helper data 1827 auto PVD = SB.getPerVGPRData(); 1828 1829 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1830 // Load in VGPR data 1831 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true); 1832 1833 // Unpack lanes 1834 for (unsigned i = Offset * PVD.PerVGPR, 1835 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1836 i < e; ++i) { 1837 Register SubReg = 1838 SB.NumSubRegs == 1 1839 ? SB.SuperReg 1840 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1841 1842 bool LastSubReg = (i + 1 == e); 1843 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, 1844 SB.TII.get(AMDGPU::V_READLANE_B32), SubReg) 1845 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg)) 1846 .addImm(i); 1847 if (SB.NumSubRegs > 1 && i == 0) 1848 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1849 if (LIS) { 1850 if (i == e - 1) 1851 LIS->ReplaceMachineInstrInMaps(*MI, *MIB); 1852 else 1853 LIS->InsertMachineInstrInMaps(*MIB); 1854 } 1855 } 1856 } 1857 1858 SB.restore(); 1859 } 1860 1861 MI->eraseFromParent(); 1862 1863 if (LIS) 1864 LIS->removeAllRegUnitsForPhysReg(SB.SuperReg); 1865 1866 return true; 1867 } 1868 1869 bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI, 1870 MachineBasicBlock &RestoreMBB, 1871 Register SGPR, RegScavenger *RS) const { 1872 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0, 1873 RS); 1874 SB.prepare(); 1875 // Generate the spill of SGPR to SB.TmpVGPR. 1876 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill); 1877 auto PVD = SB.getPerVGPRData(); 1878 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1879 unsigned TmpVGPRFlags = RegState::Undef; 1880 // Write sub registers into the VGPR 1881 for (unsigned i = Offset * PVD.PerVGPR, 1882 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1883 i < e; ++i) { 1884 Register SubReg = 1885 SB.NumSubRegs == 1 1886 ? SB.SuperReg 1887 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1888 1889 MachineInstrBuilder WriteLane = 1890 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32), 1891 SB.TmpVGPR) 1892 .addReg(SubReg, SubKillState) 1893 .addImm(i % PVD.PerVGPR) 1894 .addReg(SB.TmpVGPR, TmpVGPRFlags); 1895 TmpVGPRFlags = 0; 1896 // There could be undef components of a spilled super register. 1897 // TODO: Can we detect this and skip the spill? 1898 if (SB.NumSubRegs > 1) { 1899 // The last implicit use of the SB.SuperReg carries the "Kill" flag. 1900 unsigned SuperKillState = 0; 1901 if (i + 1 == SB.NumSubRegs) 1902 SuperKillState |= getKillRegState(SB.IsKill); 1903 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState); 1904 } 1905 } 1906 // Don't need to write VGPR out. 1907 } 1908 1909 // Restore clobbered registers in the specified restore block. 1910 MI = RestoreMBB.end(); 1911 SB.setMI(&RestoreMBB, MI); 1912 // Generate the restore of SGPR from SB.TmpVGPR. 1913 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1914 // Don't need to load VGPR in. 1915 // Unpack lanes 1916 for (unsigned i = Offset * PVD.PerVGPR, 1917 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1918 i < e; ++i) { 1919 Register SubReg = 1920 SB.NumSubRegs == 1 1921 ? SB.SuperReg 1922 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1923 bool LastSubReg = (i + 1 == e); 1924 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32), 1925 SubReg) 1926 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg)) 1927 .addImm(i); 1928 if (SB.NumSubRegs > 1 && i == 0) 1929 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1930 } 1931 } 1932 SB.restore(); 1933 1934 SB.MFI.addToSpilledSGPRs(SB.NumSubRegs); 1935 return false; 1936 } 1937 1938 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to 1939 /// a VGPR and the stack slot can be safely eliminated when all other users are 1940 /// handled. 1941 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( 1942 MachineBasicBlock::iterator MI, 1943 int FI, 1944 RegScavenger *RS, 1945 LiveIntervals *LIS) const { 1946 switch (MI->getOpcode()) { 1947 case AMDGPU::SI_SPILL_S1024_SAVE: 1948 case AMDGPU::SI_SPILL_S512_SAVE: 1949 case AMDGPU::SI_SPILL_S256_SAVE: 1950 case AMDGPU::SI_SPILL_S224_SAVE: 1951 case AMDGPU::SI_SPILL_S192_SAVE: 1952 case AMDGPU::SI_SPILL_S160_SAVE: 1953 case AMDGPU::SI_SPILL_S128_SAVE: 1954 case AMDGPU::SI_SPILL_S96_SAVE: 1955 case AMDGPU::SI_SPILL_S64_SAVE: 1956 case AMDGPU::SI_SPILL_S32_SAVE: 1957 return spillSGPR(MI, FI, RS, LIS, true); 1958 case AMDGPU::SI_SPILL_S1024_RESTORE: 1959 case AMDGPU::SI_SPILL_S512_RESTORE: 1960 case AMDGPU::SI_SPILL_S256_RESTORE: 1961 case AMDGPU::SI_SPILL_S224_RESTORE: 1962 case AMDGPU::SI_SPILL_S192_RESTORE: 1963 case AMDGPU::SI_SPILL_S160_RESTORE: 1964 case AMDGPU::SI_SPILL_S128_RESTORE: 1965 case AMDGPU::SI_SPILL_S96_RESTORE: 1966 case AMDGPU::SI_SPILL_S64_RESTORE: 1967 case AMDGPU::SI_SPILL_S32_RESTORE: 1968 return restoreSGPR(MI, FI, RS, LIS, true); 1969 default: 1970 llvm_unreachable("not an SGPR spill instruction"); 1971 } 1972 } 1973 1974 void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, 1975 int SPAdj, unsigned FIOperandNum, 1976 RegScavenger *RS) const { 1977 MachineFunction *MF = MI->getParent()->getParent(); 1978 MachineBasicBlock *MBB = MI->getParent(); 1979 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1980 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 1981 const SIInstrInfo *TII = ST.getInstrInfo(); 1982 DebugLoc DL = MI->getDebugLoc(); 1983 1984 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?"); 1985 1986 MachineOperand &FIOp = MI->getOperand(FIOperandNum); 1987 int Index = MI->getOperand(FIOperandNum).getIndex(); 1988 1989 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF) 1990 ? getBaseRegister() 1991 : getFrameRegister(*MF); 1992 1993 switch (MI->getOpcode()) { 1994 // SGPR register spill 1995 case AMDGPU::SI_SPILL_S1024_SAVE: 1996 case AMDGPU::SI_SPILL_S512_SAVE: 1997 case AMDGPU::SI_SPILL_S256_SAVE: 1998 case AMDGPU::SI_SPILL_S224_SAVE: 1999 case AMDGPU::SI_SPILL_S192_SAVE: 2000 case AMDGPU::SI_SPILL_S160_SAVE: 2001 case AMDGPU::SI_SPILL_S128_SAVE: 2002 case AMDGPU::SI_SPILL_S96_SAVE: 2003 case AMDGPU::SI_SPILL_S64_SAVE: 2004 case AMDGPU::SI_SPILL_S32_SAVE: { 2005 spillSGPR(MI, Index, RS); 2006 break; 2007 } 2008 2009 // SGPR register restore 2010 case AMDGPU::SI_SPILL_S1024_RESTORE: 2011 case AMDGPU::SI_SPILL_S512_RESTORE: 2012 case AMDGPU::SI_SPILL_S256_RESTORE: 2013 case AMDGPU::SI_SPILL_S224_RESTORE: 2014 case AMDGPU::SI_SPILL_S192_RESTORE: 2015 case AMDGPU::SI_SPILL_S160_RESTORE: 2016 case AMDGPU::SI_SPILL_S128_RESTORE: 2017 case AMDGPU::SI_SPILL_S96_RESTORE: 2018 case AMDGPU::SI_SPILL_S64_RESTORE: 2019 case AMDGPU::SI_SPILL_S32_RESTORE: { 2020 restoreSGPR(MI, Index, RS); 2021 break; 2022 } 2023 2024 // VGPR register spill 2025 case AMDGPU::SI_SPILL_V1024_SAVE: 2026 case AMDGPU::SI_SPILL_V512_SAVE: 2027 case AMDGPU::SI_SPILL_V256_SAVE: 2028 case AMDGPU::SI_SPILL_V224_SAVE: 2029 case AMDGPU::SI_SPILL_V192_SAVE: 2030 case AMDGPU::SI_SPILL_V160_SAVE: 2031 case AMDGPU::SI_SPILL_V128_SAVE: 2032 case AMDGPU::SI_SPILL_V96_SAVE: 2033 case AMDGPU::SI_SPILL_V64_SAVE: 2034 case AMDGPU::SI_SPILL_V32_SAVE: 2035 case AMDGPU::SI_SPILL_A1024_SAVE: 2036 case AMDGPU::SI_SPILL_A512_SAVE: 2037 case AMDGPU::SI_SPILL_A256_SAVE: 2038 case AMDGPU::SI_SPILL_A224_SAVE: 2039 case AMDGPU::SI_SPILL_A192_SAVE: 2040 case AMDGPU::SI_SPILL_A160_SAVE: 2041 case AMDGPU::SI_SPILL_A128_SAVE: 2042 case AMDGPU::SI_SPILL_A96_SAVE: 2043 case AMDGPU::SI_SPILL_A64_SAVE: 2044 case AMDGPU::SI_SPILL_A32_SAVE: 2045 case AMDGPU::SI_SPILL_AV1024_SAVE: 2046 case AMDGPU::SI_SPILL_AV512_SAVE: 2047 case AMDGPU::SI_SPILL_AV256_SAVE: 2048 case AMDGPU::SI_SPILL_AV224_SAVE: 2049 case AMDGPU::SI_SPILL_AV192_SAVE: 2050 case AMDGPU::SI_SPILL_AV160_SAVE: 2051 case AMDGPU::SI_SPILL_AV128_SAVE: 2052 case AMDGPU::SI_SPILL_AV96_SAVE: 2053 case AMDGPU::SI_SPILL_AV64_SAVE: 2054 case AMDGPU::SI_SPILL_AV32_SAVE: { 2055 const MachineOperand *VData = TII->getNamedOperand(*MI, 2056 AMDGPU::OpName::vdata); 2057 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 2058 MFI->getStackPtrOffsetReg()); 2059 2060 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 2061 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 2062 auto *MBB = MI->getParent(); 2063 buildSpillLoadStore( 2064 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, 2065 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 2066 *MI->memoperands_begin(), RS); 2067 MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); 2068 MI->eraseFromParent(); 2069 break; 2070 } 2071 case AMDGPU::SI_SPILL_V32_RESTORE: 2072 case AMDGPU::SI_SPILL_V64_RESTORE: 2073 case AMDGPU::SI_SPILL_V96_RESTORE: 2074 case AMDGPU::SI_SPILL_V128_RESTORE: 2075 case AMDGPU::SI_SPILL_V160_RESTORE: 2076 case AMDGPU::SI_SPILL_V192_RESTORE: 2077 case AMDGPU::SI_SPILL_V224_RESTORE: 2078 case AMDGPU::SI_SPILL_V256_RESTORE: 2079 case AMDGPU::SI_SPILL_V512_RESTORE: 2080 case AMDGPU::SI_SPILL_V1024_RESTORE: 2081 case AMDGPU::SI_SPILL_A32_RESTORE: 2082 case AMDGPU::SI_SPILL_A64_RESTORE: 2083 case AMDGPU::SI_SPILL_A96_RESTORE: 2084 case AMDGPU::SI_SPILL_A128_RESTORE: 2085 case AMDGPU::SI_SPILL_A160_RESTORE: 2086 case AMDGPU::SI_SPILL_A192_RESTORE: 2087 case AMDGPU::SI_SPILL_A224_RESTORE: 2088 case AMDGPU::SI_SPILL_A256_RESTORE: 2089 case AMDGPU::SI_SPILL_A512_RESTORE: 2090 case AMDGPU::SI_SPILL_A1024_RESTORE: 2091 case AMDGPU::SI_SPILL_AV32_RESTORE: 2092 case AMDGPU::SI_SPILL_AV64_RESTORE: 2093 case AMDGPU::SI_SPILL_AV96_RESTORE: 2094 case AMDGPU::SI_SPILL_AV128_RESTORE: 2095 case AMDGPU::SI_SPILL_AV160_RESTORE: 2096 case AMDGPU::SI_SPILL_AV192_RESTORE: 2097 case AMDGPU::SI_SPILL_AV224_RESTORE: 2098 case AMDGPU::SI_SPILL_AV256_RESTORE: 2099 case AMDGPU::SI_SPILL_AV512_RESTORE: 2100 case AMDGPU::SI_SPILL_AV1024_RESTORE: { 2101 const MachineOperand *VData = TII->getNamedOperand(*MI, 2102 AMDGPU::OpName::vdata); 2103 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 2104 MFI->getStackPtrOffsetReg()); 2105 2106 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 2107 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 2108 auto *MBB = MI->getParent(); 2109 buildSpillLoadStore( 2110 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, 2111 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 2112 *MI->memoperands_begin(), RS); 2113 MI->eraseFromParent(); 2114 break; 2115 } 2116 2117 default: { 2118 // Other access to frame index 2119 const DebugLoc &DL = MI->getDebugLoc(); 2120 2121 int64_t Offset = FrameInfo.getObjectOffset(Index); 2122 if (ST.enableFlatScratch()) { 2123 if (TII->isFLATScratch(*MI)) { 2124 assert((int16_t)FIOperandNum == 2125 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 2126 AMDGPU::OpName::saddr)); 2127 2128 // The offset is always swizzled, just replace it 2129 if (FrameReg) 2130 FIOp.ChangeToRegister(FrameReg, false); 2131 2132 if (!Offset) 2133 return; 2134 2135 MachineOperand *OffsetOp = 2136 TII->getNamedOperand(*MI, AMDGPU::OpName::offset); 2137 int64_t NewOffset = Offset + OffsetOp->getImm(); 2138 if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 2139 SIInstrFlags::FlatScratch)) { 2140 OffsetOp->setImm(NewOffset); 2141 if (FrameReg) 2142 return; 2143 Offset = 0; 2144 } 2145 2146 assert(!TII->getNamedOperand(*MI, AMDGPU::OpName::vaddr) && 2147 "Unexpected vaddr for flat scratch with a FI operand"); 2148 2149 // On GFX10 we have ST mode to use no registers for an address. 2150 // Otherwise we need to materialize 0 into an SGPR. 2151 if (!Offset && ST.hasFlatScratchSTMode()) { 2152 unsigned Opc = MI->getOpcode(); 2153 unsigned NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc); 2154 MI->RemoveOperand( 2155 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr)); 2156 MI->setDesc(TII->get(NewOpc)); 2157 return; 2158 } 2159 } 2160 2161 if (!FrameReg) { 2162 FIOp.ChangeToImmediate(Offset); 2163 if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) 2164 return; 2165 } 2166 2167 // We need to use register here. Check if we can use an SGPR or need 2168 // a VGPR. 2169 FIOp.ChangeToRegister(AMDGPU::M0, false); 2170 bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp); 2171 2172 if (!Offset && FrameReg && UseSGPR) { 2173 FIOp.setReg(FrameReg); 2174 return; 2175 } 2176 2177 const TargetRegisterClass *RC = UseSGPR ? &AMDGPU::SReg_32_XM0RegClass 2178 : &AMDGPU::VGPR_32RegClass; 2179 2180 Register TmpReg = RS->scavengeRegister(RC, MI, 0, !UseSGPR); 2181 FIOp.setReg(TmpReg); 2182 FIOp.setIsKill(true); 2183 2184 if ((!FrameReg || !Offset) && TmpReg) { 2185 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 2186 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg); 2187 if (FrameReg) 2188 MIB.addReg(FrameReg); 2189 else 2190 MIB.addImm(Offset); 2191 2192 return; 2193 } 2194 2195 Register TmpSReg = 2196 UseSGPR ? TmpReg 2197 : RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, 2198 !UseSGPR); 2199 2200 // TODO: for flat scratch another attempt can be made with a VGPR index 2201 // if no SGPRs can be scavenged. 2202 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) 2203 report_fatal_error("Cannot scavenge register in FI elimination!"); 2204 2205 if (!TmpSReg) { 2206 // Use frame register and restore it after. 2207 TmpSReg = FrameReg; 2208 FIOp.setReg(FrameReg); 2209 FIOp.setIsKill(false); 2210 } 2211 2212 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg) 2213 .addReg(FrameReg) 2214 .addImm(Offset); 2215 2216 if (!UseSGPR) 2217 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 2218 .addReg(TmpSReg, RegState::Kill); 2219 2220 if (TmpSReg == FrameReg) { 2221 // Undo frame register modification. 2222 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32), 2223 FrameReg) 2224 .addReg(FrameReg) 2225 .addImm(-Offset); 2226 } 2227 2228 return; 2229 } 2230 2231 bool IsMUBUF = TII->isMUBUF(*MI); 2232 2233 if (!IsMUBUF && !MFI->isEntryFunction()) { 2234 // Convert to a swizzled stack address by scaling by the wave size. 2235 // 2236 // In an entry function/kernel the offset is already swizzled. 2237 2238 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32; 2239 Register ResultReg = 2240 IsCopy ? MI->getOperand(0).getReg() 2241 : RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 2242 2243 int64_t Offset = FrameInfo.getObjectOffset(Index); 2244 if (Offset == 0) { 2245 // XXX - This never happens because of emergency scavenging slot at 0? 2246 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg) 2247 .addImm(ST.getWavefrontSizeLog2()) 2248 .addReg(FrameReg); 2249 } else { 2250 if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) { 2251 // Reuse ResultReg in intermediate step. 2252 Register ScaledReg = ResultReg; 2253 2254 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), 2255 ScaledReg) 2256 .addImm(ST.getWavefrontSizeLog2()) 2257 .addReg(FrameReg); 2258 2259 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; 2260 2261 // TODO: Fold if use instruction is another add of a constant. 2262 if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { 2263 // FIXME: This can fail 2264 MIB.addImm(Offset); 2265 MIB.addReg(ScaledReg, RegState::Kill); 2266 if (!IsVOP2) 2267 MIB.addImm(0); // clamp bit 2268 } else { 2269 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 && 2270 "Need to reuse carry out register"); 2271 2272 // Use scavenged unused carry out as offset register. 2273 Register ConstOffsetReg; 2274 if (!isWave32) 2275 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0); 2276 else 2277 ConstOffsetReg = MIB.getReg(1); 2278 2279 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) 2280 .addImm(Offset); 2281 MIB.addReg(ConstOffsetReg, RegState::Kill); 2282 MIB.addReg(ScaledReg, RegState::Kill); 2283 MIB.addImm(0); // clamp bit 2284 } 2285 } else { 2286 // We have to produce a carry out, and there isn't a free SGPR pair 2287 // for it. We can keep the whole computation on the SALU to avoid 2288 // clobbering an additional register at the cost of an extra mov. 2289 2290 // We may have 1 free scratch SGPR even though a carry out is 2291 // unavailable. Only one additional mov is needed. 2292 Register TmpScaledReg = 2293 RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); 2294 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg; 2295 2296 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg) 2297 .addReg(FrameReg) 2298 .addImm(ST.getWavefrontSizeLog2()); 2299 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) 2300 .addReg(ScaledReg, RegState::Kill) 2301 .addImm(Offset); 2302 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg) 2303 .addReg(ScaledReg, RegState::Kill); 2304 2305 // If there were truly no free SGPRs, we need to undo everything. 2306 if (!TmpScaledReg.isValid()) { 2307 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) 2308 .addReg(ScaledReg, RegState::Kill) 2309 .addImm(-Offset); 2310 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg) 2311 .addReg(FrameReg) 2312 .addImm(ST.getWavefrontSizeLog2()); 2313 } 2314 } 2315 } 2316 2317 // Don't introduce an extra copy if we're just materializing in a mov. 2318 if (IsCopy) 2319 MI->eraseFromParent(); 2320 else 2321 FIOp.ChangeToRegister(ResultReg, false, false, true); 2322 return; 2323 } 2324 2325 if (IsMUBUF) { 2326 // Disable offen so we don't need a 0 vgpr base. 2327 assert(static_cast<int>(FIOperandNum) == 2328 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 2329 AMDGPU::OpName::vaddr)); 2330 2331 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset); 2332 assert((SOffset.isImm() && SOffset.getImm() == 0)); 2333 2334 if (FrameReg != AMDGPU::NoRegister) 2335 SOffset.ChangeToRegister(FrameReg, false); 2336 2337 int64_t Offset = FrameInfo.getObjectOffset(Index); 2338 int64_t OldImm 2339 = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(); 2340 int64_t NewOffset = OldImm + Offset; 2341 2342 if (SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) && 2343 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) { 2344 MI->eraseFromParent(); 2345 return; 2346 } 2347 } 2348 2349 // If the offset is simply too big, don't convert to a scratch wave offset 2350 // relative index. 2351 2352 FIOp.ChangeToImmediate(Offset); 2353 if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { 2354 Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 2355 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 2356 .addImm(Offset); 2357 FIOp.ChangeToRegister(TmpReg, false, false, true); 2358 } 2359 } 2360 } 2361 } 2362 2363 StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const { 2364 return AMDGPUInstPrinter::getRegisterName(Reg); 2365 } 2366 2367 static const TargetRegisterClass * 2368 getAnyVGPRClassForBitWidth(unsigned BitWidth) { 2369 if (BitWidth <= 64) 2370 return &AMDGPU::VReg_64RegClass; 2371 if (BitWidth <= 96) 2372 return &AMDGPU::VReg_96RegClass; 2373 if (BitWidth <= 128) 2374 return &AMDGPU::VReg_128RegClass; 2375 if (BitWidth <= 160) 2376 return &AMDGPU::VReg_160RegClass; 2377 if (BitWidth <= 192) 2378 return &AMDGPU::VReg_192RegClass; 2379 if (BitWidth <= 224) 2380 return &AMDGPU::VReg_224RegClass; 2381 if (BitWidth <= 256) 2382 return &AMDGPU::VReg_256RegClass; 2383 if (BitWidth <= 512) 2384 return &AMDGPU::VReg_512RegClass; 2385 if (BitWidth <= 1024) 2386 return &AMDGPU::VReg_1024RegClass; 2387 2388 return nullptr; 2389 } 2390 2391 static const TargetRegisterClass * 2392 getAlignedVGPRClassForBitWidth(unsigned BitWidth) { 2393 if (BitWidth <= 64) 2394 return &AMDGPU::VReg_64_Align2RegClass; 2395 if (BitWidth <= 96) 2396 return &AMDGPU::VReg_96_Align2RegClass; 2397 if (BitWidth <= 128) 2398 return &AMDGPU::VReg_128_Align2RegClass; 2399 if (BitWidth <= 160) 2400 return &AMDGPU::VReg_160_Align2RegClass; 2401 if (BitWidth <= 192) 2402 return &AMDGPU::VReg_192_Align2RegClass; 2403 if (BitWidth <= 224) 2404 return &AMDGPU::VReg_224_Align2RegClass; 2405 if (BitWidth <= 256) 2406 return &AMDGPU::VReg_256_Align2RegClass; 2407 if (BitWidth <= 512) 2408 return &AMDGPU::VReg_512_Align2RegClass; 2409 if (BitWidth <= 1024) 2410 return &AMDGPU::VReg_1024_Align2RegClass; 2411 2412 return nullptr; 2413 } 2414 2415 const TargetRegisterClass * 2416 SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const { 2417 if (BitWidth == 1) 2418 return &AMDGPU::VReg_1RegClass; 2419 if (BitWidth <= 16) 2420 return &AMDGPU::VGPR_LO16RegClass; 2421 if (BitWidth <= 32) 2422 return &AMDGPU::VGPR_32RegClass; 2423 return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth) 2424 : getAnyVGPRClassForBitWidth(BitWidth); 2425 } 2426 2427 static const TargetRegisterClass * 2428 getAnyAGPRClassForBitWidth(unsigned BitWidth) { 2429 if (BitWidth <= 64) 2430 return &AMDGPU::AReg_64RegClass; 2431 if (BitWidth <= 96) 2432 return &AMDGPU::AReg_96RegClass; 2433 if (BitWidth <= 128) 2434 return &AMDGPU::AReg_128RegClass; 2435 if (BitWidth <= 160) 2436 return &AMDGPU::AReg_160RegClass; 2437 if (BitWidth <= 192) 2438 return &AMDGPU::AReg_192RegClass; 2439 if (BitWidth <= 224) 2440 return &AMDGPU::AReg_224RegClass; 2441 if (BitWidth <= 256) 2442 return &AMDGPU::AReg_256RegClass; 2443 if (BitWidth <= 512) 2444 return &AMDGPU::AReg_512RegClass; 2445 if (BitWidth <= 1024) 2446 return &AMDGPU::AReg_1024RegClass; 2447 2448 return nullptr; 2449 } 2450 2451 static const TargetRegisterClass * 2452 getAlignedAGPRClassForBitWidth(unsigned BitWidth) { 2453 if (BitWidth <= 64) 2454 return &AMDGPU::AReg_64_Align2RegClass; 2455 if (BitWidth <= 96) 2456 return &AMDGPU::AReg_96_Align2RegClass; 2457 if (BitWidth <= 128) 2458 return &AMDGPU::AReg_128_Align2RegClass; 2459 if (BitWidth <= 160) 2460 return &AMDGPU::AReg_160_Align2RegClass; 2461 if (BitWidth <= 192) 2462 return &AMDGPU::AReg_192_Align2RegClass; 2463 if (BitWidth <= 224) 2464 return &AMDGPU::AReg_224_Align2RegClass; 2465 if (BitWidth <= 256) 2466 return &AMDGPU::AReg_256_Align2RegClass; 2467 if (BitWidth <= 512) 2468 return &AMDGPU::AReg_512_Align2RegClass; 2469 if (BitWidth <= 1024) 2470 return &AMDGPU::AReg_1024_Align2RegClass; 2471 2472 return nullptr; 2473 } 2474 2475 const TargetRegisterClass * 2476 SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const { 2477 if (BitWidth <= 16) 2478 return &AMDGPU::AGPR_LO16RegClass; 2479 if (BitWidth <= 32) 2480 return &AMDGPU::AGPR_32RegClass; 2481 return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth) 2482 : getAnyAGPRClassForBitWidth(BitWidth); 2483 } 2484 2485 static const TargetRegisterClass * 2486 getAnyVectorSuperClassForBitWidth(unsigned BitWidth) { 2487 if (BitWidth <= 64) 2488 return &AMDGPU::AV_64RegClass; 2489 if (BitWidth <= 96) 2490 return &AMDGPU::AV_96RegClass; 2491 if (BitWidth <= 128) 2492 return &AMDGPU::AV_128RegClass; 2493 if (BitWidth <= 160) 2494 return &AMDGPU::AV_160RegClass; 2495 if (BitWidth <= 192) 2496 return &AMDGPU::AV_192RegClass; 2497 if (BitWidth <= 224) 2498 return &AMDGPU::AV_224RegClass; 2499 if (BitWidth <= 256) 2500 return &AMDGPU::AV_256RegClass; 2501 if (BitWidth <= 512) 2502 return &AMDGPU::AV_512RegClass; 2503 if (BitWidth <= 1024) 2504 return &AMDGPU::AV_1024RegClass; 2505 2506 return nullptr; 2507 } 2508 2509 static const TargetRegisterClass * 2510 getAlignedVectorSuperClassForBitWidth(unsigned BitWidth) { 2511 if (BitWidth <= 64) 2512 return &AMDGPU::AV_64_Align2RegClass; 2513 if (BitWidth <= 96) 2514 return &AMDGPU::AV_96_Align2RegClass; 2515 if (BitWidth <= 128) 2516 return &AMDGPU::AV_128_Align2RegClass; 2517 if (BitWidth <= 160) 2518 return &AMDGPU::AV_160_Align2RegClass; 2519 if (BitWidth <= 192) 2520 return &AMDGPU::AV_192_Align2RegClass; 2521 if (BitWidth <= 224) 2522 return &AMDGPU::AV_224_Align2RegClass; 2523 if (BitWidth <= 256) 2524 return &AMDGPU::AV_256_Align2RegClass; 2525 if (BitWidth <= 512) 2526 return &AMDGPU::AV_512_Align2RegClass; 2527 if (BitWidth <= 1024) 2528 return &AMDGPU::AV_1024_Align2RegClass; 2529 2530 return nullptr; 2531 } 2532 2533 const TargetRegisterClass * 2534 SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const { 2535 if (BitWidth <= 16) 2536 return &AMDGPU::VGPR_LO16RegClass; 2537 if (BitWidth <= 32) 2538 return &AMDGPU::AV_32RegClass; 2539 return ST.needsAlignedVGPRs() 2540 ? getAlignedVectorSuperClassForBitWidth(BitWidth) 2541 : getAnyVectorSuperClassForBitWidth(BitWidth); 2542 } 2543 2544 const TargetRegisterClass * 2545 SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) { 2546 if (BitWidth <= 16) 2547 return &AMDGPU::SGPR_LO16RegClass; 2548 if (BitWidth <= 32) 2549 return &AMDGPU::SReg_32RegClass; 2550 if (BitWidth <= 64) 2551 return &AMDGPU::SReg_64RegClass; 2552 if (BitWidth <= 96) 2553 return &AMDGPU::SGPR_96RegClass; 2554 if (BitWidth <= 128) 2555 return &AMDGPU::SGPR_128RegClass; 2556 if (BitWidth <= 160) 2557 return &AMDGPU::SGPR_160RegClass; 2558 if (BitWidth <= 192) 2559 return &AMDGPU::SGPR_192RegClass; 2560 if (BitWidth <= 224) 2561 return &AMDGPU::SGPR_224RegClass; 2562 if (BitWidth <= 256) 2563 return &AMDGPU::SGPR_256RegClass; 2564 if (BitWidth <= 512) 2565 return &AMDGPU::SGPR_512RegClass; 2566 if (BitWidth <= 1024) 2567 return &AMDGPU::SGPR_1024RegClass; 2568 2569 return nullptr; 2570 } 2571 2572 // FIXME: This is very slow. It might be worth creating a map from physreg to 2573 // register class. 2574 const TargetRegisterClass * 2575 SIRegisterInfo::getPhysRegClass(MCRegister Reg) const { 2576 static const TargetRegisterClass *const BaseClasses[] = { 2577 &AMDGPU::VGPR_LO16RegClass, 2578 &AMDGPU::VGPR_HI16RegClass, 2579 &AMDGPU::SReg_LO16RegClass, 2580 &AMDGPU::AGPR_LO16RegClass, 2581 &AMDGPU::VGPR_32RegClass, 2582 &AMDGPU::SReg_32RegClass, 2583 &AMDGPU::AGPR_32RegClass, 2584 &AMDGPU::AGPR_32RegClass, 2585 &AMDGPU::VReg_64_Align2RegClass, 2586 &AMDGPU::VReg_64RegClass, 2587 &AMDGPU::SReg_64RegClass, 2588 &AMDGPU::AReg_64_Align2RegClass, 2589 &AMDGPU::AReg_64RegClass, 2590 &AMDGPU::VReg_96_Align2RegClass, 2591 &AMDGPU::VReg_96RegClass, 2592 &AMDGPU::SReg_96RegClass, 2593 &AMDGPU::AReg_96_Align2RegClass, 2594 &AMDGPU::AReg_96RegClass, 2595 &AMDGPU::VReg_128_Align2RegClass, 2596 &AMDGPU::VReg_128RegClass, 2597 &AMDGPU::SReg_128RegClass, 2598 &AMDGPU::AReg_128_Align2RegClass, 2599 &AMDGPU::AReg_128RegClass, 2600 &AMDGPU::VReg_160_Align2RegClass, 2601 &AMDGPU::VReg_160RegClass, 2602 &AMDGPU::SReg_160RegClass, 2603 &AMDGPU::AReg_160_Align2RegClass, 2604 &AMDGPU::AReg_160RegClass, 2605 &AMDGPU::VReg_192_Align2RegClass, 2606 &AMDGPU::VReg_192RegClass, 2607 &AMDGPU::SReg_192RegClass, 2608 &AMDGPU::AReg_192_Align2RegClass, 2609 &AMDGPU::AReg_192RegClass, 2610 &AMDGPU::VReg_224_Align2RegClass, 2611 &AMDGPU::VReg_224RegClass, 2612 &AMDGPU::SReg_224RegClass, 2613 &AMDGPU::AReg_224_Align2RegClass, 2614 &AMDGPU::AReg_224RegClass, 2615 &AMDGPU::VReg_256_Align2RegClass, 2616 &AMDGPU::VReg_256RegClass, 2617 &AMDGPU::SReg_256RegClass, 2618 &AMDGPU::AReg_256_Align2RegClass, 2619 &AMDGPU::AReg_256RegClass, 2620 &AMDGPU::VReg_512_Align2RegClass, 2621 &AMDGPU::VReg_512RegClass, 2622 &AMDGPU::SReg_512RegClass, 2623 &AMDGPU::AReg_512_Align2RegClass, 2624 &AMDGPU::AReg_512RegClass, 2625 &AMDGPU::SReg_1024RegClass, 2626 &AMDGPU::VReg_1024_Align2RegClass, 2627 &AMDGPU::VReg_1024RegClass, 2628 &AMDGPU::AReg_1024_Align2RegClass, 2629 &AMDGPU::AReg_1024RegClass, 2630 &AMDGPU::SCC_CLASSRegClass, 2631 &AMDGPU::Pseudo_SReg_32RegClass, 2632 &AMDGPU::Pseudo_SReg_128RegClass, 2633 }; 2634 2635 for (const TargetRegisterClass *BaseClass : BaseClasses) { 2636 if (BaseClass->contains(Reg)) { 2637 return BaseClass; 2638 } 2639 } 2640 return nullptr; 2641 } 2642 2643 bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo &MRI, 2644 Register Reg) const { 2645 const TargetRegisterClass *RC; 2646 if (Reg.isVirtual()) 2647 RC = MRI.getRegClass(Reg); 2648 else 2649 RC = getPhysRegClass(Reg); 2650 return isSGPRClass(RC); 2651 } 2652 2653 const TargetRegisterClass * 2654 SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const { 2655 unsigned Size = getRegSizeInBits(*SRC); 2656 const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size); 2657 assert(VRC && "Invalid register class size"); 2658 return VRC; 2659 } 2660 2661 const TargetRegisterClass * 2662 SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const { 2663 unsigned Size = getRegSizeInBits(*SRC); 2664 const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size); 2665 assert(ARC && "Invalid register class size"); 2666 return ARC; 2667 } 2668 2669 const TargetRegisterClass * 2670 SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const { 2671 unsigned Size = getRegSizeInBits(*VRC); 2672 if (Size == 32) 2673 return &AMDGPU::SGPR_32RegClass; 2674 const TargetRegisterClass *SRC = getSGPRClassForBitWidth(Size); 2675 assert(SRC && "Invalid register class size"); 2676 return SRC; 2677 } 2678 2679 const TargetRegisterClass *SIRegisterInfo::getSubRegClass( 2680 const TargetRegisterClass *RC, unsigned SubIdx) const { 2681 if (SubIdx == AMDGPU::NoSubRegister) 2682 return RC; 2683 2684 // We can assume that each lane corresponds to one 32-bit register. 2685 unsigned Size = getNumChannelsFromSubReg(SubIdx) * 32; 2686 if (isAGPRClass(RC)) { 2687 RC = getAGPRClassForBitWidth(Size); 2688 } else if (isVGPRClass(RC)) { 2689 RC = getVGPRClassForBitWidth(Size); 2690 } else if (isVectorSuperClass(RC)) { 2691 RC = getVectorSuperClassForBitWidth(Size); 2692 } else { 2693 RC = getSGPRClassForBitWidth(Size); 2694 } 2695 assert(RC && "Invalid sub-register class size"); 2696 return RC; 2697 } 2698 2699 const TargetRegisterClass * 2700 SIRegisterInfo::getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, 2701 const TargetRegisterClass *SubRC, 2702 unsigned SubIdx) const { 2703 // Ensure this subregister index is aligned in the super register. 2704 const TargetRegisterClass *MatchRC = 2705 getMatchingSuperRegClass(SuperRC, SubRC, SubIdx); 2706 return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr; 2707 } 2708 2709 bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { 2710 if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST && 2711 OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST) 2712 return !ST.hasMFMAInlineLiteralBug(); 2713 2714 return OpType >= AMDGPU::OPERAND_SRC_FIRST && 2715 OpType <= AMDGPU::OPERAND_SRC_LAST; 2716 } 2717 2718 bool SIRegisterInfo::shouldRewriteCopySrc( 2719 const TargetRegisterClass *DefRC, 2720 unsigned DefSubReg, 2721 const TargetRegisterClass *SrcRC, 2722 unsigned SrcSubReg) const { 2723 // We want to prefer the smallest register class possible, so we don't want to 2724 // stop and rewrite on anything that looks like a subregister 2725 // extract. Operations mostly don't care about the super register class, so we 2726 // only want to stop on the most basic of copies between the same register 2727 // class. 2728 // 2729 // e.g. if we have something like 2730 // %0 = ... 2731 // %1 = ... 2732 // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2 2733 // %3 = COPY %2, sub0 2734 // 2735 // We want to look through the COPY to find: 2736 // => %3 = COPY %0 2737 2738 // Plain copy. 2739 return getCommonSubClass(DefRC, SrcRC) != nullptr; 2740 } 2741 2742 bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const { 2743 // TODO: 64-bit operands have extending behavior from 32-bit literal. 2744 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST && 2745 OpType <= AMDGPU::OPERAND_REG_IMM_LAST; 2746 } 2747 2748 /// Returns a lowest register that is not used at any point in the function. 2749 /// If all registers are used, then this function will return 2750 /// AMDGPU::NoRegister. If \p ReserveHighestVGPR = true, then return 2751 /// highest unused register. 2752 MCRegister SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, 2753 const TargetRegisterClass *RC, 2754 const MachineFunction &MF, 2755 bool ReserveHighestVGPR) const { 2756 if (ReserveHighestVGPR) { 2757 for (MCRegister Reg : reverse(*RC)) 2758 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 2759 return Reg; 2760 } else { 2761 for (MCRegister Reg : *RC) 2762 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 2763 return Reg; 2764 } 2765 return MCRegister(); 2766 } 2767 2768 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, 2769 unsigned EltSize) const { 2770 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC->MC); 2771 assert(RegBitWidth >= 32 && RegBitWidth <= 1024); 2772 2773 const unsigned RegDWORDs = RegBitWidth / 32; 2774 const unsigned EltDWORDs = EltSize / 4; 2775 assert(RegSplitParts.size() + 1 >= EltDWORDs); 2776 2777 const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1]; 2778 const unsigned NumParts = RegDWORDs / EltDWORDs; 2779 2780 return makeArrayRef(Parts.data(), NumParts); 2781 } 2782 2783 const TargetRegisterClass* 2784 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI, 2785 Register Reg) const { 2786 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegClass(Reg); 2787 } 2788 2789 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, 2790 Register Reg) const { 2791 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); 2792 // Registers without classes are unaddressable, SGPR-like registers. 2793 return RC && isVGPRClass(RC); 2794 } 2795 2796 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI, 2797 Register Reg) const { 2798 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); 2799 2800 // Registers without classes are unaddressable, SGPR-like registers. 2801 return RC && isAGPRClass(RC); 2802 } 2803 2804 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, 2805 const TargetRegisterClass *SrcRC, 2806 unsigned SubReg, 2807 const TargetRegisterClass *DstRC, 2808 unsigned DstSubReg, 2809 const TargetRegisterClass *NewRC, 2810 LiveIntervals &LIS) const { 2811 unsigned SrcSize = getRegSizeInBits(*SrcRC); 2812 unsigned DstSize = getRegSizeInBits(*DstRC); 2813 unsigned NewSize = getRegSizeInBits(*NewRC); 2814 2815 // Do not increase size of registers beyond dword, we would need to allocate 2816 // adjacent registers and constraint regalloc more than needed. 2817 2818 // Always allow dword coalescing. 2819 if (SrcSize <= 32 || DstSize <= 32) 2820 return true; 2821 2822 return NewSize <= DstSize || NewSize <= SrcSize; 2823 } 2824 2825 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, 2826 MachineFunction &MF) const { 2827 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2828 2829 unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), 2830 MF.getFunction()); 2831 switch (RC->getID()) { 2832 default: 2833 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF); 2834 case AMDGPU::VGPR_32RegClassID: 2835 case AMDGPU::VGPR_LO16RegClassID: 2836 case AMDGPU::VGPR_HI16RegClassID: 2837 return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF)); 2838 case AMDGPU::SGPR_32RegClassID: 2839 case AMDGPU::SGPR_LO16RegClassID: 2840 return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF)); 2841 } 2842 } 2843 2844 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, 2845 unsigned Idx) const { 2846 if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 || 2847 Idx == AMDGPU::RegisterPressureSets::AGPR_32) 2848 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, 2849 const_cast<MachineFunction &>(MF)); 2850 2851 if (Idx == AMDGPU::RegisterPressureSets::SReg_32) 2852 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass, 2853 const_cast<MachineFunction &>(MF)); 2854 2855 llvm_unreachable("Unexpected register pressure set!"); 2856 } 2857 2858 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const { 2859 static const int Empty[] = { -1 }; 2860 2861 if (RegPressureIgnoredUnits[RegUnit]) 2862 return Empty; 2863 2864 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit); 2865 } 2866 2867 MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const { 2868 // Not a callee saved register. 2869 return AMDGPU::SGPR30_SGPR31; 2870 } 2871 2872 const TargetRegisterClass * 2873 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, 2874 const RegisterBank &RB, 2875 const MachineRegisterInfo &MRI) const { 2876 switch (RB.getID()) { 2877 case AMDGPU::VGPRRegBankID: 2878 return getVGPRClassForBitWidth(std::max(32u, Size)); 2879 case AMDGPU::VCCRegBankID: 2880 assert(Size == 1); 2881 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass 2882 : &AMDGPU::SReg_64_XEXECRegClass; 2883 case AMDGPU::SGPRRegBankID: 2884 return getSGPRClassForBitWidth(std::max(32u, Size)); 2885 case AMDGPU::AGPRRegBankID: 2886 return getAGPRClassForBitWidth(std::max(32u, Size)); 2887 default: 2888 llvm_unreachable("unknown register bank"); 2889 } 2890 } 2891 2892 const TargetRegisterClass * 2893 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, 2894 const MachineRegisterInfo &MRI) const { 2895 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg()); 2896 if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>()) 2897 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI); 2898 2899 if (const auto *RC = RCOrRB.dyn_cast<const TargetRegisterClass *>()) 2900 return getAllocatableClass(RC); 2901 2902 return nullptr; 2903 } 2904 2905 MCRegister SIRegisterInfo::getVCC() const { 2906 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; 2907 } 2908 2909 const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const { 2910 // VGPR tuples have an alignment requirement on gfx90a variants. 2911 return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass 2912 : &AMDGPU::VReg_64RegClass; 2913 } 2914 2915 const TargetRegisterClass * 2916 SIRegisterInfo::getRegClass(unsigned RCID) const { 2917 switch ((int)RCID) { 2918 case AMDGPU::SReg_1RegClassID: 2919 return getBoolRC(); 2920 case AMDGPU::SReg_1_XEXECRegClassID: 2921 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass 2922 : &AMDGPU::SReg_64_XEXECRegClass; 2923 case -1: 2924 return nullptr; 2925 default: 2926 return AMDGPUGenRegisterInfo::getRegClass(RCID); 2927 } 2928 } 2929 2930 // Find reaching register definition 2931 MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg, 2932 MachineInstr &Use, 2933 MachineRegisterInfo &MRI, 2934 LiveIntervals *LIS) const { 2935 auto &MDT = LIS->getAnalysis<MachineDominatorTree>(); 2936 SlotIndex UseIdx = LIS->getInstructionIndex(Use); 2937 SlotIndex DefIdx; 2938 2939 if (Reg.isVirtual()) { 2940 if (!LIS->hasInterval(Reg)) 2941 return nullptr; 2942 LiveInterval &LI = LIS->getInterval(Reg); 2943 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg) 2944 : MRI.getMaxLaneMaskForVReg(Reg); 2945 VNInfo *V = nullptr; 2946 if (LI.hasSubRanges()) { 2947 for (auto &S : LI.subranges()) { 2948 if ((S.LaneMask & SubLanes) == SubLanes) { 2949 V = S.getVNInfoAt(UseIdx); 2950 break; 2951 } 2952 } 2953 } else { 2954 V = LI.getVNInfoAt(UseIdx); 2955 } 2956 if (!V) 2957 return nullptr; 2958 DefIdx = V->def; 2959 } else { 2960 // Find last def. 2961 for (MCRegUnitIterator Units(Reg.asMCReg(), this); Units.isValid(); 2962 ++Units) { 2963 LiveRange &LR = LIS->getRegUnit(*Units); 2964 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) { 2965 if (!DefIdx.isValid() || 2966 MDT.dominates(LIS->getInstructionFromIndex(DefIdx), 2967 LIS->getInstructionFromIndex(V->def))) 2968 DefIdx = V->def; 2969 } else { 2970 return nullptr; 2971 } 2972 } 2973 } 2974 2975 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx); 2976 2977 if (!Def || !MDT.dominates(Def, &Use)) 2978 return nullptr; 2979 2980 assert(Def->modifiesRegister(Reg, this)); 2981 2982 return Def; 2983 } 2984 2985 MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const { 2986 assert(getRegSizeInBits(*getPhysRegClass(Reg)) <= 32); 2987 2988 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass, 2989 AMDGPU::SReg_32RegClass, 2990 AMDGPU::AGPR_32RegClass } ) { 2991 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC)) 2992 return Super; 2993 } 2994 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16, 2995 &AMDGPU::VGPR_32RegClass)) { 2996 return Super; 2997 } 2998 2999 return AMDGPU::NoRegister; 3000 } 3001 3002 bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const { 3003 if (!ST.needsAlignedVGPRs()) 3004 return true; 3005 3006 if (isVGPRClass(&RC)) 3007 return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC))); 3008 if (isAGPRClass(&RC)) 3009 return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC))); 3010 if (isVectorSuperClass(&RC)) 3011 return RC.hasSuperClassEq( 3012 getVectorSuperClassForBitWidth(getRegSizeInBits(RC))); 3013 3014 return true; 3015 } 3016 3017 bool SIRegisterInfo::isConstantPhysReg(MCRegister PhysReg) const { 3018 switch (PhysReg) { 3019 case AMDGPU::SGPR_NULL: 3020 case AMDGPU::SRC_SHARED_BASE: 3021 case AMDGPU::SRC_PRIVATE_BASE: 3022 case AMDGPU::SRC_SHARED_LIMIT: 3023 case AMDGPU::SRC_PRIVATE_LIMIT: 3024 return true; 3025 default: 3026 return false; 3027 } 3028 } 3029 3030 ArrayRef<MCPhysReg> 3031 SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const { 3032 return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(), 3033 ST.getMaxNumSGPRs(MF) / 4); 3034 } 3035 3036 ArrayRef<MCPhysReg> 3037 SIRegisterInfo::getAllSGPR64(const MachineFunction &MF) const { 3038 return makeArrayRef(AMDGPU::SGPR_64RegClass.begin(), 3039 ST.getMaxNumSGPRs(MF) / 2); 3040 } 3041 3042 ArrayRef<MCPhysReg> 3043 SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const { 3044 return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF)); 3045 } 3046