1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI implementation of the TargetRegisterInfo class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "SIRegisterInfo.h" 15 #include "AMDGPU.h" 16 #include "AMDGPURegisterBankInfo.h" 17 #include "GCNSubtarget.h" 18 #include "MCTargetDesc/AMDGPUInstPrinter.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "llvm/CodeGen/LiveIntervals.h" 22 #include "llvm/CodeGen/LivePhysRegs.h" 23 #include "llvm/CodeGen/MachineDominators.h" 24 #include "llvm/CodeGen/MachineFrameInfo.h" 25 #include "llvm/CodeGen/RegisterScavenging.h" 26 27 using namespace llvm; 28 29 #define GET_REGINFO_TARGET_DESC 30 #include "AMDGPUGenRegisterInfo.inc" 31 32 static cl::opt<bool> EnableSpillSGPRToVGPR( 33 "amdgpu-spill-sgpr-to-vgpr", 34 cl::desc("Enable spilling VGPRs to SGPRs"), 35 cl::ReallyHidden, 36 cl::init(true)); 37 38 std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts; 39 std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable; 40 41 // Map numbers of DWORDs to indexes in SubRegFromChannelTable. 42 // Valid indexes are shifted 1, such that a 0 mapping means unsupported. 43 // e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8, 44 // meaning index 7 in SubRegFromChannelTable. 45 static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = { 46 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9}; 47 48 namespace llvm { 49 50 // A temporary struct to spill SGPRs. 51 // This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits 52 // just v_writelane and v_readlane. 53 // 54 // When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR 55 // is saved to scratch (or the other way around for loads). 56 // For this, a VGPR is required where the needed lanes can be clobbered. The 57 // RegScavenger can provide a VGPR where currently active lanes can be 58 // clobbered, but we still need to save inactive lanes. 59 // The high-level steps are: 60 // - Try to scavenge SGPR(s) to save exec 61 // - Try to scavenge VGPR 62 // - Save needed, all or inactive lanes of a TmpVGPR 63 // - Spill/Restore SGPRs using TmpVGPR 64 // - Restore TmpVGPR 65 // 66 // To save all lanes of TmpVGPR, exec needs to be saved and modified. If we 67 // cannot scavenge temporary SGPRs to save exec, we use the following code: 68 // buffer_store_dword TmpVGPR ; only if active lanes need to be saved 69 // s_not exec, exec 70 // buffer_store_dword TmpVGPR ; save inactive lanes 71 // s_not exec, exec 72 struct SGPRSpillBuilder { 73 struct PerVGPRData { 74 unsigned PerVGPR; 75 unsigned NumVGPRs; 76 int64_t VGPRLanes; 77 }; 78 79 // The SGPR to save 80 Register SuperReg; 81 MachineBasicBlock::iterator MI; 82 ArrayRef<int16_t> SplitParts; 83 unsigned NumSubRegs; 84 bool IsKill; 85 const DebugLoc &DL; 86 87 /* When spilling to stack */ 88 // The SGPRs are written into this VGPR, which is then written to scratch 89 // (or vice versa for loads). 90 Register TmpVGPR = AMDGPU::NoRegister; 91 // Temporary spill slot to save TmpVGPR to. 92 int TmpVGPRIndex = 0; 93 // If TmpVGPR is live before the spill or if it is scavenged. 94 bool TmpVGPRLive = false; 95 // Scavenged SGPR to save EXEC. 96 Register SavedExecReg = AMDGPU::NoRegister; 97 // Stack index to write the SGPRs to. 98 int Index; 99 unsigned EltSize = 4; 100 101 RegScavenger *RS; 102 MachineBasicBlock *MBB; 103 MachineFunction &MF; 104 SIMachineFunctionInfo &MFI; 105 const SIInstrInfo &TII; 106 const SIRegisterInfo &TRI; 107 bool IsWave32; 108 Register ExecReg; 109 unsigned MovOpc; 110 unsigned NotOpc; 111 112 SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, 113 bool IsWave32, MachineBasicBlock::iterator MI, int Index, 114 RegScavenger *RS) 115 : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(0).getReg(), 116 MI->getOperand(0).isKill(), Index, RS) {} 117 118 SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, 119 bool IsWave32, MachineBasicBlock::iterator MI, Register Reg, 120 bool IsKill, int Index, RegScavenger *RS) 121 : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()), 122 Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()), 123 MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI), 124 IsWave32(IsWave32) { 125 const TargetRegisterClass *RC = TRI.getPhysRegClass(SuperReg); 126 SplitParts = TRI.getRegSplitParts(RC, EltSize); 127 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 128 129 if (IsWave32) { 130 ExecReg = AMDGPU::EXEC_LO; 131 MovOpc = AMDGPU::S_MOV_B32; 132 NotOpc = AMDGPU::S_NOT_B32; 133 } else { 134 ExecReg = AMDGPU::EXEC; 135 MovOpc = AMDGPU::S_MOV_B64; 136 NotOpc = AMDGPU::S_NOT_B64; 137 } 138 139 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 140 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI && 141 SuperReg != AMDGPU::EXEC && "exec should never spill"); 142 } 143 144 PerVGPRData getPerVGPRData() { 145 PerVGPRData Data; 146 Data.PerVGPR = IsWave32 ? 32 : 64; 147 Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR; 148 Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL; 149 return Data; 150 } 151 152 // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is 153 // free. 154 // Writes these instructions if an SGPR can be scavenged: 155 // s_mov_b64 s[6:7], exec ; Save exec 156 // s_mov_b64 exec, 3 ; Wanted lanemask 157 // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot 158 // 159 // Writes these instructions if no SGPR can be scavenged: 160 // buffer_store_dword v0 ; Only if no free VGPR was found 161 // s_not_b64 exec, exec 162 // buffer_store_dword v0 ; Save inactive lanes 163 // ; exec stays inverted, it is flipped back in 164 // ; restore. 165 void prepare() { 166 // Scavenged temporary VGPR to use. It must be scavenged once for any number 167 // of spilled subregs. 168 // FIXME: The liveness analysis is limited and does not tell if a register 169 // is in use in lanes that are currently inactive. We can never be sure if 170 // a register as actually in use in another lane, so we need to save all 171 // used lanes of the chosen VGPR. 172 assert(RS && "Cannot spill SGPR to memory without RegScavenger"); 173 TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0, false); 174 175 // Reserve temporary stack slot 176 TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI); 177 if (TmpVGPR) { 178 // Found a register that is dead in the currently active lanes, we only 179 // need to spill inactive lanes. 180 TmpVGPRLive = false; 181 } else { 182 // Pick v0 because it doesn't make a difference. 183 TmpVGPR = AMDGPU::VGPR0; 184 TmpVGPRLive = true; 185 } 186 187 if (TmpVGPRLive) { 188 // We need to inform the scavenger that this index is already in use until 189 // we're done with the custom emergency spill. 190 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR); 191 } 192 193 // We may end up recursively calling the scavenger, and don't want to re-use 194 // the same register. 195 RS->setRegUsed(TmpVGPR); 196 197 // Try to scavenge SGPRs to save exec 198 assert(!SavedExecReg && "Exec is already saved, refuse to save again"); 199 const TargetRegisterClass &RC = 200 IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass; 201 RS->setRegUsed(SuperReg); 202 SavedExecReg = RS->scavengeRegister(&RC, MI, 0, false); 203 204 int64_t VGPRLanes = getPerVGPRData().VGPRLanes; 205 206 if (SavedExecReg) { 207 RS->setRegUsed(SavedExecReg); 208 // Set exec to needed lanes 209 BuildMI(*MBB, MI, DL, TII.get(MovOpc), SavedExecReg).addReg(ExecReg); 210 auto I = 211 BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes); 212 if (!TmpVGPRLive) 213 I.addReg(TmpVGPR, RegState::ImplicitDefine); 214 // Spill needed lanes 215 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false); 216 } else { 217 // The modify and restore of exec clobber SCC, which we would have to save 218 // and restore. FIXME: We probably would need to reserve a register for 219 // this. 220 if (RS->isRegUsed(AMDGPU::SCC)) 221 MI->emitError("unhandled SGPR spill to memory"); 222 223 // Spill active lanes 224 if (TmpVGPRLive) 225 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false, 226 /*IsKill*/ false); 227 // Spill inactive lanes 228 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 229 if (!TmpVGPRLive) 230 I.addReg(TmpVGPR, RegState::ImplicitDefine); 231 I->getOperand(2).setIsDead(true); // Mark SCC as dead. 232 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false); 233 } 234 } 235 236 // Writes these instructions if an SGPR can be scavenged: 237 // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot 238 // s_waitcnt vmcnt(0) ; If a free VGPR was found 239 // s_mov_b64 exec, s[6:7] ; Save exec 240 // 241 // Writes these instructions if no SGPR can be scavenged: 242 // buffer_load_dword v0 ; Restore inactive lanes 243 // s_waitcnt vmcnt(0) ; If a free VGPR was found 244 // s_not_b64 exec, exec 245 // buffer_load_dword v0 ; Only if no free VGPR was found 246 void restore() { 247 if (SavedExecReg) { 248 // Restore used lanes 249 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true, 250 /*IsKill*/ false); 251 // Restore exec 252 auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg) 253 .addReg(SavedExecReg, RegState::Kill); 254 // Add an implicit use of the load so it is not dead. 255 // FIXME This inserts an unnecessary waitcnt 256 if (!TmpVGPRLive) { 257 I.addReg(TmpVGPR, RegState::ImplicitKill); 258 } 259 } else { 260 // Restore inactive lanes 261 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true, 262 /*IsKill*/ false); 263 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 264 if (!TmpVGPRLive) 265 I.addReg(TmpVGPR, RegState::ImplicitKill); 266 I->getOperand(2).setIsDead(true); // Mark SCC as dead. 267 268 // Restore active lanes 269 if (TmpVGPRLive) 270 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true); 271 } 272 273 // Inform the scavenger where we're releasing our custom scavenged register. 274 if (TmpVGPRLive) { 275 MachineBasicBlock::iterator RestorePt = std::prev(MI); 276 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR, &*RestorePt); 277 } 278 } 279 280 // Write TmpVGPR to memory or read TmpVGPR from memory. 281 // Either using a single buffer_load/store if exec is set to the needed mask 282 // or using 283 // buffer_load 284 // s_not exec, exec 285 // buffer_load 286 // s_not exec, exec 287 void readWriteTmpVGPR(unsigned Offset, bool IsLoad) { 288 if (SavedExecReg) { 289 // Spill needed lanes 290 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad); 291 } else { 292 // The modify and restore of exec clobber SCC, which we would have to save 293 // and restore. FIXME: We probably would need to reserve a register for 294 // this. 295 if (RS->isRegUsed(AMDGPU::SCC)) 296 MI->emitError("unhandled SGPR spill to memory"); 297 298 // Spill active lanes 299 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad, 300 /*IsKill*/ false); 301 // Spill inactive lanes 302 auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 303 Not0->getOperand(2).setIsDead(); // Mark SCC as dead. 304 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad); 305 auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 306 Not1->getOperand(2).setIsDead(); // Mark SCC as dead. 307 } 308 } 309 310 void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI) { 311 assert(MBB->getParent() == &MF); 312 MI = NewMI; 313 MBB = NewMBB; 314 } 315 }; 316 317 } // namespace llvm 318 319 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) 320 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST), 321 SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) { 322 323 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 && 324 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) && 325 (getSubRegIndexLaneMask(AMDGPU::lo16) | 326 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() == 327 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() && 328 "getNumCoveredRegs() will not work with generated subreg masks!"); 329 330 RegPressureIgnoredUnits.resize(getNumRegUnits()); 331 RegPressureIgnoredUnits.set( 332 *MCRegUnitIterator(MCRegister::from(AMDGPU::M0), this)); 333 for (auto Reg : AMDGPU::VGPR_HI16RegClass) 334 RegPressureIgnoredUnits.set(*MCRegUnitIterator(Reg, this)); 335 336 // HACK: Until this is fully tablegen'd. 337 static llvm::once_flag InitializeRegSplitPartsFlag; 338 339 static auto InitializeRegSplitPartsOnce = [this]() { 340 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) { 341 unsigned Size = getSubRegIdxSize(Idx); 342 if (Size & 31) 343 continue; 344 std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1]; 345 unsigned Pos = getSubRegIdxOffset(Idx); 346 if (Pos % Size) 347 continue; 348 Pos /= Size; 349 if (Vec.empty()) { 350 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits. 351 Vec.resize(MaxNumParts); 352 } 353 Vec[Pos] = Idx; 354 } 355 }; 356 357 static llvm::once_flag InitializeSubRegFromChannelTableFlag; 358 359 static auto InitializeSubRegFromChannelTableOnce = [this]() { 360 for (auto &Row : SubRegFromChannelTable) 361 Row.fill(AMDGPU::NoSubRegister); 362 for (unsigned Idx = 1; Idx < getNumSubRegIndices(); ++Idx) { 363 unsigned Width = AMDGPUSubRegIdxRanges[Idx].Size / 32; 364 unsigned Offset = AMDGPUSubRegIdxRanges[Idx].Offset / 32; 365 assert(Width < SubRegFromChannelTableWidthMap.size()); 366 Width = SubRegFromChannelTableWidthMap[Width]; 367 if (Width == 0) 368 continue; 369 unsigned TableIdx = Width - 1; 370 assert(TableIdx < SubRegFromChannelTable.size()); 371 assert(Offset < SubRegFromChannelTable[TableIdx].size()); 372 SubRegFromChannelTable[TableIdx][Offset] = Idx; 373 } 374 }; 375 376 llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce); 377 llvm::call_once(InitializeSubRegFromChannelTableFlag, 378 InitializeSubRegFromChannelTableOnce); 379 } 380 381 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, 382 MCRegister Reg) const { 383 MCRegAliasIterator R(Reg, this, true); 384 385 for (; R.isValid(); ++R) 386 Reserved.set(*R); 387 } 388 389 // Forced to be here by one .inc 390 const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs( 391 const MachineFunction *MF) const { 392 CallingConv::ID CC = MF->getFunction().getCallingConv(); 393 switch (CC) { 394 case CallingConv::C: 395 case CallingConv::Fast: 396 case CallingConv::Cold: 397 return ST.hasGFX90AInsts() ? CSR_AMDGPU_HighRegs_With_AGPRs_SaveList 398 : CSR_AMDGPU_HighRegs_SaveList; 399 case CallingConv::AMDGPU_Gfx: 400 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_With_AGPRs_SaveList 401 : CSR_AMDGPU_SI_Gfx_SaveList; 402 default: { 403 // Dummy to not crash RegisterClassInfo. 404 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister; 405 return &NoCalleeSavedReg; 406 } 407 } 408 } 409 410 const MCPhysReg * 411 SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const { 412 return nullptr; 413 } 414 415 const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, 416 CallingConv::ID CC) const { 417 switch (CC) { 418 case CallingConv::C: 419 case CallingConv::Fast: 420 case CallingConv::Cold: 421 return ST.hasGFX90AInsts() ? CSR_AMDGPU_HighRegs_With_AGPRs_RegMask 422 : CSR_AMDGPU_HighRegs_RegMask; 423 case CallingConv::AMDGPU_Gfx: 424 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_With_AGPRs_RegMask 425 : CSR_AMDGPU_SI_Gfx_RegMask; 426 default: 427 return nullptr; 428 } 429 } 430 431 const uint32_t *SIRegisterInfo::getNoPreservedMask() const { 432 return CSR_AMDGPU_NoRegs_RegMask; 433 } 434 435 const TargetRegisterClass * 436 SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, 437 const MachineFunction &MF) const { 438 // FIXME: Should have a helper function like getEquivalentVGPRClass to get the 439 // equivalent AV class. If used one, the verifier will crash after 440 // RegBankSelect in the GISel flow. The aligned regclasses are not fully given 441 // until Instruction selection. 442 if (ST.hasMAIInsts() && (isVGPRClass(RC) || isAGPRClass(RC))) { 443 if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass) 444 return &AMDGPU::AV_32RegClass; 445 if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass) 446 return &AMDGPU::AV_64RegClass; 447 if (RC == &AMDGPU::VReg_64_Align2RegClass || 448 RC == &AMDGPU::AReg_64_Align2RegClass) 449 return &AMDGPU::AV_64_Align2RegClass; 450 if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass) 451 return &AMDGPU::AV_96RegClass; 452 if (RC == &AMDGPU::VReg_96_Align2RegClass || 453 RC == &AMDGPU::AReg_96_Align2RegClass) 454 return &AMDGPU::AV_96_Align2RegClass; 455 if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass) 456 return &AMDGPU::AV_128RegClass; 457 if (RC == &AMDGPU::VReg_128_Align2RegClass || 458 RC == &AMDGPU::AReg_128_Align2RegClass) 459 return &AMDGPU::AV_128_Align2RegClass; 460 if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass) 461 return &AMDGPU::AV_160RegClass; 462 if (RC == &AMDGPU::VReg_160_Align2RegClass || 463 RC == &AMDGPU::AReg_160_Align2RegClass) 464 return &AMDGPU::AV_160_Align2RegClass; 465 if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass) 466 return &AMDGPU::AV_192RegClass; 467 if (RC == &AMDGPU::VReg_192_Align2RegClass || 468 RC == &AMDGPU::AReg_192_Align2RegClass) 469 return &AMDGPU::AV_192_Align2RegClass; 470 if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass) 471 return &AMDGPU::AV_256RegClass; 472 if (RC == &AMDGPU::VReg_256_Align2RegClass || 473 RC == &AMDGPU::AReg_256_Align2RegClass) 474 return &AMDGPU::AV_256_Align2RegClass; 475 if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass) 476 return &AMDGPU::AV_512RegClass; 477 if (RC == &AMDGPU::VReg_512_Align2RegClass || 478 RC == &AMDGPU::AReg_512_Align2RegClass) 479 return &AMDGPU::AV_512_Align2RegClass; 480 if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass) 481 return &AMDGPU::AV_1024RegClass; 482 if (RC == &AMDGPU::VReg_1024_Align2RegClass || 483 RC == &AMDGPU::AReg_1024_Align2RegClass) 484 return &AMDGPU::AV_1024_Align2RegClass; 485 } 486 487 return TargetRegisterInfo::getLargestLegalSuperClass(RC, MF); 488 } 489 490 Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { 491 const SIFrameLowering *TFI = ST.getFrameLowering(); 492 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 493 // During ISel lowering we always reserve the stack pointer in entry 494 // functions, but never actually want to reference it when accessing our own 495 // frame. If we need a frame pointer we use it, but otherwise we can just use 496 // an immediate "0" which we represent by returning NoRegister. 497 if (FuncInfo->isEntryFunction()) { 498 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register(); 499 } 500 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() 501 : FuncInfo->getStackPtrOffsetReg(); 502 } 503 504 bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const { 505 // When we need stack realignment, we can't reference off of the 506 // stack pointer, so we reserve a base pointer. 507 const MachineFrameInfo &MFI = MF.getFrameInfo(); 508 return MFI.getNumFixedObjects() && shouldRealignStack(MF); 509 } 510 511 Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; } 512 513 const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const { 514 return CSR_AMDGPU_AllVGPRs_RegMask; 515 } 516 517 const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const { 518 return CSR_AMDGPU_AllAGPRs_RegMask; 519 } 520 521 const uint32_t *SIRegisterInfo::getAllVectorRegMask() const { 522 return CSR_AMDGPU_AllVectorRegs_RegMask; 523 } 524 525 const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const { 526 return CSR_AMDGPU_AllAllocatableSRegs_RegMask; 527 } 528 529 unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel, 530 unsigned NumRegs) { 531 assert(NumRegs < SubRegFromChannelTableWidthMap.size()); 532 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs]; 533 assert(NumRegIndex && "Not implemented"); 534 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size()); 535 return SubRegFromChannelTable[NumRegIndex - 1][Channel]; 536 } 537 538 MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg( 539 const MachineFunction &MF) const { 540 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4; 541 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); 542 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass); 543 } 544 545 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { 546 BitVector Reserved(getNumRegs()); 547 Reserved.set(AMDGPU::MODE); 548 549 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but 550 // this seems likely to result in bugs, so I'm marking them as reserved. 551 reserveRegisterTuples(Reserved, AMDGPU::EXEC); 552 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); 553 554 // M0 has to be reserved so that llvm accepts it as a live-in into a block. 555 reserveRegisterTuples(Reserved, AMDGPU::M0); 556 557 // Reserve src_vccz, src_execz, src_scc. 558 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ); 559 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ); 560 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC); 561 562 // Reserve the memory aperture registers. 563 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE); 564 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); 565 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); 566 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT); 567 568 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen. 569 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID); 570 571 // Reserve xnack_mask registers - support is not implemented in Codegen. 572 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK); 573 574 // Reserve lds_direct register - support is not implemented in Codegen. 575 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT); 576 577 // Reserve Trap Handler registers - support is not implemented in Codegen. 578 reserveRegisterTuples(Reserved, AMDGPU::TBA); 579 reserveRegisterTuples(Reserved, AMDGPU::TMA); 580 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1); 581 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3); 582 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5); 583 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7); 584 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); 585 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); 586 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13); 587 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15); 588 589 // Reserve null register - it shall never be allocated 590 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL); 591 592 // Disallow vcc_hi allocation in wave32. It may be allocated but most likely 593 // will result in bugs. 594 if (isWave32) { 595 Reserved.set(AMDGPU::VCC); 596 Reserved.set(AMDGPU::VCC_HI); 597 } 598 599 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); 600 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 601 for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) { 602 unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); 603 reserveRegisterTuples(Reserved, Reg); 604 } 605 606 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 607 unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); 608 unsigned MaxNumAGPRs = MaxNumVGPRs; 609 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); 610 611 if (ST.hasGFX90AInsts()) { 612 // In an entry function without calls and AGPRs used it is possible to use 613 // the whole register budget for VGPRs. 614 615 // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and 616 // split register file accordingly. 617 if (MFI->usesAGPRs(MF)) { 618 MaxNumVGPRs /= 2; 619 MaxNumAGPRs = MaxNumVGPRs; 620 } else { 621 if (MaxNumVGPRs > TotalNumVGPRs) { 622 MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs; 623 MaxNumVGPRs = TotalNumVGPRs; 624 } else 625 MaxNumAGPRs = 0; 626 } 627 } else if (ST.hasMAIInsts()) { 628 // In order to guarantee copying between AGPRs, we need a scratch VGPR 629 // available at all times. 630 reserveRegisterTuples(Reserved, AMDGPU::VGPR32); 631 } 632 633 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) { 634 unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); 635 reserveRegisterTuples(Reserved, Reg); 636 } 637 638 for (unsigned i = MaxNumAGPRs; i < TotalNumVGPRs; ++i) { 639 unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); 640 reserveRegisterTuples(Reserved, Reg); 641 } 642 643 for (auto Reg : AMDGPU::SReg_32RegClass) { 644 Reserved.set(getSubReg(Reg, AMDGPU::hi16)); 645 Register Low = getSubReg(Reg, AMDGPU::lo16); 646 // This is to prevent BB vcc liveness errors. 647 if (!AMDGPU::SGPR_LO16RegClass.contains(Low)) 648 Reserved.set(Low); 649 } 650 651 for (auto Reg : AMDGPU::AGPR_32RegClass) { 652 Reserved.set(getSubReg(Reg, AMDGPU::hi16)); 653 } 654 655 // Reserve all the rest AGPRs if there are no instructions to use it. 656 if (!ST.hasMAIInsts()) { 657 for (unsigned i = 0; i < MaxNumVGPRs; ++i) { 658 unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); 659 reserveRegisterTuples(Reserved, Reg); 660 } 661 } 662 663 Register ScratchRSrcReg = MFI->getScratchRSrcReg(); 664 if (ScratchRSrcReg != AMDGPU::NoRegister) { 665 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need 666 // to spill. 667 // TODO: May need to reserve a VGPR if doing LDS spilling. 668 reserveRegisterTuples(Reserved, ScratchRSrcReg); 669 } 670 671 // We have to assume the SP is needed in case there are calls in the function, 672 // which is detected after the function is lowered. If we aren't really going 673 // to need SP, don't bother reserving it. 674 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg(); 675 676 if (StackPtrReg) { 677 reserveRegisterTuples(Reserved, StackPtrReg); 678 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg)); 679 } 680 681 MCRegister FrameReg = MFI->getFrameOffsetReg(); 682 if (FrameReg) { 683 reserveRegisterTuples(Reserved, FrameReg); 684 assert(!isSubRegister(ScratchRSrcReg, FrameReg)); 685 } 686 687 if (hasBasePointer(MF)) { 688 MCRegister BasePtrReg = getBaseRegister(); 689 reserveRegisterTuples(Reserved, BasePtrReg); 690 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg)); 691 } 692 693 for (auto Reg : MFI->WWMReservedRegs) { 694 reserveRegisterTuples(Reserved, Reg.first); 695 } 696 697 // Reserve VGPRs used for SGPR spilling. 698 // Note we treat freezeReservedRegs unusually because we run register 699 // allocation in two phases. It's OK to re-freeze with new registers for the 700 // second run. 701 #if 0 702 for (auto &SpilledFI : MFI->sgpr_spill_vgprs()) { 703 for (auto &SpilledVGPR : SpilledFI.second) 704 reserveRegisterTuples(Reserved, SpilledVGPR.VGPR); 705 } 706 #endif 707 708 // FIXME: Stop using reserved registers for this. 709 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs()) 710 reserveRegisterTuples(Reserved, Reg); 711 712 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) 713 reserveRegisterTuples(Reserved, Reg); 714 715 for (auto SSpill : MFI->getSGPRSpillVGPRs()) 716 reserveRegisterTuples(Reserved, SSpill.VGPR); 717 718 return Reserved; 719 } 720 721 bool SIRegisterInfo::isAsmClobberable(const MachineFunction &MF, 722 MCRegister PhysReg) const { 723 return !MF.getRegInfo().isReserved(PhysReg); 724 } 725 726 bool SIRegisterInfo::shouldRealignStack(const MachineFunction &MF) const { 727 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 728 // On entry, the base address is 0, so it can't possibly need any more 729 // alignment. 730 731 // FIXME: Should be able to specify the entry frame alignment per calling 732 // convention instead. 733 if (Info->isEntryFunction()) 734 return false; 735 736 return TargetRegisterInfo::shouldRealignStack(MF); 737 } 738 739 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { 740 const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>(); 741 if (Info->isEntryFunction()) { 742 const MachineFrameInfo &MFI = Fn.getFrameInfo(); 743 return MFI.hasStackObjects() || MFI.hasCalls(); 744 } 745 746 // May need scavenger for dealing with callee saved registers. 747 return true; 748 } 749 750 bool SIRegisterInfo::requiresFrameIndexScavenging( 751 const MachineFunction &MF) const { 752 // Do not use frame virtual registers. They used to be used for SGPRs, but 753 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the 754 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a 755 // spill. 756 return false; 757 } 758 759 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( 760 const MachineFunction &MF) const { 761 const MachineFrameInfo &MFI = MF.getFrameInfo(); 762 return MFI.hasStackObjects(); 763 } 764 765 bool SIRegisterInfo::requiresVirtualBaseRegisters( 766 const MachineFunction &) const { 767 // There are no special dedicated stack or frame pointers. 768 return true; 769 } 770 771 int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const { 772 assert(SIInstrInfo::isMUBUF(*MI) || SIInstrInfo::isFLATScratch(*MI)); 773 774 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 775 AMDGPU::OpName::offset); 776 return MI->getOperand(OffIdx).getImm(); 777 } 778 779 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI, 780 int Idx) const { 781 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 782 return 0; 783 784 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 785 AMDGPU::OpName::vaddr) || 786 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 787 AMDGPU::OpName::saddr))) && 788 "Should never see frame index on non-address operand"); 789 790 return getScratchInstrOffset(MI); 791 } 792 793 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { 794 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 795 return false; 796 797 int64_t FullOffset = Offset + getScratchInstrOffset(MI); 798 799 if (SIInstrInfo::isMUBUF(*MI)) 800 return !SIInstrInfo::isLegalMUBUFImmOffset(FullOffset); 801 802 const SIInstrInfo *TII = ST.getInstrInfo(); 803 return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS, 804 SIInstrFlags::FlatScratch); 805 } 806 807 Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, 808 int FrameIdx, 809 int64_t Offset) const { 810 MachineBasicBlock::iterator Ins = MBB->begin(); 811 DebugLoc DL; // Defaults to "unknown" 812 813 if (Ins != MBB->end()) 814 DL = Ins->getDebugLoc(); 815 816 MachineFunction *MF = MBB->getParent(); 817 const SIInstrInfo *TII = ST.getInstrInfo(); 818 MachineRegisterInfo &MRI = MF->getRegInfo(); 819 unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32 820 : AMDGPU::V_MOV_B32_e32; 821 822 Register BaseReg = MRI.createVirtualRegister( 823 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass 824 : &AMDGPU::VGPR_32RegClass); 825 826 if (Offset == 0) { 827 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg) 828 .addFrameIndex(FrameIdx); 829 return BaseReg; 830 } 831 832 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 833 834 Register FIReg = MRI.createVirtualRegister( 835 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass 836 : &AMDGPU::VGPR_32RegClass); 837 838 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) 839 .addImm(Offset); 840 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg) 841 .addFrameIndex(FrameIdx); 842 843 if (ST.enableFlatScratch() ) { 844 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg) 845 .addReg(OffsetReg, RegState::Kill) 846 .addReg(FIReg); 847 return BaseReg; 848 } 849 850 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg) 851 .addReg(OffsetReg, RegState::Kill) 852 .addReg(FIReg) 853 .addImm(0); // clamp bit 854 855 return BaseReg; 856 } 857 858 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, 859 int64_t Offset) const { 860 const SIInstrInfo *TII = ST.getInstrInfo(); 861 bool IsFlat = TII->isFLATScratch(MI); 862 863 #ifndef NDEBUG 864 // FIXME: Is it possible to be storing a frame index to itself? 865 bool SeenFI = false; 866 for (const MachineOperand &MO: MI.operands()) { 867 if (MO.isFI()) { 868 if (SeenFI) 869 llvm_unreachable("should not see multiple frame indices"); 870 871 SeenFI = true; 872 } 873 } 874 #endif 875 876 MachineOperand *FIOp = 877 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr 878 : AMDGPU::OpName::vaddr); 879 880 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); 881 int64_t NewOffset = OffsetOp->getImm() + Offset; 882 883 assert(FIOp && FIOp->isFI() && "frame index must be address operand"); 884 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI)); 885 886 if (IsFlat) { 887 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 888 SIInstrFlags::FlatScratch) && 889 "offset should be legal"); 890 FIOp->ChangeToRegister(BaseReg, false); 891 OffsetOp->setImm(NewOffset); 892 return; 893 } 894 895 #ifndef NDEBUG 896 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset); 897 assert(SOffset->isImm() && SOffset->getImm() == 0); 898 #endif 899 900 assert(SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) && 901 "offset should be legal"); 902 903 FIOp->ChangeToRegister(BaseReg, false); 904 OffsetOp->setImm(NewOffset); 905 } 906 907 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, 908 Register BaseReg, 909 int64_t Offset) const { 910 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 911 return false; 912 913 int64_t NewOffset = Offset + getScratchInstrOffset(MI); 914 915 if (SIInstrInfo::isMUBUF(*MI)) 916 return SIInstrInfo::isLegalMUBUFImmOffset(NewOffset); 917 918 const SIInstrInfo *TII = ST.getInstrInfo(); 919 return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 920 SIInstrFlags::FlatScratch); 921 } 922 923 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( 924 const MachineFunction &MF, unsigned Kind) const { 925 // This is inaccurate. It depends on the instruction and address space. The 926 // only place where we should hit this is for dealing with frame indexes / 927 // private accesses, so this is correct in that case. 928 return &AMDGPU::VGPR_32RegClass; 929 } 930 931 const TargetRegisterClass * 932 SIRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { 933 if (isAGPRClass(RC) && !ST.hasGFX90AInsts()) 934 return getEquivalentVGPRClass(RC); 935 936 return RC; 937 } 938 939 static unsigned getNumSubRegsForSpillOp(unsigned Op) { 940 941 switch (Op) { 942 case AMDGPU::SI_SPILL_S1024_SAVE: 943 case AMDGPU::SI_SPILL_S1024_RESTORE: 944 case AMDGPU::SI_SPILL_V1024_SAVE: 945 case AMDGPU::SI_SPILL_V1024_RESTORE: 946 case AMDGPU::SI_SPILL_A1024_SAVE: 947 case AMDGPU::SI_SPILL_A1024_RESTORE: 948 case AMDGPU::SI_SPILL_AV1024_SAVE: 949 case AMDGPU::SI_SPILL_AV1024_RESTORE: 950 return 32; 951 case AMDGPU::SI_SPILL_S512_SAVE: 952 case AMDGPU::SI_SPILL_S512_RESTORE: 953 case AMDGPU::SI_SPILL_V512_SAVE: 954 case AMDGPU::SI_SPILL_V512_RESTORE: 955 case AMDGPU::SI_SPILL_A512_SAVE: 956 case AMDGPU::SI_SPILL_A512_RESTORE: 957 case AMDGPU::SI_SPILL_AV512_SAVE: 958 case AMDGPU::SI_SPILL_AV512_RESTORE: 959 return 16; 960 case AMDGPU::SI_SPILL_S256_SAVE: 961 case AMDGPU::SI_SPILL_S256_RESTORE: 962 case AMDGPU::SI_SPILL_V256_SAVE: 963 case AMDGPU::SI_SPILL_V256_RESTORE: 964 case AMDGPU::SI_SPILL_A256_SAVE: 965 case AMDGPU::SI_SPILL_A256_RESTORE: 966 case AMDGPU::SI_SPILL_AV256_SAVE: 967 case AMDGPU::SI_SPILL_AV256_RESTORE: 968 return 8; 969 case AMDGPU::SI_SPILL_S224_SAVE: 970 case AMDGPU::SI_SPILL_S224_RESTORE: 971 case AMDGPU::SI_SPILL_V224_SAVE: 972 case AMDGPU::SI_SPILL_V224_RESTORE: 973 case AMDGPU::SI_SPILL_A224_SAVE: 974 case AMDGPU::SI_SPILL_A224_RESTORE: 975 case AMDGPU::SI_SPILL_AV224_SAVE: 976 case AMDGPU::SI_SPILL_AV224_RESTORE: 977 return 7; 978 case AMDGPU::SI_SPILL_S192_SAVE: 979 case AMDGPU::SI_SPILL_S192_RESTORE: 980 case AMDGPU::SI_SPILL_V192_SAVE: 981 case AMDGPU::SI_SPILL_V192_RESTORE: 982 case AMDGPU::SI_SPILL_A192_SAVE: 983 case AMDGPU::SI_SPILL_A192_RESTORE: 984 case AMDGPU::SI_SPILL_AV192_SAVE: 985 case AMDGPU::SI_SPILL_AV192_RESTORE: 986 return 6; 987 case AMDGPU::SI_SPILL_S160_SAVE: 988 case AMDGPU::SI_SPILL_S160_RESTORE: 989 case AMDGPU::SI_SPILL_V160_SAVE: 990 case AMDGPU::SI_SPILL_V160_RESTORE: 991 case AMDGPU::SI_SPILL_A160_SAVE: 992 case AMDGPU::SI_SPILL_A160_RESTORE: 993 case AMDGPU::SI_SPILL_AV160_SAVE: 994 case AMDGPU::SI_SPILL_AV160_RESTORE: 995 return 5; 996 case AMDGPU::SI_SPILL_S128_SAVE: 997 case AMDGPU::SI_SPILL_S128_RESTORE: 998 case AMDGPU::SI_SPILL_V128_SAVE: 999 case AMDGPU::SI_SPILL_V128_RESTORE: 1000 case AMDGPU::SI_SPILL_A128_SAVE: 1001 case AMDGPU::SI_SPILL_A128_RESTORE: 1002 case AMDGPU::SI_SPILL_AV128_SAVE: 1003 case AMDGPU::SI_SPILL_AV128_RESTORE: 1004 return 4; 1005 case AMDGPU::SI_SPILL_S96_SAVE: 1006 case AMDGPU::SI_SPILL_S96_RESTORE: 1007 case AMDGPU::SI_SPILL_V96_SAVE: 1008 case AMDGPU::SI_SPILL_V96_RESTORE: 1009 case AMDGPU::SI_SPILL_A96_SAVE: 1010 case AMDGPU::SI_SPILL_A96_RESTORE: 1011 case AMDGPU::SI_SPILL_AV96_SAVE: 1012 case AMDGPU::SI_SPILL_AV96_RESTORE: 1013 return 3; 1014 case AMDGPU::SI_SPILL_S64_SAVE: 1015 case AMDGPU::SI_SPILL_S64_RESTORE: 1016 case AMDGPU::SI_SPILL_V64_SAVE: 1017 case AMDGPU::SI_SPILL_V64_RESTORE: 1018 case AMDGPU::SI_SPILL_A64_SAVE: 1019 case AMDGPU::SI_SPILL_A64_RESTORE: 1020 case AMDGPU::SI_SPILL_AV64_SAVE: 1021 case AMDGPU::SI_SPILL_AV64_RESTORE: 1022 return 2; 1023 case AMDGPU::SI_SPILL_S32_SAVE: 1024 case AMDGPU::SI_SPILL_S32_RESTORE: 1025 case AMDGPU::SI_SPILL_V32_SAVE: 1026 case AMDGPU::SI_SPILL_V32_RESTORE: 1027 case AMDGPU::SI_SPILL_A32_SAVE: 1028 case AMDGPU::SI_SPILL_A32_RESTORE: 1029 case AMDGPU::SI_SPILL_AV32_SAVE: 1030 case AMDGPU::SI_SPILL_AV32_RESTORE: 1031 return 1; 1032 default: llvm_unreachable("Invalid spill opcode"); 1033 } 1034 } 1035 1036 static int getOffsetMUBUFStore(unsigned Opc) { 1037 switch (Opc) { 1038 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 1039 return AMDGPU::BUFFER_STORE_DWORD_OFFSET; 1040 case AMDGPU::BUFFER_STORE_BYTE_OFFEN: 1041 return AMDGPU::BUFFER_STORE_BYTE_OFFSET; 1042 case AMDGPU::BUFFER_STORE_SHORT_OFFEN: 1043 return AMDGPU::BUFFER_STORE_SHORT_OFFSET; 1044 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: 1045 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; 1046 case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN: 1047 return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET; 1048 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN: 1049 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; 1050 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN: 1051 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET; 1052 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN: 1053 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET; 1054 default: 1055 return -1; 1056 } 1057 } 1058 1059 static int getOffsetMUBUFLoad(unsigned Opc) { 1060 switch (Opc) { 1061 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 1062 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 1063 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN: 1064 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET; 1065 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN: 1066 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET; 1067 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN: 1068 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET; 1069 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN: 1070 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET; 1071 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN: 1072 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; 1073 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN: 1074 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET; 1075 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN: 1076 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET; 1077 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN: 1078 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET; 1079 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN: 1080 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET; 1081 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN: 1082 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET; 1083 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN: 1084 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET; 1085 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN: 1086 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET; 1087 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN: 1088 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET; 1089 default: 1090 return -1; 1091 } 1092 } 1093 1094 static int getOffenMUBUFStore(unsigned Opc) { 1095 switch (Opc) { 1096 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 1097 return AMDGPU::BUFFER_STORE_DWORD_OFFEN; 1098 case AMDGPU::BUFFER_STORE_BYTE_OFFSET: 1099 return AMDGPU::BUFFER_STORE_BYTE_OFFEN; 1100 case AMDGPU::BUFFER_STORE_SHORT_OFFSET: 1101 return AMDGPU::BUFFER_STORE_SHORT_OFFEN; 1102 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET: 1103 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN; 1104 case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET: 1105 return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN; 1106 case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET: 1107 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN; 1108 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET: 1109 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN; 1110 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET: 1111 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN; 1112 default: 1113 return -1; 1114 } 1115 } 1116 1117 static int getOffenMUBUFLoad(unsigned Opc) { 1118 switch (Opc) { 1119 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 1120 return AMDGPU::BUFFER_LOAD_DWORD_OFFEN; 1121 case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET: 1122 return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN; 1123 case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET: 1124 return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN; 1125 case AMDGPU::BUFFER_LOAD_USHORT_OFFSET: 1126 return AMDGPU::BUFFER_LOAD_USHORT_OFFEN; 1127 case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET: 1128 return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN; 1129 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET: 1130 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; 1131 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET: 1132 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN; 1133 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET: 1134 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN; 1135 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET: 1136 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN; 1137 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET: 1138 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN; 1139 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET: 1140 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN; 1141 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET: 1142 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN; 1143 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET: 1144 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN; 1145 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET: 1146 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN; 1147 default: 1148 return -1; 1149 } 1150 } 1151 1152 static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, 1153 MachineBasicBlock &MBB, 1154 MachineBasicBlock::iterator MI, 1155 int Index, unsigned Lane, 1156 unsigned ValueReg, bool IsKill) { 1157 MachineFunction *MF = MBB.getParent(); 1158 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1159 const SIInstrInfo *TII = ST.getInstrInfo(); 1160 1161 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane); 1162 1163 if (Reg == AMDGPU::NoRegister) 1164 return MachineInstrBuilder(); 1165 1166 bool IsStore = MI->mayStore(); 1167 MachineRegisterInfo &MRI = MF->getRegInfo(); 1168 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 1169 1170 unsigned Dst = IsStore ? Reg : ValueReg; 1171 unsigned Src = IsStore ? ValueReg : Reg; 1172 bool IsVGPR = TRI->isVGPR(MRI, Reg); 1173 DebugLoc DL = MI->getDebugLoc(); 1174 if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) { 1175 // Spiller during regalloc may restore a spilled register to its superclass. 1176 // It could result in AGPR spills restored to VGPRs or the other way around, 1177 // making the src and dst with identical regclasses at this point. It just 1178 // needs a copy in such cases. 1179 auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst) 1180 .addReg(Src, getKillRegState(IsKill)); 1181 CopyMIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1182 return CopyMIB; 1183 } 1184 unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 1185 : AMDGPU::V_ACCVGPR_READ_B32_e64; 1186 1187 auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst) 1188 .addReg(Src, getKillRegState(IsKill)); 1189 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1190 return MIB; 1191 } 1192 1193 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not 1194 // need to handle the case where an SGPR may need to be spilled while spilling. 1195 static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, 1196 MachineFrameInfo &MFI, 1197 MachineBasicBlock::iterator MI, 1198 int Index, 1199 int64_t Offset) { 1200 const SIInstrInfo *TII = ST.getInstrInfo(); 1201 MachineBasicBlock *MBB = MI->getParent(); 1202 const DebugLoc &DL = MI->getDebugLoc(); 1203 bool IsStore = MI->mayStore(); 1204 1205 unsigned Opc = MI->getOpcode(); 1206 int LoadStoreOp = IsStore ? 1207 getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc); 1208 if (LoadStoreOp == -1) 1209 return false; 1210 1211 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); 1212 if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr()) 1213 return true; 1214 1215 MachineInstrBuilder NewMI = 1216 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) 1217 .add(*Reg) 1218 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) 1219 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) 1220 .addImm(Offset) 1221 .addImm(0) // cpol 1222 .addImm(0) // tfe 1223 .addImm(0) // swz 1224 .cloneMemRefs(*MI); 1225 1226 const MachineOperand *VDataIn = TII->getNamedOperand(*MI, 1227 AMDGPU::OpName::vdata_in); 1228 if (VDataIn) 1229 NewMI.add(*VDataIn); 1230 return true; 1231 } 1232 1233 static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, 1234 unsigned LoadStoreOp, 1235 unsigned EltSize) { 1236 bool IsStore = TII->get(LoadStoreOp).mayStore(); 1237 bool HasVAddr = AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) != -1; 1238 bool UseST = 1239 !HasVAddr && 1240 AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::saddr) < 0; 1241 1242 switch (EltSize) { 1243 case 4: 1244 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 1245 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR; 1246 break; 1247 case 8: 1248 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR 1249 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR; 1250 break; 1251 case 12: 1252 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR 1253 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR; 1254 break; 1255 case 16: 1256 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR 1257 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR; 1258 break; 1259 default: 1260 llvm_unreachable("Unexpected spill load/store size!"); 1261 } 1262 1263 if (HasVAddr) 1264 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp); 1265 else if (UseST) 1266 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); 1267 1268 return LoadStoreOp; 1269 } 1270 1271 void SIRegisterInfo::buildSpillLoadStore( 1272 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, 1273 unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill, 1274 MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO, 1275 RegScavenger *RS, LivePhysRegs *LiveRegs) const { 1276 assert((!RS || !LiveRegs) && "Only RS or LiveRegs can be set but not both"); 1277 1278 MachineFunction *MF = MBB.getParent(); 1279 const SIInstrInfo *TII = ST.getInstrInfo(); 1280 const MachineFrameInfo &MFI = MF->getFrameInfo(); 1281 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>(); 1282 1283 const MCInstrDesc *Desc = &TII->get(LoadStoreOp); 1284 bool IsStore = Desc->mayStore(); 1285 bool IsFlat = TII->isFLATScratch(LoadStoreOp); 1286 1287 bool CanClobberSCC = false; 1288 bool Scavenged = false; 1289 MCRegister SOffset = ScratchOffsetReg; 1290 1291 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); 1292 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores. 1293 const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC); 1294 const unsigned RegWidth = AMDGPU::getRegBitWidth(RC->getID()) / 8; 1295 1296 // Always use 4 byte operations for AGPRs because we need to scavenge 1297 // a temporary VGPR. 1298 unsigned EltSize = (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u) : 4u; 1299 unsigned NumSubRegs = RegWidth / EltSize; 1300 unsigned Size = NumSubRegs * EltSize; 1301 unsigned RemSize = RegWidth - Size; 1302 unsigned NumRemSubRegs = RemSize ? 1 : 0; 1303 int64_t Offset = InstOffset + MFI.getObjectOffset(Index); 1304 int64_t MaterializedOffset = Offset; 1305 1306 int64_t MaxOffset = Offset + Size + RemSize - EltSize; 1307 int64_t ScratchOffsetRegDelta = 0; 1308 1309 if (IsFlat && EltSize > 4) { 1310 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); 1311 Desc = &TII->get(LoadStoreOp); 1312 } 1313 1314 Align Alignment = MFI.getObjectAlign(Index); 1315 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); 1316 1317 assert((IsFlat || ((Offset % EltSize) == 0)) && 1318 "unexpected VGPR spill offset"); 1319 1320 // Track a VGPR to use for a constant offset we need to materialize. 1321 Register TmpOffsetVGPR; 1322 1323 // Track a VGPR to use as an intermediate value. 1324 Register TmpIntermediateVGPR; 1325 bool UseVGPROffset = false; 1326 1327 // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate 1328 // combination. 1329 auto MaterializeVOffset = [&](Register SGPRBase, Register TmpVGPR, 1330 int64_t VOffset) { 1331 // We are using a VGPR offset 1332 if (IsFlat && SGPRBase) { 1333 // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free 1334 // SGPR, so perform the add as vector. 1335 // We don't need a base SGPR in the kernel. 1336 1337 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) >= 2) { 1338 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), TmpVGPR) 1339 .addReg(SGPRBase) 1340 .addImm(VOffset) 1341 .addImm(0); // clamp 1342 } else { 1343 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 1344 .addReg(SGPRBase); 1345 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), TmpVGPR) 1346 .addImm(VOffset) 1347 .addReg(TmpOffsetVGPR); 1348 } 1349 } else { 1350 assert(TmpOffsetVGPR); 1351 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 1352 .addImm(VOffset); 1353 } 1354 }; 1355 1356 bool IsOffsetLegal = 1357 IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, 1358 SIInstrFlags::FlatScratch) 1359 : SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset); 1360 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) { 1361 SOffset = MCRegister(); 1362 1363 // We don't have access to the register scavenger if this function is called 1364 // during PEI::scavengeFrameVirtualRegs() so use LiveRegs in this case. 1365 // TODO: Clobbering SCC is not necessary for scratch instructions in the 1366 // entry. 1367 if (RS) { 1368 SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false); 1369 1370 // Piggy back on the liveness scan we just did see if SCC is dead. 1371 CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC); 1372 } else if (LiveRegs) { 1373 CanClobberSCC = !LiveRegs->contains(AMDGPU::SCC); 1374 for (MCRegister Reg : AMDGPU::SGPR_32RegClass) { 1375 if (LiveRegs->available(MF->getRegInfo(), Reg)) { 1376 SOffset = Reg; 1377 break; 1378 } 1379 } 1380 } 1381 1382 if (ScratchOffsetReg != AMDGPU::NoRegister && !CanClobberSCC) 1383 SOffset = Register(); 1384 1385 if (!SOffset) { 1386 UseVGPROffset = true; 1387 1388 if (RS) { 1389 TmpOffsetVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 1390 } else { 1391 assert(LiveRegs); 1392 for (MCRegister Reg : AMDGPU::VGPR_32RegClass) { 1393 if (LiveRegs->available(MF->getRegInfo(), Reg)) { 1394 TmpOffsetVGPR = Reg; 1395 break; 1396 } 1397 } 1398 } 1399 1400 assert(TmpOffsetVGPR); 1401 } else if (!SOffset && CanClobberSCC) { 1402 // There are no free SGPRs, and since we are in the process of spilling 1403 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true 1404 // on SI/CI and on VI it is true until we implement spilling using scalar 1405 // stores), we have no way to free up an SGPR. Our solution here is to 1406 // add the offset directly to the ScratchOffset or StackPtrOffset 1407 // register, and then subtract the offset after the spill to return the 1408 // register to it's original value. 1409 1410 // TODO: If we don't have to do an emergency stack slot spill, converting 1411 // to use the VGPR offset is fewer instructions. 1412 if (!ScratchOffsetReg) 1413 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg(); 1414 SOffset = ScratchOffsetReg; 1415 ScratchOffsetRegDelta = Offset; 1416 } else { 1417 Scavenged = true; 1418 } 1419 1420 // We currently only support spilling VGPRs to EltSize boundaries, meaning 1421 // we can simplify the adjustment of Offset here to just scale with 1422 // WavefrontSize. 1423 if (!IsFlat && !UseVGPROffset) 1424 Offset *= ST.getWavefrontSize(); 1425 1426 if (!UseVGPROffset && !SOffset) 1427 report_fatal_error("could not scavenge SGPR to spill in entry function"); 1428 1429 if (UseVGPROffset) { 1430 // We are using a VGPR offset 1431 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, Offset); 1432 } else if (ScratchOffsetReg == AMDGPU::NoRegister) { 1433 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset); 1434 } else { 1435 assert(Offset != 0); 1436 auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) 1437 .addReg(ScratchOffsetReg) 1438 .addImm(Offset); 1439 Add->getOperand(3).setIsDead(); // Mark SCC as dead. 1440 } 1441 1442 Offset = 0; 1443 } 1444 1445 if (IsFlat && SOffset == AMDGPU::NoRegister) { 1446 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 1447 && "Unexpected vaddr for flat scratch with a FI operand"); 1448 1449 if (UseVGPROffset) { 1450 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp); 1451 } else { 1452 assert(ST.hasFlatScratchSTMode()); 1453 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); 1454 } 1455 1456 Desc = &TII->get(LoadStoreOp); 1457 } 1458 1459 for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e; 1460 ++i, RegOffset += EltSize) { 1461 if (i == NumSubRegs) { 1462 EltSize = RemSize; 1463 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); 1464 } 1465 Desc = &TII->get(LoadStoreOp); 1466 1467 if (!IsFlat && UseVGPROffset) { 1468 int NewLoadStoreOp = IsStore ? getOffenMUBUFStore(LoadStoreOp) 1469 : getOffenMUBUFLoad(LoadStoreOp); 1470 Desc = &TII->get(NewLoadStoreOp); 1471 } 1472 1473 if (UseVGPROffset && TmpOffsetVGPR == TmpIntermediateVGPR) { 1474 // If we are spilling an AGPR beyond the range of the memory instruction 1475 // offset and need to use a VGPR offset, we ideally have at least 2 1476 // scratch VGPRs. If we don't have a second free VGPR without spilling, 1477 // recycle the VGPR used for the offset which requires resetting after 1478 // each subregister. 1479 1480 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, MaterializedOffset); 1481 } 1482 1483 unsigned NumRegs = EltSize / 4; 1484 Register SubReg = e == 1 1485 ? ValueReg 1486 : Register(getSubReg(ValueReg, 1487 getSubRegFromChannel(RegOffset / 4, NumRegs))); 1488 1489 unsigned SOffsetRegState = 0; 1490 unsigned SrcDstRegState = getDefRegState(!IsStore); 1491 const bool IsLastSubReg = i + 1 == e; 1492 if (IsLastSubReg) { 1493 SOffsetRegState |= getKillRegState(Scavenged); 1494 // The last implicit use carries the "Kill" flag. 1495 SrcDstRegState |= getKillRegState(IsKill); 1496 } 1497 1498 // Make sure the whole register is defined if there are undef components by 1499 // adding an implicit def of the super-reg on the first instruction. 1500 bool NeedSuperRegDef = e > 1 && IsStore && i == 0; 1501 bool NeedSuperRegImpOperand = e > 1; 1502 1503 // Remaining element size to spill into memory after some parts of it 1504 // spilled into either AGPRs or VGPRs. 1505 unsigned RemEltSize = EltSize; 1506 1507 // AGPRs to spill VGPRs and vice versa are allocated in a reverse order, 1508 // starting from the last lane. In case if a register cannot be completely 1509 // spilled into another register that will ensure its alignment does not 1510 // change. For targets with VGPR alignment requirement this is important 1511 // in case of flat scratch usage as we might get a scratch_load or 1512 // scratch_store of an unaligned register otherwise. 1513 for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS, 1514 LaneE = RegOffset / 4; 1515 Lane >= LaneE; --Lane) { 1516 bool IsSubReg = e > 1 || EltSize > 4; 1517 Register Sub = IsSubReg 1518 ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane))) 1519 : ValueReg; 1520 auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill); 1521 if (!MIB.getInstr()) 1522 break; 1523 if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && !i)) { 1524 MIB.addReg(ValueReg, RegState::ImplicitDefine); 1525 NeedSuperRegDef = false; 1526 } 1527 if (IsSubReg || NeedSuperRegImpOperand) { 1528 NeedSuperRegImpOperand = true; 1529 unsigned State = SrcDstRegState; 1530 if (Lane != LaneE) 1531 State &= ~RegState::Kill; 1532 MIB.addReg(ValueReg, RegState::Implicit | State); 1533 } 1534 RemEltSize -= 4; 1535 } 1536 1537 if (!RemEltSize) // Fully spilled into AGPRs. 1538 continue; 1539 1540 if (RemEltSize != EltSize) { // Partially spilled to AGPRs 1541 assert(IsFlat && EltSize > 4); 1542 1543 unsigned NumRegs = RemEltSize / 4; 1544 SubReg = Register(getSubReg(ValueReg, 1545 getSubRegFromChannel(RegOffset / 4, NumRegs))); 1546 unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize); 1547 Desc = &TII->get(Opc); 1548 } 1549 1550 unsigned FinalReg = SubReg; 1551 1552 if (IsAGPR) { 1553 assert(EltSize == 4); 1554 1555 if (!TmpIntermediateVGPR) { 1556 assert(MF->getRegInfo().isReserved(AMDGPU::VGPR32)); 1557 TmpIntermediateVGPR = AMDGPU::VGPR32; 1558 } 1559 if (IsStore) { 1560 auto AccRead = BuildMI(MBB, MI, DL, 1561 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), 1562 TmpIntermediateVGPR) 1563 .addReg(SubReg, getKillRegState(IsKill)); 1564 if (NeedSuperRegDef) 1565 AccRead.addReg(ValueReg, RegState::ImplicitDefine); 1566 AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1567 } 1568 SubReg = TmpIntermediateVGPR; 1569 } else if (UseVGPROffset) { 1570 // FIXME: change to scavengeRegisterBackwards() 1571 if (!TmpOffsetVGPR) { 1572 TmpOffsetVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 1573 RS->setRegUsed(TmpOffsetVGPR); 1574 } 1575 } 1576 1577 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset); 1578 MachineMemOperand *NewMMO = 1579 MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize, 1580 commonAlignment(Alignment, RegOffset)); 1581 1582 auto MIB = 1583 BuildMI(MBB, MI, DL, *Desc) 1584 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)); 1585 1586 if (UseVGPROffset) { 1587 // For an AGPR spill, we reuse the same temp VGPR for the offset and the 1588 // intermediate accvgpr_write. 1589 MIB.addReg(TmpOffsetVGPR, getKillRegState(IsLastSubReg && !IsAGPR)); 1590 } 1591 1592 if (!IsFlat) 1593 MIB.addReg(FuncInfo->getScratchRSrcReg()); 1594 1595 if (SOffset == AMDGPU::NoRegister) { 1596 if (!IsFlat) { 1597 if (UseVGPROffset && ScratchOffsetReg) { 1598 assert(!FuncInfo->isEntryFunction()); 1599 MIB.addReg(ScratchOffsetReg); 1600 } else { 1601 assert(FuncInfo->isEntryFunction()); 1602 MIB.addImm(0); 1603 } 1604 } 1605 } else { 1606 MIB.addReg(SOffset, SOffsetRegState); 1607 } 1608 MIB.addImm(Offset + RegOffset) 1609 .addImm(0); // cpol 1610 if (!IsFlat) 1611 MIB.addImm(0) // tfe 1612 .addImm(0); // swz 1613 MIB.addMemOperand(NewMMO); 1614 1615 if (!IsAGPR && NeedSuperRegDef) 1616 MIB.addReg(ValueReg, RegState::ImplicitDefine); 1617 1618 if (!IsStore && IsAGPR && TmpIntermediateVGPR != AMDGPU::NoRegister) { 1619 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), 1620 FinalReg) 1621 .addReg(TmpIntermediateVGPR, RegState::Kill); 1622 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1623 } 1624 1625 if (NeedSuperRegImpOperand) 1626 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); 1627 } 1628 1629 if (ScratchOffsetRegDelta != 0) { 1630 // Subtract the offset we added to the ScratchOffset register. 1631 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) 1632 .addReg(SOffset) 1633 .addImm(-ScratchOffsetRegDelta); 1634 } 1635 } 1636 1637 void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, 1638 int Offset, bool IsLoad, 1639 bool IsKill) const { 1640 // Load/store VGPR 1641 MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo(); 1642 assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill); 1643 1644 Register FrameReg = 1645 FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF) 1646 ? getBaseRegister() 1647 : getFrameRegister(SB.MF); 1648 1649 Align Alignment = FrameInfo.getObjectAlign(Index); 1650 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SB.MF, Index); 1651 MachineMemOperand *MMO = SB.MF.getMachineMemOperand( 1652 PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore, 1653 SB.EltSize, Alignment); 1654 1655 if (IsLoad) { 1656 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 1657 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 1658 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false, 1659 FrameReg, Offset * SB.EltSize, MMO, SB.RS); 1660 } else { 1661 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 1662 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 1663 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill, 1664 FrameReg, Offset * SB.EltSize, MMO, SB.RS); 1665 // This only ever adds one VGPR spill 1666 SB.MFI.addToSpilledVGPRs(1); 1667 } 1668 } 1669 1670 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, 1671 int Index, 1672 RegScavenger *RS, 1673 LiveIntervals *LIS, 1674 bool OnlyToVGPR) const { 1675 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); 1676 1677 ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills = 1678 SB.MFI.getSGPRToVGPRSpills(Index); 1679 bool SpillToVGPR = !VGPRSpills.empty(); 1680 if (OnlyToVGPR && !SpillToVGPR) 1681 return false; 1682 1683 assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() && 1684 SB.SuperReg != SB.MFI.getFrameOffsetReg())); 1685 1686 if (SpillToVGPR) { 1687 1688 assert(SB.NumSubRegs == VGPRSpills.size() && 1689 "Num of VGPR lanes should be equal to num of SGPRs spilled"); 1690 1691 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) { 1692 Register SubReg = 1693 SB.NumSubRegs == 1 1694 ? SB.SuperReg 1695 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1696 SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; 1697 1698 bool UseKill = SB.IsKill && i == SB.NumSubRegs - 1; 1699 1700 // Mark the "old value of vgpr" input undef only if this is the first sgpr 1701 // spill to this specific vgpr in the first basic block. 1702 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, 1703 SB.TII.get(AMDGPU::V_WRITELANE_B32), Spill.VGPR) 1704 .addReg(SubReg, getKillRegState(UseKill)) 1705 .addImm(Spill.Lane) 1706 .addReg(Spill.VGPR); 1707 if (LIS) { 1708 if (i == 0) 1709 LIS->ReplaceMachineInstrInMaps(*MI, *MIB); 1710 else 1711 LIS->InsertMachineInstrInMaps(*MIB); 1712 } 1713 1714 if (i == 0 && SB.NumSubRegs > 1) { 1715 // We may be spilling a super-register which is only partially defined, 1716 // and need to ensure later spills think the value is defined. 1717 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1718 } 1719 1720 if (SB.NumSubRegs > 1) 1721 MIB.addReg(SB.SuperReg, getKillRegState(UseKill) | RegState::Implicit); 1722 1723 // FIXME: Since this spills to another register instead of an actual 1724 // frame index, we should delete the frame index when all references to 1725 // it are fixed. 1726 } 1727 } else { 1728 SB.prepare(); 1729 1730 // SubReg carries the "Kill" flag when SubReg == SB.SuperReg. 1731 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill); 1732 1733 // Per VGPR helper data 1734 auto PVD = SB.getPerVGPRData(); 1735 1736 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1737 unsigned TmpVGPRFlags = RegState::Undef; 1738 1739 // Write sub registers into the VGPR 1740 for (unsigned i = Offset * PVD.PerVGPR, 1741 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1742 i < e; ++i) { 1743 Register SubReg = 1744 SB.NumSubRegs == 1 1745 ? SB.SuperReg 1746 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1747 1748 MachineInstrBuilder WriteLane = 1749 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32), 1750 SB.TmpVGPR) 1751 .addReg(SubReg, SubKillState) 1752 .addImm(i % PVD.PerVGPR) 1753 .addReg(SB.TmpVGPR, TmpVGPRFlags); 1754 TmpVGPRFlags = 0; 1755 1756 if (LIS) { 1757 if (i == 0) 1758 LIS->ReplaceMachineInstrInMaps(*MI, *WriteLane); 1759 else 1760 LIS->InsertMachineInstrInMaps(*WriteLane); 1761 } 1762 1763 // There could be undef components of a spilled super register. 1764 // TODO: Can we detect this and skip the spill? 1765 if (SB.NumSubRegs > 1) { 1766 // The last implicit use of the SB.SuperReg carries the "Kill" flag. 1767 unsigned SuperKillState = 0; 1768 if (i + 1 == SB.NumSubRegs) 1769 SuperKillState |= getKillRegState(SB.IsKill); 1770 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState); 1771 } 1772 } 1773 1774 // Write out VGPR 1775 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false); 1776 } 1777 1778 SB.restore(); 1779 } 1780 1781 MI->eraseFromParent(); 1782 SB.MFI.addToSpilledSGPRs(SB.NumSubRegs); 1783 1784 if (LIS) 1785 LIS->removeAllRegUnitsForPhysReg(SB.SuperReg); 1786 1787 return true; 1788 } 1789 1790 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, 1791 int Index, 1792 RegScavenger *RS, 1793 LiveIntervals *LIS, 1794 bool OnlyToVGPR) const { 1795 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); 1796 1797 ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills = 1798 SB.MFI.getSGPRToVGPRSpills(Index); 1799 bool SpillToVGPR = !VGPRSpills.empty(); 1800 if (OnlyToVGPR && !SpillToVGPR) 1801 return false; 1802 1803 if (SpillToVGPR) { 1804 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) { 1805 Register SubReg = 1806 SB.NumSubRegs == 1 1807 ? SB.SuperReg 1808 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1809 1810 SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; 1811 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32), 1812 SubReg) 1813 .addReg(Spill.VGPR) 1814 .addImm(Spill.Lane); 1815 if (SB.NumSubRegs > 1 && i == 0) 1816 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1817 if (LIS) { 1818 if (i == e - 1) 1819 LIS->ReplaceMachineInstrInMaps(*MI, *MIB); 1820 else 1821 LIS->InsertMachineInstrInMaps(*MIB); 1822 } 1823 1824 } 1825 } else { 1826 SB.prepare(); 1827 1828 // Per VGPR helper data 1829 auto PVD = SB.getPerVGPRData(); 1830 1831 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1832 // Load in VGPR data 1833 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true); 1834 1835 // Unpack lanes 1836 for (unsigned i = Offset * PVD.PerVGPR, 1837 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1838 i < e; ++i) { 1839 Register SubReg = 1840 SB.NumSubRegs == 1 1841 ? SB.SuperReg 1842 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1843 1844 bool LastSubReg = (i + 1 == e); 1845 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, 1846 SB.TII.get(AMDGPU::V_READLANE_B32), SubReg) 1847 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg)) 1848 .addImm(i); 1849 if (SB.NumSubRegs > 1 && i == 0) 1850 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1851 if (LIS) { 1852 if (i == e - 1) 1853 LIS->ReplaceMachineInstrInMaps(*MI, *MIB); 1854 else 1855 LIS->InsertMachineInstrInMaps(*MIB); 1856 } 1857 } 1858 } 1859 1860 SB.restore(); 1861 } 1862 1863 MI->eraseFromParent(); 1864 1865 if (LIS) 1866 LIS->removeAllRegUnitsForPhysReg(SB.SuperReg); 1867 1868 return true; 1869 } 1870 1871 bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI, 1872 MachineBasicBlock &RestoreMBB, 1873 Register SGPR, RegScavenger *RS) const { 1874 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0, 1875 RS); 1876 SB.prepare(); 1877 // Generate the spill of SGPR to SB.TmpVGPR. 1878 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill); 1879 auto PVD = SB.getPerVGPRData(); 1880 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1881 unsigned TmpVGPRFlags = RegState::Undef; 1882 // Write sub registers into the VGPR 1883 for (unsigned i = Offset * PVD.PerVGPR, 1884 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1885 i < e; ++i) { 1886 Register SubReg = 1887 SB.NumSubRegs == 1 1888 ? SB.SuperReg 1889 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1890 1891 MachineInstrBuilder WriteLane = 1892 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32), 1893 SB.TmpVGPR) 1894 .addReg(SubReg, SubKillState) 1895 .addImm(i % PVD.PerVGPR) 1896 .addReg(SB.TmpVGPR, TmpVGPRFlags); 1897 TmpVGPRFlags = 0; 1898 // There could be undef components of a spilled super register. 1899 // TODO: Can we detect this and skip the spill? 1900 if (SB.NumSubRegs > 1) { 1901 // The last implicit use of the SB.SuperReg carries the "Kill" flag. 1902 unsigned SuperKillState = 0; 1903 if (i + 1 == SB.NumSubRegs) 1904 SuperKillState |= getKillRegState(SB.IsKill); 1905 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState); 1906 } 1907 } 1908 // Don't need to write VGPR out. 1909 } 1910 1911 // Restore clobbered registers in the specified restore block. 1912 MI = RestoreMBB.end(); 1913 SB.setMI(&RestoreMBB, MI); 1914 // Generate the restore of SGPR from SB.TmpVGPR. 1915 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1916 // Don't need to load VGPR in. 1917 // Unpack lanes 1918 for (unsigned i = Offset * PVD.PerVGPR, 1919 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1920 i < e; ++i) { 1921 Register SubReg = 1922 SB.NumSubRegs == 1 1923 ? SB.SuperReg 1924 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1925 bool LastSubReg = (i + 1 == e); 1926 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32), 1927 SubReg) 1928 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg)) 1929 .addImm(i); 1930 if (SB.NumSubRegs > 1 && i == 0) 1931 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1932 } 1933 } 1934 SB.restore(); 1935 1936 SB.MFI.addToSpilledSGPRs(SB.NumSubRegs); 1937 return false; 1938 } 1939 1940 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to 1941 /// a VGPR and the stack slot can be safely eliminated when all other users are 1942 /// handled. 1943 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( 1944 MachineBasicBlock::iterator MI, 1945 int FI, 1946 RegScavenger *RS, 1947 LiveIntervals *LIS) const { 1948 switch (MI->getOpcode()) { 1949 case AMDGPU::SI_SPILL_S1024_SAVE: 1950 case AMDGPU::SI_SPILL_S512_SAVE: 1951 case AMDGPU::SI_SPILL_S256_SAVE: 1952 case AMDGPU::SI_SPILL_S224_SAVE: 1953 case AMDGPU::SI_SPILL_S192_SAVE: 1954 case AMDGPU::SI_SPILL_S160_SAVE: 1955 case AMDGPU::SI_SPILL_S128_SAVE: 1956 case AMDGPU::SI_SPILL_S96_SAVE: 1957 case AMDGPU::SI_SPILL_S64_SAVE: 1958 case AMDGPU::SI_SPILL_S32_SAVE: 1959 return spillSGPR(MI, FI, RS, LIS, true); 1960 case AMDGPU::SI_SPILL_S1024_RESTORE: 1961 case AMDGPU::SI_SPILL_S512_RESTORE: 1962 case AMDGPU::SI_SPILL_S256_RESTORE: 1963 case AMDGPU::SI_SPILL_S224_RESTORE: 1964 case AMDGPU::SI_SPILL_S192_RESTORE: 1965 case AMDGPU::SI_SPILL_S160_RESTORE: 1966 case AMDGPU::SI_SPILL_S128_RESTORE: 1967 case AMDGPU::SI_SPILL_S96_RESTORE: 1968 case AMDGPU::SI_SPILL_S64_RESTORE: 1969 case AMDGPU::SI_SPILL_S32_RESTORE: 1970 return restoreSGPR(MI, FI, RS, LIS, true); 1971 default: 1972 llvm_unreachable("not an SGPR spill instruction"); 1973 } 1974 } 1975 1976 void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, 1977 int SPAdj, unsigned FIOperandNum, 1978 RegScavenger *RS) const { 1979 MachineFunction *MF = MI->getParent()->getParent(); 1980 MachineBasicBlock *MBB = MI->getParent(); 1981 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1982 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 1983 const SIInstrInfo *TII = ST.getInstrInfo(); 1984 DebugLoc DL = MI->getDebugLoc(); 1985 1986 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?"); 1987 1988 MachineOperand &FIOp = MI->getOperand(FIOperandNum); 1989 int Index = MI->getOperand(FIOperandNum).getIndex(); 1990 1991 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF) 1992 ? getBaseRegister() 1993 : getFrameRegister(*MF); 1994 1995 switch (MI->getOpcode()) { 1996 // SGPR register spill 1997 case AMDGPU::SI_SPILL_S1024_SAVE: 1998 case AMDGPU::SI_SPILL_S512_SAVE: 1999 case AMDGPU::SI_SPILL_S256_SAVE: 2000 case AMDGPU::SI_SPILL_S224_SAVE: 2001 case AMDGPU::SI_SPILL_S192_SAVE: 2002 case AMDGPU::SI_SPILL_S160_SAVE: 2003 case AMDGPU::SI_SPILL_S128_SAVE: 2004 case AMDGPU::SI_SPILL_S96_SAVE: 2005 case AMDGPU::SI_SPILL_S64_SAVE: 2006 case AMDGPU::SI_SPILL_S32_SAVE: { 2007 spillSGPR(MI, Index, RS); 2008 break; 2009 } 2010 2011 // SGPR register restore 2012 case AMDGPU::SI_SPILL_S1024_RESTORE: 2013 case AMDGPU::SI_SPILL_S512_RESTORE: 2014 case AMDGPU::SI_SPILL_S256_RESTORE: 2015 case AMDGPU::SI_SPILL_S224_RESTORE: 2016 case AMDGPU::SI_SPILL_S192_RESTORE: 2017 case AMDGPU::SI_SPILL_S160_RESTORE: 2018 case AMDGPU::SI_SPILL_S128_RESTORE: 2019 case AMDGPU::SI_SPILL_S96_RESTORE: 2020 case AMDGPU::SI_SPILL_S64_RESTORE: 2021 case AMDGPU::SI_SPILL_S32_RESTORE: { 2022 restoreSGPR(MI, Index, RS); 2023 break; 2024 } 2025 2026 // VGPR register spill 2027 case AMDGPU::SI_SPILL_V1024_SAVE: 2028 case AMDGPU::SI_SPILL_V512_SAVE: 2029 case AMDGPU::SI_SPILL_V256_SAVE: 2030 case AMDGPU::SI_SPILL_V224_SAVE: 2031 case AMDGPU::SI_SPILL_V192_SAVE: 2032 case AMDGPU::SI_SPILL_V160_SAVE: 2033 case AMDGPU::SI_SPILL_V128_SAVE: 2034 case AMDGPU::SI_SPILL_V96_SAVE: 2035 case AMDGPU::SI_SPILL_V64_SAVE: 2036 case AMDGPU::SI_SPILL_V32_SAVE: 2037 case AMDGPU::SI_SPILL_A1024_SAVE: 2038 case AMDGPU::SI_SPILL_A512_SAVE: 2039 case AMDGPU::SI_SPILL_A256_SAVE: 2040 case AMDGPU::SI_SPILL_A224_SAVE: 2041 case AMDGPU::SI_SPILL_A192_SAVE: 2042 case AMDGPU::SI_SPILL_A160_SAVE: 2043 case AMDGPU::SI_SPILL_A128_SAVE: 2044 case AMDGPU::SI_SPILL_A96_SAVE: 2045 case AMDGPU::SI_SPILL_A64_SAVE: 2046 case AMDGPU::SI_SPILL_A32_SAVE: 2047 case AMDGPU::SI_SPILL_AV1024_SAVE: 2048 case AMDGPU::SI_SPILL_AV512_SAVE: 2049 case AMDGPU::SI_SPILL_AV256_SAVE: 2050 case AMDGPU::SI_SPILL_AV224_SAVE: 2051 case AMDGPU::SI_SPILL_AV192_SAVE: 2052 case AMDGPU::SI_SPILL_AV160_SAVE: 2053 case AMDGPU::SI_SPILL_AV128_SAVE: 2054 case AMDGPU::SI_SPILL_AV96_SAVE: 2055 case AMDGPU::SI_SPILL_AV64_SAVE: 2056 case AMDGPU::SI_SPILL_AV32_SAVE: { 2057 const MachineOperand *VData = TII->getNamedOperand(*MI, 2058 AMDGPU::OpName::vdata); 2059 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 2060 MFI->getStackPtrOffsetReg()); 2061 2062 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 2063 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 2064 auto *MBB = MI->getParent(); 2065 buildSpillLoadStore( 2066 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, 2067 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 2068 *MI->memoperands_begin(), RS); 2069 MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); 2070 MI->eraseFromParent(); 2071 break; 2072 } 2073 case AMDGPU::SI_SPILL_V32_RESTORE: 2074 case AMDGPU::SI_SPILL_V64_RESTORE: 2075 case AMDGPU::SI_SPILL_V96_RESTORE: 2076 case AMDGPU::SI_SPILL_V128_RESTORE: 2077 case AMDGPU::SI_SPILL_V160_RESTORE: 2078 case AMDGPU::SI_SPILL_V192_RESTORE: 2079 case AMDGPU::SI_SPILL_V224_RESTORE: 2080 case AMDGPU::SI_SPILL_V256_RESTORE: 2081 case AMDGPU::SI_SPILL_V512_RESTORE: 2082 case AMDGPU::SI_SPILL_V1024_RESTORE: 2083 case AMDGPU::SI_SPILL_A32_RESTORE: 2084 case AMDGPU::SI_SPILL_A64_RESTORE: 2085 case AMDGPU::SI_SPILL_A96_RESTORE: 2086 case AMDGPU::SI_SPILL_A128_RESTORE: 2087 case AMDGPU::SI_SPILL_A160_RESTORE: 2088 case AMDGPU::SI_SPILL_A192_RESTORE: 2089 case AMDGPU::SI_SPILL_A224_RESTORE: 2090 case AMDGPU::SI_SPILL_A256_RESTORE: 2091 case AMDGPU::SI_SPILL_A512_RESTORE: 2092 case AMDGPU::SI_SPILL_A1024_RESTORE: 2093 case AMDGPU::SI_SPILL_AV32_RESTORE: 2094 case AMDGPU::SI_SPILL_AV64_RESTORE: 2095 case AMDGPU::SI_SPILL_AV96_RESTORE: 2096 case AMDGPU::SI_SPILL_AV128_RESTORE: 2097 case AMDGPU::SI_SPILL_AV160_RESTORE: 2098 case AMDGPU::SI_SPILL_AV192_RESTORE: 2099 case AMDGPU::SI_SPILL_AV224_RESTORE: 2100 case AMDGPU::SI_SPILL_AV256_RESTORE: 2101 case AMDGPU::SI_SPILL_AV512_RESTORE: 2102 case AMDGPU::SI_SPILL_AV1024_RESTORE: { 2103 const MachineOperand *VData = TII->getNamedOperand(*MI, 2104 AMDGPU::OpName::vdata); 2105 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 2106 MFI->getStackPtrOffsetReg()); 2107 2108 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 2109 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 2110 auto *MBB = MI->getParent(); 2111 buildSpillLoadStore( 2112 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, 2113 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 2114 *MI->memoperands_begin(), RS); 2115 MI->eraseFromParent(); 2116 break; 2117 } 2118 2119 default: { 2120 // Other access to frame index 2121 const DebugLoc &DL = MI->getDebugLoc(); 2122 2123 int64_t Offset = FrameInfo.getObjectOffset(Index); 2124 if (ST.enableFlatScratch()) { 2125 if (TII->isFLATScratch(*MI)) { 2126 assert((int16_t)FIOperandNum == 2127 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 2128 AMDGPU::OpName::saddr)); 2129 2130 // The offset is always swizzled, just replace it 2131 if (FrameReg) 2132 FIOp.ChangeToRegister(FrameReg, false); 2133 2134 if (!Offset) 2135 return; 2136 2137 MachineOperand *OffsetOp = 2138 TII->getNamedOperand(*MI, AMDGPU::OpName::offset); 2139 int64_t NewOffset = Offset + OffsetOp->getImm(); 2140 if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 2141 SIInstrFlags::FlatScratch)) { 2142 OffsetOp->setImm(NewOffset); 2143 if (FrameReg) 2144 return; 2145 Offset = 0; 2146 } 2147 2148 if (!Offset) { 2149 unsigned Opc = MI->getOpcode(); 2150 int NewOpc = -1; 2151 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) != -1) { 2152 NewOpc = AMDGPU::getFlatScratchInstSVfromSVS(Opc); 2153 } else if (ST.hasFlatScratchSTMode()) { 2154 // On GFX10 we have ST mode to use no registers for an address. 2155 // Otherwise we need to materialize 0 into an SGPR. 2156 NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc); 2157 } 2158 2159 if (NewOpc != -1) { 2160 MI->removeOperand( 2161 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr)); 2162 MI->setDesc(TII->get(NewOpc)); 2163 return; 2164 } 2165 } 2166 } 2167 2168 if (!FrameReg) { 2169 FIOp.ChangeToImmediate(Offset); 2170 if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) 2171 return; 2172 } 2173 2174 // We need to use register here. Check if we can use an SGPR or need 2175 // a VGPR. 2176 FIOp.ChangeToRegister(AMDGPU::M0, false); 2177 bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp); 2178 2179 if (!Offset && FrameReg && UseSGPR) { 2180 FIOp.setReg(FrameReg); 2181 return; 2182 } 2183 2184 const TargetRegisterClass *RC = UseSGPR ? &AMDGPU::SReg_32_XM0RegClass 2185 : &AMDGPU::VGPR_32RegClass; 2186 2187 Register TmpReg = RS->scavengeRegister(RC, MI, 0, !UseSGPR); 2188 FIOp.setReg(TmpReg); 2189 FIOp.setIsKill(true); 2190 2191 if ((!FrameReg || !Offset) && TmpReg) { 2192 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 2193 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg); 2194 if (FrameReg) 2195 MIB.addReg(FrameReg); 2196 else 2197 MIB.addImm(Offset); 2198 2199 return; 2200 } 2201 2202 Register TmpSReg = 2203 UseSGPR ? TmpReg 2204 : RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, 2205 !UseSGPR); 2206 2207 // TODO: for flat scratch another attempt can be made with a VGPR index 2208 // if no SGPRs can be scavenged. 2209 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) 2210 report_fatal_error("Cannot scavenge register in FI elimination!"); 2211 2212 if (!TmpSReg) { 2213 // Use frame register and restore it after. 2214 TmpSReg = FrameReg; 2215 FIOp.setReg(FrameReg); 2216 FIOp.setIsKill(false); 2217 } 2218 2219 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg) 2220 .addReg(FrameReg) 2221 .addImm(Offset); 2222 2223 if (!UseSGPR) 2224 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 2225 .addReg(TmpSReg, RegState::Kill); 2226 2227 if (TmpSReg == FrameReg) { 2228 // Undo frame register modification. 2229 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32), 2230 FrameReg) 2231 .addReg(FrameReg) 2232 .addImm(-Offset); 2233 } 2234 2235 return; 2236 } 2237 2238 bool IsMUBUF = TII->isMUBUF(*MI); 2239 2240 if (!IsMUBUF && !MFI->isEntryFunction()) { 2241 // Convert to a swizzled stack address by scaling by the wave size. 2242 // 2243 // In an entry function/kernel the offset is already swizzled. 2244 2245 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32; 2246 Register ResultReg = 2247 IsCopy ? MI->getOperand(0).getReg() 2248 : RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 2249 2250 int64_t Offset = FrameInfo.getObjectOffset(Index); 2251 if (Offset == 0) { 2252 // XXX - This never happens because of emergency scavenging slot at 0? 2253 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg) 2254 .addImm(ST.getWavefrontSizeLog2()) 2255 .addReg(FrameReg); 2256 } else { 2257 if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) { 2258 // Reuse ResultReg in intermediate step. 2259 Register ScaledReg = ResultReg; 2260 2261 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), 2262 ScaledReg) 2263 .addImm(ST.getWavefrontSizeLog2()) 2264 .addReg(FrameReg); 2265 2266 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; 2267 2268 // TODO: Fold if use instruction is another add of a constant. 2269 if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { 2270 // FIXME: This can fail 2271 MIB.addImm(Offset); 2272 MIB.addReg(ScaledReg, RegState::Kill); 2273 if (!IsVOP2) 2274 MIB.addImm(0); // clamp bit 2275 } else { 2276 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 && 2277 "Need to reuse carry out register"); 2278 2279 // Use scavenged unused carry out as offset register. 2280 Register ConstOffsetReg; 2281 if (!isWave32) 2282 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0); 2283 else 2284 ConstOffsetReg = MIB.getReg(1); 2285 2286 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) 2287 .addImm(Offset); 2288 MIB.addReg(ConstOffsetReg, RegState::Kill); 2289 MIB.addReg(ScaledReg, RegState::Kill); 2290 MIB.addImm(0); // clamp bit 2291 } 2292 } else { 2293 // We have to produce a carry out, and there isn't a free SGPR pair 2294 // for it. We can keep the whole computation on the SALU to avoid 2295 // clobbering an additional register at the cost of an extra mov. 2296 2297 // We may have 1 free scratch SGPR even though a carry out is 2298 // unavailable. Only one additional mov is needed. 2299 Register TmpScaledReg = 2300 RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); 2301 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg; 2302 2303 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg) 2304 .addReg(FrameReg) 2305 .addImm(ST.getWavefrontSizeLog2()); 2306 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) 2307 .addReg(ScaledReg, RegState::Kill) 2308 .addImm(Offset); 2309 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg) 2310 .addReg(ScaledReg, RegState::Kill); 2311 2312 // If there were truly no free SGPRs, we need to undo everything. 2313 if (!TmpScaledReg.isValid()) { 2314 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) 2315 .addReg(ScaledReg, RegState::Kill) 2316 .addImm(-Offset); 2317 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg) 2318 .addReg(FrameReg) 2319 .addImm(ST.getWavefrontSizeLog2()); 2320 } 2321 } 2322 } 2323 2324 // Don't introduce an extra copy if we're just materializing in a mov. 2325 if (IsCopy) 2326 MI->eraseFromParent(); 2327 else 2328 FIOp.ChangeToRegister(ResultReg, false, false, true); 2329 return; 2330 } 2331 2332 if (IsMUBUF) { 2333 // Disable offen so we don't need a 0 vgpr base. 2334 assert(static_cast<int>(FIOperandNum) == 2335 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 2336 AMDGPU::OpName::vaddr)); 2337 2338 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset); 2339 assert((SOffset.isImm() && SOffset.getImm() == 0)); 2340 2341 if (FrameReg != AMDGPU::NoRegister) 2342 SOffset.ChangeToRegister(FrameReg, false); 2343 2344 int64_t Offset = FrameInfo.getObjectOffset(Index); 2345 int64_t OldImm 2346 = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(); 2347 int64_t NewOffset = OldImm + Offset; 2348 2349 if (SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) && 2350 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) { 2351 MI->eraseFromParent(); 2352 return; 2353 } 2354 } 2355 2356 // If the offset is simply too big, don't convert to a scratch wave offset 2357 // relative index. 2358 2359 FIOp.ChangeToImmediate(Offset); 2360 if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { 2361 Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 2362 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 2363 .addImm(Offset); 2364 FIOp.ChangeToRegister(TmpReg, false, false, true); 2365 } 2366 } 2367 } 2368 } 2369 2370 StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const { 2371 return AMDGPUInstPrinter::getRegisterName(Reg); 2372 } 2373 2374 static const TargetRegisterClass * 2375 getAnyVGPRClassForBitWidth(unsigned BitWidth) { 2376 if (BitWidth <= 64) 2377 return &AMDGPU::VReg_64RegClass; 2378 if (BitWidth <= 96) 2379 return &AMDGPU::VReg_96RegClass; 2380 if (BitWidth <= 128) 2381 return &AMDGPU::VReg_128RegClass; 2382 if (BitWidth <= 160) 2383 return &AMDGPU::VReg_160RegClass; 2384 if (BitWidth <= 192) 2385 return &AMDGPU::VReg_192RegClass; 2386 if (BitWidth <= 224) 2387 return &AMDGPU::VReg_224RegClass; 2388 if (BitWidth <= 256) 2389 return &AMDGPU::VReg_256RegClass; 2390 if (BitWidth <= 512) 2391 return &AMDGPU::VReg_512RegClass; 2392 if (BitWidth <= 1024) 2393 return &AMDGPU::VReg_1024RegClass; 2394 2395 return nullptr; 2396 } 2397 2398 static const TargetRegisterClass * 2399 getAlignedVGPRClassForBitWidth(unsigned BitWidth) { 2400 if (BitWidth <= 64) 2401 return &AMDGPU::VReg_64_Align2RegClass; 2402 if (BitWidth <= 96) 2403 return &AMDGPU::VReg_96_Align2RegClass; 2404 if (BitWidth <= 128) 2405 return &AMDGPU::VReg_128_Align2RegClass; 2406 if (BitWidth <= 160) 2407 return &AMDGPU::VReg_160_Align2RegClass; 2408 if (BitWidth <= 192) 2409 return &AMDGPU::VReg_192_Align2RegClass; 2410 if (BitWidth <= 224) 2411 return &AMDGPU::VReg_224_Align2RegClass; 2412 if (BitWidth <= 256) 2413 return &AMDGPU::VReg_256_Align2RegClass; 2414 if (BitWidth <= 512) 2415 return &AMDGPU::VReg_512_Align2RegClass; 2416 if (BitWidth <= 1024) 2417 return &AMDGPU::VReg_1024_Align2RegClass; 2418 2419 return nullptr; 2420 } 2421 2422 const TargetRegisterClass * 2423 SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const { 2424 if (BitWidth == 1) 2425 return &AMDGPU::VReg_1RegClass; 2426 if (BitWidth <= 16) 2427 return &AMDGPU::VGPR_LO16RegClass; 2428 if (BitWidth <= 32) 2429 return &AMDGPU::VGPR_32RegClass; 2430 return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth) 2431 : getAnyVGPRClassForBitWidth(BitWidth); 2432 } 2433 2434 static const TargetRegisterClass * 2435 getAnyAGPRClassForBitWidth(unsigned BitWidth) { 2436 if (BitWidth <= 64) 2437 return &AMDGPU::AReg_64RegClass; 2438 if (BitWidth <= 96) 2439 return &AMDGPU::AReg_96RegClass; 2440 if (BitWidth <= 128) 2441 return &AMDGPU::AReg_128RegClass; 2442 if (BitWidth <= 160) 2443 return &AMDGPU::AReg_160RegClass; 2444 if (BitWidth <= 192) 2445 return &AMDGPU::AReg_192RegClass; 2446 if (BitWidth <= 224) 2447 return &AMDGPU::AReg_224RegClass; 2448 if (BitWidth <= 256) 2449 return &AMDGPU::AReg_256RegClass; 2450 if (BitWidth <= 512) 2451 return &AMDGPU::AReg_512RegClass; 2452 if (BitWidth <= 1024) 2453 return &AMDGPU::AReg_1024RegClass; 2454 2455 return nullptr; 2456 } 2457 2458 static const TargetRegisterClass * 2459 getAlignedAGPRClassForBitWidth(unsigned BitWidth) { 2460 if (BitWidth <= 64) 2461 return &AMDGPU::AReg_64_Align2RegClass; 2462 if (BitWidth <= 96) 2463 return &AMDGPU::AReg_96_Align2RegClass; 2464 if (BitWidth <= 128) 2465 return &AMDGPU::AReg_128_Align2RegClass; 2466 if (BitWidth <= 160) 2467 return &AMDGPU::AReg_160_Align2RegClass; 2468 if (BitWidth <= 192) 2469 return &AMDGPU::AReg_192_Align2RegClass; 2470 if (BitWidth <= 224) 2471 return &AMDGPU::AReg_224_Align2RegClass; 2472 if (BitWidth <= 256) 2473 return &AMDGPU::AReg_256_Align2RegClass; 2474 if (BitWidth <= 512) 2475 return &AMDGPU::AReg_512_Align2RegClass; 2476 if (BitWidth <= 1024) 2477 return &AMDGPU::AReg_1024_Align2RegClass; 2478 2479 return nullptr; 2480 } 2481 2482 const TargetRegisterClass * 2483 SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const { 2484 if (BitWidth <= 16) 2485 return &AMDGPU::AGPR_LO16RegClass; 2486 if (BitWidth <= 32) 2487 return &AMDGPU::AGPR_32RegClass; 2488 return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth) 2489 : getAnyAGPRClassForBitWidth(BitWidth); 2490 } 2491 2492 static const TargetRegisterClass * 2493 getAnyVectorSuperClassForBitWidth(unsigned BitWidth) { 2494 if (BitWidth <= 64) 2495 return &AMDGPU::AV_64RegClass; 2496 if (BitWidth <= 96) 2497 return &AMDGPU::AV_96RegClass; 2498 if (BitWidth <= 128) 2499 return &AMDGPU::AV_128RegClass; 2500 if (BitWidth <= 160) 2501 return &AMDGPU::AV_160RegClass; 2502 if (BitWidth <= 192) 2503 return &AMDGPU::AV_192RegClass; 2504 if (BitWidth <= 224) 2505 return &AMDGPU::AV_224RegClass; 2506 if (BitWidth <= 256) 2507 return &AMDGPU::AV_256RegClass; 2508 if (BitWidth <= 512) 2509 return &AMDGPU::AV_512RegClass; 2510 if (BitWidth <= 1024) 2511 return &AMDGPU::AV_1024RegClass; 2512 2513 return nullptr; 2514 } 2515 2516 static const TargetRegisterClass * 2517 getAlignedVectorSuperClassForBitWidth(unsigned BitWidth) { 2518 if (BitWidth <= 64) 2519 return &AMDGPU::AV_64_Align2RegClass; 2520 if (BitWidth <= 96) 2521 return &AMDGPU::AV_96_Align2RegClass; 2522 if (BitWidth <= 128) 2523 return &AMDGPU::AV_128_Align2RegClass; 2524 if (BitWidth <= 160) 2525 return &AMDGPU::AV_160_Align2RegClass; 2526 if (BitWidth <= 192) 2527 return &AMDGPU::AV_192_Align2RegClass; 2528 if (BitWidth <= 224) 2529 return &AMDGPU::AV_224_Align2RegClass; 2530 if (BitWidth <= 256) 2531 return &AMDGPU::AV_256_Align2RegClass; 2532 if (BitWidth <= 512) 2533 return &AMDGPU::AV_512_Align2RegClass; 2534 if (BitWidth <= 1024) 2535 return &AMDGPU::AV_1024_Align2RegClass; 2536 2537 return nullptr; 2538 } 2539 2540 const TargetRegisterClass * 2541 SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const { 2542 if (BitWidth <= 16) 2543 return &AMDGPU::VGPR_LO16RegClass; 2544 if (BitWidth <= 32) 2545 return &AMDGPU::AV_32RegClass; 2546 return ST.needsAlignedVGPRs() 2547 ? getAlignedVectorSuperClassForBitWidth(BitWidth) 2548 : getAnyVectorSuperClassForBitWidth(BitWidth); 2549 } 2550 2551 const TargetRegisterClass * 2552 SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) { 2553 if (BitWidth <= 16) 2554 return &AMDGPU::SGPR_LO16RegClass; 2555 if (BitWidth <= 32) 2556 return &AMDGPU::SReg_32RegClass; 2557 if (BitWidth <= 64) 2558 return &AMDGPU::SReg_64RegClass; 2559 if (BitWidth <= 96) 2560 return &AMDGPU::SGPR_96RegClass; 2561 if (BitWidth <= 128) 2562 return &AMDGPU::SGPR_128RegClass; 2563 if (BitWidth <= 160) 2564 return &AMDGPU::SGPR_160RegClass; 2565 if (BitWidth <= 192) 2566 return &AMDGPU::SGPR_192RegClass; 2567 if (BitWidth <= 224) 2568 return &AMDGPU::SGPR_224RegClass; 2569 if (BitWidth <= 256) 2570 return &AMDGPU::SGPR_256RegClass; 2571 if (BitWidth <= 512) 2572 return &AMDGPU::SGPR_512RegClass; 2573 if (BitWidth <= 1024) 2574 return &AMDGPU::SGPR_1024RegClass; 2575 2576 return nullptr; 2577 } 2578 2579 // FIXME: This is very slow. It might be worth creating a map from physreg to 2580 // register class. 2581 const TargetRegisterClass * 2582 SIRegisterInfo::getPhysRegClass(MCRegister Reg) const { 2583 static const TargetRegisterClass *const BaseClasses[] = { 2584 &AMDGPU::VGPR_LO16RegClass, 2585 &AMDGPU::VGPR_HI16RegClass, 2586 &AMDGPU::SReg_LO16RegClass, 2587 &AMDGPU::AGPR_LO16RegClass, 2588 &AMDGPU::VGPR_32RegClass, 2589 &AMDGPU::SReg_32RegClass, 2590 &AMDGPU::AGPR_32RegClass, 2591 &AMDGPU::AGPR_32RegClass, 2592 &AMDGPU::VReg_64_Align2RegClass, 2593 &AMDGPU::VReg_64RegClass, 2594 &AMDGPU::SReg_64RegClass, 2595 &AMDGPU::AReg_64_Align2RegClass, 2596 &AMDGPU::AReg_64RegClass, 2597 &AMDGPU::VReg_96_Align2RegClass, 2598 &AMDGPU::VReg_96RegClass, 2599 &AMDGPU::SReg_96RegClass, 2600 &AMDGPU::AReg_96_Align2RegClass, 2601 &AMDGPU::AReg_96RegClass, 2602 &AMDGPU::VReg_128_Align2RegClass, 2603 &AMDGPU::VReg_128RegClass, 2604 &AMDGPU::SReg_128RegClass, 2605 &AMDGPU::AReg_128_Align2RegClass, 2606 &AMDGPU::AReg_128RegClass, 2607 &AMDGPU::VReg_160_Align2RegClass, 2608 &AMDGPU::VReg_160RegClass, 2609 &AMDGPU::SReg_160RegClass, 2610 &AMDGPU::AReg_160_Align2RegClass, 2611 &AMDGPU::AReg_160RegClass, 2612 &AMDGPU::VReg_192_Align2RegClass, 2613 &AMDGPU::VReg_192RegClass, 2614 &AMDGPU::SReg_192RegClass, 2615 &AMDGPU::AReg_192_Align2RegClass, 2616 &AMDGPU::AReg_192RegClass, 2617 &AMDGPU::VReg_224_Align2RegClass, 2618 &AMDGPU::VReg_224RegClass, 2619 &AMDGPU::SReg_224RegClass, 2620 &AMDGPU::AReg_224_Align2RegClass, 2621 &AMDGPU::AReg_224RegClass, 2622 &AMDGPU::VReg_256_Align2RegClass, 2623 &AMDGPU::VReg_256RegClass, 2624 &AMDGPU::SReg_256RegClass, 2625 &AMDGPU::AReg_256_Align2RegClass, 2626 &AMDGPU::AReg_256RegClass, 2627 &AMDGPU::VReg_512_Align2RegClass, 2628 &AMDGPU::VReg_512RegClass, 2629 &AMDGPU::SReg_512RegClass, 2630 &AMDGPU::AReg_512_Align2RegClass, 2631 &AMDGPU::AReg_512RegClass, 2632 &AMDGPU::SReg_1024RegClass, 2633 &AMDGPU::VReg_1024_Align2RegClass, 2634 &AMDGPU::VReg_1024RegClass, 2635 &AMDGPU::AReg_1024_Align2RegClass, 2636 &AMDGPU::AReg_1024RegClass, 2637 &AMDGPU::SCC_CLASSRegClass, 2638 &AMDGPU::Pseudo_SReg_32RegClass, 2639 &AMDGPU::Pseudo_SReg_128RegClass, 2640 }; 2641 2642 for (const TargetRegisterClass *BaseClass : BaseClasses) { 2643 if (BaseClass->contains(Reg)) { 2644 return BaseClass; 2645 } 2646 } 2647 return nullptr; 2648 } 2649 2650 bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo &MRI, 2651 Register Reg) const { 2652 const TargetRegisterClass *RC; 2653 if (Reg.isVirtual()) 2654 RC = MRI.getRegClass(Reg); 2655 else 2656 RC = getPhysRegClass(Reg); 2657 return isSGPRClass(RC); 2658 } 2659 2660 const TargetRegisterClass * 2661 SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const { 2662 unsigned Size = getRegSizeInBits(*SRC); 2663 const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size); 2664 assert(VRC && "Invalid register class size"); 2665 return VRC; 2666 } 2667 2668 const TargetRegisterClass * 2669 SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const { 2670 unsigned Size = getRegSizeInBits(*SRC); 2671 const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size); 2672 assert(ARC && "Invalid register class size"); 2673 return ARC; 2674 } 2675 2676 const TargetRegisterClass * 2677 SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const { 2678 unsigned Size = getRegSizeInBits(*VRC); 2679 if (Size == 32) 2680 return &AMDGPU::SGPR_32RegClass; 2681 const TargetRegisterClass *SRC = getSGPRClassForBitWidth(Size); 2682 assert(SRC && "Invalid register class size"); 2683 return SRC; 2684 } 2685 2686 const TargetRegisterClass *SIRegisterInfo::getSubRegClass( 2687 const TargetRegisterClass *RC, unsigned SubIdx) const { 2688 if (SubIdx == AMDGPU::NoSubRegister) 2689 return RC; 2690 2691 // We can assume that each lane corresponds to one 32-bit register. 2692 unsigned Size = getNumChannelsFromSubReg(SubIdx) * 32; 2693 if (isAGPRClass(RC)) { 2694 RC = getAGPRClassForBitWidth(Size); 2695 } else if (isVGPRClass(RC)) { 2696 RC = getVGPRClassForBitWidth(Size); 2697 } else if (isVectorSuperClass(RC)) { 2698 RC = getVectorSuperClassForBitWidth(Size); 2699 } else { 2700 RC = getSGPRClassForBitWidth(Size); 2701 } 2702 assert(RC && "Invalid sub-register class size"); 2703 return RC; 2704 } 2705 2706 const TargetRegisterClass * 2707 SIRegisterInfo::getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, 2708 const TargetRegisterClass *SubRC, 2709 unsigned SubIdx) const { 2710 // Ensure this subregister index is aligned in the super register. 2711 const TargetRegisterClass *MatchRC = 2712 getMatchingSuperRegClass(SuperRC, SubRC, SubIdx); 2713 return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr; 2714 } 2715 2716 bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { 2717 if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST && 2718 OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST) 2719 return !ST.hasMFMAInlineLiteralBug(); 2720 2721 return OpType >= AMDGPU::OPERAND_SRC_FIRST && 2722 OpType <= AMDGPU::OPERAND_SRC_LAST; 2723 } 2724 2725 bool SIRegisterInfo::shouldRewriteCopySrc( 2726 const TargetRegisterClass *DefRC, 2727 unsigned DefSubReg, 2728 const TargetRegisterClass *SrcRC, 2729 unsigned SrcSubReg) const { 2730 // We want to prefer the smallest register class possible, so we don't want to 2731 // stop and rewrite on anything that looks like a subregister 2732 // extract. Operations mostly don't care about the super register class, so we 2733 // only want to stop on the most basic of copies between the same register 2734 // class. 2735 // 2736 // e.g. if we have something like 2737 // %0 = ... 2738 // %1 = ... 2739 // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2 2740 // %3 = COPY %2, sub0 2741 // 2742 // We want to look through the COPY to find: 2743 // => %3 = COPY %0 2744 2745 // Plain copy. 2746 return getCommonSubClass(DefRC, SrcRC) != nullptr; 2747 } 2748 2749 bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const { 2750 // TODO: 64-bit operands have extending behavior from 32-bit literal. 2751 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST && 2752 OpType <= AMDGPU::OPERAND_REG_IMM_LAST; 2753 } 2754 2755 /// Returns a lowest register that is not used at any point in the function. 2756 /// If all registers are used, then this function will return 2757 /// AMDGPU::NoRegister. If \p ReserveHighestVGPR = true, then return 2758 /// highest unused register. 2759 MCRegister SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, 2760 const TargetRegisterClass *RC, 2761 const MachineFunction &MF, 2762 bool ReserveHighestVGPR) const { 2763 if (ReserveHighestVGPR) { 2764 for (MCRegister Reg : reverse(*RC)) 2765 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 2766 return Reg; 2767 } else { 2768 for (MCRegister Reg : *RC) 2769 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 2770 return Reg; 2771 } 2772 return MCRegister(); 2773 } 2774 2775 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, 2776 unsigned EltSize) const { 2777 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC->MC); 2778 assert(RegBitWidth >= 32 && RegBitWidth <= 1024); 2779 2780 const unsigned RegDWORDs = RegBitWidth / 32; 2781 const unsigned EltDWORDs = EltSize / 4; 2782 assert(RegSplitParts.size() + 1 >= EltDWORDs); 2783 2784 const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1]; 2785 const unsigned NumParts = RegDWORDs / EltDWORDs; 2786 2787 return makeArrayRef(Parts.data(), NumParts); 2788 } 2789 2790 const TargetRegisterClass* 2791 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI, 2792 Register Reg) const { 2793 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegClass(Reg); 2794 } 2795 2796 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, 2797 Register Reg) const { 2798 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); 2799 // Registers without classes are unaddressable, SGPR-like registers. 2800 return RC && isVGPRClass(RC); 2801 } 2802 2803 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI, 2804 Register Reg) const { 2805 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); 2806 2807 // Registers without classes are unaddressable, SGPR-like registers. 2808 return RC && isAGPRClass(RC); 2809 } 2810 2811 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, 2812 const TargetRegisterClass *SrcRC, 2813 unsigned SubReg, 2814 const TargetRegisterClass *DstRC, 2815 unsigned DstSubReg, 2816 const TargetRegisterClass *NewRC, 2817 LiveIntervals &LIS) const { 2818 unsigned SrcSize = getRegSizeInBits(*SrcRC); 2819 unsigned DstSize = getRegSizeInBits(*DstRC); 2820 unsigned NewSize = getRegSizeInBits(*NewRC); 2821 2822 // Do not increase size of registers beyond dword, we would need to allocate 2823 // adjacent registers and constraint regalloc more than needed. 2824 2825 // Always allow dword coalescing. 2826 if (SrcSize <= 32 || DstSize <= 32) 2827 return true; 2828 2829 return NewSize <= DstSize || NewSize <= SrcSize; 2830 } 2831 2832 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, 2833 MachineFunction &MF) const { 2834 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2835 2836 unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), 2837 MF.getFunction()); 2838 switch (RC->getID()) { 2839 default: 2840 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF); 2841 case AMDGPU::VGPR_32RegClassID: 2842 case AMDGPU::VGPR_LO16RegClassID: 2843 case AMDGPU::VGPR_HI16RegClassID: 2844 return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF)); 2845 case AMDGPU::SGPR_32RegClassID: 2846 case AMDGPU::SGPR_LO16RegClassID: 2847 return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF)); 2848 } 2849 } 2850 2851 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, 2852 unsigned Idx) const { 2853 if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 || 2854 Idx == AMDGPU::RegisterPressureSets::AGPR_32) 2855 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, 2856 const_cast<MachineFunction &>(MF)); 2857 2858 if (Idx == AMDGPU::RegisterPressureSets::SReg_32) 2859 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass, 2860 const_cast<MachineFunction &>(MF)); 2861 2862 llvm_unreachable("Unexpected register pressure set!"); 2863 } 2864 2865 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const { 2866 static const int Empty[] = { -1 }; 2867 2868 if (RegPressureIgnoredUnits[RegUnit]) 2869 return Empty; 2870 2871 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit); 2872 } 2873 2874 MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const { 2875 // Not a callee saved register. 2876 return AMDGPU::SGPR30_SGPR31; 2877 } 2878 2879 const TargetRegisterClass * 2880 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, 2881 const RegisterBank &RB, 2882 const MachineRegisterInfo &MRI) const { 2883 switch (RB.getID()) { 2884 case AMDGPU::VGPRRegBankID: 2885 return getVGPRClassForBitWidth(std::max(32u, Size)); 2886 case AMDGPU::VCCRegBankID: 2887 assert(Size == 1); 2888 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass 2889 : &AMDGPU::SReg_64_XEXECRegClass; 2890 case AMDGPU::SGPRRegBankID: 2891 return getSGPRClassForBitWidth(std::max(32u, Size)); 2892 case AMDGPU::AGPRRegBankID: 2893 return getAGPRClassForBitWidth(std::max(32u, Size)); 2894 default: 2895 llvm_unreachable("unknown register bank"); 2896 } 2897 } 2898 2899 const TargetRegisterClass * 2900 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, 2901 const MachineRegisterInfo &MRI) const { 2902 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg()); 2903 if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>()) 2904 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI); 2905 2906 if (const auto *RC = RCOrRB.dyn_cast<const TargetRegisterClass *>()) 2907 return getAllocatableClass(RC); 2908 2909 return nullptr; 2910 } 2911 2912 MCRegister SIRegisterInfo::getVCC() const { 2913 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; 2914 } 2915 2916 const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const { 2917 // VGPR tuples have an alignment requirement on gfx90a variants. 2918 return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass 2919 : &AMDGPU::VReg_64RegClass; 2920 } 2921 2922 const TargetRegisterClass * 2923 SIRegisterInfo::getRegClass(unsigned RCID) const { 2924 switch ((int)RCID) { 2925 case AMDGPU::SReg_1RegClassID: 2926 return getBoolRC(); 2927 case AMDGPU::SReg_1_XEXECRegClassID: 2928 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass 2929 : &AMDGPU::SReg_64_XEXECRegClass; 2930 case -1: 2931 return nullptr; 2932 default: 2933 return AMDGPUGenRegisterInfo::getRegClass(RCID); 2934 } 2935 } 2936 2937 // Find reaching register definition 2938 MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg, 2939 MachineInstr &Use, 2940 MachineRegisterInfo &MRI, 2941 LiveIntervals *LIS) const { 2942 auto &MDT = LIS->getAnalysis<MachineDominatorTree>(); 2943 SlotIndex UseIdx = LIS->getInstructionIndex(Use); 2944 SlotIndex DefIdx; 2945 2946 if (Reg.isVirtual()) { 2947 if (!LIS->hasInterval(Reg)) 2948 return nullptr; 2949 LiveInterval &LI = LIS->getInterval(Reg); 2950 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg) 2951 : MRI.getMaxLaneMaskForVReg(Reg); 2952 VNInfo *V = nullptr; 2953 if (LI.hasSubRanges()) { 2954 for (auto &S : LI.subranges()) { 2955 if ((S.LaneMask & SubLanes) == SubLanes) { 2956 V = S.getVNInfoAt(UseIdx); 2957 break; 2958 } 2959 } 2960 } else { 2961 V = LI.getVNInfoAt(UseIdx); 2962 } 2963 if (!V) 2964 return nullptr; 2965 DefIdx = V->def; 2966 } else { 2967 // Find last def. 2968 for (MCRegUnitIterator Units(Reg.asMCReg(), this); Units.isValid(); 2969 ++Units) { 2970 LiveRange &LR = LIS->getRegUnit(*Units); 2971 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) { 2972 if (!DefIdx.isValid() || 2973 MDT.dominates(LIS->getInstructionFromIndex(DefIdx), 2974 LIS->getInstructionFromIndex(V->def))) 2975 DefIdx = V->def; 2976 } else { 2977 return nullptr; 2978 } 2979 } 2980 } 2981 2982 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx); 2983 2984 if (!Def || !MDT.dominates(Def, &Use)) 2985 return nullptr; 2986 2987 assert(Def->modifiesRegister(Reg, this)); 2988 2989 return Def; 2990 } 2991 2992 MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const { 2993 assert(getRegSizeInBits(*getPhysRegClass(Reg)) <= 32); 2994 2995 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass, 2996 AMDGPU::SReg_32RegClass, 2997 AMDGPU::AGPR_32RegClass } ) { 2998 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC)) 2999 return Super; 3000 } 3001 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16, 3002 &AMDGPU::VGPR_32RegClass)) { 3003 return Super; 3004 } 3005 3006 return AMDGPU::NoRegister; 3007 } 3008 3009 bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const { 3010 if (!ST.needsAlignedVGPRs()) 3011 return true; 3012 3013 if (isVGPRClass(&RC)) 3014 return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC))); 3015 if (isAGPRClass(&RC)) 3016 return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC))); 3017 if (isVectorSuperClass(&RC)) 3018 return RC.hasSuperClassEq( 3019 getVectorSuperClassForBitWidth(getRegSizeInBits(RC))); 3020 3021 return true; 3022 } 3023 3024 const TargetRegisterClass * 3025 SIRegisterInfo::getProperlyAlignedRC(const TargetRegisterClass *RC) const { 3026 if (!RC || !ST.needsAlignedVGPRs()) 3027 return RC; 3028 3029 unsigned Size = getRegSizeInBits(*RC); 3030 if (Size <= 32) 3031 return RC; 3032 3033 if (isVGPRClass(RC)) 3034 return getAlignedVGPRClassForBitWidth(Size); 3035 if (isAGPRClass(RC)) 3036 return getAlignedAGPRClassForBitWidth(Size); 3037 if (isVectorSuperClass(RC)) 3038 return getAlignedVectorSuperClassForBitWidth(Size); 3039 3040 return RC; 3041 } 3042 3043 bool SIRegisterInfo::isConstantPhysReg(MCRegister PhysReg) const { 3044 switch (PhysReg) { 3045 case AMDGPU::SGPR_NULL: 3046 case AMDGPU::SRC_SHARED_BASE: 3047 case AMDGPU::SRC_PRIVATE_BASE: 3048 case AMDGPU::SRC_SHARED_LIMIT: 3049 case AMDGPU::SRC_PRIVATE_LIMIT: 3050 return true; 3051 default: 3052 return false; 3053 } 3054 } 3055 3056 ArrayRef<MCPhysReg> 3057 SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const { 3058 return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(), 3059 ST.getMaxNumSGPRs(MF) / 4); 3060 } 3061 3062 ArrayRef<MCPhysReg> 3063 SIRegisterInfo::getAllSGPR64(const MachineFunction &MF) const { 3064 return makeArrayRef(AMDGPU::SGPR_64RegClass.begin(), 3065 ST.getMaxNumSGPRs(MF) / 2); 3066 } 3067 3068 ArrayRef<MCPhysReg> 3069 SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const { 3070 return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF)); 3071 } 3072