1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI implementation of the TargetRegisterInfo class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "SIRegisterInfo.h" 15 #include "AMDGPU.h" 16 #include "AMDGPURegisterBankInfo.h" 17 #include "GCNSubtarget.h" 18 #include "MCTargetDesc/AMDGPUInstPrinter.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "llvm/CodeGen/LiveIntervals.h" 22 #include "llvm/CodeGen/MachineDominators.h" 23 #include "llvm/CodeGen/RegisterScavenging.h" 24 25 using namespace llvm; 26 27 #define GET_REGINFO_TARGET_DESC 28 #include "AMDGPUGenRegisterInfo.inc" 29 30 static cl::opt<bool> EnableSpillSGPRToVGPR( 31 "amdgpu-spill-sgpr-to-vgpr", 32 cl::desc("Enable spilling VGPRs to SGPRs"), 33 cl::ReallyHidden, 34 cl::init(true)); 35 36 std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts; 37 std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable; 38 39 // Map numbers of DWORDs to indexes in SubRegFromChannelTable. 40 // Valid indexes are shifted 1, such that a 0 mapping means unsupported. 41 // e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8, 42 // meaning index 7 in SubRegFromChannelTable. 43 static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = { 44 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9}; 45 46 namespace llvm { 47 48 // A temporary struct to spill SGPRs. 49 // This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits 50 // just v_writelane and v_readlane. 51 // 52 // When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR 53 // is saved to scratch (or the other way around for loads). 54 // For this, a VGPR is required where the needed lanes can be clobbered. The 55 // RegScavenger can provide a VGPR where currently active lanes can be 56 // clobbered, but we still need to save inactive lanes. 57 // The high-level steps are: 58 // - Try to scavenge SGPR(s) to save exec 59 // - Try to scavenge VGPR 60 // - Save needed, all or inactive lanes of a TmpVGPR 61 // - Spill/Restore SGPRs using TmpVGPR 62 // - Restore TmpVGPR 63 // 64 // To save all lanes of TmpVGPR, exec needs to be saved and modified. If we 65 // cannot scavenge temporary SGPRs to save exec, we use the following code: 66 // buffer_store_dword TmpVGPR ; only if active lanes need to be saved 67 // s_not exec, exec 68 // buffer_store_dword TmpVGPR ; save inactive lanes 69 // s_not exec, exec 70 struct SGPRSpillBuilder { 71 struct PerVGPRData { 72 unsigned PerVGPR; 73 unsigned NumVGPRs; 74 int64_t VGPRLanes; 75 }; 76 77 // The SGPR to save 78 Register SuperReg; 79 MachineBasicBlock::iterator MI; 80 ArrayRef<int16_t> SplitParts; 81 unsigned NumSubRegs; 82 bool IsKill; 83 const DebugLoc &DL; 84 85 /* When spilling to stack */ 86 // The SGPRs are written into this VGPR, which is then written to scratch 87 // (or vice versa for loads). 88 Register TmpVGPR = AMDGPU::NoRegister; 89 // Temporary spill slot to save TmpVGPR to. 90 int TmpVGPRIndex = 0; 91 // If TmpVGPR is live before the spill or if it is scavenged. 92 bool TmpVGPRLive = false; 93 // Scavenged SGPR to save EXEC. 94 Register SavedExecReg = AMDGPU::NoRegister; 95 // Stack index to write the SGPRs to. 96 int Index; 97 unsigned EltSize = 4; 98 99 RegScavenger *RS; 100 MachineBasicBlock &MBB; 101 MachineFunction &MF; 102 SIMachineFunctionInfo &MFI; 103 const SIInstrInfo &TII; 104 const SIRegisterInfo &TRI; 105 bool IsWave32; 106 Register ExecReg; 107 unsigned MovOpc; 108 unsigned NotOpc; 109 110 SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, 111 bool IsWave32, MachineBasicBlock::iterator MI, int Index, 112 RegScavenger *RS) 113 : SuperReg(MI->getOperand(0).getReg()), MI(MI), 114 IsKill(MI->getOperand(0).isKill()), DL(MI->getDebugLoc()), Index(Index), 115 RS(RS), MBB(*MI->getParent()), MF(*MBB.getParent()), 116 MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI), 117 IsWave32(IsWave32) { 118 const TargetRegisterClass *RC = TRI.getPhysRegClass(SuperReg); 119 SplitParts = TRI.getRegSplitParts(RC, EltSize); 120 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 121 122 if (IsWave32) { 123 ExecReg = AMDGPU::EXEC_LO; 124 MovOpc = AMDGPU::S_MOV_B32; 125 NotOpc = AMDGPU::S_NOT_B32; 126 } else { 127 ExecReg = AMDGPU::EXEC; 128 MovOpc = AMDGPU::S_MOV_B64; 129 NotOpc = AMDGPU::S_NOT_B64; 130 } 131 132 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 133 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI && 134 SuperReg != AMDGPU::EXEC && "exec should never spill"); 135 } 136 137 PerVGPRData getPerVGPRData() { 138 PerVGPRData Data; 139 Data.PerVGPR = IsWave32 ? 32 : 64; 140 Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR; 141 Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL; 142 return Data; 143 } 144 145 // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is 146 // free. 147 // Writes these instructions if an SGPR can be scavenged: 148 // s_mov_b64 s[6:7], exec ; Save exec 149 // s_mov_b64 exec, 3 ; Wanted lanemask 150 // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot 151 // 152 // Writes these instructions if no SGPR can be scavenged: 153 // buffer_store_dword v0 ; Only if no free VGPR was found 154 // s_not_b64 exec, exec 155 // buffer_store_dword v0 ; Save inactive lanes 156 // ; exec stays inverted, it is flipped back in 157 // ; restore. 158 void prepare() { 159 // Scavenged temporary VGPR to use. It must be scavenged once for any number 160 // of spilled subregs. 161 // FIXME: The liveness analysis is limited and does not tell if a register 162 // is in use in lanes that are currently inactive. We can never be sure if 163 // a register as actually in use in another lane, so we need to save all 164 // used lanes of the chosen VGPR. 165 assert(RS && "Cannot spill SGPR to memory without RegScavenger"); 166 TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0, false); 167 168 // Reserve temporary stack slot 169 TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI); 170 if (TmpVGPR) { 171 // Found a register that is dead in the currently active lanes, we only 172 // need to spill inactive lanes. 173 TmpVGPRLive = false; 174 } else { 175 // Pick v0 because it doesn't make a difference. 176 TmpVGPR = AMDGPU::VGPR0; 177 TmpVGPRLive = true; 178 } 179 180 // Try to scavenge SGPRs to save exec 181 assert(!SavedExecReg && "Exec is already saved, refuse to save again"); 182 const TargetRegisterClass &RC = 183 IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass; 184 RS->setRegUsed(SuperReg); 185 SavedExecReg = RS->scavengeRegister(&RC, MI, 0, false); 186 187 int64_t VGPRLanes = getPerVGPRData().VGPRLanes; 188 189 if (SavedExecReg) { 190 RS->setRegUsed(SavedExecReg); 191 // Set exec to needed lanes 192 BuildMI(MBB, MI, DL, TII.get(MovOpc), SavedExecReg).addReg(ExecReg); 193 auto I = BuildMI(MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes); 194 if (!TmpVGPRLive) 195 I.addReg(TmpVGPR, RegState::ImplicitDefine); 196 // Spill needed lanes 197 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false); 198 } else { 199 // Spill active lanes 200 if (TmpVGPRLive) 201 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false, 202 /*IsKill*/ false); 203 // Spill inactive lanes 204 auto I = BuildMI(MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 205 if (!TmpVGPRLive) 206 I.addReg(TmpVGPR, RegState::ImplicitDefine); 207 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false); 208 } 209 } 210 211 // Writes these instructions if an SGPR can be scavenged: 212 // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot 213 // s_waitcnt vmcnt(0) ; If a free VGPR was found 214 // s_mov_b64 exec, s[6:7] ; Save exec 215 // 216 // Writes these instructions if no SGPR can be scavenged: 217 // buffer_load_dword v0 ; Restore inactive lanes 218 // s_waitcnt vmcnt(0) ; If a free VGPR was found 219 // s_not_b64 exec, exec 220 // buffer_load_dword v0 ; Only if no free VGPR was found 221 void restore() { 222 if (SavedExecReg) { 223 // Restore used lanes 224 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true, 225 /*IsKill*/ false); 226 // Restore exec 227 auto I = BuildMI(MBB, MI, DL, TII.get(MovOpc), ExecReg) 228 .addReg(SavedExecReg, RegState::Kill); 229 // Add an implicit use of the load so it is not dead. 230 // FIXME This inserts an unnecessary waitcnt 231 if (!TmpVGPRLive) { 232 I.addReg(TmpVGPR, RegState::ImplicitKill); 233 } 234 } else { 235 // Restore inactive lanes 236 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true, 237 /*IsKill*/ false); 238 auto I = BuildMI(MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 239 if (!TmpVGPRLive) { 240 I.addReg(TmpVGPR, RegState::ImplicitKill); 241 } 242 // Restore active lanes 243 if (TmpVGPRLive) 244 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true); 245 } 246 } 247 248 // Write TmpVGPR to memory or read TmpVGPR from memory. 249 // Either using a single buffer_load/store if exec is set to the needed mask 250 // or using 251 // buffer_load 252 // s_not exec, exec 253 // buffer_load 254 // s_not exec, exec 255 void readWriteTmpVGPR(unsigned Offset, bool IsLoad) { 256 if (SavedExecReg) { 257 // Spill needed lanes 258 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad); 259 } else { 260 // Spill active lanes 261 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad, 262 /*IsKill*/ false); 263 // Spill inactive lanes 264 BuildMI(MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 265 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad); 266 BuildMI(MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 267 } 268 } 269 }; 270 271 } // namespace llvm 272 273 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) 274 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST), 275 SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) { 276 277 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 && 278 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) && 279 (getSubRegIndexLaneMask(AMDGPU::lo16) | 280 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() == 281 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() && 282 "getNumCoveredRegs() will not work with generated subreg masks!"); 283 284 RegPressureIgnoredUnits.resize(getNumRegUnits()); 285 RegPressureIgnoredUnits.set( 286 *MCRegUnitIterator(MCRegister::from(AMDGPU::M0), this)); 287 for (auto Reg : AMDGPU::VGPR_HI16RegClass) 288 RegPressureIgnoredUnits.set(*MCRegUnitIterator(Reg, this)); 289 290 // HACK: Until this is fully tablegen'd. 291 static llvm::once_flag InitializeRegSplitPartsFlag; 292 293 static auto InitializeRegSplitPartsOnce = [this]() { 294 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) { 295 unsigned Size = getSubRegIdxSize(Idx); 296 if (Size & 31) 297 continue; 298 std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1]; 299 unsigned Pos = getSubRegIdxOffset(Idx); 300 if (Pos % Size) 301 continue; 302 Pos /= Size; 303 if (Vec.empty()) { 304 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits. 305 Vec.resize(MaxNumParts); 306 } 307 Vec[Pos] = Idx; 308 } 309 }; 310 311 static llvm::once_flag InitializeSubRegFromChannelTableFlag; 312 313 static auto InitializeSubRegFromChannelTableOnce = [this]() { 314 for (auto &Row : SubRegFromChannelTable) 315 Row.fill(AMDGPU::NoSubRegister); 316 for (uint16_t Idx = 1; Idx < getNumSubRegIndices(); ++Idx) { 317 unsigned Width = AMDGPUSubRegIdxRanges[Idx].Size / 32; 318 unsigned Offset = AMDGPUSubRegIdxRanges[Idx].Offset / 32; 319 assert(Width < SubRegFromChannelTableWidthMap.size()); 320 Width = SubRegFromChannelTableWidthMap[Width]; 321 if (Width == 0) 322 continue; 323 unsigned TableIdx = Width - 1; 324 assert(TableIdx < SubRegFromChannelTable.size()); 325 assert(Offset < SubRegFromChannelTable[TableIdx].size()); 326 SubRegFromChannelTable[TableIdx][Offset] = Idx; 327 } 328 }; 329 330 llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce); 331 llvm::call_once(InitializeSubRegFromChannelTableFlag, 332 InitializeSubRegFromChannelTableOnce); 333 } 334 335 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, 336 MCRegister Reg) const { 337 MCRegAliasIterator R(Reg, this, true); 338 339 for (; R.isValid(); ++R) 340 Reserved.set(*R); 341 } 342 343 // Forced to be here by one .inc 344 const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs( 345 const MachineFunction *MF) const { 346 CallingConv::ID CC = MF->getFunction().getCallingConv(); 347 switch (CC) { 348 case CallingConv::C: 349 case CallingConv::Fast: 350 case CallingConv::Cold: 351 case CallingConv::AMDGPU_Gfx: 352 return MF->getSubtarget<GCNSubtarget>().hasGFX90AInsts() 353 ? CSR_AMDGPU_HighRegs_With_AGPRs_SaveList 354 : CSR_AMDGPU_HighRegs_SaveList; 355 default: { 356 // Dummy to not crash RegisterClassInfo. 357 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister; 358 return &NoCalleeSavedReg; 359 } 360 } 361 } 362 363 const MCPhysReg * 364 SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const { 365 return nullptr; 366 } 367 368 const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, 369 CallingConv::ID CC) const { 370 switch (CC) { 371 case CallingConv::C: 372 case CallingConv::Fast: 373 case CallingConv::Cold: 374 case CallingConv::AMDGPU_Gfx: 375 return MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts() 376 ? CSR_AMDGPU_HighRegs_With_AGPRs_RegMask 377 : CSR_AMDGPU_HighRegs_RegMask; 378 default: 379 return nullptr; 380 } 381 } 382 383 const uint32_t *SIRegisterInfo::getNoPreservedMask() const { 384 return CSR_AMDGPU_NoRegs_RegMask; 385 } 386 387 Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { 388 const SIFrameLowering *TFI = 389 MF.getSubtarget<GCNSubtarget>().getFrameLowering(); 390 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 391 // During ISel lowering we always reserve the stack pointer in entry 392 // functions, but never actually want to reference it when accessing our own 393 // frame. If we need a frame pointer we use it, but otherwise we can just use 394 // an immediate "0" which we represent by returning NoRegister. 395 if (FuncInfo->isEntryFunction()) { 396 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register(); 397 } 398 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() 399 : FuncInfo->getStackPtrOffsetReg(); 400 } 401 402 bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const { 403 // When we need stack realignment, we can't reference off of the 404 // stack pointer, so we reserve a base pointer. 405 const MachineFrameInfo &MFI = MF.getFrameInfo(); 406 return MFI.getNumFixedObjects() && shouldRealignStack(MF); 407 } 408 409 Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; } 410 411 const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const { 412 return CSR_AMDGPU_AllVGPRs_RegMask; 413 } 414 415 const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const { 416 return CSR_AMDGPU_AllAGPRs_RegMask; 417 } 418 419 const uint32_t *SIRegisterInfo::getAllVectorRegMask() const { 420 return CSR_AMDGPU_AllVectorRegs_RegMask; 421 } 422 423 const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const { 424 return CSR_AMDGPU_AllAllocatableSRegs_RegMask; 425 } 426 427 unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel, 428 unsigned NumRegs) { 429 assert(NumRegs < SubRegFromChannelTableWidthMap.size()); 430 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs]; 431 assert(NumRegIndex && "Not implemented"); 432 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size()); 433 return SubRegFromChannelTable[NumRegIndex - 1][Channel]; 434 } 435 436 MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg( 437 const MachineFunction &MF) const { 438 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4; 439 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); 440 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass); 441 } 442 443 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { 444 BitVector Reserved(getNumRegs()); 445 Reserved.set(AMDGPU::MODE); 446 447 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but 448 // this seems likely to result in bugs, so I'm marking them as reserved. 449 reserveRegisterTuples(Reserved, AMDGPU::EXEC); 450 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); 451 452 // M0 has to be reserved so that llvm accepts it as a live-in into a block. 453 reserveRegisterTuples(Reserved, AMDGPU::M0); 454 455 // Reserve src_vccz, src_execz, src_scc. 456 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ); 457 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ); 458 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC); 459 460 // Reserve the memory aperture registers. 461 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE); 462 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); 463 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); 464 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT); 465 466 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen. 467 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID); 468 469 // Reserve xnack_mask registers - support is not implemented in Codegen. 470 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK); 471 472 // Reserve lds_direct register - support is not implemented in Codegen. 473 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT); 474 475 // Reserve Trap Handler registers - support is not implemented in Codegen. 476 reserveRegisterTuples(Reserved, AMDGPU::TBA); 477 reserveRegisterTuples(Reserved, AMDGPU::TMA); 478 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1); 479 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3); 480 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5); 481 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7); 482 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); 483 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); 484 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13); 485 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15); 486 487 // Reserve null register - it shall never be allocated 488 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL); 489 490 // Disallow vcc_hi allocation in wave32. It may be allocated but most likely 491 // will result in bugs. 492 if (isWave32) { 493 Reserved.set(AMDGPU::VCC); 494 Reserved.set(AMDGPU::VCC_HI); 495 } 496 497 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); 498 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 499 for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) { 500 unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); 501 reserveRegisterTuples(Reserved, Reg); 502 } 503 504 unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); 505 // TODO: In an entry function without calls and AGPRs used it is possible 506 // to use the whole register budget for VGPRs. Even more it shall 507 // be possible to estimate maximum AGPR/VGPR pressure and split 508 // register file accordingly. 509 if (ST.hasGFX90AInsts()) 510 MaxNumVGPRs /= 2; 511 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); 512 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) { 513 unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); 514 reserveRegisterTuples(Reserved, Reg); 515 Reg = AMDGPU::AGPR_32RegClass.getRegister(i); 516 reserveRegisterTuples(Reserved, Reg); 517 } 518 519 for (auto Reg : AMDGPU::SReg_32RegClass) { 520 Reserved.set(getSubReg(Reg, AMDGPU::hi16)); 521 Register Low = getSubReg(Reg, AMDGPU::lo16); 522 // This is to prevent BB vcc liveness errors. 523 if (!AMDGPU::SGPR_LO16RegClass.contains(Low)) 524 Reserved.set(Low); 525 } 526 527 for (auto Reg : AMDGPU::AGPR_32RegClass) { 528 Reserved.set(getSubReg(Reg, AMDGPU::hi16)); 529 } 530 531 // Reserve all the rest AGPRs if there are no instructions to use it. 532 if (!ST.hasMAIInsts()) { 533 for (unsigned i = 0; i < MaxNumVGPRs; ++i) { 534 unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); 535 reserveRegisterTuples(Reserved, Reg); 536 } 537 } 538 539 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 540 541 Register ScratchRSrcReg = MFI->getScratchRSrcReg(); 542 if (ScratchRSrcReg != AMDGPU::NoRegister) { 543 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need 544 // to spill. 545 // TODO: May need to reserve a VGPR if doing LDS spilling. 546 reserveRegisterTuples(Reserved, ScratchRSrcReg); 547 } 548 549 // We have to assume the SP is needed in case there are calls in the function, 550 // which is detected after the function is lowered. If we aren't really going 551 // to need SP, don't bother reserving it. 552 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg(); 553 554 if (StackPtrReg) { 555 reserveRegisterTuples(Reserved, StackPtrReg); 556 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg)); 557 } 558 559 MCRegister FrameReg = MFI->getFrameOffsetReg(); 560 if (FrameReg) { 561 reserveRegisterTuples(Reserved, FrameReg); 562 assert(!isSubRegister(ScratchRSrcReg, FrameReg)); 563 } 564 565 if (hasBasePointer(MF)) { 566 MCRegister BasePtrReg = getBaseRegister(); 567 reserveRegisterTuples(Reserved, BasePtrReg); 568 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg)); 569 } 570 571 for (auto Reg : MFI->WWMReservedRegs) { 572 reserveRegisterTuples(Reserved, Reg.first); 573 } 574 575 // FIXME: Stop using reserved registers for this. 576 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs()) 577 reserveRegisterTuples(Reserved, Reg); 578 579 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) 580 reserveRegisterTuples(Reserved, Reg); 581 582 for (auto SSpill : MFI->getSGPRSpillVGPRs()) 583 reserveRegisterTuples(Reserved, SSpill.VGPR); 584 585 return Reserved; 586 } 587 588 bool SIRegisterInfo::shouldRealignStack(const MachineFunction &MF) const { 589 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 590 // On entry, the base address is 0, so it can't possibly need any more 591 // alignment. 592 593 // FIXME: Should be able to specify the entry frame alignment per calling 594 // convention instead. 595 if (Info->isEntryFunction()) 596 return false; 597 598 return TargetRegisterInfo::shouldRealignStack(MF); 599 } 600 601 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { 602 const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>(); 603 if (Info->isEntryFunction()) { 604 const MachineFrameInfo &MFI = Fn.getFrameInfo(); 605 return MFI.hasStackObjects() || MFI.hasCalls(); 606 } 607 608 // May need scavenger for dealing with callee saved registers. 609 return true; 610 } 611 612 bool SIRegisterInfo::requiresFrameIndexScavenging( 613 const MachineFunction &MF) const { 614 // Do not use frame virtual registers. They used to be used for SGPRs, but 615 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the 616 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a 617 // spill. 618 return false; 619 } 620 621 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( 622 const MachineFunction &MF) const { 623 const MachineFrameInfo &MFI = MF.getFrameInfo(); 624 return MFI.hasStackObjects(); 625 } 626 627 bool SIRegisterInfo::requiresVirtualBaseRegisters( 628 const MachineFunction &) const { 629 // There are no special dedicated stack or frame pointers. 630 return true; 631 } 632 633 int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const { 634 assert(SIInstrInfo::isMUBUF(*MI) || SIInstrInfo::isFLATScratch(*MI)); 635 636 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 637 AMDGPU::OpName::offset); 638 return MI->getOperand(OffIdx).getImm(); 639 } 640 641 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI, 642 int Idx) const { 643 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 644 return 0; 645 646 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 647 AMDGPU::OpName::vaddr) || 648 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 649 AMDGPU::OpName::saddr))) && 650 "Should never see frame index on non-address operand"); 651 652 return getScratchInstrOffset(MI); 653 } 654 655 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { 656 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 657 return false; 658 659 int64_t FullOffset = Offset + getScratchInstrOffset(MI); 660 661 if (SIInstrInfo::isMUBUF(*MI)) 662 return !SIInstrInfo::isLegalMUBUFImmOffset(FullOffset); 663 664 const SIInstrInfo *TII = ST.getInstrInfo(); 665 return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS, 666 SIInstrFlags::FlatScratch); 667 } 668 669 Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, 670 int FrameIdx, 671 int64_t Offset) const { 672 MachineBasicBlock::iterator Ins = MBB->begin(); 673 DebugLoc DL; // Defaults to "unknown" 674 675 if (Ins != MBB->end()) 676 DL = Ins->getDebugLoc(); 677 678 MachineFunction *MF = MBB->getParent(); 679 const SIInstrInfo *TII = ST.getInstrInfo(); 680 MachineRegisterInfo &MRI = MF->getRegInfo(); 681 unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32 682 : AMDGPU::V_MOV_B32_e32; 683 684 Register BaseReg = MRI.createVirtualRegister( 685 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass 686 : &AMDGPU::VGPR_32RegClass); 687 688 if (Offset == 0) { 689 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg) 690 .addFrameIndex(FrameIdx); 691 return BaseReg; 692 } 693 694 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 695 696 Register FIReg = MRI.createVirtualRegister( 697 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass 698 : &AMDGPU::VGPR_32RegClass); 699 700 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) 701 .addImm(Offset); 702 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg) 703 .addFrameIndex(FrameIdx); 704 705 if (ST.enableFlatScratch() ) { 706 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg) 707 .addReg(OffsetReg, RegState::Kill) 708 .addReg(FIReg); 709 return BaseReg; 710 } 711 712 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg) 713 .addReg(OffsetReg, RegState::Kill) 714 .addReg(FIReg) 715 .addImm(0); // clamp bit 716 717 return BaseReg; 718 } 719 720 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, 721 int64_t Offset) const { 722 const SIInstrInfo *TII = ST.getInstrInfo(); 723 bool IsFlat = TII->isFLATScratch(MI); 724 725 #ifndef NDEBUG 726 // FIXME: Is it possible to be storing a frame index to itself? 727 bool SeenFI = false; 728 for (const MachineOperand &MO: MI.operands()) { 729 if (MO.isFI()) { 730 if (SeenFI) 731 llvm_unreachable("should not see multiple frame indices"); 732 733 SeenFI = true; 734 } 735 } 736 #endif 737 738 MachineOperand *FIOp = 739 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr 740 : AMDGPU::OpName::vaddr); 741 742 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); 743 int64_t NewOffset = OffsetOp->getImm() + Offset; 744 745 assert(FIOp && FIOp->isFI() && "frame index must be address operand"); 746 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI)); 747 748 if (IsFlat) { 749 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 750 SIInstrFlags::FlatScratch) && 751 "offset should be legal"); 752 FIOp->ChangeToRegister(BaseReg, false); 753 OffsetOp->setImm(NewOffset); 754 return; 755 } 756 757 #ifndef NDEBUG 758 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset); 759 assert(SOffset->isImm() && SOffset->getImm() == 0); 760 #endif 761 762 assert(SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) && 763 "offset should be legal"); 764 765 FIOp->ChangeToRegister(BaseReg, false); 766 OffsetOp->setImm(NewOffset); 767 } 768 769 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, 770 Register BaseReg, 771 int64_t Offset) const { 772 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 773 return false; 774 775 int64_t NewOffset = Offset + getScratchInstrOffset(MI); 776 777 if (SIInstrInfo::isMUBUF(*MI)) 778 return SIInstrInfo::isLegalMUBUFImmOffset(NewOffset); 779 780 const SIInstrInfo *TII = ST.getInstrInfo(); 781 return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 782 SIInstrFlags::FlatScratch); 783 } 784 785 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( 786 const MachineFunction &MF, unsigned Kind) const { 787 // This is inaccurate. It depends on the instruction and address space. The 788 // only place where we should hit this is for dealing with frame indexes / 789 // private accesses, so this is correct in that case. 790 return &AMDGPU::VGPR_32RegClass; 791 } 792 793 static unsigned getNumSubRegsForSpillOp(unsigned Op) { 794 795 switch (Op) { 796 case AMDGPU::SI_SPILL_S1024_SAVE: 797 case AMDGPU::SI_SPILL_S1024_RESTORE: 798 case AMDGPU::SI_SPILL_V1024_SAVE: 799 case AMDGPU::SI_SPILL_V1024_RESTORE: 800 case AMDGPU::SI_SPILL_A1024_SAVE: 801 case AMDGPU::SI_SPILL_A1024_RESTORE: 802 return 32; 803 case AMDGPU::SI_SPILL_S512_SAVE: 804 case AMDGPU::SI_SPILL_S512_RESTORE: 805 case AMDGPU::SI_SPILL_V512_SAVE: 806 case AMDGPU::SI_SPILL_V512_RESTORE: 807 case AMDGPU::SI_SPILL_A512_SAVE: 808 case AMDGPU::SI_SPILL_A512_RESTORE: 809 return 16; 810 case AMDGPU::SI_SPILL_S256_SAVE: 811 case AMDGPU::SI_SPILL_S256_RESTORE: 812 case AMDGPU::SI_SPILL_V256_SAVE: 813 case AMDGPU::SI_SPILL_V256_RESTORE: 814 case AMDGPU::SI_SPILL_A256_SAVE: 815 case AMDGPU::SI_SPILL_A256_RESTORE: 816 return 8; 817 case AMDGPU::SI_SPILL_S192_SAVE: 818 case AMDGPU::SI_SPILL_S192_RESTORE: 819 case AMDGPU::SI_SPILL_V192_SAVE: 820 case AMDGPU::SI_SPILL_V192_RESTORE: 821 case AMDGPU::SI_SPILL_A192_SAVE: 822 case AMDGPU::SI_SPILL_A192_RESTORE: 823 return 6; 824 case AMDGPU::SI_SPILL_S160_SAVE: 825 case AMDGPU::SI_SPILL_S160_RESTORE: 826 case AMDGPU::SI_SPILL_V160_SAVE: 827 case AMDGPU::SI_SPILL_V160_RESTORE: 828 case AMDGPU::SI_SPILL_A160_SAVE: 829 case AMDGPU::SI_SPILL_A160_RESTORE: 830 return 5; 831 case AMDGPU::SI_SPILL_S128_SAVE: 832 case AMDGPU::SI_SPILL_S128_RESTORE: 833 case AMDGPU::SI_SPILL_V128_SAVE: 834 case AMDGPU::SI_SPILL_V128_RESTORE: 835 case AMDGPU::SI_SPILL_A128_SAVE: 836 case AMDGPU::SI_SPILL_A128_RESTORE: 837 return 4; 838 case AMDGPU::SI_SPILL_S96_SAVE: 839 case AMDGPU::SI_SPILL_S96_RESTORE: 840 case AMDGPU::SI_SPILL_V96_SAVE: 841 case AMDGPU::SI_SPILL_V96_RESTORE: 842 case AMDGPU::SI_SPILL_A96_SAVE: 843 case AMDGPU::SI_SPILL_A96_RESTORE: 844 return 3; 845 case AMDGPU::SI_SPILL_S64_SAVE: 846 case AMDGPU::SI_SPILL_S64_RESTORE: 847 case AMDGPU::SI_SPILL_V64_SAVE: 848 case AMDGPU::SI_SPILL_V64_RESTORE: 849 case AMDGPU::SI_SPILL_A64_SAVE: 850 case AMDGPU::SI_SPILL_A64_RESTORE: 851 return 2; 852 case AMDGPU::SI_SPILL_S32_SAVE: 853 case AMDGPU::SI_SPILL_S32_RESTORE: 854 case AMDGPU::SI_SPILL_V32_SAVE: 855 case AMDGPU::SI_SPILL_V32_RESTORE: 856 case AMDGPU::SI_SPILL_A32_SAVE: 857 case AMDGPU::SI_SPILL_A32_RESTORE: 858 return 1; 859 default: llvm_unreachable("Invalid spill opcode"); 860 } 861 } 862 863 static int getOffsetMUBUFStore(unsigned Opc) { 864 switch (Opc) { 865 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 866 return AMDGPU::BUFFER_STORE_DWORD_OFFSET; 867 case AMDGPU::BUFFER_STORE_BYTE_OFFEN: 868 return AMDGPU::BUFFER_STORE_BYTE_OFFSET; 869 case AMDGPU::BUFFER_STORE_SHORT_OFFEN: 870 return AMDGPU::BUFFER_STORE_SHORT_OFFSET; 871 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: 872 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; 873 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN: 874 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; 875 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN: 876 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET; 877 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN: 878 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET; 879 default: 880 return -1; 881 } 882 } 883 884 static int getOffsetMUBUFLoad(unsigned Opc) { 885 switch (Opc) { 886 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 887 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 888 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN: 889 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET; 890 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN: 891 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET; 892 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN: 893 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET; 894 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN: 895 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET; 896 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN: 897 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; 898 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN: 899 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET; 900 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN: 901 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET; 902 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN: 903 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET; 904 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN: 905 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET; 906 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN: 907 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET; 908 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN: 909 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET; 910 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN: 911 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET; 912 default: 913 return -1; 914 } 915 } 916 917 static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, 918 MachineBasicBlock &MBB, 919 MachineBasicBlock::iterator MI, 920 int Index, unsigned Lane, 921 unsigned ValueReg, bool IsKill) { 922 MachineFunction *MF = MBB.getParent(); 923 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 924 const SIInstrInfo *TII = ST.getInstrInfo(); 925 926 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane); 927 928 if (Reg == AMDGPU::NoRegister) 929 return MachineInstrBuilder(); 930 931 bool IsStore = MI->mayStore(); 932 MachineRegisterInfo &MRI = MF->getRegInfo(); 933 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 934 935 unsigned Dst = IsStore ? Reg : ValueReg; 936 unsigned Src = IsStore ? ValueReg : Reg; 937 unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 938 : AMDGPU::V_ACCVGPR_READ_B32_e64; 939 940 auto MIB = BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst) 941 .addReg(Src, getKillRegState(IsKill)); 942 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 943 return MIB; 944 } 945 946 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not 947 // need to handle the case where an SGPR may need to be spilled while spilling. 948 static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, 949 MachineFrameInfo &MFI, 950 MachineBasicBlock::iterator MI, 951 int Index, 952 int64_t Offset) { 953 const SIInstrInfo *TII = ST.getInstrInfo(); 954 MachineBasicBlock *MBB = MI->getParent(); 955 const DebugLoc &DL = MI->getDebugLoc(); 956 bool IsStore = MI->mayStore(); 957 958 unsigned Opc = MI->getOpcode(); 959 int LoadStoreOp = IsStore ? 960 getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc); 961 if (LoadStoreOp == -1) 962 return false; 963 964 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); 965 if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr()) 966 return true; 967 968 MachineInstrBuilder NewMI = 969 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) 970 .add(*Reg) 971 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) 972 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) 973 .addImm(Offset) 974 .addImm(0) // cpol 975 .addImm(0) // tfe 976 .addImm(0) // swz 977 .cloneMemRefs(*MI); 978 979 const MachineOperand *VDataIn = TII->getNamedOperand(*MI, 980 AMDGPU::OpName::vdata_in); 981 if (VDataIn) 982 NewMI.add(*VDataIn); 983 return true; 984 } 985 986 static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, 987 unsigned LoadStoreOp, 988 unsigned EltSize) { 989 bool IsStore = TII->get(LoadStoreOp).mayStore(); 990 bool UseST = 991 AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 && 992 AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::saddr) < 0; 993 994 switch (EltSize) { 995 case 4: 996 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 997 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR; 998 break; 999 case 8: 1000 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR 1001 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR; 1002 break; 1003 case 12: 1004 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR 1005 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR; 1006 break; 1007 case 16: 1008 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR 1009 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR; 1010 break; 1011 default: 1012 llvm_unreachable("Unexpected spill load/store size!"); 1013 } 1014 1015 if (UseST) 1016 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); 1017 1018 return LoadStoreOp; 1019 } 1020 1021 void SIRegisterInfo::buildSpillLoadStore( 1022 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, 1023 unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill, 1024 MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO, 1025 RegScavenger *RS, LivePhysRegs *LiveRegs) const { 1026 assert((!RS || !LiveRegs) && "Only RS or LiveRegs can be set but not both"); 1027 1028 MachineFunction *MF = MBB.getParent(); 1029 const SIInstrInfo *TII = ST.getInstrInfo(); 1030 const MachineFrameInfo &MFI = MF->getFrameInfo(); 1031 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>(); 1032 1033 const MCInstrDesc *Desc = &TII->get(LoadStoreOp); 1034 const DebugLoc &DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc(); 1035 bool IsStore = Desc->mayStore(); 1036 bool IsFlat = TII->isFLATScratch(LoadStoreOp); 1037 1038 bool Scavenged = false; 1039 MCRegister SOffset = ScratchOffsetReg; 1040 1041 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); 1042 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores. 1043 const bool IsAGPR = !ST.hasGFX90AInsts() && hasAGPRs(RC); 1044 const unsigned RegWidth = AMDGPU::getRegBitWidth(RC->getID()) / 8; 1045 1046 // Always use 4 byte operations for AGPRs because we need to scavenge 1047 // a temporary VGPR. 1048 unsigned EltSize = (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u) : 4u; 1049 unsigned NumSubRegs = RegWidth / EltSize; 1050 unsigned Size = NumSubRegs * EltSize; 1051 unsigned RemSize = RegWidth - Size; 1052 unsigned NumRemSubRegs = RemSize ? 1 : 0; 1053 int64_t Offset = InstOffset + MFI.getObjectOffset(Index); 1054 int64_t MaxOffset = Offset + Size + RemSize - EltSize; 1055 int64_t ScratchOffsetRegDelta = 0; 1056 1057 if (IsFlat && EltSize > 4) { 1058 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); 1059 Desc = &TII->get(LoadStoreOp); 1060 } 1061 1062 Align Alignment = MFI.getObjectAlign(Index); 1063 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); 1064 1065 assert((IsFlat || ((Offset % EltSize) == 0)) && 1066 "unexpected VGPR spill offset"); 1067 1068 bool IsOffsetLegal = 1069 IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, 1070 SIInstrFlags::FlatScratch) 1071 : SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset); 1072 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) { 1073 SOffset = MCRegister(); 1074 1075 // We currently only support spilling VGPRs to EltSize boundaries, meaning 1076 // we can simplify the adjustment of Offset here to just scale with 1077 // WavefrontSize. 1078 if (!IsFlat) 1079 Offset *= ST.getWavefrontSize(); 1080 1081 // We don't have access to the register scavenger if this function is called 1082 // during PEI::scavengeFrameVirtualRegs() so use LiveRegs in this case. 1083 if (RS) { 1084 SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false); 1085 } else if (LiveRegs) { 1086 for (MCRegister Reg : AMDGPU::SGPR_32RegClass) { 1087 if (LiveRegs->available(MF->getRegInfo(), Reg)) { 1088 SOffset = Reg; 1089 break; 1090 } 1091 } 1092 } 1093 1094 if (!SOffset) { 1095 // There are no free SGPRs, and since we are in the process of spilling 1096 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true 1097 // on SI/CI and on VI it is true until we implement spilling using scalar 1098 // stores), we have no way to free up an SGPR. Our solution here is to 1099 // add the offset directly to the ScratchOffset or StackPtrOffset 1100 // register, and then subtract the offset after the spill to return the 1101 // register to it's original value. 1102 if (!ScratchOffsetReg) 1103 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg(); 1104 SOffset = ScratchOffsetReg; 1105 ScratchOffsetRegDelta = Offset; 1106 } else { 1107 Scavenged = true; 1108 } 1109 1110 if (!SOffset) 1111 report_fatal_error("could not scavenge SGPR to spill in entry function"); 1112 1113 if (ScratchOffsetReg == AMDGPU::NoRegister) { 1114 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset); 1115 } else { 1116 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) 1117 .addReg(ScratchOffsetReg) 1118 .addImm(Offset); 1119 } 1120 1121 Offset = 0; 1122 } 1123 1124 if (IsFlat && SOffset == AMDGPU::NoRegister) { 1125 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 1126 && "Unexpected vaddr for flat scratch with a FI operand"); 1127 1128 assert(ST.hasFlatScratchSTMode()); 1129 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); 1130 Desc = &TII->get(LoadStoreOp); 1131 } 1132 1133 Register TmpReg; 1134 1135 for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e; 1136 ++i, RegOffset += EltSize) { 1137 if (i == NumSubRegs) { 1138 EltSize = RemSize; 1139 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); 1140 } 1141 Desc = &TII->get(LoadStoreOp); 1142 1143 unsigned NumRegs = EltSize / 4; 1144 Register SubReg = e == 1 1145 ? ValueReg 1146 : Register(getSubReg(ValueReg, 1147 getSubRegFromChannel(RegOffset / 4, NumRegs))); 1148 1149 unsigned SOffsetRegState = 0; 1150 unsigned SrcDstRegState = getDefRegState(!IsStore); 1151 if (i + 1 == e) { 1152 SOffsetRegState |= getKillRegState(Scavenged); 1153 // The last implicit use carries the "Kill" flag. 1154 SrcDstRegState |= getKillRegState(IsKill); 1155 } 1156 1157 // Make sure the whole register is defined if there are undef components by 1158 // adding an implicit def of the super-reg on the first instruction. 1159 bool NeedSuperRegDef = e > 1 && IsStore && i == 0; 1160 bool NeedSuperRegImpOperand = e > 1; 1161 1162 unsigned Lane = RegOffset / 4; 1163 unsigned LaneE = (RegOffset + EltSize) / 4; 1164 for ( ; Lane != LaneE; ++Lane) { 1165 bool IsSubReg = e > 1 || EltSize > 4; 1166 Register Sub = IsSubReg 1167 ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane))) 1168 : ValueReg; 1169 auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill); 1170 if (!MIB.getInstr()) 1171 break; 1172 if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == 0)) { 1173 MIB.addReg(ValueReg, RegState::ImplicitDefine); 1174 NeedSuperRegDef = false; 1175 } 1176 if (IsSubReg || NeedSuperRegImpOperand) { 1177 NeedSuperRegImpOperand = true; 1178 unsigned State = SrcDstRegState; 1179 if (Lane + 1 != LaneE) 1180 State &= ~RegState::Kill; 1181 MIB.addReg(ValueReg, RegState::Implicit | State); 1182 } 1183 } 1184 1185 if (Lane == LaneE) // Fully spilled into AGPRs. 1186 continue; 1187 1188 // Offset in bytes from the beginning of the ValueReg to its portion we 1189 // still need to spill. It may differ from RegOffset if a portion of 1190 // current SubReg has been already spilled into AGPRs by the loop above. 1191 unsigned RemRegOffset = Lane * 4; 1192 unsigned RemEltSize = EltSize - (RemRegOffset - RegOffset); 1193 if (RemEltSize != EltSize) { // Partially spilled to AGPRs 1194 assert(IsFlat && EltSize > 4); 1195 1196 unsigned NumRegs = RemEltSize / 4; 1197 SubReg = Register(getSubReg(ValueReg, 1198 getSubRegFromChannel(RemRegOffset / 4, NumRegs))); 1199 unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize); 1200 Desc = &TII->get(Opc); 1201 } 1202 1203 unsigned FinalReg = SubReg; 1204 1205 if (IsAGPR) { 1206 assert(EltSize == 4); 1207 1208 if (!TmpReg) { 1209 assert(RS && "Needs to have RegScavenger to spill an AGPR!"); 1210 // FIXME: change to scavengeRegisterBackwards() 1211 TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 1212 RS->setRegUsed(TmpReg); 1213 } 1214 if (IsStore) { 1215 auto AccRead = BuildMI(MBB, MI, DL, 1216 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TmpReg) 1217 .addReg(SubReg, getKillRegState(IsKill)); 1218 if (NeedSuperRegDef) 1219 AccRead.addReg(ValueReg, RegState::ImplicitDefine); 1220 AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1221 } 1222 SubReg = TmpReg; 1223 } 1224 1225 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RemRegOffset); 1226 MachineMemOperand *NewMMO = 1227 MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize, 1228 commonAlignment(Alignment, RemRegOffset)); 1229 1230 auto MIB = 1231 BuildMI(MBB, MI, DL, *Desc) 1232 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)); 1233 if (!IsFlat) 1234 MIB.addReg(FuncInfo->getScratchRSrcReg()); 1235 1236 if (SOffset == AMDGPU::NoRegister) { 1237 if (!IsFlat) 1238 MIB.addImm(0); 1239 } else { 1240 MIB.addReg(SOffset, SOffsetRegState); 1241 } 1242 MIB.addImm(Offset + RemRegOffset) 1243 .addImm(0); // cpol 1244 if (!IsFlat) 1245 MIB.addImm(0) // tfe 1246 .addImm(0); // swz 1247 MIB.addMemOperand(NewMMO); 1248 1249 if (!IsAGPR && NeedSuperRegDef) 1250 MIB.addReg(ValueReg, RegState::ImplicitDefine); 1251 1252 if (!IsStore && TmpReg != AMDGPU::NoRegister) { 1253 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), 1254 FinalReg) 1255 .addReg(TmpReg, RegState::Kill); 1256 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1257 } 1258 1259 if (NeedSuperRegImpOperand) 1260 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); 1261 } 1262 1263 if (ScratchOffsetRegDelta != 0) { 1264 // Subtract the offset we added to the ScratchOffset register. 1265 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) 1266 .addReg(SOffset) 1267 .addImm(-ScratchOffsetRegDelta); 1268 } 1269 } 1270 1271 void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, 1272 int Offset, bool IsLoad, 1273 bool IsKill) const { 1274 // Load/store VGPR 1275 MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo(); 1276 assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill); 1277 1278 Register FrameReg = 1279 FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF) 1280 ? getBaseRegister() 1281 : getFrameRegister(SB.MF); 1282 1283 Align Alignment = FrameInfo.getObjectAlign(Index); 1284 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SB.MF, Index); 1285 MachineMemOperand *MMO = SB.MF.getMachineMemOperand( 1286 PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore, 1287 SB.EltSize, Alignment); 1288 1289 if (IsLoad) { 1290 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 1291 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 1292 buildSpillLoadStore(SB.MBB, SB.MI, Opc, Index, SB.TmpVGPR, false, FrameReg, 1293 Offset * SB.EltSize, MMO, SB.RS); 1294 } else { 1295 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 1296 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 1297 buildSpillLoadStore(SB.MBB, SB.MI, Opc, Index, SB.TmpVGPR, IsKill, FrameReg, 1298 Offset * SB.EltSize, MMO, SB.RS); 1299 // This only ever adds one VGPR spill 1300 SB.MFI.addToSpilledVGPRs(1); 1301 } 1302 } 1303 1304 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, 1305 int Index, 1306 RegScavenger *RS, 1307 bool OnlyToVGPR) const { 1308 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); 1309 1310 ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills = 1311 SB.MFI.getSGPRToVGPRSpills(Index); 1312 bool SpillToVGPR = !VGPRSpills.empty(); 1313 if (OnlyToVGPR && !SpillToVGPR) 1314 return false; 1315 1316 assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() && 1317 SB.SuperReg != SB.MFI.getFrameOffsetReg())); 1318 1319 if (SpillToVGPR) { 1320 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) { 1321 Register SubReg = 1322 SB.NumSubRegs == 1 1323 ? SB.SuperReg 1324 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1325 SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; 1326 1327 bool UseKill = SB.IsKill && i == SB.NumSubRegs - 1; 1328 1329 // Mark the "old value of vgpr" input undef only if this is the first sgpr 1330 // spill to this specific vgpr in the first basic block. 1331 auto MIB = BuildMI(SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32), 1332 Spill.VGPR) 1333 .addReg(SubReg, getKillRegState(UseKill)) 1334 .addImm(Spill.Lane) 1335 .addReg(Spill.VGPR); 1336 1337 if (i == 0 && SB.NumSubRegs > 1) { 1338 // We may be spilling a super-register which is only partially defined, 1339 // and need to ensure later spills think the value is defined. 1340 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1341 } 1342 1343 if (SB.NumSubRegs > 1) 1344 MIB.addReg(SB.SuperReg, getKillRegState(UseKill) | RegState::Implicit); 1345 1346 // FIXME: Since this spills to another register instead of an actual 1347 // frame index, we should delete the frame index when all references to 1348 // it are fixed. 1349 } 1350 } else { 1351 SB.prepare(); 1352 1353 // SubReg carries the "Kill" flag when SubReg == SB.SuperReg. 1354 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill); 1355 1356 // Per VGPR helper data 1357 auto PVD = SB.getPerVGPRData(); 1358 1359 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1360 unsigned TmpVGPRFlags = RegState::Undef; 1361 1362 // Write sub registers into the VGPR 1363 for (unsigned i = Offset * PVD.PerVGPR, 1364 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1365 i < e; ++i) { 1366 Register SubReg = 1367 SB.NumSubRegs == 1 1368 ? SB.SuperReg 1369 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1370 1371 MachineInstrBuilder WriteLane = 1372 BuildMI(SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32), 1373 SB.TmpVGPR) 1374 .addReg(SubReg, SubKillState) 1375 .addImm(i % PVD.PerVGPR) 1376 .addReg(SB.TmpVGPR, TmpVGPRFlags); 1377 TmpVGPRFlags = 0; 1378 1379 // There could be undef components of a spilled super register. 1380 // TODO: Can we detect this and skip the spill? 1381 if (SB.NumSubRegs > 1) { 1382 // The last implicit use of the SB.SuperReg carries the "Kill" flag. 1383 unsigned SuperKillState = 0; 1384 if (i + 1 == SB.NumSubRegs) 1385 SuperKillState |= getKillRegState(SB.IsKill); 1386 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState); 1387 } 1388 } 1389 1390 // Write out VGPR 1391 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false); 1392 } 1393 1394 SB.restore(); 1395 } 1396 1397 MI->eraseFromParent(); 1398 SB.MFI.addToSpilledSGPRs(SB.NumSubRegs); 1399 return true; 1400 } 1401 1402 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, 1403 int Index, 1404 RegScavenger *RS, 1405 bool OnlyToVGPR) const { 1406 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); 1407 1408 ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills = 1409 SB.MFI.getSGPRToVGPRSpills(Index); 1410 bool SpillToVGPR = !VGPRSpills.empty(); 1411 if (OnlyToVGPR && !SpillToVGPR) 1412 return false; 1413 1414 if (SpillToVGPR) { 1415 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) { 1416 Register SubReg = 1417 SB.NumSubRegs == 1 1418 ? SB.SuperReg 1419 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1420 1421 SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; 1422 auto MIB = 1423 BuildMI(SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32), SubReg) 1424 .addReg(Spill.VGPR) 1425 .addImm(Spill.Lane); 1426 if (SB.NumSubRegs > 1 && i == 0) 1427 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1428 } 1429 } else { 1430 SB.prepare(); 1431 1432 // Per VGPR helper data 1433 auto PVD = SB.getPerVGPRData(); 1434 1435 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1436 // Load in VGPR data 1437 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true); 1438 1439 // Unpack lanes 1440 for (unsigned i = Offset * PVD.PerVGPR, 1441 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1442 i < e; ++i) { 1443 Register SubReg = 1444 SB.NumSubRegs == 1 1445 ? SB.SuperReg 1446 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1447 1448 bool LastSubReg = (i + 1 == e); 1449 auto MIB = BuildMI(SB.MBB, MI, SB.DL, 1450 SB.TII.get(AMDGPU::V_READLANE_B32), SubReg) 1451 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg)) 1452 .addImm(i); 1453 if (SB.NumSubRegs > 1 && i == 0) 1454 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1455 } 1456 } 1457 1458 SB.restore(); 1459 } 1460 1461 MI->eraseFromParent(); 1462 return true; 1463 } 1464 1465 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to 1466 /// a VGPR and the stack slot can be safely eliminated when all other users are 1467 /// handled. 1468 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( 1469 MachineBasicBlock::iterator MI, 1470 int FI, 1471 RegScavenger *RS) const { 1472 switch (MI->getOpcode()) { 1473 case AMDGPU::SI_SPILL_S1024_SAVE: 1474 case AMDGPU::SI_SPILL_S512_SAVE: 1475 case AMDGPU::SI_SPILL_S256_SAVE: 1476 case AMDGPU::SI_SPILL_S192_SAVE: 1477 case AMDGPU::SI_SPILL_S160_SAVE: 1478 case AMDGPU::SI_SPILL_S128_SAVE: 1479 case AMDGPU::SI_SPILL_S96_SAVE: 1480 case AMDGPU::SI_SPILL_S64_SAVE: 1481 case AMDGPU::SI_SPILL_S32_SAVE: 1482 return spillSGPR(MI, FI, RS, true); 1483 case AMDGPU::SI_SPILL_S1024_RESTORE: 1484 case AMDGPU::SI_SPILL_S512_RESTORE: 1485 case AMDGPU::SI_SPILL_S256_RESTORE: 1486 case AMDGPU::SI_SPILL_S192_RESTORE: 1487 case AMDGPU::SI_SPILL_S160_RESTORE: 1488 case AMDGPU::SI_SPILL_S128_RESTORE: 1489 case AMDGPU::SI_SPILL_S96_RESTORE: 1490 case AMDGPU::SI_SPILL_S64_RESTORE: 1491 case AMDGPU::SI_SPILL_S32_RESTORE: 1492 return restoreSGPR(MI, FI, RS, true); 1493 default: 1494 llvm_unreachable("not an SGPR spill instruction"); 1495 } 1496 } 1497 1498 void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, 1499 int SPAdj, unsigned FIOperandNum, 1500 RegScavenger *RS) const { 1501 MachineFunction *MF = MI->getParent()->getParent(); 1502 MachineBasicBlock *MBB = MI->getParent(); 1503 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1504 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 1505 const SIInstrInfo *TII = ST.getInstrInfo(); 1506 DebugLoc DL = MI->getDebugLoc(); 1507 1508 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?"); 1509 1510 MachineOperand &FIOp = MI->getOperand(FIOperandNum); 1511 int Index = MI->getOperand(FIOperandNum).getIndex(); 1512 1513 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF) 1514 ? getBaseRegister() 1515 : getFrameRegister(*MF); 1516 1517 switch (MI->getOpcode()) { 1518 // SGPR register spill 1519 case AMDGPU::SI_SPILL_S1024_SAVE: 1520 case AMDGPU::SI_SPILL_S512_SAVE: 1521 case AMDGPU::SI_SPILL_S256_SAVE: 1522 case AMDGPU::SI_SPILL_S192_SAVE: 1523 case AMDGPU::SI_SPILL_S160_SAVE: 1524 case AMDGPU::SI_SPILL_S128_SAVE: 1525 case AMDGPU::SI_SPILL_S96_SAVE: 1526 case AMDGPU::SI_SPILL_S64_SAVE: 1527 case AMDGPU::SI_SPILL_S32_SAVE: { 1528 spillSGPR(MI, Index, RS); 1529 break; 1530 } 1531 1532 // SGPR register restore 1533 case AMDGPU::SI_SPILL_S1024_RESTORE: 1534 case AMDGPU::SI_SPILL_S512_RESTORE: 1535 case AMDGPU::SI_SPILL_S256_RESTORE: 1536 case AMDGPU::SI_SPILL_S192_RESTORE: 1537 case AMDGPU::SI_SPILL_S160_RESTORE: 1538 case AMDGPU::SI_SPILL_S128_RESTORE: 1539 case AMDGPU::SI_SPILL_S96_RESTORE: 1540 case AMDGPU::SI_SPILL_S64_RESTORE: 1541 case AMDGPU::SI_SPILL_S32_RESTORE: { 1542 restoreSGPR(MI, Index, RS); 1543 break; 1544 } 1545 1546 // VGPR register spill 1547 case AMDGPU::SI_SPILL_V1024_SAVE: 1548 case AMDGPU::SI_SPILL_V512_SAVE: 1549 case AMDGPU::SI_SPILL_V256_SAVE: 1550 case AMDGPU::SI_SPILL_V192_SAVE: 1551 case AMDGPU::SI_SPILL_V160_SAVE: 1552 case AMDGPU::SI_SPILL_V128_SAVE: 1553 case AMDGPU::SI_SPILL_V96_SAVE: 1554 case AMDGPU::SI_SPILL_V64_SAVE: 1555 case AMDGPU::SI_SPILL_V32_SAVE: 1556 case AMDGPU::SI_SPILL_A1024_SAVE: 1557 case AMDGPU::SI_SPILL_A512_SAVE: 1558 case AMDGPU::SI_SPILL_A256_SAVE: 1559 case AMDGPU::SI_SPILL_A192_SAVE: 1560 case AMDGPU::SI_SPILL_A160_SAVE: 1561 case AMDGPU::SI_SPILL_A128_SAVE: 1562 case AMDGPU::SI_SPILL_A96_SAVE: 1563 case AMDGPU::SI_SPILL_A64_SAVE: 1564 case AMDGPU::SI_SPILL_A32_SAVE: { 1565 const MachineOperand *VData = TII->getNamedOperand(*MI, 1566 AMDGPU::OpName::vdata); 1567 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 1568 MFI->getStackPtrOffsetReg()); 1569 1570 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 1571 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 1572 auto *MBB = MI->getParent(); 1573 buildSpillLoadStore( 1574 *MBB, MI, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, 1575 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 1576 *MI->memoperands_begin(), RS); 1577 MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); 1578 MI->eraseFromParent(); 1579 break; 1580 } 1581 case AMDGPU::SI_SPILL_V32_RESTORE: 1582 case AMDGPU::SI_SPILL_V64_RESTORE: 1583 case AMDGPU::SI_SPILL_V96_RESTORE: 1584 case AMDGPU::SI_SPILL_V128_RESTORE: 1585 case AMDGPU::SI_SPILL_V160_RESTORE: 1586 case AMDGPU::SI_SPILL_V192_RESTORE: 1587 case AMDGPU::SI_SPILL_V256_RESTORE: 1588 case AMDGPU::SI_SPILL_V512_RESTORE: 1589 case AMDGPU::SI_SPILL_V1024_RESTORE: 1590 case AMDGPU::SI_SPILL_A32_RESTORE: 1591 case AMDGPU::SI_SPILL_A64_RESTORE: 1592 case AMDGPU::SI_SPILL_A96_RESTORE: 1593 case AMDGPU::SI_SPILL_A128_RESTORE: 1594 case AMDGPU::SI_SPILL_A160_RESTORE: 1595 case AMDGPU::SI_SPILL_A192_RESTORE: 1596 case AMDGPU::SI_SPILL_A256_RESTORE: 1597 case AMDGPU::SI_SPILL_A512_RESTORE: 1598 case AMDGPU::SI_SPILL_A1024_RESTORE: { 1599 const MachineOperand *VData = TII->getNamedOperand(*MI, 1600 AMDGPU::OpName::vdata); 1601 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 1602 MFI->getStackPtrOffsetReg()); 1603 1604 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 1605 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 1606 auto *MBB = MI->getParent(); 1607 buildSpillLoadStore( 1608 *MBB, MI, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, 1609 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 1610 *MI->memoperands_begin(), RS); 1611 MI->eraseFromParent(); 1612 break; 1613 } 1614 1615 default: { 1616 // Other access to frame index 1617 const DebugLoc &DL = MI->getDebugLoc(); 1618 1619 int64_t Offset = FrameInfo.getObjectOffset(Index); 1620 if (ST.enableFlatScratch()) { 1621 if (TII->isFLATScratch(*MI)) { 1622 assert((int16_t)FIOperandNum == 1623 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 1624 AMDGPU::OpName::saddr)); 1625 1626 // The offset is always swizzled, just replace it 1627 if (FrameReg) 1628 FIOp.ChangeToRegister(FrameReg, false); 1629 1630 if (!Offset) 1631 return; 1632 1633 MachineOperand *OffsetOp = 1634 TII->getNamedOperand(*MI, AMDGPU::OpName::offset); 1635 int64_t NewOffset = Offset + OffsetOp->getImm(); 1636 if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 1637 SIInstrFlags::FlatScratch)) { 1638 OffsetOp->setImm(NewOffset); 1639 if (FrameReg) 1640 return; 1641 Offset = 0; 1642 } 1643 1644 assert(!TII->getNamedOperand(*MI, AMDGPU::OpName::vaddr) && 1645 "Unexpected vaddr for flat scratch with a FI operand"); 1646 1647 // On GFX10 we have ST mode to use no registers for an address. 1648 // Otherwise we need to materialize 0 into an SGPR. 1649 if (!Offset && ST.hasFlatScratchSTMode()) { 1650 unsigned Opc = MI->getOpcode(); 1651 unsigned NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc); 1652 MI->RemoveOperand( 1653 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr)); 1654 MI->setDesc(TII->get(NewOpc)); 1655 return; 1656 } 1657 } 1658 1659 if (!FrameReg) { 1660 FIOp.ChangeToImmediate(Offset); 1661 if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) 1662 return; 1663 } 1664 1665 // We need to use register here. Check if we can use an SGPR or need 1666 // a VGPR. 1667 FIOp.ChangeToRegister(AMDGPU::M0, false); 1668 bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp); 1669 1670 if (!Offset && FrameReg && UseSGPR) { 1671 FIOp.setReg(FrameReg); 1672 return; 1673 } 1674 1675 const TargetRegisterClass *RC = UseSGPR ? &AMDGPU::SReg_32_XM0RegClass 1676 : &AMDGPU::VGPR_32RegClass; 1677 1678 Register TmpReg = RS->scavengeRegister(RC, MI, 0, !UseSGPR); 1679 FIOp.setReg(TmpReg); 1680 FIOp.setIsKill(true); 1681 1682 if ((!FrameReg || !Offset) && TmpReg) { 1683 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1684 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg); 1685 if (FrameReg) 1686 MIB.addReg(FrameReg); 1687 else 1688 MIB.addImm(Offset); 1689 1690 return; 1691 } 1692 1693 Register TmpSReg = 1694 UseSGPR ? TmpReg 1695 : RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, 1696 !UseSGPR); 1697 1698 // TODO: for flat scratch another attempt can be made with a VGPR index 1699 // if no SGPRs can be scavenged. 1700 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) 1701 report_fatal_error("Cannot scavenge register in FI elimination!"); 1702 1703 if (!TmpSReg) { 1704 // Use frame register and restore it after. 1705 TmpSReg = FrameReg; 1706 FIOp.setReg(FrameReg); 1707 FIOp.setIsKill(false); 1708 } 1709 1710 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg) 1711 .addReg(FrameReg) 1712 .addImm(Offset); 1713 1714 if (!UseSGPR) 1715 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 1716 .addReg(TmpSReg, RegState::Kill); 1717 1718 if (TmpSReg == FrameReg) { 1719 // Undo frame register modification. 1720 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32), 1721 FrameReg) 1722 .addReg(FrameReg) 1723 .addImm(-Offset); 1724 } 1725 1726 return; 1727 } 1728 1729 bool IsMUBUF = TII->isMUBUF(*MI); 1730 1731 if (!IsMUBUF && !MFI->isEntryFunction()) { 1732 // Convert to a swizzled stack address by scaling by the wave size. 1733 // 1734 // In an entry function/kernel the offset is already swizzled. 1735 1736 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32; 1737 Register ResultReg = 1738 IsCopy ? MI->getOperand(0).getReg() 1739 : RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 1740 1741 int64_t Offset = FrameInfo.getObjectOffset(Index); 1742 if (Offset == 0) { 1743 // XXX - This never happens because of emergency scavenging slot at 0? 1744 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg) 1745 .addImm(ST.getWavefrontSizeLog2()) 1746 .addReg(FrameReg); 1747 } else { 1748 if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) { 1749 // Reuse ResultReg in intermediate step. 1750 Register ScaledReg = ResultReg; 1751 1752 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), 1753 ScaledReg) 1754 .addImm(ST.getWavefrontSizeLog2()) 1755 .addReg(FrameReg); 1756 1757 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; 1758 1759 // TODO: Fold if use instruction is another add of a constant. 1760 if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { 1761 // FIXME: This can fail 1762 MIB.addImm(Offset); 1763 MIB.addReg(ScaledReg, RegState::Kill); 1764 if (!IsVOP2) 1765 MIB.addImm(0); // clamp bit 1766 } else { 1767 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 && 1768 "Need to reuse carry out register"); 1769 1770 // Use scavenged unused carry out as offset register. 1771 Register ConstOffsetReg; 1772 if (!isWave32) 1773 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0); 1774 else 1775 ConstOffsetReg = MIB.getReg(1); 1776 1777 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) 1778 .addImm(Offset); 1779 MIB.addReg(ConstOffsetReg, RegState::Kill); 1780 MIB.addReg(ScaledReg, RegState::Kill); 1781 MIB.addImm(0); // clamp bit 1782 } 1783 } else { 1784 // We have to produce a carry out, and there isn't a free SGPR pair 1785 // for it. We can keep the whole computation on the SALU to avoid 1786 // clobbering an additional register at the cost of an extra mov. 1787 1788 // We may have 1 free scratch SGPR even though a carry out is 1789 // unavailable. Only one additional mov is needed. 1790 Register TmpScaledReg = 1791 RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); 1792 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg; 1793 1794 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg) 1795 .addReg(FrameReg) 1796 .addImm(ST.getWavefrontSizeLog2()); 1797 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) 1798 .addReg(ScaledReg, RegState::Kill) 1799 .addImm(Offset); 1800 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg) 1801 .addReg(ScaledReg, RegState::Kill); 1802 1803 // If there were truly no free SGPRs, we need to undo everything. 1804 if (!TmpScaledReg.isValid()) { 1805 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) 1806 .addReg(ScaledReg, RegState::Kill) 1807 .addImm(-Offset); 1808 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg) 1809 .addReg(FrameReg) 1810 .addImm(ST.getWavefrontSizeLog2()); 1811 } 1812 } 1813 } 1814 1815 // Don't introduce an extra copy if we're just materializing in a mov. 1816 if (IsCopy) 1817 MI->eraseFromParent(); 1818 else 1819 FIOp.ChangeToRegister(ResultReg, false, false, true); 1820 return; 1821 } 1822 1823 if (IsMUBUF) { 1824 // Disable offen so we don't need a 0 vgpr base. 1825 assert(static_cast<int>(FIOperandNum) == 1826 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 1827 AMDGPU::OpName::vaddr)); 1828 1829 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset); 1830 assert((SOffset.isImm() && SOffset.getImm() == 0)); 1831 1832 if (FrameReg != AMDGPU::NoRegister) 1833 SOffset.ChangeToRegister(FrameReg, false); 1834 1835 int64_t Offset = FrameInfo.getObjectOffset(Index); 1836 int64_t OldImm 1837 = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(); 1838 int64_t NewOffset = OldImm + Offset; 1839 1840 if (SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) && 1841 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) { 1842 MI->eraseFromParent(); 1843 return; 1844 } 1845 } 1846 1847 // If the offset is simply too big, don't convert to a scratch wave offset 1848 // relative index. 1849 1850 FIOp.ChangeToImmediate(Offset); 1851 if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { 1852 Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 1853 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 1854 .addImm(Offset); 1855 FIOp.ChangeToRegister(TmpReg, false, false, true); 1856 } 1857 } 1858 } 1859 } 1860 1861 StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const { 1862 return AMDGPUInstPrinter::getRegisterName(Reg); 1863 } 1864 1865 static const TargetRegisterClass * 1866 getAnyVGPRClassForBitWidth(unsigned BitWidth) { 1867 if (BitWidth <= 64) 1868 return &AMDGPU::VReg_64RegClass; 1869 if (BitWidth <= 96) 1870 return &AMDGPU::VReg_96RegClass; 1871 if (BitWidth <= 128) 1872 return &AMDGPU::VReg_128RegClass; 1873 if (BitWidth <= 160) 1874 return &AMDGPU::VReg_160RegClass; 1875 if (BitWidth <= 192) 1876 return &AMDGPU::VReg_192RegClass; 1877 if (BitWidth <= 256) 1878 return &AMDGPU::VReg_256RegClass; 1879 if (BitWidth <= 512) 1880 return &AMDGPU::VReg_512RegClass; 1881 if (BitWidth <= 1024) 1882 return &AMDGPU::VReg_1024RegClass; 1883 1884 return nullptr; 1885 } 1886 1887 static const TargetRegisterClass * 1888 getAlignedVGPRClassForBitWidth(unsigned BitWidth) { 1889 if (BitWidth <= 64) 1890 return &AMDGPU::VReg_64_Align2RegClass; 1891 if (BitWidth <= 96) 1892 return &AMDGPU::VReg_96_Align2RegClass; 1893 if (BitWidth <= 128) 1894 return &AMDGPU::VReg_128_Align2RegClass; 1895 if (BitWidth <= 160) 1896 return &AMDGPU::VReg_160_Align2RegClass; 1897 if (BitWidth <= 192) 1898 return &AMDGPU::VReg_192_Align2RegClass; 1899 if (BitWidth <= 256) 1900 return &AMDGPU::VReg_256_Align2RegClass; 1901 if (BitWidth <= 512) 1902 return &AMDGPU::VReg_512_Align2RegClass; 1903 if (BitWidth <= 1024) 1904 return &AMDGPU::VReg_1024_Align2RegClass; 1905 1906 return nullptr; 1907 } 1908 1909 const TargetRegisterClass * 1910 SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const { 1911 if (BitWidth == 1) 1912 return &AMDGPU::VReg_1RegClass; 1913 if (BitWidth <= 16) 1914 return &AMDGPU::VGPR_LO16RegClass; 1915 if (BitWidth <= 32) 1916 return &AMDGPU::VGPR_32RegClass; 1917 return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth) 1918 : getAnyVGPRClassForBitWidth(BitWidth); 1919 } 1920 1921 static const TargetRegisterClass * 1922 getAnyAGPRClassForBitWidth(unsigned BitWidth) { 1923 if (BitWidth <= 64) 1924 return &AMDGPU::AReg_64RegClass; 1925 if (BitWidth <= 96) 1926 return &AMDGPU::AReg_96RegClass; 1927 if (BitWidth <= 128) 1928 return &AMDGPU::AReg_128RegClass; 1929 if (BitWidth <= 160) 1930 return &AMDGPU::AReg_160RegClass; 1931 if (BitWidth <= 192) 1932 return &AMDGPU::AReg_192RegClass; 1933 if (BitWidth <= 256) 1934 return &AMDGPU::AReg_256RegClass; 1935 if (BitWidth <= 512) 1936 return &AMDGPU::AReg_512RegClass; 1937 if (BitWidth <= 1024) 1938 return &AMDGPU::AReg_1024RegClass; 1939 1940 return nullptr; 1941 } 1942 1943 static const TargetRegisterClass * 1944 getAlignedAGPRClassForBitWidth(unsigned BitWidth) { 1945 if (BitWidth <= 64) 1946 return &AMDGPU::AReg_64_Align2RegClass; 1947 if (BitWidth <= 96) 1948 return &AMDGPU::AReg_96_Align2RegClass; 1949 if (BitWidth <= 128) 1950 return &AMDGPU::AReg_128_Align2RegClass; 1951 if (BitWidth <= 160) 1952 return &AMDGPU::AReg_160_Align2RegClass; 1953 if (BitWidth <= 192) 1954 return &AMDGPU::AReg_192_Align2RegClass; 1955 if (BitWidth <= 256) 1956 return &AMDGPU::AReg_256_Align2RegClass; 1957 if (BitWidth <= 512) 1958 return &AMDGPU::AReg_512_Align2RegClass; 1959 if (BitWidth <= 1024) 1960 return &AMDGPU::AReg_1024_Align2RegClass; 1961 1962 return nullptr; 1963 } 1964 1965 const TargetRegisterClass * 1966 SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const { 1967 if (BitWidth <= 16) 1968 return &AMDGPU::AGPR_LO16RegClass; 1969 if (BitWidth <= 32) 1970 return &AMDGPU::AGPR_32RegClass; 1971 return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth) 1972 : getAnyAGPRClassForBitWidth(BitWidth); 1973 } 1974 1975 const TargetRegisterClass * 1976 SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) { 1977 if (BitWidth <= 16) 1978 return &AMDGPU::SGPR_LO16RegClass; 1979 if (BitWidth <= 32) 1980 return &AMDGPU::SReg_32RegClass; 1981 if (BitWidth <= 64) 1982 return &AMDGPU::SReg_64RegClass; 1983 if (BitWidth <= 96) 1984 return &AMDGPU::SGPR_96RegClass; 1985 if (BitWidth <= 128) 1986 return &AMDGPU::SGPR_128RegClass; 1987 if (BitWidth <= 160) 1988 return &AMDGPU::SGPR_160RegClass; 1989 if (BitWidth <= 192) 1990 return &AMDGPU::SGPR_192RegClass; 1991 if (BitWidth <= 256) 1992 return &AMDGPU::SGPR_256RegClass; 1993 if (BitWidth <= 512) 1994 return &AMDGPU::SGPR_512RegClass; 1995 if (BitWidth <= 1024) 1996 return &AMDGPU::SGPR_1024RegClass; 1997 1998 return nullptr; 1999 } 2000 2001 // FIXME: This is very slow. It might be worth creating a map from physreg to 2002 // register class. 2003 const TargetRegisterClass * 2004 SIRegisterInfo::getPhysRegClass(MCRegister Reg) const { 2005 static const TargetRegisterClass *const BaseClasses[] = { 2006 &AMDGPU::VGPR_LO16RegClass, 2007 &AMDGPU::VGPR_HI16RegClass, 2008 &AMDGPU::SReg_LO16RegClass, 2009 &AMDGPU::AGPR_LO16RegClass, 2010 &AMDGPU::VGPR_32RegClass, 2011 &AMDGPU::SReg_32RegClass, 2012 &AMDGPU::AGPR_32RegClass, 2013 &AMDGPU::AGPR_32RegClass, 2014 &AMDGPU::VReg_64_Align2RegClass, 2015 &AMDGPU::VReg_64RegClass, 2016 &AMDGPU::SReg_64RegClass, 2017 &AMDGPU::AReg_64_Align2RegClass, 2018 &AMDGPU::AReg_64RegClass, 2019 &AMDGPU::VReg_96_Align2RegClass, 2020 &AMDGPU::VReg_96RegClass, 2021 &AMDGPU::SReg_96RegClass, 2022 &AMDGPU::AReg_96_Align2RegClass, 2023 &AMDGPU::AReg_96RegClass, 2024 &AMDGPU::VReg_128_Align2RegClass, 2025 &AMDGPU::VReg_128RegClass, 2026 &AMDGPU::SReg_128RegClass, 2027 &AMDGPU::AReg_128_Align2RegClass, 2028 &AMDGPU::AReg_128RegClass, 2029 &AMDGPU::VReg_160_Align2RegClass, 2030 &AMDGPU::VReg_160RegClass, 2031 &AMDGPU::SReg_160RegClass, 2032 &AMDGPU::AReg_160_Align2RegClass, 2033 &AMDGPU::AReg_160RegClass, 2034 &AMDGPU::VReg_192_Align2RegClass, 2035 &AMDGPU::VReg_192RegClass, 2036 &AMDGPU::SReg_192RegClass, 2037 &AMDGPU::AReg_192_Align2RegClass, 2038 &AMDGPU::AReg_192RegClass, 2039 &AMDGPU::VReg_256_Align2RegClass, 2040 &AMDGPU::VReg_256RegClass, 2041 &AMDGPU::SReg_256RegClass, 2042 &AMDGPU::AReg_256_Align2RegClass, 2043 &AMDGPU::AReg_256RegClass, 2044 &AMDGPU::VReg_512_Align2RegClass, 2045 &AMDGPU::VReg_512RegClass, 2046 &AMDGPU::SReg_512RegClass, 2047 &AMDGPU::AReg_512_Align2RegClass, 2048 &AMDGPU::AReg_512RegClass, 2049 &AMDGPU::SReg_1024RegClass, 2050 &AMDGPU::VReg_1024_Align2RegClass, 2051 &AMDGPU::VReg_1024RegClass, 2052 &AMDGPU::AReg_1024_Align2RegClass, 2053 &AMDGPU::AReg_1024RegClass, 2054 &AMDGPU::SCC_CLASSRegClass, 2055 &AMDGPU::Pseudo_SReg_32RegClass, 2056 &AMDGPU::Pseudo_SReg_128RegClass, 2057 }; 2058 2059 for (const TargetRegisterClass *BaseClass : BaseClasses) { 2060 if (BaseClass->contains(Reg)) { 2061 return BaseClass; 2062 } 2063 } 2064 return nullptr; 2065 } 2066 2067 bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo &MRI, 2068 Register Reg) const { 2069 const TargetRegisterClass *RC; 2070 if (Reg.isVirtual()) 2071 RC = MRI.getRegClass(Reg); 2072 else 2073 RC = getPhysRegClass(Reg); 2074 return isSGPRClass(RC); 2075 } 2076 2077 // TODO: It might be helpful to have some target specific flags in 2078 // TargetRegisterClass to mark which classes are VGPRs to make this trivial. 2079 bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { 2080 unsigned Size = getRegSizeInBits(*RC); 2081 if (Size == 16) { 2082 return getCommonSubClass(&AMDGPU::VGPR_LO16RegClass, RC) != nullptr || 2083 getCommonSubClass(&AMDGPU::VGPR_HI16RegClass, RC) != nullptr; 2084 } 2085 const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size); 2086 if (!VRC) { 2087 assert(Size < 32 && "Invalid register class size"); 2088 return false; 2089 } 2090 return getCommonSubClass(VRC, RC) != nullptr; 2091 } 2092 2093 bool SIRegisterInfo::hasAGPRs(const TargetRegisterClass *RC) const { 2094 unsigned Size = getRegSizeInBits(*RC); 2095 if (Size < 16) 2096 return false; 2097 const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size); 2098 if (!ARC) { 2099 assert(getVGPRClassForBitWidth(Size) && "Invalid register class size"); 2100 return false; 2101 } 2102 return getCommonSubClass(ARC, RC) != nullptr; 2103 } 2104 2105 const TargetRegisterClass * 2106 SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const { 2107 unsigned Size = getRegSizeInBits(*SRC); 2108 const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size); 2109 assert(VRC && "Invalid register class size"); 2110 return VRC; 2111 } 2112 2113 const TargetRegisterClass * 2114 SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const { 2115 unsigned Size = getRegSizeInBits(*SRC); 2116 const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size); 2117 assert(ARC && "Invalid register class size"); 2118 return ARC; 2119 } 2120 2121 const TargetRegisterClass * 2122 SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const { 2123 unsigned Size = getRegSizeInBits(*VRC); 2124 if (Size == 32) 2125 return &AMDGPU::SGPR_32RegClass; 2126 const TargetRegisterClass *SRC = getSGPRClassForBitWidth(Size); 2127 assert(SRC && "Invalid register class size"); 2128 return SRC; 2129 } 2130 2131 const TargetRegisterClass *SIRegisterInfo::getSubRegClass( 2132 const TargetRegisterClass *RC, unsigned SubIdx) const { 2133 if (SubIdx == AMDGPU::NoSubRegister) 2134 return RC; 2135 2136 // We can assume that each lane corresponds to one 32-bit register. 2137 unsigned Size = getNumChannelsFromSubReg(SubIdx) * 32; 2138 if (isSGPRClass(RC)) { 2139 if (Size == 32) 2140 RC = &AMDGPU::SGPR_32RegClass; 2141 else 2142 RC = getSGPRClassForBitWidth(Size); 2143 } else if (hasAGPRs(RC)) { 2144 RC = getAGPRClassForBitWidth(Size); 2145 } else { 2146 RC = getVGPRClassForBitWidth(Size); 2147 } 2148 assert(RC && "Invalid sub-register class size"); 2149 return RC; 2150 } 2151 2152 const TargetRegisterClass * 2153 SIRegisterInfo::getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, 2154 const TargetRegisterClass *SubRC, 2155 unsigned SubIdx) const { 2156 // Ensure this subregister index is aligned in the super register. 2157 const TargetRegisterClass *MatchRC = 2158 getMatchingSuperRegClass(SuperRC, SubRC, SubIdx); 2159 return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr; 2160 } 2161 2162 bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { 2163 if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST && 2164 OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST) 2165 return !ST.hasMFMAInlineLiteralBug(); 2166 2167 return OpType >= AMDGPU::OPERAND_SRC_FIRST && 2168 OpType <= AMDGPU::OPERAND_SRC_LAST; 2169 } 2170 2171 bool SIRegisterInfo::shouldRewriteCopySrc( 2172 const TargetRegisterClass *DefRC, 2173 unsigned DefSubReg, 2174 const TargetRegisterClass *SrcRC, 2175 unsigned SrcSubReg) const { 2176 // We want to prefer the smallest register class possible, so we don't want to 2177 // stop and rewrite on anything that looks like a subregister 2178 // extract. Operations mostly don't care about the super register class, so we 2179 // only want to stop on the most basic of copies between the same register 2180 // class. 2181 // 2182 // e.g. if we have something like 2183 // %0 = ... 2184 // %1 = ... 2185 // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2 2186 // %3 = COPY %2, sub0 2187 // 2188 // We want to look through the COPY to find: 2189 // => %3 = COPY %0 2190 2191 // Plain copy. 2192 return getCommonSubClass(DefRC, SrcRC) != nullptr; 2193 } 2194 2195 bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const { 2196 // TODO: 64-bit operands have extending behavior from 32-bit literal. 2197 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST && 2198 OpType <= AMDGPU::OPERAND_REG_IMM_LAST; 2199 } 2200 2201 /// Returns a lowest register that is not used at any point in the function. 2202 /// If all registers are used, then this function will return 2203 /// AMDGPU::NoRegister. If \p ReserveHighestVGPR = true, then return 2204 /// highest unused register. 2205 MCRegister SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, 2206 const TargetRegisterClass *RC, 2207 const MachineFunction &MF, 2208 bool ReserveHighestVGPR) const { 2209 if (ReserveHighestVGPR) { 2210 for (MCRegister Reg : reverse(*RC)) 2211 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 2212 return Reg; 2213 } else { 2214 for (MCRegister Reg : *RC) 2215 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 2216 return Reg; 2217 } 2218 return MCRegister(); 2219 } 2220 2221 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, 2222 unsigned EltSize) const { 2223 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC->MC); 2224 assert(RegBitWidth >= 32 && RegBitWidth <= 1024); 2225 2226 const unsigned RegDWORDs = RegBitWidth / 32; 2227 const unsigned EltDWORDs = EltSize / 4; 2228 assert(RegSplitParts.size() + 1 >= EltDWORDs); 2229 2230 const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1]; 2231 const unsigned NumParts = RegDWORDs / EltDWORDs; 2232 2233 return makeArrayRef(Parts.data(), NumParts); 2234 } 2235 2236 const TargetRegisterClass* 2237 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI, 2238 Register Reg) const { 2239 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegClass(Reg); 2240 } 2241 2242 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, 2243 Register Reg) const { 2244 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); 2245 // Registers without classes are unaddressable, SGPR-like registers. 2246 return RC && hasVGPRs(RC); 2247 } 2248 2249 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI, 2250 Register Reg) const { 2251 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); 2252 2253 // Registers without classes are unaddressable, SGPR-like registers. 2254 return RC && hasAGPRs(RC); 2255 } 2256 2257 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, 2258 const TargetRegisterClass *SrcRC, 2259 unsigned SubReg, 2260 const TargetRegisterClass *DstRC, 2261 unsigned DstSubReg, 2262 const TargetRegisterClass *NewRC, 2263 LiveIntervals &LIS) const { 2264 unsigned SrcSize = getRegSizeInBits(*SrcRC); 2265 unsigned DstSize = getRegSizeInBits(*DstRC); 2266 unsigned NewSize = getRegSizeInBits(*NewRC); 2267 2268 // Do not increase size of registers beyond dword, we would need to allocate 2269 // adjacent registers and constraint regalloc more than needed. 2270 2271 // Always allow dword coalescing. 2272 if (SrcSize <= 32 || DstSize <= 32) 2273 return true; 2274 2275 return NewSize <= DstSize || NewSize <= SrcSize; 2276 } 2277 2278 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, 2279 MachineFunction &MF) const { 2280 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2281 2282 unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), 2283 MF.getFunction()); 2284 switch (RC->getID()) { 2285 default: 2286 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF); 2287 case AMDGPU::VGPR_32RegClassID: 2288 case AMDGPU::VGPR_LO16RegClassID: 2289 case AMDGPU::VGPR_HI16RegClassID: 2290 return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF)); 2291 case AMDGPU::SGPR_32RegClassID: 2292 case AMDGPU::SGPR_LO16RegClassID: 2293 return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF)); 2294 } 2295 } 2296 2297 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, 2298 unsigned Idx) const { 2299 if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 || 2300 Idx == AMDGPU::RegisterPressureSets::AGPR_32) 2301 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, 2302 const_cast<MachineFunction &>(MF)); 2303 2304 if (Idx == AMDGPU::RegisterPressureSets::SReg_32) 2305 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass, 2306 const_cast<MachineFunction &>(MF)); 2307 2308 llvm_unreachable("Unexpected register pressure set!"); 2309 } 2310 2311 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const { 2312 static const int Empty[] = { -1 }; 2313 2314 if (RegPressureIgnoredUnits[RegUnit]) 2315 return Empty; 2316 2317 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit); 2318 } 2319 2320 MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const { 2321 // Not a callee saved register. 2322 return AMDGPU::SGPR30_SGPR31; 2323 } 2324 2325 const TargetRegisterClass * 2326 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, 2327 const RegisterBank &RB, 2328 const MachineRegisterInfo &MRI) const { 2329 switch (RB.getID()) { 2330 case AMDGPU::VGPRRegBankID: 2331 return getVGPRClassForBitWidth(std::max(32u, Size)); 2332 case AMDGPU::VCCRegBankID: 2333 assert(Size == 1); 2334 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass 2335 : &AMDGPU::SReg_64_XEXECRegClass; 2336 case AMDGPU::SGPRRegBankID: 2337 return getSGPRClassForBitWidth(std::max(32u, Size)); 2338 case AMDGPU::AGPRRegBankID: 2339 return getAGPRClassForBitWidth(std::max(32u, Size)); 2340 default: 2341 llvm_unreachable("unknown register bank"); 2342 } 2343 } 2344 2345 const TargetRegisterClass * 2346 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, 2347 const MachineRegisterInfo &MRI) const { 2348 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg()); 2349 if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>()) 2350 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI); 2351 2352 const TargetRegisterClass *RC = RCOrRB.get<const TargetRegisterClass*>(); 2353 return getAllocatableClass(RC); 2354 } 2355 2356 MCRegister SIRegisterInfo::getVCC() const { 2357 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; 2358 } 2359 2360 const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const { 2361 // VGPR tuples have an alignment requirement on gfx90a variants. 2362 return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass 2363 : &AMDGPU::VReg_64RegClass; 2364 } 2365 2366 const TargetRegisterClass * 2367 SIRegisterInfo::getRegClass(unsigned RCID) const { 2368 switch ((int)RCID) { 2369 case AMDGPU::SReg_1RegClassID: 2370 return getBoolRC(); 2371 case AMDGPU::SReg_1_XEXECRegClassID: 2372 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass 2373 : &AMDGPU::SReg_64_XEXECRegClass; 2374 case -1: 2375 return nullptr; 2376 default: 2377 return AMDGPUGenRegisterInfo::getRegClass(RCID); 2378 } 2379 } 2380 2381 // Find reaching register definition 2382 MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg, 2383 MachineInstr &Use, 2384 MachineRegisterInfo &MRI, 2385 LiveIntervals *LIS) const { 2386 auto &MDT = LIS->getAnalysis<MachineDominatorTree>(); 2387 SlotIndex UseIdx = LIS->getInstructionIndex(Use); 2388 SlotIndex DefIdx; 2389 2390 if (Reg.isVirtual()) { 2391 if (!LIS->hasInterval(Reg)) 2392 return nullptr; 2393 LiveInterval &LI = LIS->getInterval(Reg); 2394 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg) 2395 : MRI.getMaxLaneMaskForVReg(Reg); 2396 VNInfo *V = nullptr; 2397 if (LI.hasSubRanges()) { 2398 for (auto &S : LI.subranges()) { 2399 if ((S.LaneMask & SubLanes) == SubLanes) { 2400 V = S.getVNInfoAt(UseIdx); 2401 break; 2402 } 2403 } 2404 } else { 2405 V = LI.getVNInfoAt(UseIdx); 2406 } 2407 if (!V) 2408 return nullptr; 2409 DefIdx = V->def; 2410 } else { 2411 // Find last def. 2412 for (MCRegUnitIterator Units(Reg.asMCReg(), this); Units.isValid(); 2413 ++Units) { 2414 LiveRange &LR = LIS->getRegUnit(*Units); 2415 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) { 2416 if (!DefIdx.isValid() || 2417 MDT.dominates(LIS->getInstructionFromIndex(DefIdx), 2418 LIS->getInstructionFromIndex(V->def))) 2419 DefIdx = V->def; 2420 } else { 2421 return nullptr; 2422 } 2423 } 2424 } 2425 2426 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx); 2427 2428 if (!Def || !MDT.dominates(Def, &Use)) 2429 return nullptr; 2430 2431 assert(Def->modifiesRegister(Reg, this)); 2432 2433 return Def; 2434 } 2435 2436 MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const { 2437 assert(getRegSizeInBits(*getPhysRegClass(Reg)) <= 32); 2438 2439 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass, 2440 AMDGPU::SReg_32RegClass, 2441 AMDGPU::AGPR_32RegClass } ) { 2442 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC)) 2443 return Super; 2444 } 2445 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16, 2446 &AMDGPU::VGPR_32RegClass)) { 2447 return Super; 2448 } 2449 2450 return AMDGPU::NoRegister; 2451 } 2452 2453 bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const { 2454 if (!ST.needsAlignedVGPRs()) 2455 return true; 2456 2457 if (hasVGPRs(&RC)) 2458 return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC))); 2459 if (hasAGPRs(&RC)) 2460 return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC))); 2461 2462 return true; 2463 } 2464 2465 bool SIRegisterInfo::isConstantPhysReg(MCRegister PhysReg) const { 2466 switch (PhysReg) { 2467 case AMDGPU::SGPR_NULL: 2468 case AMDGPU::SRC_SHARED_BASE: 2469 case AMDGPU::SRC_PRIVATE_BASE: 2470 case AMDGPU::SRC_SHARED_LIMIT: 2471 case AMDGPU::SRC_PRIVATE_LIMIT: 2472 return true; 2473 default: 2474 return false; 2475 } 2476 } 2477 2478 ArrayRef<MCPhysReg> 2479 SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const { 2480 return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(), 2481 ST.getMaxNumSGPRs(MF) / 4); 2482 } 2483 2484 ArrayRef<MCPhysReg> 2485 SIRegisterInfo::getAllSGPR64(const MachineFunction &MF) const { 2486 return makeArrayRef(AMDGPU::SGPR_64RegClass.begin(), 2487 ST.getMaxNumSGPRs(MF) / 2); 2488 } 2489 2490 ArrayRef<MCPhysReg> 2491 SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const { 2492 return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF)); 2493 } 2494