1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the RegisterBankInfo class for 10 /// AMDGPU. 11 /// 12 /// \par 13 /// 14 /// AMDGPU has unique register bank constraints that require special high level 15 /// strategies to deal with. There are two main true physical register banks 16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a 17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector 18 /// boolean context. There is also the AGPR bank, which is a special purpose 19 /// physical register bank present on some subtargets. 20 /// 21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to 22 /// be uniform. It is generally not valid to legalize operands by inserting 23 /// copies as on other targets. Operations which require uniform, SGPR operands 24 /// generally require scalarization by repeatedly executing the instruction, 25 /// activating each set of lanes using a unique set of input values. This is 26 /// referred to as a waterfall loop. 27 /// 28 /// \par Booleans 29 /// 30 /// Booleans (s1 values) requires special consideration. A vector compare result 31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit 32 /// register. These are represented with the VCC bank. During selection, we need 33 /// to be able to unambiguously go back from a register class to a register 34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register 35 /// bank, we need to know the use context type. An SGPR s1 value always means a 36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets 37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to 38 /// a 32-bit virtual register. Taken together, this means we need to adjust the 39 /// type of boolean operations to be regbank legal. All SALU booleans need to be 40 /// widened to 32-bits, and all VALU booleans need to be s1 values. 41 /// 42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact 43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc 44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from 45 /// memory) will require a copy to the VCC bank which will require clearing the 46 /// high bits and inserting a compare. 47 /// 48 /// \par Constant bus restriction 49 /// 50 /// VALU instructions have a limitation known as the constant bus 51 /// restriction. Most VALU instructions can use SGPR operands, but may read at 52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most 53 /// instructions). This is one unique SGPR, so the same SGPR may be used for 54 /// multiple operands. From a register bank perspective, any combination of 55 /// operands should be legal as an SGPR, but this is contextually dependent on 56 /// the SGPR operands all being the same register. There is therefore optimal to 57 /// choose the SGPR with the most uses to minimize the number of copies. 58 /// 59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_* 60 /// operation should have its source operands all mapped to VGPRs (except for 61 /// VCC), inserting copies from any SGPR operands. This the most trivial legal 62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too 63 /// complicated to solve here. Every optimization pattern or instruction 64 /// selected to multiple outputs would have to enforce this rule, and there 65 /// would be additional complexity in tracking this rule for every G_* 66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of 67 /// picking the optimal operand combination from a post-isel optimization pass. 68 /// 69 //===----------------------------------------------------------------------===// 70 71 #include "AMDGPURegisterBankInfo.h" 72 73 #include "AMDGPU.h" 74 #include "AMDGPUGlobalISelUtils.h" 75 #include "AMDGPUInstrInfo.h" 76 #include "GCNSubtarget.h" 77 #include "SIMachineFunctionInfo.h" 78 #include "SIRegisterInfo.h" 79 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 80 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 81 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 82 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 83 #include "llvm/CodeGen/RegisterBank.h" 84 #include "llvm/IR/IntrinsicsAMDGPU.h" 85 86 #define GET_TARGET_REGBANK_IMPL 87 #include "AMDGPUGenRegisterBank.inc" 88 89 // This file will be TableGen'ed at some point. 90 #include "AMDGPUGenRegisterBankInfo.def" 91 92 using namespace llvm; 93 using namespace MIPatternMatch; 94 95 namespace { 96 97 // Observer to apply a register bank to new registers created by LegalizerHelper. 98 class ApplyRegBankMapping final : public GISelChangeObserver { 99 private: 100 const AMDGPURegisterBankInfo &RBI; 101 MachineRegisterInfo &MRI; 102 const RegisterBank *NewBank; 103 SmallVector<MachineInstr *, 4> NewInsts; 104 105 public: 106 ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_, 107 MachineRegisterInfo &MRI_, const RegisterBank *RB) 108 : RBI(RBI_), MRI(MRI_), NewBank(RB) {} 109 110 ~ApplyRegBankMapping() { 111 for (MachineInstr *MI : NewInsts) 112 applyBank(*MI); 113 } 114 115 /// Set any registers that don't have a set register class or bank to SALU. 116 void applyBank(MachineInstr &MI) { 117 const unsigned Opc = MI.getOpcode(); 118 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT || 119 Opc == AMDGPU::G_SEXT) { 120 // LegalizerHelper wants to use the basic legalization artifacts when 121 // widening etc. We don't handle selection with vcc in artifact sources, 122 // so we need to use a select instead to handle these properly. 123 Register DstReg = MI.getOperand(0).getReg(); 124 Register SrcReg = MI.getOperand(1).getReg(); 125 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI); 126 if (SrcBank == &AMDGPU::VCCRegBank) { 127 const LLT S32 = LLT::scalar(32); 128 assert(MRI.getType(SrcReg) == LLT::scalar(1)); 129 assert(MRI.getType(DstReg) == S32); 130 assert(NewBank == &AMDGPU::VGPRRegBank); 131 132 // Replace the extension with a select, which really uses the boolean 133 // source. 134 MachineIRBuilder B(MI); 135 auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1); 136 auto False = B.buildConstant(S32, 0); 137 B.buildSelect(DstReg, SrcReg, True, False); 138 MRI.setRegBank(True.getReg(0), *NewBank); 139 MRI.setRegBank(False.getReg(0), *NewBank); 140 MI.eraseFromParent(); 141 } 142 143 assert(!MRI.getRegClassOrRegBank(DstReg)); 144 MRI.setRegBank(DstReg, *NewBank); 145 return; 146 } 147 148 #ifndef NDEBUG 149 if (Opc == AMDGPU::G_TRUNC) { 150 Register DstReg = MI.getOperand(0).getReg(); 151 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI); 152 assert(DstBank != &AMDGPU::VCCRegBank); 153 } 154 #endif 155 156 for (MachineOperand &Op : MI.operands()) { 157 if (!Op.isReg()) 158 continue; 159 160 // We may see physical registers if building a real MI 161 Register Reg = Op.getReg(); 162 if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg)) 163 continue; 164 165 const RegisterBank *RB = NewBank; 166 if (MRI.getType(Reg) == LLT::scalar(1)) { 167 assert(NewBank == &AMDGPU::VGPRRegBank && 168 "s1 operands should only be used for vector bools"); 169 assert((MI.getOpcode() != AMDGPU::G_TRUNC && 170 MI.getOpcode() != AMDGPU::G_ANYEXT) && 171 "not expecting legalization artifacts here"); 172 RB = &AMDGPU::VCCRegBank; 173 } 174 175 MRI.setRegBank(Reg, *RB); 176 } 177 } 178 179 void erasingInstr(MachineInstr &MI) override {} 180 181 void createdInstr(MachineInstr &MI) override { 182 // At this point, the instruction was just inserted and has no operands. 183 NewInsts.push_back(&MI); 184 } 185 186 void changingInstr(MachineInstr &MI) override {} 187 void changedInstr(MachineInstr &MI) override { 188 // FIXME: In principle we should probably add the instruction to NewInsts, 189 // but the way the LegalizerHelper uses the observer, we will always see the 190 // registers we need to set the regbank on also referenced in a new 191 // instruction. 192 } 193 }; 194 195 } 196 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST) 197 : AMDGPUGenRegisterBankInfo(), 198 Subtarget(ST), 199 TRI(Subtarget.getRegisterInfo()), 200 TII(Subtarget.getInstrInfo()) { 201 202 // HACK: Until this is fully tablegen'd. 203 static llvm::once_flag InitializeRegisterBankFlag; 204 205 static auto InitializeRegisterBankOnce = [this]() { 206 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && 207 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank && 208 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank); 209 (void)this; 210 }; 211 212 llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce); 213 } 214 215 static bool isVectorRegisterBank(const RegisterBank &Bank) { 216 unsigned BankID = Bank.getID(); 217 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID; 218 } 219 220 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, 221 const RegisterBank &Src, 222 unsigned Size) const { 223 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane? 224 if (Dst.getID() == AMDGPU::SGPRRegBankID && 225 (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) { 226 return std::numeric_limits<unsigned>::max(); 227 } 228 229 // Bool values are tricky, because the meaning is based on context. The SCC 230 // and VCC banks are for the natural scalar and vector conditions produced by 231 // a compare. 232 // 233 // Legalization doesn't know about the necessary context, so an s1 use may 234 // have been a truncate from an arbitrary value, in which case a copy (lowered 235 // as a compare with 0) needs to be inserted. 236 if (Size == 1 && 237 (Dst.getID() == AMDGPU::SGPRRegBankID) && 238 (isVectorRegisterBank(Src) || 239 Src.getID() == AMDGPU::SGPRRegBankID || 240 Src.getID() == AMDGPU::VCCRegBankID)) 241 return std::numeric_limits<unsigned>::max(); 242 243 // There is no direct copy between AGPRs. 244 if (Dst.getID() == AMDGPU::AGPRRegBankID && 245 Src.getID() == AMDGPU::AGPRRegBankID) 246 return 4; 247 248 return RegisterBankInfo::copyCost(Dst, Src, Size); 249 } 250 251 unsigned AMDGPURegisterBankInfo::getBreakDownCost( 252 const ValueMapping &ValMapping, 253 const RegisterBank *CurBank) const { 254 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to 255 // VGPR. 256 // FIXME: Is there a better way to do this? 257 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64) 258 return 10; // This is expensive. 259 260 assert(ValMapping.NumBreakDowns == 2 && 261 ValMapping.BreakDown[0].Length == 32 && 262 ValMapping.BreakDown[0].StartIdx == 0 && 263 ValMapping.BreakDown[1].Length == 32 && 264 ValMapping.BreakDown[1].StartIdx == 32 && 265 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank); 266 267 // 32-bit extract of a 64-bit value is just access of a subregister, so free. 268 // TODO: Cost of 0 hits assert, though it's not clear it's what we really 269 // want. 270 271 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR 272 // alignment restrictions, but this probably isn't important. 273 return 1; 274 } 275 276 const RegisterBank & 277 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, 278 LLT Ty) const { 279 if (&RC == &AMDGPU::SReg_1RegClass) 280 return AMDGPU::VCCRegBank; 281 282 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a 283 // VCC-like use. 284 if (TRI->isSGPRClass(&RC)) { 285 // FIXME: This probably came from a copy from a physical register, which 286 // should be inferable from the copied to-type. We don't have many boolean 287 // physical register constraints so just assume a normal SGPR for now. 288 if (!Ty.isValid()) 289 return AMDGPU::SGPRRegBank; 290 291 return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank; 292 } 293 294 return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank; 295 } 296 297 template <unsigned NumOps> 298 RegisterBankInfo::InstructionMappings 299 AMDGPURegisterBankInfo::addMappingFromTable( 300 const MachineInstr &MI, const MachineRegisterInfo &MRI, 301 const std::array<unsigned, NumOps> RegSrcOpIdx, 302 ArrayRef<OpRegBankEntry<NumOps>> Table) const { 303 304 InstructionMappings AltMappings; 305 306 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands()); 307 308 unsigned Sizes[NumOps]; 309 for (unsigned I = 0; I < NumOps; ++I) { 310 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg(); 311 Sizes[I] = getSizeInBits(Reg, MRI, *TRI); 312 } 313 314 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) { 315 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI); 316 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI); 317 } 318 319 // getInstrMapping's default mapping uses ID 1, so start at 2. 320 unsigned MappingID = 2; 321 for (const auto &Entry : Table) { 322 for (unsigned I = 0; I < NumOps; ++I) { 323 int OpIdx = RegSrcOpIdx[I]; 324 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]); 325 } 326 327 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost, 328 getOperandsMapping(Operands), 329 Operands.size())); 330 } 331 332 return AltMappings; 333 } 334 335 RegisterBankInfo::InstructionMappings 336 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic( 337 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 338 switch (MI.getIntrinsicID()) { 339 case Intrinsic::amdgcn_readlane: { 340 static const OpRegBankEntry<3> Table[2] = { 341 // Perfectly legal. 342 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 343 344 // Need a readfirstlane for the index. 345 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 346 }; 347 348 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 349 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 350 } 351 case Intrinsic::amdgcn_writelane: { 352 static const OpRegBankEntry<4> Table[4] = { 353 // Perfectly legal. 354 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 355 356 // Need readfirstlane of first op 357 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 358 359 // Need readfirstlane of second op 360 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 361 362 // Need readfirstlane of both ops 363 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 } 364 }; 365 366 // rsrc, voffset, offset 367 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } }; 368 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 369 } 370 default: 371 return RegisterBankInfo::getInstrAlternativeMappings(MI); 372 } 373 } 374 375 RegisterBankInfo::InstructionMappings 376 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( 377 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 378 379 switch (MI.getIntrinsicID()) { 380 case Intrinsic::amdgcn_s_buffer_load: { 381 static const OpRegBankEntry<2> Table[4] = { 382 // Perfectly legal. 383 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 384 385 // Only need 1 register in loop 386 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 }, 387 388 // Have to waterfall the resource. 389 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 }, 390 391 // Have to waterfall the resource, and the offset. 392 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 } 393 }; 394 395 // rsrc, offset 396 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } }; 397 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 398 } 399 case Intrinsic::amdgcn_ds_ordered_add: 400 case Intrinsic::amdgcn_ds_ordered_swap: { 401 // VGPR = M0, VGPR 402 static const OpRegBankEntry<3> Table[2] = { 403 // Perfectly legal. 404 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 405 406 // Need a readfirstlane for m0 407 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 408 }; 409 410 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 411 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 412 } 413 case Intrinsic::amdgcn_s_sendmsg: 414 case Intrinsic::amdgcn_s_sendmsghalt: { 415 // FIXME: Should have no register for immediate 416 static const OpRegBankEntry<1> Table[2] = { 417 // Perfectly legal. 418 { { AMDGPU::SGPRRegBankID }, 1 }, 419 420 // Need readlane 421 { { AMDGPU::VGPRRegBankID }, 3 } 422 }; 423 424 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } }; 425 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 426 } 427 default: 428 return RegisterBankInfo::getInstrAlternativeMappings(MI); 429 } 430 } 431 432 // FIXME: Returns uniform if there's no source value information. This is 433 // probably wrong. 434 static bool isScalarLoadLegal(const MachineInstr &MI) { 435 if (!MI.hasOneMemOperand()) 436 return false; 437 438 const MachineMemOperand *MMO = *MI.memoperands_begin(); 439 const unsigned AS = MMO->getAddrSpace(); 440 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS || 441 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT; 442 // Require 4-byte alignment. 443 return MMO->getAlign() >= Align(4) && 444 // Can't do a scalar atomic load. 445 !MMO->isAtomic() && 446 // Don't use scalar loads for volatile accesses to non-constant address 447 // spaces. 448 (IsConst || !MMO->isVolatile()) && 449 // Memory must be known constant, or not written before this load. 450 (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) && 451 AMDGPUInstrInfo::isUniformMMO(MMO); 452 } 453 454 RegisterBankInfo::InstructionMappings 455 AMDGPURegisterBankInfo::getInstrAlternativeMappings( 456 const MachineInstr &MI) const { 457 458 const MachineFunction &MF = *MI.getParent()->getParent(); 459 const MachineRegisterInfo &MRI = MF.getRegInfo(); 460 461 462 InstructionMappings AltMappings; 463 switch (MI.getOpcode()) { 464 case TargetOpcode::G_CONSTANT: { 465 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 466 if (Size == 1) { 467 static const OpRegBankEntry<1> Table[3] = { 468 { { AMDGPU::VGPRRegBankID }, 1 }, 469 { { AMDGPU::SGPRRegBankID }, 1 }, 470 { { AMDGPU::VCCRegBankID }, 1 } 471 }; 472 473 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); 474 } 475 476 LLVM_FALLTHROUGH; 477 } 478 case TargetOpcode::G_FCONSTANT: 479 case TargetOpcode::G_FRAME_INDEX: 480 case TargetOpcode::G_GLOBAL_VALUE: { 481 static const OpRegBankEntry<1> Table[2] = { 482 { { AMDGPU::VGPRRegBankID }, 1 }, 483 { { AMDGPU::SGPRRegBankID }, 1 } 484 }; 485 486 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); 487 } 488 case TargetOpcode::G_AND: 489 case TargetOpcode::G_OR: 490 case TargetOpcode::G_XOR: { 491 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 492 493 if (Size == 1) { 494 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0. 495 const InstructionMapping &SCCMapping = getInstructionMapping( 496 1, 1, getOperandsMapping( 497 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), 498 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), 499 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}), 500 3); // Num Operands 501 AltMappings.push_back(&SCCMapping); 502 503 const InstructionMapping &VCCMapping0 = getInstructionMapping( 504 2, 1, getOperandsMapping( 505 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 506 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 507 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}), 508 3); // Num Operands 509 AltMappings.push_back(&VCCMapping0); 510 return AltMappings; 511 } 512 513 if (Size != 64) 514 break; 515 516 const InstructionMapping &SSMapping = getInstructionMapping( 517 1, 1, getOperandsMapping( 518 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 519 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 520 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 521 3); // Num Operands 522 AltMappings.push_back(&SSMapping); 523 524 const InstructionMapping &VVMapping = getInstructionMapping( 525 2, 2, getOperandsMapping( 526 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 527 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 528 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 529 3); // Num Operands 530 AltMappings.push_back(&VVMapping); 531 break; 532 } 533 case TargetOpcode::G_LOAD: 534 case TargetOpcode::G_ZEXTLOAD: 535 case TargetOpcode::G_SEXTLOAD: { 536 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 537 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); 538 unsigned PtrSize = PtrTy.getSizeInBits(); 539 unsigned AS = PtrTy.getAddressSpace(); 540 541 if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && 542 AS != AMDGPUAS::PRIVATE_ADDRESS) && 543 isScalarLoadLegal(MI)) { 544 const InstructionMapping &SSMapping = getInstructionMapping( 545 1, 1, getOperandsMapping( 546 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 547 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}), 548 2); // Num Operands 549 AltMappings.push_back(&SSMapping); 550 } 551 552 const InstructionMapping &VVMapping = getInstructionMapping( 553 2, 1, 554 getOperandsMapping( 555 {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 556 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}), 557 2); // Num Operands 558 AltMappings.push_back(&VVMapping); 559 560 // It may be possible to have a vgpr = load sgpr mapping here, because 561 // the mubuf instructions support this kind of load, but probably for only 562 // gfx7 and older. However, the addressing mode matching in the instruction 563 // selector should be able to do a better job of detecting and selecting 564 // these kinds of loads from the vgpr = load vgpr mapping. 565 566 return AltMappings; 567 568 } 569 case TargetOpcode::G_SELECT: { 570 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 571 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 572 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 573 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), 574 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 575 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 576 4); // Num Operands 577 AltMappings.push_back(&SSMapping); 578 579 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 580 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 581 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 582 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 583 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 584 4); // Num Operands 585 AltMappings.push_back(&VVMapping); 586 587 return AltMappings; 588 } 589 case TargetOpcode::G_UADDE: 590 case TargetOpcode::G_USUBE: 591 case TargetOpcode::G_SADDE: 592 case TargetOpcode::G_SSUBE: { 593 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 594 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 595 getOperandsMapping( 596 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 597 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), 598 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 599 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 600 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}), 601 5); // Num Operands 602 AltMappings.push_back(&SSMapping); 603 604 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 605 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 606 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 607 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 608 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 609 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}), 610 5); // Num Operands 611 AltMappings.push_back(&VVMapping); 612 return AltMappings; 613 } 614 case AMDGPU::G_BRCOND: { 615 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 616 617 // TODO: Change type to 32 for scalar 618 const InstructionMapping &SMapping = getInstructionMapping( 619 1, 1, getOperandsMapping( 620 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}), 621 2); // Num Operands 622 AltMappings.push_back(&SMapping); 623 624 const InstructionMapping &VMapping = getInstructionMapping( 625 1, 1, getOperandsMapping( 626 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }), 627 2); // Num Operands 628 AltMappings.push_back(&VMapping); 629 return AltMappings; 630 } 631 case AMDGPU::G_INTRINSIC: 632 return getInstrAlternativeMappingsIntrinsic(MI, MRI); 633 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: 634 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI); 635 default: 636 break; 637 } 638 return RegisterBankInfo::getInstrAlternativeMappings(MI); 639 } 640 641 void AMDGPURegisterBankInfo::split64BitValueForMapping( 642 MachineIRBuilder &B, 643 SmallVector<Register, 2> &Regs, 644 LLT HalfTy, 645 Register Reg) const { 646 assert(HalfTy.getSizeInBits() == 32); 647 MachineRegisterInfo *MRI = B.getMRI(); 648 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy); 649 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy); 650 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI); 651 MRI->setRegBank(LoLHS, *Bank); 652 MRI->setRegBank(HiLHS, *Bank); 653 654 Regs.push_back(LoLHS); 655 Regs.push_back(HiLHS); 656 657 B.buildInstr(AMDGPU::G_UNMERGE_VALUES) 658 .addDef(LoLHS) 659 .addDef(HiLHS) 660 .addUse(Reg); 661 } 662 663 /// Replace the current type each register in \p Regs has with \p NewTy 664 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs, 665 LLT NewTy) { 666 for (Register Reg : Regs) { 667 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits()); 668 MRI.setType(Reg, NewTy); 669 } 670 } 671 672 static LLT getHalfSizedType(LLT Ty) { 673 if (Ty.isVector()) { 674 assert(Ty.getElementCount().isKnownMultipleOf(2)); 675 return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2), 676 Ty.getElementType()); 677 } 678 679 assert(Ty.getScalarSizeInBits() % 2 == 0); 680 return LLT::scalar(Ty.getScalarSizeInBits() / 2); 681 } 682 683 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If 684 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to 685 /// execute the instruction for each unique combination of values in all lanes 686 /// in the wave. The block will be split such that rest of the instructions are 687 /// moved to a new block. 688 /// 689 /// Essentially performs this loop: 690 // 691 /// Save Execution Mask 692 /// For (Lane : Wavefront) { 693 /// Enable Lane, Disable all other lanes 694 /// SGPR = read SGPR value for current lane from VGPR 695 /// VGPRResult[Lane] = use_op SGPR 696 /// } 697 /// Restore Execution Mask 698 /// 699 /// There is additional complexity to try for compare values to identify the 700 /// unique values used. 701 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 702 MachineIRBuilder &B, 703 iterator_range<MachineBasicBlock::iterator> Range, 704 SmallSet<Register, 4> &SGPROperandRegs, 705 MachineRegisterInfo &MRI) const { 706 707 // Track use registers which have already been expanded with a readfirstlane 708 // sequence. This may have multiple uses if moving a sequence. 709 DenseMap<Register, Register> WaterfalledRegMap; 710 711 MachineBasicBlock &MBB = B.getMBB(); 712 MachineFunction *MF = &B.getMF(); 713 714 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); 715 const unsigned WaveAndOpc = Subtarget.isWave32() ? 716 AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; 717 const unsigned MovExecOpc = 718 Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 719 const unsigned MovExecTermOpc = 720 Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; 721 722 const unsigned XorTermOpc = Subtarget.isWave32() ? 723 AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; 724 const unsigned AndSaveExecOpc = Subtarget.isWave32() ? 725 AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; 726 const unsigned ExecReg = Subtarget.isWave32() ? 727 AMDGPU::EXEC_LO : AMDGPU::EXEC; 728 729 #ifndef NDEBUG 730 const int OrigRangeSize = std::distance(Range.begin(), Range.end()); 731 #endif 732 733 Register SaveExecReg = MRI.createVirtualRegister(WaveRC); 734 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC); 735 736 // Don't bother using generic instructions/registers for the exec mask. 737 B.buildInstr(TargetOpcode::IMPLICIT_DEF) 738 .addDef(InitSaveExecReg); 739 740 Register PhiExec = MRI.createVirtualRegister(WaveRC); 741 Register NewExec = MRI.createVirtualRegister(WaveRC); 742 743 // To insert the loop we need to split the block. Move everything before this 744 // point to a new block, and insert a new empty block before this instruction. 745 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); 746 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); 747 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock(); 748 MachineFunction::iterator MBBI(MBB); 749 ++MBBI; 750 MF->insert(MBBI, LoopBB); 751 MF->insert(MBBI, RestoreExecBB); 752 MF->insert(MBBI, RemainderBB); 753 754 LoopBB->addSuccessor(RestoreExecBB); 755 LoopBB->addSuccessor(LoopBB); 756 757 // Move the rest of the block into a new block. 758 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 759 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end()); 760 761 MBB.addSuccessor(LoopBB); 762 RestoreExecBB->addSuccessor(RemainderBB); 763 764 B.setInsertPt(*LoopBB, LoopBB->end()); 765 766 B.buildInstr(TargetOpcode::PHI) 767 .addDef(PhiExec) 768 .addReg(InitSaveExecReg) 769 .addMBB(&MBB) 770 .addReg(NewExec) 771 .addMBB(LoopBB); 772 773 const DebugLoc &DL = B.getDL(); 774 775 MachineInstr &FirstInst = *Range.begin(); 776 777 // Move the instruction into the loop. Note we moved everything after 778 // Range.end() already into a new block, so Range.end() is no longer valid. 779 LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end()); 780 781 // Figure out the iterator range after splicing the instructions. 782 MachineBasicBlock::iterator NewBegin = FirstInst.getIterator(); 783 auto NewEnd = LoopBB->end(); 784 785 MachineBasicBlock::iterator I = Range.begin(); 786 B.setInsertPt(*LoopBB, I); 787 788 Register CondReg; 789 790 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize); 791 792 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) { 793 for (MachineOperand &Op : MI.uses()) { 794 if (!Op.isReg() || Op.isDef()) 795 continue; 796 797 Register OldReg = Op.getReg(); 798 if (!SGPROperandRegs.count(OldReg)) 799 continue; 800 801 // See if we already processed this register in another instruction in the 802 // sequence. 803 auto OldVal = WaterfalledRegMap.find(OldReg); 804 if (OldVal != WaterfalledRegMap.end()) { 805 Op.setReg(OldVal->second); 806 continue; 807 } 808 809 Register OpReg = Op.getReg(); 810 LLT OpTy = MRI.getType(OpReg); 811 812 const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI); 813 if (OpBank != &AMDGPU::VGPRRegBank) { 814 // Insert copy from AGPR to VGPR before the loop. 815 B.setMBB(MBB); 816 OpReg = B.buildCopy(OpTy, OpReg).getReg(0); 817 MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank); 818 B.setInstr(*I); 819 } 820 821 unsigned OpSize = OpTy.getSizeInBits(); 822 823 // Can only do a readlane of 32-bit pieces. 824 if (OpSize == 32) { 825 // Avoid extra copies in the simple case of one 32-bit register. 826 Register CurrentLaneOpReg 827 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 828 MRI.setType(CurrentLaneOpReg, OpTy); 829 830 constrainGenericRegister(OpReg, AMDGPU::VGPR_32RegClass, MRI); 831 // Read the next variant <- also loop target. 832 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 833 CurrentLaneOpReg) 834 .addReg(OpReg); 835 836 Register NewCondReg = MRI.createVirtualRegister(WaveRC); 837 bool First = CondReg == AMDGPU::NoRegister; 838 if (First) 839 CondReg = NewCondReg; 840 841 // Compare the just read M0 value to all possible Idx values. 842 B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64) 843 .addDef(NewCondReg) 844 .addReg(CurrentLaneOpReg) 845 .addReg(OpReg); 846 Op.setReg(CurrentLaneOpReg); 847 848 if (!First) { 849 Register AndReg = MRI.createVirtualRegister(WaveRC); 850 851 // If there are multiple operands to consider, and the conditions. 852 B.buildInstr(WaveAndOpc) 853 .addDef(AndReg) 854 .addReg(NewCondReg) 855 .addReg(CondReg); 856 CondReg = AndReg; 857 } 858 } else { 859 LLT S32 = LLT::scalar(32); 860 SmallVector<Register, 8> ReadlanePieces; 861 862 // The compares can be done as 64-bit, but the extract needs to be done 863 // in 32-bit pieces. 864 865 bool Is64 = OpSize % 64 == 0; 866 867 unsigned UnmergeTySize = Is64 ? 64 : 32; 868 unsigned CmpOp = 869 Is64 ? AMDGPU::V_CMP_EQ_U64_e64 : AMDGPU::V_CMP_EQ_U32_e64; 870 871 // Insert the unmerge before the loop. 872 873 B.setMBB(MBB); 874 unsigned NumPieces = OpSize / UnmergeTySize; 875 SmallVector<Register, 8> UnmergePieces; 876 if (NumPieces == 1) { 877 UnmergePieces.push_back(OpReg); 878 } else { 879 LLT UnmergeTy = LLT::scalar(UnmergeTySize); 880 MachineInstrBuilder Unmerge = B.buildUnmerge(UnmergeTy, OpReg); 881 for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) 882 UnmergePieces.push_back(Unmerge.getReg(PieceIdx)); 883 } 884 B.setInstr(*I); 885 886 for (Register UnmergePiece : UnmergePieces) { 887 Register CurrentLaneOpReg; 888 if (Is64) { 889 Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32); 890 Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32); 891 892 MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass); 893 MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass); 894 MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass); 895 896 // Read the next variant <- also loop target. 897 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 898 CurrentLaneOpRegLo) 899 .addReg(UnmergePiece, 0, AMDGPU::sub0); 900 901 // Read the next variant <- also loop target. 902 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 903 CurrentLaneOpRegHi) 904 .addReg(UnmergePiece, 0, AMDGPU::sub1); 905 906 CurrentLaneOpReg = 907 B.buildMerge(LLT::scalar(64), 908 {CurrentLaneOpRegLo, CurrentLaneOpRegHi}) 909 .getReg(0); 910 911 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass); 912 913 if (OpTy.getScalarSizeInBits() == 64) { 914 // If we need to produce a 64-bit element vector, so use the 915 // merged pieces 916 ReadlanePieces.push_back(CurrentLaneOpReg); 917 } else { 918 // 32-bit element type. 919 ReadlanePieces.push_back(CurrentLaneOpRegLo); 920 ReadlanePieces.push_back(CurrentLaneOpRegHi); 921 } 922 } else { 923 CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32); 924 MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass); 925 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass); 926 927 // Read the next variant <- also loop target. 928 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 929 CurrentLaneOpReg) 930 .addReg(UnmergePiece); 931 ReadlanePieces.push_back(CurrentLaneOpReg); 932 } 933 934 Register NewCondReg = MRI.createVirtualRegister(WaveRC); 935 bool First = CondReg == AMDGPU::NoRegister; 936 if (First) 937 CondReg = NewCondReg; 938 939 B.buildInstr(CmpOp) 940 .addDef(NewCondReg) 941 .addReg(CurrentLaneOpReg) 942 .addReg(UnmergePiece); 943 944 if (!First) { 945 Register AndReg = MRI.createVirtualRegister(WaveRC); 946 947 // If there are multiple operands to consider, and the conditions. 948 B.buildInstr(WaveAndOpc) 949 .addDef(AndReg) 950 .addReg(NewCondReg) 951 .addReg(CondReg); 952 CondReg = AndReg; 953 } 954 } 955 956 // FIXME: Build merge seems to switch to CONCAT_VECTORS but not 957 // BUILD_VECTOR 958 if (OpTy.isVector()) { 959 auto Merge = B.buildBuildVector(OpTy, ReadlanePieces); 960 Op.setReg(Merge.getReg(0)); 961 MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank); 962 } else if (ReadlanePieces.size() > 1) { 963 auto Merge = B.buildMerge(OpTy, ReadlanePieces); 964 Op.setReg(Merge.getReg(0)); 965 MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank); 966 } else { 967 Op.setReg(ReadlanePieces[0]); 968 } 969 } 970 971 // Make sure we don't re-process this register again. 972 WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg())); 973 } 974 } 975 976 // Update EXEC, save the original EXEC value to VCC. 977 B.buildInstr(AndSaveExecOpc) 978 .addDef(NewExec) 979 .addReg(CondReg, RegState::Kill); 980 981 MRI.setSimpleHint(NewExec, CondReg); 982 983 B.setInsertPt(*LoopBB, LoopBB->end()); 984 985 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 986 B.buildInstr(XorTermOpc) 987 .addDef(ExecReg) 988 .addReg(ExecReg) 989 .addReg(NewExec); 990 991 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use 992 // s_cbranch_scc0? 993 994 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. 995 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB); 996 997 // Save the EXEC mask before the loop. 998 BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg) 999 .addReg(ExecReg); 1000 1001 // Restore the EXEC mask after the loop. 1002 B.setMBB(*RestoreExecBB); 1003 B.buildInstr(MovExecTermOpc) 1004 .addDef(ExecReg) 1005 .addReg(SaveExecReg); 1006 1007 // Set the insert point after the original instruction, so any new 1008 // instructions will be in the remainder. 1009 B.setInsertPt(*RemainderBB, RemainderBB->begin()); 1010 1011 return true; 1012 } 1013 1014 // Return any unique registers used by \p MI at \p OpIndices that need to be 1015 // handled in a waterfall loop. Returns these registers in \p 1016 // SGPROperandRegs. Returns true if there are any operands to handle and a 1017 // waterfall loop is necessary. 1018 bool AMDGPURegisterBankInfo::collectWaterfallOperands( 1019 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI, 1020 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const { 1021 for (unsigned Op : OpIndices) { 1022 assert(MI.getOperand(Op).isUse()); 1023 Register Reg = MI.getOperand(Op).getReg(); 1024 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); 1025 if (OpBank->getID() != AMDGPU::SGPRRegBankID) 1026 SGPROperandRegs.insert(Reg); 1027 } 1028 1029 // No operands need to be replaced, so no need to loop. 1030 return !SGPROperandRegs.empty(); 1031 } 1032 1033 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 1034 MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI, 1035 ArrayRef<unsigned> OpIndices) const { 1036 // Use a set to avoid extra readfirstlanes in the case where multiple operands 1037 // are the same register. 1038 SmallSet<Register, 4> SGPROperandRegs; 1039 1040 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices)) 1041 return false; 1042 1043 MachineBasicBlock::iterator I = MI.getIterator(); 1044 return executeInWaterfallLoop(B, make_range(I, std::next(I)), 1045 SGPROperandRegs, MRI); 1046 } 1047 1048 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 1049 MachineInstr &MI, MachineRegisterInfo &MRI, 1050 ArrayRef<unsigned> OpIndices) const { 1051 MachineIRBuilder B(MI); 1052 return executeInWaterfallLoop(B, MI, MRI, OpIndices); 1053 } 1054 1055 // Legalize an operand that must be an SGPR by inserting a readfirstlane. 1056 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane( 1057 MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const { 1058 Register Reg = MI.getOperand(OpIdx).getReg(); 1059 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 1060 if (Bank == &AMDGPU::SGPRRegBank) 1061 return; 1062 1063 LLT Ty = MRI.getType(Reg); 1064 MachineIRBuilder B(MI); 1065 1066 if (Bank != &AMDGPU::VGPRRegBank) { 1067 // We need to copy from AGPR to VGPR 1068 Reg = B.buildCopy(Ty, Reg).getReg(0); 1069 MRI.setRegBank(Reg, AMDGPU::VGPRRegBank); 1070 } 1071 1072 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1073 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32) 1074 .addDef(SGPR) 1075 .addReg(Reg); 1076 1077 MRI.setType(SGPR, Ty); 1078 1079 const TargetRegisterClass *Constrained = 1080 constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI); 1081 (void)Constrained; 1082 assert(Constrained && "Failed to constrain readfirstlane src reg"); 1083 1084 MI.getOperand(OpIdx).setReg(SGPR); 1085 } 1086 1087 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the 1088 /// rest will be in the remainder. 1089 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) { 1090 unsigned TotalSize = Ty.getSizeInBits(); 1091 if (!Ty.isVector()) 1092 return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)}; 1093 1094 LLT EltTy = Ty.getElementType(); 1095 unsigned EltSize = EltTy.getSizeInBits(); 1096 assert(FirstSize % EltSize == 0); 1097 1098 unsigned FirstPartNumElts = FirstSize / EltSize; 1099 unsigned RemainderElts = (TotalSize - FirstSize) / EltSize; 1100 1101 return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy), 1102 LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)}; 1103 } 1104 1105 static LLT widen96To128(LLT Ty) { 1106 if (!Ty.isVector()) 1107 return LLT::scalar(128); 1108 1109 LLT EltTy = Ty.getElementType(); 1110 assert(128 % EltTy.getSizeInBits() == 0); 1111 return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy); 1112 } 1113 1114 bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI, 1115 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1116 MachineRegisterInfo &MRI) const { 1117 Register DstReg = MI.getOperand(0).getReg(); 1118 const LLT LoadTy = MRI.getType(DstReg); 1119 unsigned LoadSize = LoadTy.getSizeInBits(); 1120 const unsigned MaxNonSmrdLoadSize = 128; 1121 1122 const RegisterBank *DstBank = 1123 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1124 if (DstBank == &AMDGPU::SGPRRegBank) { 1125 // There are some special cases that we need to look at for 32 bit and 96 1126 // bit SGPR loads otherwise we have nothing to do. 1127 if (LoadSize != 32 && LoadSize != 96) 1128 return false; 1129 1130 MachineMemOperand *MMO = *MI.memoperands_begin(); 1131 const unsigned MemSize = 8 * MMO->getSize(); 1132 // Scalar loads of size 8 or 16 bit with proper alignment may be widened to 1133 // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit 1134 // scalar loads should have a load size of 32 but memory access size of less 1135 // than 32. 1136 if (LoadSize == 32 && 1137 (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI))) 1138 return false; 1139 1140 Register PtrReg = MI.getOperand(1).getReg(); 1141 1142 ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank); 1143 MachineIRBuilder B(MI, O); 1144 1145 if (LoadSize == 32) { 1146 // This is an extending load from a sub-dword size. Widen the memory 1147 // access size to 4 bytes and clear the extra high bits appropriately 1148 const LLT S32 = LLT::scalar(32); 1149 if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) { 1150 // Must extend the sign bit into higher bits for a G_SEXTLOAD 1151 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0); 1152 B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize); 1153 } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) { 1154 // Must extend zero into higher bits with an AND for a G_ZEXTLOAD 1155 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0); 1156 B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize); 1157 } else 1158 // We do not need to touch the higher bits for regular loads. 1159 B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0); 1160 } else { 1161 // 96-bit loads are only available for vector loads. We need to split this 1162 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load). 1163 if (MMO->getAlign() < Align(16)) { 1164 MachineFunction *MF = MI.getParent()->getParent(); 1165 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); 1166 MachineIRBuilder B(MI, ApplyBank); 1167 LegalizerHelper Helper(*MF, ApplyBank, B); 1168 LLT Part64, Part32; 1169 std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64); 1170 if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) != 1171 LegalizerHelper::Legalized) 1172 return false; 1173 return true; 1174 } else { 1175 LLT WiderTy = widen96To128(LoadTy); 1176 auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0); 1177 if (WiderTy.isScalar()) 1178 B.buildTrunc(MI.getOperand(0), WideLoad); 1179 else { 1180 B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(), 1181 WideLoad); 1182 } 1183 } 1184 } 1185 1186 MI.eraseFromParent(); 1187 return true; 1188 } 1189 1190 // 128-bit loads are supported for all instruction types. 1191 if (LoadSize <= MaxNonSmrdLoadSize) 1192 return false; 1193 1194 SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0)); 1195 SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1)); 1196 1197 if (SrcRegs.empty()) 1198 SrcRegs.push_back(MI.getOperand(1).getReg()); 1199 1200 assert(LoadSize % MaxNonSmrdLoadSize == 0); 1201 1202 // RegBankSelect only emits scalar types, so we need to reset the pointer 1203 // operand to a pointer type. 1204 Register BasePtrReg = SrcRegs[0]; 1205 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); 1206 MRI.setType(BasePtrReg, PtrTy); 1207 1208 unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize; 1209 const LLT LoadSplitTy = LoadTy.divide(NumSplitParts); 1210 ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank); 1211 MachineIRBuilder B(MI, Observer); 1212 LegalizerHelper Helper(B.getMF(), Observer, B); 1213 1214 if (LoadTy.isVector()) { 1215 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) 1216 return false; 1217 } else { 1218 if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) 1219 return false; 1220 } 1221 1222 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 1223 return true; 1224 } 1225 1226 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc( 1227 MachineInstr &MI, 1228 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1229 MachineRegisterInfo &MRI) const { 1230 const MachineFunction &MF = *MI.getMF(); 1231 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1232 const auto &TFI = *ST.getFrameLowering(); 1233 1234 // Guard in case the stack growth direction ever changes with scratch 1235 // instructions. 1236 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown) 1237 return false; 1238 1239 Register Dst = MI.getOperand(0).getReg(); 1240 Register AllocSize = MI.getOperand(1).getReg(); 1241 Align Alignment = assumeAligned(MI.getOperand(2).getImm()); 1242 1243 const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI); 1244 1245 // TODO: Need to emit a wave reduction to get the maximum size. 1246 if (SizeBank != &AMDGPU::SGPRRegBank) 1247 return false; 1248 1249 LLT PtrTy = MRI.getType(Dst); 1250 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); 1251 1252 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1253 Register SPReg = Info->getStackPtrOffsetReg(); 1254 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank); 1255 MachineIRBuilder B(MI, ApplyBank); 1256 1257 auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2()); 1258 auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize); 1259 1260 auto SPCopy = B.buildCopy(PtrTy, SPReg); 1261 if (Alignment > TFI.getStackAlign()) { 1262 auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize); 1263 B.buildMaskLowPtrBits(Dst, PtrAdd, 1264 Log2(Alignment) + ST.getWavefrontSizeLog2()); 1265 } else { 1266 B.buildPtrAdd(Dst, SPCopy, ScaledSize); 1267 } 1268 1269 MI.eraseFromParent(); 1270 return true; 1271 } 1272 1273 bool AMDGPURegisterBankInfo::applyMappingImage( 1274 MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1275 MachineRegisterInfo &MRI, int RsrcIdx) const { 1276 const int NumDefs = MI.getNumExplicitDefs(); 1277 1278 // The reported argument index is relative to the IR intrinsic call arguments, 1279 // so we need to shift by the number of defs and the intrinsic ID. 1280 RsrcIdx += NumDefs + 1; 1281 1282 // Insert copies to VGPR arguments. 1283 applyDefaultMapping(OpdMapper); 1284 1285 // Fixup any SGPR arguments. 1286 SmallVector<unsigned, 4> SGPRIndexes; 1287 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) { 1288 if (!MI.getOperand(I).isReg()) 1289 continue; 1290 1291 // If this intrinsic has a sampler, it immediately follows rsrc. 1292 if (I == RsrcIdx || I == RsrcIdx + 1) 1293 SGPRIndexes.push_back(I); 1294 } 1295 1296 executeInWaterfallLoop(MI, MRI, SGPRIndexes); 1297 return true; 1298 } 1299 1300 static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI, 1301 Register Reg) { 1302 MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); 1303 if (!Def) 1304 return Reg; 1305 1306 // TODO: Guard against this being an implicit def 1307 return Def->getOperand(0).getReg(); 1308 } 1309 1310 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store 1311 // the three offsets (voffset, soffset and instoffset) 1312 static unsigned setBufferOffsets(MachineIRBuilder &B, 1313 const AMDGPURegisterBankInfo &RBI, 1314 Register CombinedOffset, Register &VOffsetReg, 1315 Register &SOffsetReg, int64_t &InstOffsetVal, 1316 Align Alignment) { 1317 const LLT S32 = LLT::scalar(32); 1318 MachineRegisterInfo *MRI = B.getMRI(); 1319 1320 if (Optional<int64_t> Imm = getIConstantVRegSExtVal(CombinedOffset, *MRI)) { 1321 uint32_t SOffset, ImmOffset; 1322 if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget, 1323 Alignment)) { 1324 VOffsetReg = B.buildConstant(S32, 0).getReg(0); 1325 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); 1326 InstOffsetVal = ImmOffset; 1327 1328 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1329 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1330 return SOffset + ImmOffset; 1331 } 1332 } 1333 1334 Register Base; 1335 unsigned Offset; 1336 1337 std::tie(Base, Offset) = 1338 AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset); 1339 1340 uint32_t SOffset, ImmOffset; 1341 if ((int)Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, 1342 &RBI.Subtarget, Alignment)) { 1343 if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) { 1344 VOffsetReg = Base; 1345 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); 1346 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1347 InstOffsetVal = ImmOffset; 1348 return 0; // XXX - Why is this 0? 1349 } 1350 1351 // If we have SGPR base, we can use it for soffset. 1352 if (SOffset == 0) { 1353 VOffsetReg = B.buildConstant(S32, 0).getReg(0); 1354 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1355 SOffsetReg = Base; 1356 InstOffsetVal = ImmOffset; 1357 return 0; // XXX - Why is this 0? 1358 } 1359 } 1360 1361 // Handle the variable sgpr + vgpr case. 1362 MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI); 1363 if (Add && (int)Offset >= 0) { 1364 Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg()); 1365 Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg()); 1366 1367 const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI); 1368 const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI); 1369 1370 if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) { 1371 VOffsetReg = Src0; 1372 SOffsetReg = Src1; 1373 return 0; 1374 } 1375 1376 if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) { 1377 VOffsetReg = Src1; 1378 SOffsetReg = Src0; 1379 return 0; 1380 } 1381 } 1382 1383 // Ensure we have a VGPR for the combined offset. This could be an issue if we 1384 // have an SGPR offset and a VGPR resource. 1385 if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) { 1386 VOffsetReg = CombinedOffset; 1387 } else { 1388 VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0); 1389 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1390 } 1391 1392 SOffsetReg = B.buildConstant(S32, 0).getReg(0); 1393 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1394 return 0; 1395 } 1396 1397 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad( 1398 const OperandsMapper &OpdMapper) const { 1399 MachineInstr &MI = OpdMapper.getMI(); 1400 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1401 1402 const LLT S32 = LLT::scalar(32); 1403 Register Dst = MI.getOperand(0).getReg(); 1404 LLT Ty = MRI.getType(Dst); 1405 1406 const RegisterBank *RSrcBank = 1407 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1408 const RegisterBank *OffsetBank = 1409 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 1410 if (RSrcBank == &AMDGPU::SGPRRegBank && 1411 OffsetBank == &AMDGPU::SGPRRegBank) 1412 return true; // Legal mapping 1413 1414 // FIXME: 96-bit case was widened during legalize. We need to narrow it back 1415 // here but don't have an MMO. 1416 1417 unsigned LoadSize = Ty.getSizeInBits(); 1418 int NumLoads = 1; 1419 if (LoadSize == 256 || LoadSize == 512) { 1420 NumLoads = LoadSize / 128; 1421 Ty = Ty.divide(NumLoads); 1422 } 1423 1424 // Use the alignment to ensure that the required offsets will fit into the 1425 // immediate offsets. 1426 const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1); 1427 1428 MachineIRBuilder B(MI); 1429 MachineFunction &MF = B.getMF(); 1430 1431 Register SOffset; 1432 Register VOffset; 1433 int64_t ImmOffset = 0; 1434 1435 unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(), 1436 VOffset, SOffset, ImmOffset, Alignment); 1437 1438 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we 1439 // can, but we need to track an MMO for that. 1440 const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8; 1441 const Align MemAlign(4); // FIXME: ABI type alignment? 1442 MachineMemOperand *BaseMMO = MF.getMachineMemOperand( 1443 MachinePointerInfo(), 1444 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1445 MachineMemOperand::MOInvariant, 1446 MemSize, MemAlign); 1447 if (MMOOffset != 0) 1448 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize); 1449 1450 // If only the offset is divergent, emit a MUBUF buffer load instead. We can 1451 // assume that the buffer is unswizzled. 1452 1453 Register RSrc = MI.getOperand(1).getReg(); 1454 Register VIndex = B.buildConstant(S32, 0).getReg(0); 1455 B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank); 1456 1457 SmallVector<Register, 4> LoadParts(NumLoads); 1458 1459 MachineBasicBlock::iterator MII = MI.getIterator(); 1460 MachineInstrSpan Span(MII, &B.getMBB()); 1461 1462 for (int i = 0; i < NumLoads; ++i) { 1463 if (NumLoads == 1) { 1464 LoadParts[i] = Dst; 1465 } else { 1466 LoadParts[i] = MRI.createGenericVirtualRegister(Ty); 1467 MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank); 1468 } 1469 1470 MachineMemOperand *MMO = BaseMMO; 1471 if (i != 0) 1472 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize); 1473 1474 B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD) 1475 .addDef(LoadParts[i]) // vdata 1476 .addUse(RSrc) // rsrc 1477 .addUse(VIndex) // vindex 1478 .addUse(VOffset) // voffset 1479 .addUse(SOffset) // soffset 1480 .addImm(ImmOffset + 16 * i) // offset(imm) 1481 .addImm(0) // cachepolicy, swizzled buffer(imm) 1482 .addImm(0) // idxen(imm) 1483 .addMemOperand(MMO); 1484 } 1485 1486 // TODO: If only the resource is a VGPR, it may be better to execute the 1487 // scalar load in the waterfall loop if the resource is expected to frequently 1488 // be dynamically uniform. 1489 if (RSrcBank != &AMDGPU::SGPRRegBank) { 1490 // Remove the original instruction to avoid potentially confusing the 1491 // waterfall loop logic. 1492 B.setInstr(*Span.begin()); 1493 MI.eraseFromParent(); 1494 1495 SmallSet<Register, 4> OpsToWaterfall; 1496 1497 OpsToWaterfall.insert(RSrc); 1498 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 1499 OpsToWaterfall, MRI); 1500 } 1501 1502 if (NumLoads != 1) { 1503 if (Ty.isVector()) 1504 B.buildConcatVectors(Dst, LoadParts); 1505 else 1506 B.buildMerge(Dst, LoadParts); 1507 } 1508 1509 // We removed the instruction earlier with a waterfall loop. 1510 if (RSrcBank == &AMDGPU::SGPRRegBank) 1511 MI.eraseFromParent(); 1512 1513 return true; 1514 } 1515 1516 bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper, 1517 bool Signed) const { 1518 MachineInstr &MI = OpdMapper.getMI(); 1519 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1520 1521 // Insert basic copies 1522 applyDefaultMapping(OpdMapper); 1523 1524 Register DstReg = MI.getOperand(0).getReg(); 1525 LLT Ty = MRI.getType(DstReg); 1526 1527 const LLT S32 = LLT::scalar(32); 1528 1529 unsigned FirstOpnd = MI.getOpcode() == AMDGPU::G_INTRINSIC ? 2 : 1; 1530 Register SrcReg = MI.getOperand(FirstOpnd).getReg(); 1531 Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg(); 1532 Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg(); 1533 1534 const RegisterBank *DstBank = 1535 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1536 if (DstBank == &AMDGPU::VGPRRegBank) { 1537 if (Ty == S32) 1538 return true; 1539 1540 // There is no 64-bit vgpr bitfield extract instructions so the operation 1541 // is expanded to a sequence of instructions that implement the operation. 1542 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::VGPRRegBank); 1543 MachineIRBuilder B(MI, ApplyBank); 1544 1545 const LLT S64 = LLT::scalar(64); 1546 // Shift the source operand so that extracted bits start at bit 0. 1547 auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg) 1548 : B.buildLShr(S64, SrcReg, OffsetReg); 1549 auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset); 1550 1551 // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions 1552 // if the width is a constant. 1553 if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) { 1554 // Use the 32-bit bitfield extract instruction if the width is a constant. 1555 // Depending on the width size, use either the low or high 32-bits. 1556 auto Zero = B.buildConstant(S32, 0); 1557 auto WidthImm = ConstWidth->Value.getZExtValue(); 1558 if (WidthImm <= 32) { 1559 // Use bitfield extract on the lower 32-bit source, and then sign-extend 1560 // or clear the upper 32-bits. 1561 auto Extract = 1562 Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg) 1563 : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg); 1564 auto Extend = 1565 Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero; 1566 B.buildMerge(DstReg, {Extract, Extend}); 1567 } else { 1568 // Use bitfield extract on upper 32-bit source, and combine with lower 1569 // 32-bit source. 1570 auto UpperWidth = B.buildConstant(S32, WidthImm - 32); 1571 auto Extract = 1572 Signed 1573 ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth) 1574 : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth); 1575 B.buildMerge(DstReg, {UnmergeSOffset.getReg(0), Extract}); 1576 } 1577 MI.eraseFromParent(); 1578 return true; 1579 } 1580 1581 // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit 1582 // operations. 1583 auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg); 1584 auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift); 1585 if (Signed) 1586 B.buildAShr(S64, SignBit, ExtShift); 1587 else 1588 B.buildLShr(S64, SignBit, ExtShift); 1589 MI.eraseFromParent(); 1590 return true; 1591 } 1592 1593 // The scalar form packs the offset and width in a single operand. 1594 1595 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank); 1596 MachineIRBuilder B(MI, ApplyBank); 1597 1598 // Ensure the high bits are clear to insert the offset. 1599 auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6)); 1600 auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask); 1601 1602 // Zeros out the low bits, so don't bother clamping the input value. 1603 auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16)); 1604 1605 // Transformation function, pack the offset and width of a BFE into 1606 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second 1607 // source, bits [5:0] contain the offset and bits [22:16] the width. 1608 auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth); 1609 1610 // TODO: It might be worth using a pseudo here to avoid scc clobber and 1611 // register class constraints. 1612 unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) : 1613 (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64); 1614 1615 auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs}); 1616 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) 1617 llvm_unreachable("failed to constrain BFE"); 1618 1619 MI.eraseFromParent(); 1620 return true; 1621 } 1622 1623 // Return a suitable opcode for extending the operands of Opc when widening. 1624 static unsigned getExtendOp(unsigned Opc) { 1625 switch (Opc) { 1626 case TargetOpcode::G_ASHR: 1627 case TargetOpcode::G_SMIN: 1628 case TargetOpcode::G_SMAX: 1629 return TargetOpcode::G_SEXT; 1630 case TargetOpcode::G_LSHR: 1631 case TargetOpcode::G_UMIN: 1632 case TargetOpcode::G_UMAX: 1633 return TargetOpcode::G_ZEXT; 1634 default: 1635 return TargetOpcode::G_ANYEXT; 1636 } 1637 } 1638 1639 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding 1640 // any illegal vector extend or unmerge operations. 1641 static std::pair<Register, Register> 1642 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) { 1643 const LLT S32 = LLT::scalar(32); 1644 auto Bitcast = B.buildBitcast(S32, Src); 1645 1646 if (ExtOpcode == TargetOpcode::G_SEXT) { 1647 auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16); 1648 auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16)); 1649 return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0)); 1650 } 1651 1652 auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16)); 1653 if (ExtOpcode == TargetOpcode::G_ZEXT) { 1654 auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff)); 1655 return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0)); 1656 } 1657 1658 assert(ExtOpcode == TargetOpcode::G_ANYEXT); 1659 return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0)); 1660 } 1661 1662 // For cases where only a single copy is inserted for matching register banks. 1663 // Replace the register in the instruction operand 1664 static bool substituteSimpleCopyRegs( 1665 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) { 1666 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx)); 1667 if (!SrcReg.empty()) { 1668 assert(SrcReg.size() == 1); 1669 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]); 1670 return true; 1671 } 1672 1673 return false; 1674 } 1675 1676 /// Handle register layout difference for f16 images for some subtargets. 1677 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B, 1678 MachineRegisterInfo &MRI, 1679 Register Reg) const { 1680 if (!Subtarget.hasUnpackedD16VMem()) 1681 return Reg; 1682 1683 const LLT S16 = LLT::scalar(16); 1684 LLT StoreVT = MRI.getType(Reg); 1685 if (!StoreVT.isVector() || StoreVT.getElementType() != S16) 1686 return Reg; 1687 1688 auto Unmerge = B.buildUnmerge(S16, Reg); 1689 1690 1691 SmallVector<Register, 4> WideRegs; 1692 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 1693 WideRegs.push_back(Unmerge.getReg(I)); 1694 1695 const LLT S32 = LLT::scalar(32); 1696 int NumElts = StoreVT.getNumElements(); 1697 1698 return B.buildMerge(LLT::fixed_vector(NumElts, S32), WideRegs).getReg(0); 1699 } 1700 1701 static std::pair<Register, unsigned> 1702 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { 1703 int64_t Const; 1704 if (mi_match(Reg, MRI, m_ICst(Const))) 1705 return std::make_pair(Register(), Const); 1706 1707 Register Base; 1708 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const)))) 1709 return std::make_pair(Base, Const); 1710 1711 // TODO: Handle G_OR used for add case 1712 return std::make_pair(Reg, 0); 1713 } 1714 1715 std::pair<Register, unsigned> 1716 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B, 1717 Register OrigOffset) const { 1718 const unsigned MaxImm = 4095; 1719 Register BaseReg; 1720 unsigned ImmOffset; 1721 const LLT S32 = LLT::scalar(32); 1722 1723 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(), 1724 OrigOffset); 1725 1726 unsigned C1 = 0; 1727 if (ImmOffset != 0) { 1728 // If the immediate value is too big for the immoffset field, put the value 1729 // and -4096 into the immoffset field so that the value that is copied/added 1730 // for the voffset field is a multiple of 4096, and it stands more chance 1731 // of being CSEd with the copy/add for another similar load/store. 1732 // However, do not do that rounding down to a multiple of 4096 if that is a 1733 // negative number, as it appears to be illegal to have a negative offset 1734 // in the vgpr, even if adding the immediate offset makes it positive. 1735 unsigned Overflow = ImmOffset & ~MaxImm; 1736 ImmOffset -= Overflow; 1737 if ((int32_t)Overflow < 0) { 1738 Overflow += ImmOffset; 1739 ImmOffset = 0; 1740 } 1741 1742 C1 = ImmOffset; 1743 if (Overflow != 0) { 1744 if (!BaseReg) 1745 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 1746 else { 1747 auto OverflowVal = B.buildConstant(S32, Overflow); 1748 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 1749 } 1750 } 1751 } 1752 1753 if (!BaseReg) 1754 BaseReg = B.buildConstant(S32, 0).getReg(0); 1755 1756 return {BaseReg, C1}; 1757 } 1758 1759 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg, 1760 Register SrcReg) const { 1761 MachineRegisterInfo &MRI = *B.getMRI(); 1762 LLT SrcTy = MRI.getType(SrcReg); 1763 if (SrcTy.getSizeInBits() == 32) { 1764 // Use a v_mov_b32 here to make the exec dependency explicit. 1765 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1766 .addDef(DstReg) 1767 .addUse(SrcReg); 1768 return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) && 1769 constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI); 1770 } 1771 1772 Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1773 Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1774 1775 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1776 .addDef(TmpReg0) 1777 .addUse(SrcReg, 0, AMDGPU::sub0); 1778 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1779 .addDef(TmpReg1) 1780 .addUse(SrcReg, 0, AMDGPU::sub1); 1781 B.buildInstr(AMDGPU::REG_SEQUENCE) 1782 .addDef(DstReg) 1783 .addUse(TmpReg0) 1784 .addImm(AMDGPU::sub0) 1785 .addUse(TmpReg1) 1786 .addImm(AMDGPU::sub1); 1787 1788 return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) && 1789 constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI); 1790 } 1791 1792 /// Utility function for pushing dynamic vector indexes with a constant offset 1793 /// into waterfall loops. 1794 static void reinsertVectorIndexAdd(MachineIRBuilder &B, 1795 MachineInstr &IdxUseInstr, 1796 unsigned OpIdx, 1797 unsigned ConstOffset) { 1798 MachineRegisterInfo &MRI = *B.getMRI(); 1799 const LLT S32 = LLT::scalar(32); 1800 Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg(); 1801 B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator()); 1802 1803 auto MaterializedOffset = B.buildConstant(S32, ConstOffset); 1804 1805 auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset); 1806 MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank); 1807 MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank); 1808 IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0)); 1809 } 1810 1811 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the 1812 /// original 32-bit source value (to be inserted in the low part of the combined 1813 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit 1814 /// value. 1815 static void extendLow32IntoHigh32(MachineIRBuilder &B, 1816 Register Hi32Reg, Register Lo32Reg, 1817 unsigned ExtOpc, 1818 const RegisterBank &RegBank, 1819 bool IsBooleanSrc = false) { 1820 if (ExtOpc == AMDGPU::G_ZEXT) { 1821 B.buildConstant(Hi32Reg, 0); 1822 } else if (ExtOpc == AMDGPU::G_SEXT) { 1823 if (IsBooleanSrc) { 1824 // If we know the original source was an s1, the high half is the same as 1825 // the low. 1826 B.buildCopy(Hi32Reg, Lo32Reg); 1827 } else { 1828 // Replicate sign bit from 32-bit extended part. 1829 auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31); 1830 B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank); 1831 B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt); 1832 } 1833 } else { 1834 assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension"); 1835 B.buildUndef(Hi32Reg); 1836 } 1837 } 1838 1839 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect( 1840 MachineInstr &MI, MachineRegisterInfo &MRI, 1841 const OperandsMapper &OpdMapper) const { 1842 1843 Register VecReg = MI.getOperand(1).getReg(); 1844 Register Idx = MI.getOperand(2).getReg(); 1845 1846 const RegisterBank &IdxBank = 1847 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 1848 1849 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank; 1850 1851 LLT VecTy = MRI.getType(VecReg); 1852 unsigned EltSize = VecTy.getScalarSizeInBits(); 1853 unsigned NumElem = VecTy.getNumElements(); 1854 1855 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, 1856 IsDivergentIdx)) 1857 return false; 1858 1859 MachineIRBuilder B(MI); 1860 LLT S32 = LLT::scalar(32); 1861 1862 const RegisterBank &DstBank = 1863 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1864 const RegisterBank &SrcBank = 1865 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1866 1867 const RegisterBank &CCBank = 1868 (DstBank == AMDGPU::SGPRRegBank && 1869 SrcBank == AMDGPU::SGPRRegBank && 1870 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank 1871 : AMDGPU::VCCRegBank; 1872 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); 1873 1874 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { 1875 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); 1876 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); 1877 } 1878 1879 LLT EltTy = VecTy.getScalarType(); 1880 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 1881 unsigned NumLanes = DstRegs.size(); 1882 if (!NumLanes) 1883 NumLanes = 1; 1884 else 1885 EltTy = MRI.getType(DstRegs[0]); 1886 1887 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg); 1888 SmallVector<Register, 2> Res(NumLanes); 1889 for (unsigned L = 0; L < NumLanes; ++L) 1890 Res[L] = UnmergeToEltTy.getReg(L); 1891 1892 for (unsigned I = 1; I < NumElem; ++I) { 1893 auto IC = B.buildConstant(S32, I); 1894 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); 1895 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC); 1896 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); 1897 1898 for (unsigned L = 0; L < NumLanes; ++L) { 1899 auto S = B.buildSelect(EltTy, Cmp, 1900 UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]); 1901 1902 for (unsigned N : { 0, 2, 3 }) 1903 MRI.setRegBank(S->getOperand(N).getReg(), DstBank); 1904 1905 Res[L] = S->getOperand(0).getReg(); 1906 } 1907 } 1908 1909 for (unsigned L = 0; L < NumLanes; ++L) { 1910 Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L]; 1911 B.buildCopy(DstReg, Res[L]); 1912 MRI.setRegBank(DstReg, DstBank); 1913 } 1914 1915 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank); 1916 MI.eraseFromParent(); 1917 1918 return true; 1919 } 1920 1921 // Insert a cross regbank copy for a register if it already has a bank that 1922 // differs from the one we want to set. 1923 static Register constrainRegToBank(MachineRegisterInfo &MRI, 1924 MachineIRBuilder &B, Register &Reg, 1925 const RegisterBank &Bank) { 1926 const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg); 1927 if (CurrBank && *CurrBank != Bank) { 1928 Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0); 1929 MRI.setRegBank(Copy, Bank); 1930 return Copy; 1931 } 1932 1933 MRI.setRegBank(Reg, Bank); 1934 return Reg; 1935 } 1936 1937 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect( 1938 MachineInstr &MI, MachineRegisterInfo &MRI, 1939 const OperandsMapper &OpdMapper) const { 1940 1941 Register VecReg = MI.getOperand(1).getReg(); 1942 Register Idx = MI.getOperand(3).getReg(); 1943 1944 const RegisterBank &IdxBank = 1945 *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; 1946 1947 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank; 1948 1949 LLT VecTy = MRI.getType(VecReg); 1950 unsigned EltSize = VecTy.getScalarSizeInBits(); 1951 unsigned NumElem = VecTy.getNumElements(); 1952 1953 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, 1954 IsDivergentIdx)) 1955 return false; 1956 1957 MachineIRBuilder B(MI); 1958 LLT S32 = LLT::scalar(32); 1959 1960 const RegisterBank &DstBank = 1961 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1962 const RegisterBank &SrcBank = 1963 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1964 const RegisterBank &InsBank = 1965 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 1966 1967 const RegisterBank &CCBank = 1968 (DstBank == AMDGPU::SGPRRegBank && 1969 SrcBank == AMDGPU::SGPRRegBank && 1970 InsBank == AMDGPU::SGPRRegBank && 1971 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank 1972 : AMDGPU::VCCRegBank; 1973 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); 1974 1975 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { 1976 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); 1977 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); 1978 } 1979 1980 LLT EltTy = VecTy.getScalarType(); 1981 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); 1982 unsigned NumLanes = InsRegs.size(); 1983 if (!NumLanes) { 1984 NumLanes = 1; 1985 InsRegs.push_back(MI.getOperand(2).getReg()); 1986 } else { 1987 EltTy = MRI.getType(InsRegs[0]); 1988 } 1989 1990 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg); 1991 SmallVector<Register, 16> Ops(NumElem * NumLanes); 1992 1993 for (unsigned I = 0; I < NumElem; ++I) { 1994 auto IC = B.buildConstant(S32, I); 1995 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); 1996 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC); 1997 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); 1998 1999 for (unsigned L = 0; L < NumLanes; ++L) { 2000 Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank); 2001 Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L); 2002 Op1 = constrainRegToBank(MRI, B, Op1, DstBank); 2003 2004 Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0); 2005 MRI.setRegBank(Select, DstBank); 2006 2007 Ops[I * NumLanes + L] = Select; 2008 } 2009 } 2010 2011 LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy); 2012 if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) { 2013 B.buildBuildVector(MI.getOperand(0), Ops); 2014 } else { 2015 auto Vec = B.buildBuildVector(MergeTy, Ops); 2016 MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank); 2017 B.buildBitcast(MI.getOperand(0).getReg(), Vec); 2018 } 2019 2020 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank); 2021 MI.eraseFromParent(); 2022 2023 return true; 2024 } 2025 2026 void AMDGPURegisterBankInfo::applyMappingImpl( 2027 const OperandsMapper &OpdMapper) const { 2028 MachineInstr &MI = OpdMapper.getMI(); 2029 unsigned Opc = MI.getOpcode(); 2030 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 2031 switch (Opc) { 2032 case AMDGPU::G_PHI: { 2033 Register DstReg = MI.getOperand(0).getReg(); 2034 LLT DstTy = MRI.getType(DstReg); 2035 if (DstTy != LLT::scalar(1)) 2036 break; 2037 2038 const LLT S32 = LLT::scalar(32); 2039 const RegisterBank *DstBank = 2040 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2041 if (DstBank == &AMDGPU::VCCRegBank) { 2042 applyDefaultMapping(OpdMapper); 2043 // The standard handling only considers the result register bank for 2044 // phis. For VCC, blindly inserting a copy when the phi is lowered will 2045 // produce an invalid copy. We can only copy with some kind of compare to 2046 // get a vector boolean result. Insert a register bank copy that will be 2047 // correctly lowered to a compare. 2048 MachineIRBuilder B(*MI.getParent()->getParent()); 2049 2050 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 2051 Register SrcReg = MI.getOperand(I).getReg(); 2052 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); 2053 2054 if (SrcBank != &AMDGPU::VCCRegBank) { 2055 MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB(); 2056 B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator()); 2057 2058 auto Copy = B.buildCopy(LLT::scalar(1), SrcReg); 2059 MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank); 2060 MI.getOperand(I).setReg(Copy.getReg(0)); 2061 } 2062 } 2063 2064 return; 2065 } 2066 2067 // Phi handling is strange and only considers the bank of the destination. 2068 substituteSimpleCopyRegs(OpdMapper, 0); 2069 2070 // Promote SGPR/VGPR booleans to s32 2071 MachineFunction *MF = MI.getParent()->getParent(); 2072 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); 2073 MachineIRBuilder B(MI, ApplyBank); 2074 LegalizerHelper Helper(*MF, ApplyBank, B); 2075 2076 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 2077 llvm_unreachable("widen scalar should have succeeded"); 2078 2079 return; 2080 } 2081 case AMDGPU::G_ICMP: 2082 case AMDGPU::G_UADDO: 2083 case AMDGPU::G_USUBO: 2084 case AMDGPU::G_UADDE: 2085 case AMDGPU::G_SADDE: 2086 case AMDGPU::G_USUBE: 2087 case AMDGPU::G_SSUBE: { 2088 unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1; 2089 Register DstReg = MI.getOperand(BoolDstOp).getReg(); 2090 2091 const RegisterBank *DstBank = 2092 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2093 if (DstBank != &AMDGPU::SGPRRegBank) 2094 break; 2095 2096 const bool HasCarryIn = MI.getNumOperands() == 5; 2097 2098 // If this is a scalar compare, promote the result to s32, as the selection 2099 // will end up using a copy to a 32-bit vreg. 2100 const LLT S32 = LLT::scalar(32); 2101 Register NewDstReg = MRI.createGenericVirtualRegister(S32); 2102 MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank); 2103 MI.getOperand(BoolDstOp).setReg(NewDstReg); 2104 MachineIRBuilder B(MI); 2105 2106 if (HasCarryIn) { 2107 Register NewSrcReg = MRI.createGenericVirtualRegister(S32); 2108 MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank); 2109 B.buildZExt(NewSrcReg, MI.getOperand(4).getReg()); 2110 MI.getOperand(4).setReg(NewSrcReg); 2111 } 2112 2113 MachineBasicBlock *MBB = MI.getParent(); 2114 B.setInsertPt(*MBB, std::next(MI.getIterator())); 2115 2116 // If we had a constrained VCC result register, a copy was inserted to VCC 2117 // from SGPR. 2118 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0)); 2119 if (DefRegs.empty()) 2120 DefRegs.push_back(DstReg); 2121 B.buildTrunc(DefRegs[0], NewDstReg); 2122 return; 2123 } 2124 case AMDGPU::G_SELECT: { 2125 Register DstReg = MI.getOperand(0).getReg(); 2126 LLT DstTy = MRI.getType(DstReg); 2127 2128 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1)); 2129 if (CondRegs.empty()) 2130 CondRegs.push_back(MI.getOperand(1).getReg()); 2131 else { 2132 assert(CondRegs.size() == 1); 2133 } 2134 2135 const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI); 2136 if (CondBank == &AMDGPU::SGPRRegBank) { 2137 MachineIRBuilder B(MI); 2138 const LLT S32 = LLT::scalar(32); 2139 Register NewCondReg = MRI.createGenericVirtualRegister(S32); 2140 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); 2141 2142 MI.getOperand(1).setReg(NewCondReg); 2143 B.buildZExt(NewCondReg, CondRegs[0]); 2144 } 2145 2146 if (DstTy.getSizeInBits() != 64) 2147 break; 2148 2149 MachineIRBuilder B(MI); 2150 LLT HalfTy = getHalfSizedType(DstTy); 2151 2152 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2153 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 2154 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3)); 2155 2156 // All inputs are SGPRs, nothing special to do. 2157 if (DefRegs.empty()) { 2158 assert(Src1Regs.empty() && Src2Regs.empty()); 2159 break; 2160 } 2161 2162 if (Src1Regs.empty()) 2163 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 2164 else { 2165 setRegsToType(MRI, Src1Regs, HalfTy); 2166 } 2167 2168 if (Src2Regs.empty()) 2169 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg()); 2170 else 2171 setRegsToType(MRI, Src2Regs, HalfTy); 2172 2173 setRegsToType(MRI, DefRegs, HalfTy); 2174 2175 B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]); 2176 B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]); 2177 2178 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2179 MI.eraseFromParent(); 2180 return; 2181 } 2182 case AMDGPU::G_BRCOND: { 2183 Register CondReg = MI.getOperand(0).getReg(); 2184 // FIXME: Should use legalizer helper, but should change bool ext type. 2185 const RegisterBank *CondBank = 2186 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2187 2188 if (CondBank == &AMDGPU::SGPRRegBank) { 2189 MachineIRBuilder B(MI); 2190 const LLT S32 = LLT::scalar(32); 2191 Register NewCondReg = MRI.createGenericVirtualRegister(S32); 2192 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); 2193 2194 MI.getOperand(0).setReg(NewCondReg); 2195 B.buildZExt(NewCondReg, CondReg); 2196 return; 2197 } 2198 2199 break; 2200 } 2201 case AMDGPU::G_AND: 2202 case AMDGPU::G_OR: 2203 case AMDGPU::G_XOR: { 2204 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if 2205 // there is a VGPR input. 2206 Register DstReg = MI.getOperand(0).getReg(); 2207 LLT DstTy = MRI.getType(DstReg); 2208 2209 if (DstTy.getSizeInBits() == 1) { 2210 const RegisterBank *DstBank = 2211 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2212 if (DstBank == &AMDGPU::VCCRegBank) 2213 break; 2214 2215 MachineFunction *MF = MI.getParent()->getParent(); 2216 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); 2217 MachineIRBuilder B(MI, ApplyBank); 2218 LegalizerHelper Helper(*MF, ApplyBank, B); 2219 2220 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) != 2221 LegalizerHelper::Legalized) 2222 llvm_unreachable("widen scalar should have succeeded"); 2223 return; 2224 } 2225 2226 if (DstTy.getSizeInBits() != 64) 2227 break; 2228 2229 LLT HalfTy = getHalfSizedType(DstTy); 2230 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2231 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1)); 2232 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 2233 2234 // All inputs are SGPRs, nothing special to do. 2235 if (DefRegs.empty()) { 2236 assert(Src0Regs.empty() && Src1Regs.empty()); 2237 break; 2238 } 2239 2240 assert(DefRegs.size() == 2); 2241 assert(Src0Regs.size() == Src1Regs.size() && 2242 (Src0Regs.empty() || Src0Regs.size() == 2)); 2243 2244 // Depending on where the source registers came from, the generic code may 2245 // have decided to split the inputs already or not. If not, we still need to 2246 // extract the values. 2247 MachineIRBuilder B(MI); 2248 2249 if (Src0Regs.empty()) 2250 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg()); 2251 else 2252 setRegsToType(MRI, Src0Regs, HalfTy); 2253 2254 if (Src1Regs.empty()) 2255 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 2256 else 2257 setRegsToType(MRI, Src1Regs, HalfTy); 2258 2259 setRegsToType(MRI, DefRegs, HalfTy); 2260 2261 B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]}); 2262 B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]}); 2263 2264 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2265 MI.eraseFromParent(); 2266 return; 2267 } 2268 case AMDGPU::G_ABS: { 2269 Register SrcReg = MI.getOperand(1).getReg(); 2270 const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg); 2271 2272 // There is no VALU abs instruction so we need to replace it with a sub and 2273 // max combination. 2274 if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) { 2275 MachineFunction *MF = MI.getParent()->getParent(); 2276 ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank); 2277 MachineIRBuilder B(MI, Apply); 2278 LegalizerHelper Helper(*MF, Apply, B); 2279 2280 if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized) 2281 llvm_unreachable("lowerAbsToMaxNeg should have succeeded"); 2282 return; 2283 } 2284 LLVM_FALLTHROUGH; 2285 } 2286 case AMDGPU::G_ADD: 2287 case AMDGPU::G_SUB: 2288 case AMDGPU::G_MUL: 2289 case AMDGPU::G_SHL: 2290 case AMDGPU::G_LSHR: 2291 case AMDGPU::G_ASHR: 2292 case AMDGPU::G_SMIN: 2293 case AMDGPU::G_SMAX: 2294 case AMDGPU::G_UMIN: 2295 case AMDGPU::G_UMAX: { 2296 Register DstReg = MI.getOperand(0).getReg(); 2297 LLT DstTy = MRI.getType(DstReg); 2298 2299 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. 2300 // Packed 16-bit operations need to be scalarized and promoted. 2301 if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16)) 2302 break; 2303 2304 const RegisterBank *DstBank = 2305 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2306 if (DstBank == &AMDGPU::VGPRRegBank) 2307 break; 2308 2309 const LLT S32 = LLT::scalar(32); 2310 MachineBasicBlock *MBB = MI.getParent(); 2311 MachineFunction *MF = MBB->getParent(); 2312 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank); 2313 MachineIRBuilder B(MI, ApplySALU); 2314 2315 if (DstTy.isVector()) { 2316 Register WideSrc0Lo, WideSrc0Hi; 2317 Register WideSrc1Lo, WideSrc1Hi; 2318 2319 unsigned ExtendOp = getExtendOp(MI.getOpcode()); 2320 std::tie(WideSrc0Lo, WideSrc0Hi) 2321 = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp); 2322 std::tie(WideSrc1Lo, WideSrc1Hi) 2323 = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp); 2324 auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo}); 2325 auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi}); 2326 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)}); 2327 MI.eraseFromParent(); 2328 } else { 2329 LegalizerHelper Helper(*MF, ApplySALU, B); 2330 2331 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 2332 llvm_unreachable("widen scalar should have succeeded"); 2333 2334 // FIXME: s16 shift amounts should be legal. 2335 if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR || 2336 Opc == AMDGPU::G_ASHR) { 2337 B.setInsertPt(*MBB, MI.getIterator()); 2338 if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized) 2339 llvm_unreachable("widen scalar should have succeeded"); 2340 } 2341 } 2342 2343 return; 2344 } 2345 case AMDGPU::G_SEXT_INREG: { 2346 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1)); 2347 if (SrcRegs.empty()) 2348 break; // Nothing to repair 2349 2350 const LLT S32 = LLT::scalar(32); 2351 MachineIRBuilder B(MI); 2352 ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank); 2353 GISelObserverWrapper Observer(&O); 2354 B.setChangeObserver(Observer); 2355 2356 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs 2357 // we would need to further expand, and doesn't let us directly set the 2358 // result registers. 2359 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 2360 2361 int Amt = MI.getOperand(2).getImm(); 2362 if (Amt <= 32) { 2363 if (Amt == 32) { 2364 // The low bits are unchanged. 2365 B.buildCopy(DstRegs[0], SrcRegs[0]); 2366 } else { 2367 // Extend in the low bits and propagate the sign bit to the high half. 2368 B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt); 2369 } 2370 2371 B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31)); 2372 } else { 2373 // The low bits are unchanged, and extend in the high bits. 2374 B.buildCopy(DstRegs[0], SrcRegs[0]); 2375 B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32); 2376 } 2377 2378 Register DstReg = MI.getOperand(0).getReg(); 2379 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2380 MI.eraseFromParent(); 2381 return; 2382 } 2383 case AMDGPU::G_CTPOP: 2384 case AMDGPU::G_BITREVERSE: { 2385 const RegisterBank *DstBank = 2386 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2387 if (DstBank == &AMDGPU::SGPRRegBank) 2388 break; 2389 2390 Register SrcReg = MI.getOperand(1).getReg(); 2391 const LLT S32 = LLT::scalar(32); 2392 LLT Ty = MRI.getType(SrcReg); 2393 if (Ty == S32) 2394 break; 2395 2396 ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank); 2397 MachineIRBuilder B(MI, ApplyVALU); 2398 2399 MachineFunction &MF = B.getMF(); 2400 LegalizerHelper Helper(MF, ApplyVALU, B); 2401 2402 if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized) 2403 llvm_unreachable("narrowScalar should have succeeded"); 2404 return; 2405 } 2406 case AMDGPU::G_AMDGPU_FFBH_U32: 2407 case AMDGPU::G_AMDGPU_FFBL_B32: 2408 case AMDGPU::G_CTLZ_ZERO_UNDEF: 2409 case AMDGPU::G_CTTZ_ZERO_UNDEF: { 2410 const RegisterBank *DstBank = 2411 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2412 if (DstBank == &AMDGPU::SGPRRegBank) 2413 break; 2414 2415 Register SrcReg = MI.getOperand(1).getReg(); 2416 const LLT S32 = LLT::scalar(32); 2417 LLT Ty = MRI.getType(SrcReg); 2418 if (Ty == S32) 2419 break; 2420 2421 // We can narrow this more efficiently than Helper can by using ffbh/ffbl 2422 // which return -1 when the input is zero: 2423 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32)) 2424 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo)) 2425 // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32)) 2426 // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo)) 2427 ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank); 2428 MachineIRBuilder B(MI, ApplyVALU); 2429 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1)); 2430 unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF 2431 ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32 2432 : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF 2433 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32 2434 : Opc; 2435 unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32; 2436 auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]}); 2437 auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]}); 2438 unsigned AddOpc = 2439 Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF 2440 ? AMDGPU::G_ADD 2441 : AMDGPU::G_UADDSAT; 2442 Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)}); 2443 Register DstReg = MI.getOperand(0).getReg(); 2444 B.buildUMin(DstReg, X, Y); 2445 MI.eraseFromParent(); 2446 return; 2447 } 2448 case AMDGPU::G_SEXT: 2449 case AMDGPU::G_ZEXT: 2450 case AMDGPU::G_ANYEXT: { 2451 Register SrcReg = MI.getOperand(1).getReg(); 2452 LLT SrcTy = MRI.getType(SrcReg); 2453 const bool Signed = Opc == AMDGPU::G_SEXT; 2454 2455 assert(empty(OpdMapper.getVRegs(1))); 2456 2457 MachineIRBuilder B(MI); 2458 const RegisterBank *SrcBank = 2459 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2460 2461 Register DstReg = MI.getOperand(0).getReg(); 2462 LLT DstTy = MRI.getType(DstReg); 2463 if (DstTy.isScalar() && 2464 SrcBank != &AMDGPU::SGPRRegBank && 2465 SrcBank != &AMDGPU::VCCRegBank && 2466 // FIXME: Should handle any type that round to s64 when irregular 2467 // breakdowns supported. 2468 DstTy.getSizeInBits() == 64 && 2469 SrcTy.getSizeInBits() <= 32) { 2470 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2471 2472 // Extend to 32-bit, and then extend the low half. 2473 if (Signed) { 2474 // TODO: Should really be buildSExtOrCopy 2475 B.buildSExtOrTrunc(DefRegs[0], SrcReg); 2476 } else if (Opc == AMDGPU::G_ZEXT) { 2477 B.buildZExtOrTrunc(DefRegs[0], SrcReg); 2478 } else { 2479 B.buildAnyExtOrTrunc(DefRegs[0], SrcReg); 2480 } 2481 2482 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank); 2483 MRI.setRegBank(DstReg, *SrcBank); 2484 MI.eraseFromParent(); 2485 return; 2486 } 2487 2488 if (SrcTy != LLT::scalar(1)) 2489 return; 2490 2491 // It is not legal to have a legalization artifact with a VCC source. Rather 2492 // than introducing a copy, insert the select we would have to select the 2493 // copy to. 2494 if (SrcBank == &AMDGPU::VCCRegBank) { 2495 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2496 2497 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank; 2498 2499 unsigned DstSize = DstTy.getSizeInBits(); 2500 // 64-bit select is SGPR only 2501 const bool UseSel64 = DstSize > 32 && 2502 SrcBank->getID() == AMDGPU::SGPRRegBankID; 2503 2504 // TODO: Should s16 select be legal? 2505 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32); 2506 auto True = B.buildConstant(SelType, Signed ? -1 : 1); 2507 auto False = B.buildConstant(SelType, 0); 2508 2509 MRI.setRegBank(True.getReg(0), *DstBank); 2510 MRI.setRegBank(False.getReg(0), *DstBank); 2511 MRI.setRegBank(DstReg, *DstBank); 2512 2513 if (DstSize > 32) { 2514 B.buildSelect(DefRegs[0], SrcReg, True, False); 2515 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true); 2516 } else if (DstSize < 32) { 2517 auto Sel = B.buildSelect(SelType, SrcReg, True, False); 2518 MRI.setRegBank(Sel.getReg(0), *DstBank); 2519 B.buildTrunc(DstReg, Sel); 2520 } else { 2521 B.buildSelect(DstReg, SrcReg, True, False); 2522 } 2523 2524 MI.eraseFromParent(); 2525 return; 2526 } 2527 2528 break; 2529 } 2530 case AMDGPU::G_BUILD_VECTOR: 2531 case AMDGPU::G_BUILD_VECTOR_TRUNC: { 2532 Register DstReg = MI.getOperand(0).getReg(); 2533 LLT DstTy = MRI.getType(DstReg); 2534 if (DstTy != LLT::fixed_vector(2, 16)) 2535 break; 2536 2537 assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty()); 2538 substituteSimpleCopyRegs(OpdMapper, 1); 2539 substituteSimpleCopyRegs(OpdMapper, 2); 2540 2541 const RegisterBank *DstBank = 2542 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2543 if (DstBank == &AMDGPU::SGPRRegBank) 2544 break; // Can use S_PACK_* instructions. 2545 2546 MachineIRBuilder B(MI); 2547 2548 Register Lo = MI.getOperand(1).getReg(); 2549 Register Hi = MI.getOperand(2).getReg(); 2550 const LLT S32 = LLT::scalar(32); 2551 2552 const RegisterBank *BankLo = 2553 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2554 const RegisterBank *BankHi = 2555 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2556 2557 Register ZextLo; 2558 Register ShiftHi; 2559 2560 if (Opc == AMDGPU::G_BUILD_VECTOR) { 2561 ZextLo = B.buildZExt(S32, Lo).getReg(0); 2562 MRI.setRegBank(ZextLo, *BankLo); 2563 2564 Register ZextHi = B.buildZExt(S32, Hi).getReg(0); 2565 MRI.setRegBank(ZextHi, *BankHi); 2566 2567 auto ShiftAmt = B.buildConstant(S32, 16); 2568 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); 2569 2570 ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0); 2571 MRI.setRegBank(ShiftHi, *BankHi); 2572 } else { 2573 Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0); 2574 MRI.setRegBank(MaskLo, *BankLo); 2575 2576 auto ShiftAmt = B.buildConstant(S32, 16); 2577 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); 2578 2579 ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0); 2580 MRI.setRegBank(ShiftHi, *BankHi); 2581 2582 ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0); 2583 MRI.setRegBank(ZextLo, *BankLo); 2584 } 2585 2586 auto Or = B.buildOr(S32, ZextLo, ShiftHi); 2587 MRI.setRegBank(Or.getReg(0), *DstBank); 2588 2589 B.buildBitcast(DstReg, Or); 2590 MI.eraseFromParent(); 2591 return; 2592 } 2593 case AMDGPU::G_EXTRACT_VECTOR_ELT: { 2594 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 2595 2596 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty()); 2597 2598 Register DstReg = MI.getOperand(0).getReg(); 2599 Register SrcReg = MI.getOperand(1).getReg(); 2600 2601 const LLT S32 = LLT::scalar(32); 2602 LLT DstTy = MRI.getType(DstReg); 2603 LLT SrcTy = MRI.getType(SrcReg); 2604 2605 if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper)) 2606 return; 2607 2608 MachineIRBuilder B(MI); 2609 2610 const ValueMapping &DstMapping 2611 = OpdMapper.getInstrMapping().getOperandMapping(0); 2612 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank; 2613 const RegisterBank *SrcBank = 2614 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2615 const RegisterBank *IdxBank = 2616 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2617 2618 Register BaseIdxReg; 2619 unsigned ConstOffset; 2620 std::tie(BaseIdxReg, ConstOffset) = 2621 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg()); 2622 2623 // See if the index is an add of a constant which will be foldable by moving 2624 // the base register of the index later if this is going to be executed in a 2625 // waterfall loop. This is essentially to reassociate the add of a constant 2626 // with the readfirstlane. 2627 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && 2628 ConstOffset > 0 && 2629 ConstOffset < SrcTy.getNumElements(); 2630 2631 // Move the base register. We'll re-insert the add later. 2632 if (ShouldMoveIndexIntoLoop) 2633 MI.getOperand(2).setReg(BaseIdxReg); 2634 2635 // If this is a VGPR result only because the index was a VGPR result, the 2636 // actual indexing will be done on the SGPR source vector, which will 2637 // produce a scalar result. We need to copy to the VGPR result inside the 2638 // waterfall loop. 2639 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank && 2640 SrcBank == &AMDGPU::SGPRRegBank; 2641 if (DstRegs.empty()) { 2642 applyDefaultMapping(OpdMapper); 2643 2644 executeInWaterfallLoop(MI, MRI, { 2 }); 2645 2646 if (NeedCopyToVGPR) { 2647 // We don't want a phi for this temporary reg. 2648 Register TmpReg = MRI.createGenericVirtualRegister(DstTy); 2649 MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank); 2650 MI.getOperand(0).setReg(TmpReg); 2651 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 2652 2653 // Use a v_mov_b32 here to make the exec dependency explicit. 2654 buildVCopy(B, DstReg, TmpReg); 2655 } 2656 2657 // Re-insert the constant offset add inside the waterfall loop. 2658 if (ShouldMoveIndexIntoLoop) 2659 reinsertVectorIndexAdd(B, MI, 2, ConstOffset); 2660 2661 return; 2662 } 2663 2664 assert(DstTy.getSizeInBits() == 64); 2665 2666 LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32); 2667 2668 auto CastSrc = B.buildBitcast(Vec32, SrcReg); 2669 auto One = B.buildConstant(S32, 1); 2670 2671 MachineBasicBlock::iterator MII = MI.getIterator(); 2672 2673 // Split the vector index into 32-bit pieces. Prepare to move all of the 2674 // new instructions into a waterfall loop if necessary. 2675 // 2676 // Don't put the bitcast or constant in the loop. 2677 MachineInstrSpan Span(MII, &B.getMBB()); 2678 2679 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). 2680 auto IdxLo = B.buildShl(S32, BaseIdxReg, One); 2681 auto IdxHi = B.buildAdd(S32, IdxLo, One); 2682 2683 auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo); 2684 auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi); 2685 2686 MRI.setRegBank(DstReg, *DstBank); 2687 MRI.setRegBank(CastSrc.getReg(0), *SrcBank); 2688 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); 2689 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); 2690 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); 2691 2692 SmallSet<Register, 4> OpsToWaterfall; 2693 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) { 2694 MI.eraseFromParent(); 2695 return; 2696 } 2697 2698 // Remove the original instruction to avoid potentially confusing the 2699 // waterfall loop logic. 2700 B.setInstr(*Span.begin()); 2701 MI.eraseFromParent(); 2702 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 2703 OpsToWaterfall, MRI); 2704 2705 if (NeedCopyToVGPR) { 2706 MachineBasicBlock *LoopBB = Extract1->getParent(); 2707 Register TmpReg0 = MRI.createGenericVirtualRegister(S32); 2708 Register TmpReg1 = MRI.createGenericVirtualRegister(S32); 2709 MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank); 2710 MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank); 2711 2712 Extract0->getOperand(0).setReg(TmpReg0); 2713 Extract1->getOperand(0).setReg(TmpReg1); 2714 2715 B.setInsertPt(*LoopBB, ++Extract1->getIterator()); 2716 2717 buildVCopy(B, DstRegs[0], TmpReg0); 2718 buildVCopy(B, DstRegs[1], TmpReg1); 2719 } 2720 2721 if (ShouldMoveIndexIntoLoop) 2722 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); 2723 2724 return; 2725 } 2726 case AMDGPU::G_INSERT_VECTOR_ELT: { 2727 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); 2728 2729 Register DstReg = MI.getOperand(0).getReg(); 2730 LLT VecTy = MRI.getType(DstReg); 2731 2732 assert(OpdMapper.getVRegs(0).empty()); 2733 assert(OpdMapper.getVRegs(3).empty()); 2734 2735 if (substituteSimpleCopyRegs(OpdMapper, 1)) 2736 MRI.setType(MI.getOperand(1).getReg(), VecTy); 2737 2738 if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper)) 2739 return; 2740 2741 const RegisterBank *IdxBank = 2742 OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; 2743 2744 Register SrcReg = MI.getOperand(1).getReg(); 2745 Register InsReg = MI.getOperand(2).getReg(); 2746 LLT InsTy = MRI.getType(InsReg); 2747 (void)InsTy; 2748 2749 Register BaseIdxReg; 2750 unsigned ConstOffset; 2751 std::tie(BaseIdxReg, ConstOffset) = 2752 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg()); 2753 2754 // See if the index is an add of a constant which will be foldable by moving 2755 // the base register of the index later if this is going to be executed in a 2756 // waterfall loop. This is essentially to reassociate the add of a constant 2757 // with the readfirstlane. 2758 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && 2759 ConstOffset > 0 && 2760 ConstOffset < VecTy.getNumElements(); 2761 2762 // Move the base register. We'll re-insert the add later. 2763 if (ShouldMoveIndexIntoLoop) 2764 MI.getOperand(3).setReg(BaseIdxReg); 2765 2766 2767 if (InsRegs.empty()) { 2768 executeInWaterfallLoop(MI, MRI, { 3 }); 2769 2770 // Re-insert the constant offset add inside the waterfall loop. 2771 if (ShouldMoveIndexIntoLoop) { 2772 MachineIRBuilder B(MI); 2773 reinsertVectorIndexAdd(B, MI, 3, ConstOffset); 2774 } 2775 2776 return; 2777 } 2778 2779 2780 assert(InsTy.getSizeInBits() == 64); 2781 2782 const LLT S32 = LLT::scalar(32); 2783 LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32); 2784 2785 MachineIRBuilder B(MI); 2786 auto CastSrc = B.buildBitcast(Vec32, SrcReg); 2787 auto One = B.buildConstant(S32, 1); 2788 2789 // Split the vector index into 32-bit pieces. Prepare to move all of the 2790 // new instructions into a waterfall loop if necessary. 2791 // 2792 // Don't put the bitcast or constant in the loop. 2793 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); 2794 2795 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). 2796 auto IdxLo = B.buildShl(S32, BaseIdxReg, One); 2797 auto IdxHi = B.buildAdd(S32, IdxLo, One); 2798 2799 auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo); 2800 auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi); 2801 2802 const RegisterBank *DstBank = 2803 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2804 const RegisterBank *SrcBank = 2805 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2806 const RegisterBank *InsSrcBank = 2807 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2808 2809 MRI.setRegBank(InsReg, *InsSrcBank); 2810 MRI.setRegBank(CastSrc.getReg(0), *SrcBank); 2811 MRI.setRegBank(InsLo.getReg(0), *DstBank); 2812 MRI.setRegBank(InsHi.getReg(0), *DstBank); 2813 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); 2814 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); 2815 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); 2816 2817 2818 SmallSet<Register, 4> OpsToWaterfall; 2819 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) { 2820 B.setInsertPt(B.getMBB(), MI); 2821 B.buildBitcast(DstReg, InsHi); 2822 MI.eraseFromParent(); 2823 return; 2824 } 2825 2826 B.setInstr(*Span.begin()); 2827 MI.eraseFromParent(); 2828 2829 // Figure out the point after the waterfall loop before mangling the control 2830 // flow. 2831 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 2832 OpsToWaterfall, MRI); 2833 2834 // The insertion point is now right after the original instruction. 2835 // 2836 // Keep the bitcast to the original vector type out of the loop. Doing this 2837 // saved an extra phi we don't need inside the loop. 2838 B.buildBitcast(DstReg, InsHi); 2839 2840 // Re-insert the constant offset add inside the waterfall loop. 2841 if (ShouldMoveIndexIntoLoop) 2842 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); 2843 2844 return; 2845 } 2846 case AMDGPU::G_AMDGPU_BUFFER_LOAD: 2847 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 2848 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: 2849 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 2850 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: 2851 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: 2852 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: 2853 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: 2854 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: 2855 case AMDGPU::G_AMDGPU_BUFFER_STORE: 2856 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: 2857 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: 2858 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: 2859 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: 2860 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: 2861 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: { 2862 applyDefaultMapping(OpdMapper); 2863 executeInWaterfallLoop(MI, MRI, {1, 4}); 2864 return; 2865 } 2866 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: 2867 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: 2868 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: 2869 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: 2870 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: 2871 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: 2872 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: 2873 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: 2874 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: 2875 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: 2876 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: 2877 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: { 2878 applyDefaultMapping(OpdMapper); 2879 executeInWaterfallLoop(MI, MRI, {2, 5}); 2880 return; 2881 } 2882 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: 2883 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: 2884 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { 2885 applyDefaultMapping(OpdMapper); 2886 executeInWaterfallLoop(MI, MRI, {2, 5}); 2887 return; 2888 } 2889 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { 2890 applyDefaultMapping(OpdMapper); 2891 executeInWaterfallLoop(MI, MRI, {3, 6}); 2892 return; 2893 } 2894 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { 2895 applyMappingSBufferLoad(OpdMapper); 2896 return; 2897 } 2898 case AMDGPU::G_INTRINSIC: { 2899 switch (MI.getIntrinsicID()) { 2900 case Intrinsic::amdgcn_readlane: { 2901 substituteSimpleCopyRegs(OpdMapper, 2); 2902 2903 assert(OpdMapper.getVRegs(0).empty()); 2904 assert(OpdMapper.getVRegs(3).empty()); 2905 2906 // Make sure the index is an SGPR. It doesn't make sense to run this in a 2907 // waterfall loop, so assume it's a uniform value. 2908 constrainOpWithReadfirstlane(MI, MRI, 3); // Index 2909 return; 2910 } 2911 case Intrinsic::amdgcn_writelane: { 2912 assert(OpdMapper.getVRegs(0).empty()); 2913 assert(OpdMapper.getVRegs(2).empty()); 2914 assert(OpdMapper.getVRegs(3).empty()); 2915 2916 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val 2917 constrainOpWithReadfirstlane(MI, MRI, 2); // Source value 2918 constrainOpWithReadfirstlane(MI, MRI, 3); // Index 2919 return; 2920 } 2921 case Intrinsic::amdgcn_interp_p1: 2922 case Intrinsic::amdgcn_interp_p2: 2923 case Intrinsic::amdgcn_interp_mov: 2924 case Intrinsic::amdgcn_interp_p1_f16: 2925 case Intrinsic::amdgcn_interp_p2_f16: { 2926 applyDefaultMapping(OpdMapper); 2927 2928 // Readlane for m0 value, which is always the last operand. 2929 // FIXME: Should this be a waterfall loop instead? 2930 constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index 2931 return; 2932 } 2933 case Intrinsic::amdgcn_permlane16: 2934 case Intrinsic::amdgcn_permlanex16: { 2935 // Doing a waterfall loop over these wouldn't make any sense. 2936 substituteSimpleCopyRegs(OpdMapper, 2); 2937 substituteSimpleCopyRegs(OpdMapper, 3); 2938 constrainOpWithReadfirstlane(MI, MRI, 4); 2939 constrainOpWithReadfirstlane(MI, MRI, 5); 2940 return; 2941 } 2942 case Intrinsic::amdgcn_sbfe: 2943 applyMappingBFE(OpdMapper, true); 2944 return; 2945 case Intrinsic::amdgcn_ubfe: 2946 applyMappingBFE(OpdMapper, false); 2947 return; 2948 case Intrinsic::amdgcn_ballot: 2949 // Use default handling and insert copy to vcc source. 2950 break; 2951 } 2952 break; 2953 } 2954 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 2955 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: 2956 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: 2957 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { 2958 const AMDGPU::RsrcIntrinsic *RSrcIntrin 2959 = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID()); 2960 assert(RSrcIntrin && RSrcIntrin->IsImage); 2961 // Non-images can have complications from operands that allow both SGPR 2962 // and VGPR. For now it's too complicated to figure out the final opcode 2963 // to derive the register bank from the MCInstrDesc. 2964 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg); 2965 return; 2966 } 2967 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: { 2968 unsigned N = MI.getNumExplicitOperands() - 2; 2969 applyDefaultMapping(OpdMapper); 2970 executeInWaterfallLoop(MI, MRI, { N }); 2971 return; 2972 } 2973 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { 2974 auto IntrID = MI.getIntrinsicID(); 2975 switch (IntrID) { 2976 case Intrinsic::amdgcn_ds_ordered_add: 2977 case Intrinsic::amdgcn_ds_ordered_swap: { 2978 // This is only allowed to execute with 1 lane, so readfirstlane is safe. 2979 assert(OpdMapper.getVRegs(0).empty()); 2980 substituteSimpleCopyRegs(OpdMapper, 3); 2981 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 2982 return; 2983 } 2984 case Intrinsic::amdgcn_ds_gws_init: 2985 case Intrinsic::amdgcn_ds_gws_barrier: 2986 case Intrinsic::amdgcn_ds_gws_sema_br: { 2987 // Only the first lane is executes, so readfirstlane is safe. 2988 substituteSimpleCopyRegs(OpdMapper, 1); 2989 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 2990 return; 2991 } 2992 case Intrinsic::amdgcn_ds_gws_sema_v: 2993 case Intrinsic::amdgcn_ds_gws_sema_p: 2994 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 2995 // Only the first lane is executes, so readfirstlane is safe. 2996 constrainOpWithReadfirstlane(MI, MRI, 1); // M0 2997 return; 2998 } 2999 case Intrinsic::amdgcn_ds_append: 3000 case Intrinsic::amdgcn_ds_consume: { 3001 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3002 return; 3003 } 3004 case Intrinsic::amdgcn_s_sendmsg: 3005 case Intrinsic::amdgcn_s_sendmsghalt: { 3006 // FIXME: Should this use a waterfall loop? 3007 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3008 return; 3009 } 3010 case Intrinsic::amdgcn_s_setreg: { 3011 constrainOpWithReadfirstlane(MI, MRI, 2); 3012 return; 3013 } 3014 default: { 3015 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = 3016 AMDGPU::lookupRsrcIntrinsic(IntrID)) { 3017 // Non-images can have complications from operands that allow both SGPR 3018 // and VGPR. For now it's too complicated to figure out the final opcode 3019 // to derive the register bank from the MCInstrDesc. 3020 if (RSrcIntrin->IsImage) { 3021 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg); 3022 return; 3023 } 3024 } 3025 3026 break; 3027 } 3028 } 3029 break; 3030 } 3031 case AMDGPU::G_SI_CALL: { 3032 // Use a set to avoid extra readfirstlanes in the case where multiple 3033 // operands are the same register. 3034 SmallSet<Register, 4> SGPROperandRegs; 3035 3036 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1})) 3037 break; 3038 3039 // Move all copies to physical SGPRs that are used by the call instruction 3040 // into the loop block. Start searching for these copies until the 3041 // ADJCALLSTACKUP. 3042 unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP; 3043 unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN; 3044 3045 // Move all non-copies before the copies, so that a complete range can be 3046 // moved into the waterfall loop. 3047 SmallVector<MachineInstr *, 4> NonCopyInstrs; 3048 // Count of NonCopyInstrs found until the current LastCopy. 3049 unsigned NonCopyInstrsLen = 0; 3050 MachineBasicBlock::iterator Start(&MI); 3051 MachineBasicBlock::iterator LastCopy = Start; 3052 MachineBasicBlock *MBB = MI.getParent(); 3053 const SIMachineFunctionInfo *Info = 3054 MBB->getParent()->getInfo<SIMachineFunctionInfo>(); 3055 while (Start->getOpcode() != FrameSetupOpcode) { 3056 --Start; 3057 bool IsCopy = false; 3058 if (Start->getOpcode() == AMDGPU::COPY) { 3059 auto &Dst = Start->getOperand(0); 3060 if (Dst.isReg()) { 3061 Register Reg = Dst.getReg(); 3062 if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) { 3063 IsCopy = true; 3064 } else { 3065 // Also move the copy from the scratch rsrc descriptor into the loop 3066 // to allow it to be optimized away. 3067 auto &Src = Start->getOperand(1); 3068 if (Src.isReg()) { 3069 Reg = Src.getReg(); 3070 IsCopy = Info->getScratchRSrcReg() == Reg; 3071 } 3072 } 3073 } 3074 } 3075 3076 if (IsCopy) { 3077 LastCopy = Start; 3078 NonCopyInstrsLen = NonCopyInstrs.size(); 3079 } else { 3080 NonCopyInstrs.push_back(&*Start); 3081 } 3082 } 3083 NonCopyInstrs.resize(NonCopyInstrsLen); 3084 3085 for (auto *NonCopy : reverse(NonCopyInstrs)) { 3086 MBB->splice(LastCopy, MBB, NonCopy->getIterator()); 3087 } 3088 Start = LastCopy; 3089 3090 // Do the same for copies after the loop 3091 NonCopyInstrs.clear(); 3092 NonCopyInstrsLen = 0; 3093 MachineBasicBlock::iterator End(&MI); 3094 LastCopy = End; 3095 while (End->getOpcode() != FrameDestroyOpcode) { 3096 ++End; 3097 bool IsCopy = false; 3098 if (End->getOpcode() == AMDGPU::COPY) { 3099 auto &Src = End->getOperand(1); 3100 if (Src.isReg()) { 3101 Register Reg = Src.getReg(); 3102 IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI); 3103 } 3104 } 3105 3106 if (IsCopy) { 3107 LastCopy = End; 3108 NonCopyInstrsLen = NonCopyInstrs.size(); 3109 } else { 3110 NonCopyInstrs.push_back(&*End); 3111 } 3112 } 3113 NonCopyInstrs.resize(NonCopyInstrsLen); 3114 3115 End = LastCopy; 3116 ++LastCopy; 3117 for (auto *NonCopy : reverse(NonCopyInstrs)) { 3118 MBB->splice(LastCopy, MBB, NonCopy->getIterator()); 3119 } 3120 3121 ++End; 3122 MachineIRBuilder B(*Start); 3123 executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs, MRI); 3124 break; 3125 } 3126 case AMDGPU::G_LOAD: 3127 case AMDGPU::G_ZEXTLOAD: 3128 case AMDGPU::G_SEXTLOAD: { 3129 if (applyMappingLoad(MI, OpdMapper, MRI)) 3130 return; 3131 break; 3132 } 3133 case AMDGPU::G_DYN_STACKALLOC: 3134 applyMappingDynStackAlloc(MI, OpdMapper, MRI); 3135 return; 3136 case AMDGPU::G_SBFX: 3137 applyMappingBFE(OpdMapper, /*Signed*/ true); 3138 return; 3139 case AMDGPU::G_UBFX: 3140 applyMappingBFE(OpdMapper, /*Signed*/ false); 3141 return; 3142 default: 3143 break; 3144 } 3145 3146 return applyDefaultMapping(OpdMapper); 3147 } 3148 3149 // vgpr, sgpr -> vgpr 3150 // vgpr, agpr -> vgpr 3151 // agpr, agpr -> agpr 3152 // agpr, sgpr -> vgpr 3153 static unsigned regBankUnion(unsigned RB0, unsigned RB1) { 3154 if (RB0 == AMDGPU::InvalidRegBankID) 3155 return RB1; 3156 if (RB1 == AMDGPU::InvalidRegBankID) 3157 return RB0; 3158 3159 if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) 3160 return AMDGPU::SGPRRegBankID; 3161 3162 if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID) 3163 return AMDGPU::AGPRRegBankID; 3164 3165 return AMDGPU::VGPRRegBankID; 3166 } 3167 3168 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) { 3169 if (RB0 == AMDGPU::InvalidRegBankID) 3170 return RB1; 3171 if (RB1 == AMDGPU::InvalidRegBankID) 3172 return RB0; 3173 3174 // vcc, vcc -> vcc 3175 // vcc, sgpr -> vcc 3176 // vcc, vgpr -> vcc 3177 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID) 3178 return AMDGPU::VCCRegBankID; 3179 3180 // vcc, vgpr -> vgpr 3181 return regBankUnion(RB0, RB1); 3182 } 3183 3184 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI, 3185 const MachineInstr &MI) const { 3186 unsigned RegBank = AMDGPU::InvalidRegBankID; 3187 3188 for (const MachineOperand &MO : MI.operands()) { 3189 if (!MO.isReg()) 3190 continue; 3191 Register Reg = MO.getReg(); 3192 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 3193 RegBank = regBankUnion(RegBank, Bank->getID()); 3194 if (RegBank == AMDGPU::VGPRRegBankID) 3195 break; 3196 } 3197 } 3198 3199 return RegBank; 3200 } 3201 3202 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { 3203 const MachineFunction &MF = *MI.getParent()->getParent(); 3204 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3205 for (const MachineOperand &MO : MI.operands()) { 3206 if (!MO.isReg()) 3207 continue; 3208 Register Reg = MO.getReg(); 3209 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 3210 if (Bank->getID() != AMDGPU::SGPRRegBankID) 3211 return false; 3212 } 3213 } 3214 return true; 3215 } 3216 3217 const RegisterBankInfo::InstructionMapping & 3218 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const { 3219 const MachineFunction &MF = *MI.getParent()->getParent(); 3220 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3221 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3222 3223 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3224 const MachineOperand &SrcOp = MI.getOperand(i); 3225 if (!SrcOp.isReg()) 3226 continue; 3227 3228 unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI); 3229 OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3230 } 3231 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3232 MI.getNumOperands()); 3233 } 3234 3235 const RegisterBankInfo::InstructionMapping & 3236 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { 3237 const MachineFunction &MF = *MI.getParent()->getParent(); 3238 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3239 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3240 3241 // Even though we technically could use SGPRs, this would require knowledge of 3242 // the constant bus restriction. Force all sources to VGPR (except for VCC). 3243 // 3244 // TODO: Unary ops are trivially OK, so accept SGPRs? 3245 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3246 const MachineOperand &Src = MI.getOperand(i); 3247 if (!Src.isReg()) 3248 continue; 3249 3250 unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI); 3251 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID; 3252 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size); 3253 } 3254 3255 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3256 MI.getNumOperands()); 3257 } 3258 3259 const RegisterBankInfo::InstructionMapping & 3260 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const { 3261 const MachineFunction &MF = *MI.getParent()->getParent(); 3262 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3263 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3264 3265 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { 3266 const MachineOperand &Op = MI.getOperand(I); 3267 if (!Op.isReg()) 3268 continue; 3269 3270 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI); 3271 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3272 } 3273 3274 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3275 MI.getNumOperands()); 3276 } 3277 3278 const RegisterBankInfo::InstructionMapping & 3279 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI, 3280 const MachineInstr &MI, 3281 int RsrcIdx) const { 3282 // The reported argument index is relative to the IR intrinsic call arguments, 3283 // so we need to shift by the number of defs and the intrinsic ID. 3284 RsrcIdx += MI.getNumExplicitDefs() + 1; 3285 3286 const int NumOps = MI.getNumOperands(); 3287 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps); 3288 3289 // TODO: Should packed/unpacked D16 difference be reported here as part of 3290 // the value mapping? 3291 for (int I = 0; I != NumOps; ++I) { 3292 if (!MI.getOperand(I).isReg()) 3293 continue; 3294 3295 Register OpReg = MI.getOperand(I).getReg(); 3296 // We replace some dead address operands with $noreg 3297 if (!OpReg) 3298 continue; 3299 3300 unsigned Size = getSizeInBits(OpReg, MRI, *TRI); 3301 3302 // FIXME: Probably need a new intrinsic register bank searchable table to 3303 // handle arbitrary intrinsics easily. 3304 // 3305 // If this has a sampler, it immediately follows rsrc. 3306 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1; 3307 3308 if (MustBeSGPR) { 3309 // If this must be an SGPR, so we must report whatever it is as legal. 3310 unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID); 3311 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size); 3312 } else { 3313 // Some operands must be VGPR, and these are easy to copy to. 3314 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3315 } 3316 } 3317 3318 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps); 3319 } 3320 3321 /// Return the mapping for a pointer argument. 3322 const RegisterBankInfo::ValueMapping * 3323 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI, 3324 Register PtrReg) const { 3325 LLT PtrTy = MRI.getType(PtrReg); 3326 unsigned Size = PtrTy.getSizeInBits(); 3327 if (Subtarget.useFlatForGlobal() || 3328 !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace())) 3329 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3330 3331 // If we're using MUBUF instructions for global memory, an SGPR base register 3332 // is possible. Otherwise this needs to be a VGPR. 3333 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); 3334 return AMDGPU::getValueMapping(PtrBank->getID(), Size); 3335 } 3336 3337 const RegisterBankInfo::InstructionMapping & 3338 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { 3339 3340 const MachineFunction &MF = *MI.getParent()->getParent(); 3341 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3342 SmallVector<const ValueMapping*, 2> OpdsMapping(2); 3343 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3344 Register PtrReg = MI.getOperand(1).getReg(); 3345 LLT PtrTy = MRI.getType(PtrReg); 3346 unsigned AS = PtrTy.getAddressSpace(); 3347 unsigned PtrSize = PtrTy.getSizeInBits(); 3348 3349 const ValueMapping *ValMapping; 3350 const ValueMapping *PtrMapping; 3351 3352 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); 3353 3354 if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) { 3355 if (isScalarLoadLegal(MI)) { 3356 // We have a uniform instruction so we want to use an SMRD load 3357 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3358 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); 3359 } else { 3360 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3361 3362 // If we're using MUBUF instructions for global memory, an SGPR base 3363 // register is possible. Otherwise this needs to be a VGPR. 3364 unsigned PtrBankID = Subtarget.useFlatForGlobal() ? 3365 AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID; 3366 3367 PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize); 3368 } 3369 } else { 3370 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3371 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); 3372 } 3373 3374 OpdsMapping[0] = ValMapping; 3375 OpdsMapping[1] = PtrMapping; 3376 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping( 3377 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands()); 3378 return Mapping; 3379 3380 // FIXME: Do we want to add a mapping for FLAT load, or should we just 3381 // handle that during instruction selection? 3382 } 3383 3384 unsigned 3385 AMDGPURegisterBankInfo::getRegBankID(Register Reg, 3386 const MachineRegisterInfo &MRI, 3387 unsigned Default) const { 3388 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 3389 return Bank ? Bank->getID() : Default; 3390 } 3391 3392 const RegisterBankInfo::ValueMapping * 3393 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg, 3394 const MachineRegisterInfo &MRI, 3395 const TargetRegisterInfo &TRI) const { 3396 // Lie and claim anything is legal, even though this needs to be an SGPR 3397 // applyMapping will have to deal with it as a waterfall loop. 3398 unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID); 3399 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3400 return AMDGPU::getValueMapping(Bank, Size); 3401 } 3402 3403 const RegisterBankInfo::ValueMapping * 3404 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg, 3405 const MachineRegisterInfo &MRI, 3406 const TargetRegisterInfo &TRI) const { 3407 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3408 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3409 } 3410 3411 const RegisterBankInfo::ValueMapping * 3412 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg, 3413 const MachineRegisterInfo &MRI, 3414 const TargetRegisterInfo &TRI) const { 3415 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3416 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size); 3417 } 3418 3419 /// 3420 /// This function must return a legal mapping, because 3421 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called 3422 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a 3423 /// VGPR to SGPR generated is illegal. 3424 /// 3425 // Operands that must be SGPRs must accept potentially divergent VGPRs as 3426 // legal. These will be dealt with in applyMappingImpl. 3427 // 3428 const RegisterBankInfo::InstructionMapping & 3429 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { 3430 const MachineFunction &MF = *MI.getParent()->getParent(); 3431 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3432 3433 if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) { 3434 // The default logic bothers to analyze impossible alternative mappings. We 3435 // want the most straightforward mapping, so just directly handle this. 3436 const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI, 3437 *TRI); 3438 const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI, 3439 *TRI); 3440 assert(SrcBank && "src bank should have been assigned already"); 3441 if (!DstBank) 3442 DstBank = SrcBank; 3443 3444 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3445 if (cannotCopy(*DstBank, *SrcBank, Size)) 3446 return getInvalidInstructionMapping(); 3447 3448 const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank); 3449 unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2; 3450 SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize); 3451 OpdsMapping[0] = &ValMap; 3452 if (MI.getOpcode() == AMDGPU::G_FREEZE) 3453 OpdsMapping[1] = &ValMap; 3454 3455 return getInstructionMapping( 3456 1, /*Cost*/ 1, 3457 /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize); 3458 } 3459 3460 if (MI.isRegSequence()) { 3461 // If any input is a VGPR, the result must be a VGPR. The default handling 3462 // assumes any copy between banks is legal. 3463 unsigned BankID = AMDGPU::SGPRRegBankID; 3464 3465 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3466 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI); 3467 // It doesn't make sense to use vcc or scc banks here, so just ignore 3468 // them. 3469 if (OpBank != AMDGPU::SGPRRegBankID) { 3470 BankID = AMDGPU::VGPRRegBankID; 3471 break; 3472 } 3473 } 3474 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3475 3476 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID)); 3477 return getInstructionMapping( 3478 1, /*Cost*/ 1, 3479 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 3480 } 3481 3482 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies 3483 // properly. 3484 // 3485 // TODO: There are additional exec masking dependencies to analyze. 3486 if (MI.getOpcode() == TargetOpcode::G_PHI) { 3487 unsigned ResultBank = AMDGPU::InvalidRegBankID; 3488 Register DstReg = MI.getOperand(0).getReg(); 3489 3490 // Sometimes the result may have already been assigned a bank. 3491 if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI)) 3492 ResultBank = DstBank->getID(); 3493 3494 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3495 Register Reg = MI.getOperand(I).getReg(); 3496 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 3497 3498 // FIXME: Assuming VGPR for any undetermined inputs. 3499 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) { 3500 ResultBank = AMDGPU::VGPRRegBankID; 3501 break; 3502 } 3503 3504 // FIXME: Need to promote SGPR case to s32 3505 unsigned OpBank = Bank->getID(); 3506 ResultBank = regBankBoolUnion(ResultBank, OpBank); 3507 } 3508 3509 assert(ResultBank != AMDGPU::InvalidRegBankID); 3510 3511 unsigned Size = MRI.getType(DstReg).getSizeInBits(); 3512 3513 const ValueMapping &ValMap = 3514 getValueMapping(0, Size, getRegBank(ResultBank)); 3515 return getInstructionMapping( 3516 1, /*Cost*/ 1, 3517 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 3518 } 3519 3520 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI); 3521 if (Mapping.isValid()) 3522 return Mapping; 3523 3524 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3525 3526 switch (MI.getOpcode()) { 3527 default: 3528 return getInvalidInstructionMapping(); 3529 3530 case AMDGPU::G_AND: 3531 case AMDGPU::G_OR: 3532 case AMDGPU::G_XOR: { 3533 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3534 if (Size == 1) { 3535 const RegisterBank *DstBank 3536 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI); 3537 3538 unsigned TargetBankID = AMDGPU::InvalidRegBankID; 3539 unsigned BankLHS = AMDGPU::InvalidRegBankID; 3540 unsigned BankRHS = AMDGPU::InvalidRegBankID; 3541 if (DstBank) { 3542 TargetBankID = DstBank->getID(); 3543 if (DstBank == &AMDGPU::VCCRegBank) { 3544 TargetBankID = AMDGPU::VCCRegBankID; 3545 BankLHS = AMDGPU::VCCRegBankID; 3546 BankRHS = AMDGPU::VCCRegBankID; 3547 } else { 3548 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, 3549 AMDGPU::SGPRRegBankID); 3550 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, 3551 AMDGPU::SGPRRegBankID); 3552 } 3553 } else { 3554 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, 3555 AMDGPU::VCCRegBankID); 3556 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, 3557 AMDGPU::VCCRegBankID); 3558 3559 // Both inputs should be true booleans to produce a boolean result. 3560 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) { 3561 TargetBankID = AMDGPU::VGPRRegBankID; 3562 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) { 3563 TargetBankID = AMDGPU::VCCRegBankID; 3564 BankLHS = AMDGPU::VCCRegBankID; 3565 BankRHS = AMDGPU::VCCRegBankID; 3566 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) { 3567 TargetBankID = AMDGPU::SGPRRegBankID; 3568 } 3569 } 3570 3571 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size); 3572 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size); 3573 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size); 3574 break; 3575 } 3576 3577 if (Size == 64) { 3578 3579 if (isSALUMapping(MI)) { 3580 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size); 3581 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0]; 3582 } else { 3583 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size); 3584 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/); 3585 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size); 3586 3587 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/); 3588 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size); 3589 } 3590 3591 break; 3592 } 3593 3594 LLVM_FALLTHROUGH; 3595 } 3596 case AMDGPU::G_PTR_ADD: 3597 case AMDGPU::G_PTRMASK: 3598 case AMDGPU::G_ADD: 3599 case AMDGPU::G_SUB: 3600 case AMDGPU::G_MUL: 3601 case AMDGPU::G_SHL: 3602 case AMDGPU::G_LSHR: 3603 case AMDGPU::G_ASHR: 3604 case AMDGPU::G_UADDO: 3605 case AMDGPU::G_USUBO: 3606 case AMDGPU::G_UADDE: 3607 case AMDGPU::G_SADDE: 3608 case AMDGPU::G_USUBE: 3609 case AMDGPU::G_SSUBE: 3610 case AMDGPU::G_SMIN: 3611 case AMDGPU::G_SMAX: 3612 case AMDGPU::G_UMIN: 3613 case AMDGPU::G_UMAX: 3614 case AMDGPU::G_ABS: 3615 case AMDGPU::G_SHUFFLE_VECTOR: 3616 case AMDGPU::G_SBFX: 3617 case AMDGPU::G_UBFX: 3618 if (isSALUMapping(MI)) 3619 return getDefaultMappingSOP(MI); 3620 LLVM_FALLTHROUGH; 3621 3622 case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU 3623 case AMDGPU::G_SSUBSAT: 3624 case AMDGPU::G_UADDSAT: 3625 case AMDGPU::G_USUBSAT: 3626 case AMDGPU::G_FADD: 3627 case AMDGPU::G_FSUB: 3628 case AMDGPU::G_FPTOSI: 3629 case AMDGPU::G_FPTOUI: 3630 case AMDGPU::G_FMUL: 3631 case AMDGPU::G_FMA: 3632 case AMDGPU::G_FMAD: 3633 case AMDGPU::G_FSQRT: 3634 case AMDGPU::G_FFLOOR: 3635 case AMDGPU::G_FCEIL: 3636 case AMDGPU::G_FRINT: 3637 case AMDGPU::G_SITOFP: 3638 case AMDGPU::G_UITOFP: 3639 case AMDGPU::G_FPTRUNC: 3640 case AMDGPU::G_FPEXT: 3641 case AMDGPU::G_FEXP2: 3642 case AMDGPU::G_FLOG2: 3643 case AMDGPU::G_FMINNUM: 3644 case AMDGPU::G_FMAXNUM: 3645 case AMDGPU::G_FMINNUM_IEEE: 3646 case AMDGPU::G_FMAXNUM_IEEE: 3647 case AMDGPU::G_FCANONICALIZE: 3648 case AMDGPU::G_INTRINSIC_TRUNC: 3649 case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar? 3650 case AMDGPU::G_FSHR: // TODO: Expand for scalar 3651 case AMDGPU::G_AMDGPU_FMIN_LEGACY: 3652 case AMDGPU::G_AMDGPU_FMAX_LEGACY: 3653 case AMDGPU::G_AMDGPU_RCP_IFLAG: 3654 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0: 3655 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1: 3656 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2: 3657 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: 3658 case AMDGPU::G_AMDGPU_CVT_PK_I16_I32: 3659 case AMDGPU::G_AMDGPU_SMED3: 3660 return getDefaultMappingVOP(MI); 3661 case AMDGPU::G_UMULH: 3662 case AMDGPU::G_SMULH: { 3663 if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI)) 3664 return getDefaultMappingSOP(MI); 3665 return getDefaultMappingVOP(MI); 3666 } 3667 case AMDGPU::G_IMPLICIT_DEF: { 3668 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3669 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3670 break; 3671 } 3672 case AMDGPU::G_FCONSTANT: 3673 case AMDGPU::G_CONSTANT: 3674 case AMDGPU::G_GLOBAL_VALUE: 3675 case AMDGPU::G_BLOCK_ADDR: 3676 case AMDGPU::G_READCYCLECOUNTER: { 3677 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3678 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3679 break; 3680 } 3681 case AMDGPU::G_FRAME_INDEX: { 3682 // TODO: This should be the same as other constants, but eliminateFrameIndex 3683 // currently assumes VALU uses. 3684 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3685 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3686 break; 3687 } 3688 case AMDGPU::G_DYN_STACKALLOC: { 3689 // Result is always uniform, and a wave reduction is needed for the source. 3690 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 3691 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3692 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32); 3693 break; 3694 } 3695 case AMDGPU::G_AMDGPU_WAVE_ADDRESS: { 3696 // This case is weird because we expect a physical register in the source, 3697 // but need to set a bank anyway. 3698 // 3699 // We could select the result to SGPR or VGPR, but for the one current use 3700 // it's more practical to always use VGPR. 3701 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 3702 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 3703 break; 3704 } 3705 case AMDGPU::G_INSERT: { 3706 unsigned BankID = getMappingType(MRI, MI); 3707 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3708 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 3709 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI); 3710 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 3711 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 3712 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize); 3713 OpdsMapping[3] = nullptr; 3714 break; 3715 } 3716 case AMDGPU::G_EXTRACT: { 3717 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3718 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3719 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 3720 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 3721 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 3722 OpdsMapping[2] = nullptr; 3723 break; 3724 } 3725 case AMDGPU::G_BUILD_VECTOR: 3726 case AMDGPU::G_BUILD_VECTOR_TRUNC: { 3727 LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); 3728 if (DstTy == LLT::fixed_vector(2, 16)) { 3729 unsigned DstSize = DstTy.getSizeInBits(); 3730 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3731 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3732 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI); 3733 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID); 3734 3735 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize); 3736 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize); 3737 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize); 3738 break; 3739 } 3740 3741 LLVM_FALLTHROUGH; 3742 } 3743 case AMDGPU::G_MERGE_VALUES: 3744 case AMDGPU::G_CONCAT_VECTORS: { 3745 unsigned Bank = getMappingType(MRI, MI); 3746 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3747 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3748 3749 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 3750 // Op1 and Dst should use the same register bank. 3751 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i) 3752 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize); 3753 break; 3754 } 3755 case AMDGPU::G_BITREVERSE: 3756 case AMDGPU::G_BITCAST: 3757 case AMDGPU::G_INTTOPTR: 3758 case AMDGPU::G_PTRTOINT: 3759 case AMDGPU::G_FABS: 3760 case AMDGPU::G_FNEG: { 3761 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3762 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3763 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); 3764 break; 3765 } 3766 case AMDGPU::G_AMDGPU_FFBH_U32: 3767 case AMDGPU::G_AMDGPU_FFBL_B32: 3768 case AMDGPU::G_CTLZ_ZERO_UNDEF: 3769 case AMDGPU::G_CTTZ_ZERO_UNDEF: { 3770 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3771 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3772 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32); 3773 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size); 3774 break; 3775 } 3776 case AMDGPU::G_CTPOP: { 3777 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3778 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3779 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32); 3780 3781 // This should really be getValueMappingSGPR64Only, but allowing the generic 3782 // code to handle the register split just makes using LegalizerHelper more 3783 // difficult. 3784 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); 3785 break; 3786 } 3787 case AMDGPU::G_TRUNC: { 3788 Register Dst = MI.getOperand(0).getReg(); 3789 Register Src = MI.getOperand(1).getReg(); 3790 unsigned Bank = getRegBankID(Src, MRI); 3791 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 3792 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 3793 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 3794 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize); 3795 break; 3796 } 3797 case AMDGPU::G_ZEXT: 3798 case AMDGPU::G_SEXT: 3799 case AMDGPU::G_ANYEXT: 3800 case AMDGPU::G_SEXT_INREG: { 3801 Register Dst = MI.getOperand(0).getReg(); 3802 Register Src = MI.getOperand(1).getReg(); 3803 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 3804 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 3805 3806 unsigned DstBank; 3807 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI); 3808 assert(SrcBank); 3809 switch (SrcBank->getID()) { 3810 case AMDGPU::SGPRRegBankID: 3811 DstBank = AMDGPU::SGPRRegBankID; 3812 break; 3813 default: 3814 DstBank = AMDGPU::VGPRRegBankID; 3815 break; 3816 } 3817 3818 // Scalar extend can use 64-bit BFE, but VGPRs require extending to 3819 // 32-bits, and then to 64. 3820 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize); 3821 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(), 3822 SrcSize); 3823 break; 3824 } 3825 case AMDGPU::G_FCMP: { 3826 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 3827 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI); 3828 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 3829 OpdsMapping[1] = nullptr; // Predicate Operand. 3830 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size); 3831 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3832 break; 3833 } 3834 case AMDGPU::G_STORE: { 3835 assert(MI.getOperand(0).isReg()); 3836 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3837 3838 // FIXME: We need to specify a different reg bank once scalar stores are 3839 // supported. 3840 const ValueMapping *ValMapping = 3841 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3842 OpdsMapping[0] = ValMapping; 3843 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 3844 break; 3845 } 3846 case AMDGPU::G_ICMP: { 3847 auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); 3848 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 3849 3850 // See if the result register has already been constrained to vcc, which may 3851 // happen due to control flow intrinsic lowering. 3852 unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI, 3853 AMDGPU::SGPRRegBankID); 3854 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI); 3855 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI); 3856 3857 bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID && 3858 Op2Bank == AMDGPU::SGPRRegBankID && 3859 Op3Bank == AMDGPU::SGPRRegBankID && 3860 (Size == 32 || (Size == 64 && 3861 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) && 3862 Subtarget.hasScalarCompareEq64())); 3863 3864 DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 3865 unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 3866 3867 // TODO: Use 32-bit for scalar output size. 3868 // SCC results will need to be copied to a 32-bit SGPR virtual register. 3869 const unsigned ResultSize = 1; 3870 3871 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize); 3872 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size); 3873 OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size); 3874 break; 3875 } 3876 case AMDGPU::G_EXTRACT_VECTOR_ELT: { 3877 // VGPR index can be used for waterfall when indexing a SGPR vector. 3878 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3879 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3880 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3881 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 3882 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI); 3883 unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank); 3884 3885 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize); 3886 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize); 3887 3888 // The index can be either if the source vector is VGPR. 3889 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize); 3890 break; 3891 } 3892 case AMDGPU::G_INSERT_VECTOR_ELT: { 3893 unsigned OutputBankID = isSALUMapping(MI) ? 3894 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 3895 3896 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3897 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 3898 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 3899 unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI); 3900 unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI); 3901 3902 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize); 3903 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize); 3904 3905 // This is a weird case, because we need to break down the mapping based on 3906 // the register bank of a different operand. 3907 if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) { 3908 OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID, 3909 InsertSize); 3910 } else { 3911 assert(InsertSize == 32 || InsertSize == 64); 3912 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize); 3913 } 3914 3915 // The index can be either if the source vector is VGPR. 3916 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize); 3917 break; 3918 } 3919 case AMDGPU::G_UNMERGE_VALUES: { 3920 unsigned Bank = getMappingType(MRI, MI); 3921 3922 // Op1 and Dst should use the same register bank. 3923 // FIXME: Shouldn't this be the default? Why do we need to handle this? 3924 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3925 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); 3926 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size); 3927 } 3928 break; 3929 } 3930 case AMDGPU::G_AMDGPU_BUFFER_LOAD: 3931 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 3932 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: 3933 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 3934 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: 3935 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: 3936 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: 3937 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: 3938 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: 3939 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: 3940 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: 3941 case AMDGPU::G_AMDGPU_BUFFER_STORE: 3942 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: 3943 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: 3944 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: 3945 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: { 3946 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 3947 3948 // rsrc 3949 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 3950 3951 // vindex 3952 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 3953 3954 // voffset 3955 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 3956 3957 // soffset 3958 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 3959 3960 // Any remaining operands are immediates and were correctly null 3961 // initialized. 3962 break; 3963 } 3964 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: 3965 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: 3966 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: 3967 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: 3968 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: 3969 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: 3970 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: 3971 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: 3972 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: 3973 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: 3974 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: 3975 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: 3976 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: 3977 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: 3978 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { 3979 // vdata_out 3980 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 3981 3982 // vdata_in 3983 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 3984 3985 // rsrc 3986 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 3987 3988 // vindex 3989 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 3990 3991 // voffset 3992 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 3993 3994 // soffset 3995 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 3996 3997 // Any remaining operands are immediates and were correctly null 3998 // initialized. 3999 break; 4000 } 4001 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { 4002 // vdata_out 4003 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4004 4005 // vdata_in 4006 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4007 4008 // cmp 4009 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4010 4011 // rsrc 4012 OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4013 4014 // vindex 4015 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4016 4017 // voffset 4018 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4019 4020 // soffset 4021 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI); 4022 4023 // Any remaining operands are immediates and were correctly null 4024 // initialized. 4025 break; 4026 } 4027 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { 4028 // Lie and claim everything is legal, even though some need to be 4029 // SGPRs. applyMapping will have to deal with it as a waterfall loop. 4030 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4031 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4032 4033 // We need to convert this to a MUBUF if either the resource of offset is 4034 // VGPR. 4035 unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID(); 4036 unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID(); 4037 unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank); 4038 4039 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4040 OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0); 4041 break; 4042 } 4043 case AMDGPU::G_INTRINSIC: { 4044 switch (MI.getIntrinsicID()) { 4045 default: 4046 return getInvalidInstructionMapping(); 4047 case Intrinsic::amdgcn_div_fmas: 4048 case Intrinsic::amdgcn_div_fixup: 4049 case Intrinsic::amdgcn_trig_preop: 4050 case Intrinsic::amdgcn_sin: 4051 case Intrinsic::amdgcn_cos: 4052 case Intrinsic::amdgcn_log_clamp: 4053 case Intrinsic::amdgcn_rcp: 4054 case Intrinsic::amdgcn_rcp_legacy: 4055 case Intrinsic::amdgcn_sqrt: 4056 case Intrinsic::amdgcn_rsq: 4057 case Intrinsic::amdgcn_rsq_legacy: 4058 case Intrinsic::amdgcn_rsq_clamp: 4059 case Intrinsic::amdgcn_fmul_legacy: 4060 case Intrinsic::amdgcn_fma_legacy: 4061 case Intrinsic::amdgcn_ldexp: 4062 case Intrinsic::amdgcn_frexp_mant: 4063 case Intrinsic::amdgcn_frexp_exp: 4064 case Intrinsic::amdgcn_fract: 4065 case Intrinsic::amdgcn_cvt_pkrtz: 4066 case Intrinsic::amdgcn_cvt_pknorm_i16: 4067 case Intrinsic::amdgcn_cvt_pknorm_u16: 4068 case Intrinsic::amdgcn_cvt_pk_i16: 4069 case Intrinsic::amdgcn_cvt_pk_u16: 4070 case Intrinsic::amdgcn_fmed3: 4071 case Intrinsic::amdgcn_cubeid: 4072 case Intrinsic::amdgcn_cubema: 4073 case Intrinsic::amdgcn_cubesc: 4074 case Intrinsic::amdgcn_cubetc: 4075 case Intrinsic::amdgcn_sffbh: 4076 case Intrinsic::amdgcn_fmad_ftz: 4077 case Intrinsic::amdgcn_mbcnt_lo: 4078 case Intrinsic::amdgcn_mbcnt_hi: 4079 case Intrinsic::amdgcn_mul_u24: 4080 case Intrinsic::amdgcn_mul_i24: 4081 case Intrinsic::amdgcn_mulhi_u24: 4082 case Intrinsic::amdgcn_mulhi_i24: 4083 case Intrinsic::amdgcn_lerp: 4084 case Intrinsic::amdgcn_sad_u8: 4085 case Intrinsic::amdgcn_msad_u8: 4086 case Intrinsic::amdgcn_sad_hi_u8: 4087 case Intrinsic::amdgcn_sad_u16: 4088 case Intrinsic::amdgcn_qsad_pk_u16_u8: 4089 case Intrinsic::amdgcn_mqsad_pk_u16_u8: 4090 case Intrinsic::amdgcn_mqsad_u32_u8: 4091 case Intrinsic::amdgcn_cvt_pk_u8_f32: 4092 case Intrinsic::amdgcn_alignbyte: 4093 case Intrinsic::amdgcn_perm: 4094 case Intrinsic::amdgcn_fdot2: 4095 case Intrinsic::amdgcn_sdot2: 4096 case Intrinsic::amdgcn_udot2: 4097 case Intrinsic::amdgcn_sdot4: 4098 case Intrinsic::amdgcn_udot4: 4099 case Intrinsic::amdgcn_sdot8: 4100 case Intrinsic::amdgcn_udot8: 4101 return getDefaultMappingVOP(MI); 4102 case Intrinsic::amdgcn_sbfe: 4103 case Intrinsic::amdgcn_ubfe: 4104 if (isSALUMapping(MI)) 4105 return getDefaultMappingSOP(MI); 4106 return getDefaultMappingVOP(MI); 4107 case Intrinsic::amdgcn_ds_swizzle: 4108 case Intrinsic::amdgcn_ds_permute: 4109 case Intrinsic::amdgcn_ds_bpermute: 4110 case Intrinsic::amdgcn_update_dpp: 4111 case Intrinsic::amdgcn_mov_dpp8: 4112 case Intrinsic::amdgcn_mov_dpp: 4113 case Intrinsic::amdgcn_strict_wwm: 4114 case Intrinsic::amdgcn_wwm: 4115 case Intrinsic::amdgcn_strict_wqm: 4116 case Intrinsic::amdgcn_wqm: 4117 case Intrinsic::amdgcn_softwqm: 4118 case Intrinsic::amdgcn_set_inactive: 4119 return getDefaultMappingAllVGPR(MI); 4120 case Intrinsic::amdgcn_kernarg_segment_ptr: 4121 case Intrinsic::amdgcn_s_getpc: 4122 case Intrinsic::amdgcn_groupstaticsize: 4123 case Intrinsic::amdgcn_reloc_constant: 4124 case Intrinsic::returnaddress: { 4125 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4126 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4127 break; 4128 } 4129 case Intrinsic::amdgcn_wqm_vote: { 4130 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4131 OpdsMapping[0] = OpdsMapping[2] 4132 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size); 4133 break; 4134 } 4135 case Intrinsic::amdgcn_ps_live: { 4136 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4137 break; 4138 } 4139 case Intrinsic::amdgcn_div_scale: { 4140 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4141 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4142 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size); 4143 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size); 4144 4145 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 4146 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4147 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4148 break; 4149 } 4150 case Intrinsic::amdgcn_class: { 4151 Register Src0Reg = MI.getOperand(2).getReg(); 4152 Register Src1Reg = MI.getOperand(3).getReg(); 4153 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits(); 4154 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits(); 4155 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4156 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize); 4157 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size); 4158 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size); 4159 break; 4160 } 4161 case Intrinsic::amdgcn_icmp: 4162 case Intrinsic::amdgcn_fcmp: { 4163 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4164 // This is not VCCRegBank because this is not used in boolean contexts. 4165 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4166 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4167 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); 4168 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); 4169 break; 4170 } 4171 case Intrinsic::amdgcn_readlane: { 4172 // This must be an SGPR, but accept a VGPR. 4173 Register IdxReg = MI.getOperand(3).getReg(); 4174 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 4175 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID); 4176 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4177 LLVM_FALLTHROUGH; 4178 } 4179 case Intrinsic::amdgcn_readfirstlane: { 4180 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4181 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4182 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4183 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4184 break; 4185 } 4186 case Intrinsic::amdgcn_writelane: { 4187 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4188 Register SrcReg = MI.getOperand(2).getReg(); 4189 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 4190 unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID); 4191 Register IdxReg = MI.getOperand(3).getReg(); 4192 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 4193 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID); 4194 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4195 4196 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted 4197 // to legalize. 4198 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize); 4199 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4200 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4201 break; 4202 } 4203 case Intrinsic::amdgcn_if_break: { 4204 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4205 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4206 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4207 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4208 break; 4209 } 4210 case Intrinsic::amdgcn_permlane16: 4211 case Intrinsic::amdgcn_permlanex16: { 4212 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4213 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4214 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4215 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4216 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4217 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4218 break; 4219 } 4220 case Intrinsic::amdgcn_mfma_f32_4x4x1f32: 4221 case Intrinsic::amdgcn_mfma_f32_4x4x4f16: 4222 case Intrinsic::amdgcn_mfma_i32_4x4x4i8: 4223 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16: 4224 case Intrinsic::amdgcn_mfma_f32_16x16x1f32: 4225 case Intrinsic::amdgcn_mfma_f32_16x16x4f32: 4226 case Intrinsic::amdgcn_mfma_f32_16x16x4f16: 4227 case Intrinsic::amdgcn_mfma_f32_16x16x16f16: 4228 case Intrinsic::amdgcn_mfma_i32_16x16x4i8: 4229 case Intrinsic::amdgcn_mfma_i32_16x16x16i8: 4230 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16: 4231 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16: 4232 case Intrinsic::amdgcn_mfma_f32_32x32x1f32: 4233 case Intrinsic::amdgcn_mfma_f32_32x32x2f32: 4234 case Intrinsic::amdgcn_mfma_f32_32x32x4f16: 4235 case Intrinsic::amdgcn_mfma_f32_32x32x8f16: 4236 case Intrinsic::amdgcn_mfma_i32_32x32x4i8: 4237 case Intrinsic::amdgcn_mfma_i32_32x32x8i8: 4238 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16: 4239 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: 4240 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k: 4241 case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k: 4242 case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k: 4243 case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k: 4244 case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k: 4245 case Intrinsic::amdgcn_mfma_f64_16x16x4f64: 4246 case Intrinsic::amdgcn_mfma_f64_4x4x4f64: { 4247 // Default for MAI intrinsics. 4248 // srcC can also be an immediate which can be folded later. 4249 // FIXME: Should we eventually add an alternative mapping with AGPR src 4250 // for srcA/srcB? 4251 // 4252 // vdst, srcA, srcB, srcC 4253 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 4254 OpdsMapping[0] = 4255 Info->mayNeedAGPRs() 4256 ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) 4257 : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4258 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4259 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4260 OpdsMapping[4] = 4261 Info->mayNeedAGPRs() 4262 ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) 4263 : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4264 break; 4265 } 4266 case Intrinsic::amdgcn_interp_p1: 4267 case Intrinsic::amdgcn_interp_p2: 4268 case Intrinsic::amdgcn_interp_mov: 4269 case Intrinsic::amdgcn_interp_p1_f16: 4270 case Intrinsic::amdgcn_interp_p2_f16: { 4271 const int M0Idx = MI.getNumOperands() - 1; 4272 Register M0Reg = MI.getOperand(M0Idx).getReg(); 4273 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID); 4274 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4275 4276 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4277 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I) 4278 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4279 4280 // Must be SGPR, but we must take whatever the original bank is and fix it 4281 // later. 4282 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32); 4283 break; 4284 } 4285 case Intrinsic::amdgcn_ballot: { 4286 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4287 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4288 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4289 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize); 4290 break; 4291 } 4292 } 4293 break; 4294 } 4295 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 4296 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: 4297 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: 4298 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { 4299 auto IntrID = MI.getIntrinsicID(); 4300 const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID); 4301 assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic"); 4302 // Non-images can have complications from operands that allow both SGPR 4303 // and VGPR. For now it's too complicated to figure out the final opcode 4304 // to derive the register bank from the MCInstrDesc. 4305 assert(RSrcIntrin->IsImage); 4306 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg); 4307 } 4308 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: { 4309 unsigned N = MI.getNumExplicitOperands() - 2; 4310 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128); 4311 OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI); 4312 if (N == 3) { 4313 // Sequential form: all operands combined into VGPR256/VGPR512 4314 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4315 if (Size > 256) 4316 Size = 512; 4317 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4318 } else { 4319 // NSA form 4320 for (unsigned I = 2; I < N; ++I) 4321 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4322 } 4323 break; 4324 } 4325 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { 4326 auto IntrID = MI.getIntrinsicID(); 4327 switch (IntrID) { 4328 case Intrinsic::amdgcn_s_getreg: 4329 case Intrinsic::amdgcn_s_memtime: 4330 case Intrinsic::amdgcn_s_memrealtime: 4331 case Intrinsic::amdgcn_s_get_waveid_in_workgroup: { 4332 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4333 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4334 break; 4335 } 4336 case Intrinsic::amdgcn_global_atomic_fadd: 4337 case Intrinsic::amdgcn_global_atomic_csub: 4338 case Intrinsic::amdgcn_global_atomic_fmin: 4339 case Intrinsic::amdgcn_global_atomic_fmax: 4340 case Intrinsic::amdgcn_flat_atomic_fadd: 4341 case Intrinsic::amdgcn_flat_atomic_fmin: 4342 case Intrinsic::amdgcn_flat_atomic_fmax: 4343 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: 4344 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: 4345 return getDefaultMappingAllVGPR(MI); 4346 case Intrinsic::amdgcn_ds_ordered_add: 4347 case Intrinsic::amdgcn_ds_ordered_swap: { 4348 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4349 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4350 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4351 AMDGPU::SGPRRegBankID); 4352 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32); 4353 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4354 break; 4355 } 4356 case Intrinsic::amdgcn_ds_append: 4357 case Intrinsic::amdgcn_ds_consume: { 4358 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4359 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4360 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4361 break; 4362 } 4363 case Intrinsic::amdgcn_exp_compr: 4364 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4365 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4366 break; 4367 case Intrinsic::amdgcn_exp: 4368 // FIXME: Could we support packed types here? 4369 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4370 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4371 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4372 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4373 break; 4374 case Intrinsic::amdgcn_s_sendmsg: 4375 case Intrinsic::amdgcn_s_sendmsghalt: { 4376 // This must be an SGPR, but accept a VGPR. 4377 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4378 AMDGPU::SGPRRegBankID); 4379 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4380 break; 4381 } 4382 case Intrinsic::amdgcn_s_setreg: { 4383 // This must be an SGPR, but accept a VGPR. 4384 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4385 AMDGPU::SGPRRegBankID); 4386 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4387 break; 4388 } 4389 case Intrinsic::amdgcn_end_cf: { 4390 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4391 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4392 break; 4393 } 4394 case Intrinsic::amdgcn_else: { 4395 unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4396 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4397 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); 4398 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); 4399 break; 4400 } 4401 case Intrinsic::amdgcn_live_mask: { 4402 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4403 break; 4404 } 4405 case Intrinsic::amdgcn_wqm_demote: 4406 case Intrinsic::amdgcn_kill: { 4407 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4408 break; 4409 } 4410 case Intrinsic::amdgcn_raw_buffer_load: 4411 case Intrinsic::amdgcn_raw_tbuffer_load: { 4412 // FIXME: Should make intrinsic ID the last operand of the instruction, 4413 // then this would be the same as store 4414 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4415 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4416 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4417 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4418 break; 4419 } 4420 case Intrinsic::amdgcn_raw_buffer_store: 4421 case Intrinsic::amdgcn_raw_buffer_store_format: 4422 case Intrinsic::amdgcn_raw_tbuffer_store: { 4423 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4424 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4425 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4426 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4427 break; 4428 } 4429 case Intrinsic::amdgcn_struct_buffer_load: 4430 case Intrinsic::amdgcn_struct_tbuffer_load: { 4431 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4432 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4433 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4434 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4435 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4436 break; 4437 } 4438 case Intrinsic::amdgcn_struct_buffer_store: 4439 case Intrinsic::amdgcn_struct_tbuffer_store: { 4440 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4441 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4442 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4443 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4444 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4445 break; 4446 } 4447 case Intrinsic::amdgcn_init_exec_from_input: { 4448 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4449 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4450 break; 4451 } 4452 case Intrinsic::amdgcn_ds_gws_init: 4453 case Intrinsic::amdgcn_ds_gws_barrier: 4454 case Intrinsic::amdgcn_ds_gws_sema_br: { 4455 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4456 4457 // This must be an SGPR, but accept a VGPR. 4458 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4459 AMDGPU::SGPRRegBankID); 4460 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4461 break; 4462 } 4463 case Intrinsic::amdgcn_ds_gws_sema_v: 4464 case Intrinsic::amdgcn_ds_gws_sema_p: 4465 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 4466 // This must be an SGPR, but accept a VGPR. 4467 unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, 4468 AMDGPU::SGPRRegBankID); 4469 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); 4470 break; 4471 } 4472 default: 4473 return getInvalidInstructionMapping(); 4474 } 4475 break; 4476 } 4477 case AMDGPU::G_SELECT: { 4478 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4479 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4480 AMDGPU::SGPRRegBankID); 4481 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, 4482 AMDGPU::SGPRRegBankID); 4483 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID && 4484 Op3Bank == AMDGPU::SGPRRegBankID; 4485 4486 unsigned CondBankDefault = SGPRSrcs ? 4487 AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 4488 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, 4489 CondBankDefault); 4490 if (CondBank == AMDGPU::SGPRRegBankID) 4491 CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 4492 else if (CondBank == AMDGPU::VGPRRegBankID) 4493 CondBank = AMDGPU::VCCRegBankID; 4494 4495 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ? 4496 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 4497 4498 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID); 4499 4500 // TODO: Should report 32-bit for scalar condition type. 4501 if (Size == 64) { 4502 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 4503 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 4504 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 4505 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 4506 } else { 4507 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size); 4508 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 4509 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size); 4510 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size); 4511 } 4512 4513 break; 4514 } 4515 4516 case AMDGPU::G_SI_CALL: { 4517 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64); 4518 // Lie and claim everything is legal, even though some need to be 4519 // SGPRs. applyMapping will have to deal with it as a waterfall loop. 4520 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4521 4522 // Allow anything for implicit arguments 4523 for (unsigned I = 4; I < MI.getNumOperands(); ++I) { 4524 if (MI.getOperand(I).isReg()) { 4525 Register Reg = MI.getOperand(I).getReg(); 4526 auto OpBank = getRegBankID(Reg, MRI); 4527 unsigned Size = getSizeInBits(Reg, MRI, *TRI); 4528 OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size); 4529 } 4530 } 4531 break; 4532 } 4533 case AMDGPU::G_LOAD: 4534 case AMDGPU::G_ZEXTLOAD: 4535 case AMDGPU::G_SEXTLOAD: 4536 return getInstrMappingForLoad(MI); 4537 4538 case AMDGPU::G_ATOMICRMW_XCHG: 4539 case AMDGPU::G_ATOMICRMW_ADD: 4540 case AMDGPU::G_ATOMICRMW_SUB: 4541 case AMDGPU::G_ATOMICRMW_AND: 4542 case AMDGPU::G_ATOMICRMW_OR: 4543 case AMDGPU::G_ATOMICRMW_XOR: 4544 case AMDGPU::G_ATOMICRMW_MAX: 4545 case AMDGPU::G_ATOMICRMW_MIN: 4546 case AMDGPU::G_ATOMICRMW_UMAX: 4547 case AMDGPU::G_ATOMICRMW_UMIN: 4548 case AMDGPU::G_ATOMICRMW_FADD: 4549 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: 4550 case AMDGPU::G_AMDGPU_ATOMIC_INC: 4551 case AMDGPU::G_AMDGPU_ATOMIC_DEC: 4552 case AMDGPU::G_AMDGPU_ATOMIC_FMIN: 4553 case AMDGPU::G_AMDGPU_ATOMIC_FMAX: { 4554 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4555 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 4556 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4557 break; 4558 } 4559 case AMDGPU::G_ATOMIC_CMPXCHG: { 4560 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4561 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 4562 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4563 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4564 break; 4565 } 4566 case AMDGPU::G_BRCOND: { 4567 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, 4568 AMDGPU::SGPRRegBankID); 4569 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 4570 if (Bank != AMDGPU::SGPRRegBankID) 4571 Bank = AMDGPU::VCCRegBankID; 4572 4573 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1); 4574 break; 4575 } 4576 case AMDGPU::G_FPTRUNC_ROUND_UPWARD: 4577 case AMDGPU::G_FPTRUNC_ROUND_DOWNWARD: 4578 return getDefaultMappingVOP(MI); 4579 } 4580 4581 return getInstructionMapping(/*ID*/1, /*Cost*/1, 4582 getOperandsMapping(OpdsMapping), 4583 MI.getNumOperands()); 4584 } 4585