1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the RegisterBankInfo class for 10 /// AMDGPU. 11 /// 12 /// \par 13 /// 14 /// AMDGPU has unique register bank constraints that require special high level 15 /// strategies to deal with. There are two main true physical register banks 16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a 17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector 18 /// boolean context. There is also the AGPR bank, which is a special purpose 19 /// physical register bank present on some subtargets. 20 /// 21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to 22 /// be uniform. It is generally not valid to legalize operands by inserting 23 /// copies as on other targets. Operations which require uniform, SGPR operands 24 /// generally require scalarization by repeatedly executing the instruction, 25 /// activating each set of lanes using a unique set of input values. This is 26 /// referred to as a waterfall loop. 27 /// 28 /// \par Booleans 29 /// 30 /// Booleans (s1 values) requires special consideration. A vector compare result 31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit 32 /// register. These are represented with the VCC bank. During selection, we need 33 /// to be able to unambiguously go back from a register class to a register 34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register 35 /// bank, we need to know the use context type. An SGPR s1 value always means a 36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets 37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to 38 /// a 32-bit virtual register. Taken together, this means we need to adjust the 39 /// type of boolean operations to be regbank legal. All SALU booleans need to be 40 /// widened to 32-bits, and all VALU booleans need to be s1 values. 41 /// 42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact 43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc 44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from 45 /// memory) will require a copy to the VCC bank which will require clearing the 46 /// high bits and inserting a compare. 47 /// 48 /// \par Constant bus restriction 49 /// 50 /// VALU instructions have a limitation known as the constant bus 51 /// restriction. Most VALU instructions can use SGPR operands, but may read at 52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most 53 /// instructions). This is one unique SGPR, so the same SGPR may be used for 54 /// multiple operands. From a register bank perspective, any combination of 55 /// operands should be legal as an SGPR, but this is contextually dependent on 56 /// the SGPR operands all being the same register. There is therefore optimal to 57 /// choose the SGPR with the most uses to minimize the number of copies. 58 /// 59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_* 60 /// operation should have its source operands all mapped to VGPRs (except for 61 /// VCC), inserting copies from any SGPR operands. This the most trivial legal 62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too 63 /// complicated to solve here. Every optimization pattern or instruction 64 /// selected to multiple outputs would have to enforce this rule, and there 65 /// would be additional complexity in tracking this rule for every G_* 66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of 67 /// picking the optimal operand combination from a post-isel optimization pass. 68 /// 69 //===----------------------------------------------------------------------===// 70 71 #include "AMDGPURegisterBankInfo.h" 72 73 #include "AMDGPU.h" 74 #include "AMDGPUGlobalISelUtils.h" 75 #include "AMDGPUInstrInfo.h" 76 #include "GCNSubtarget.h" 77 #include "SIMachineFunctionInfo.h" 78 #include "SIRegisterInfo.h" 79 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 80 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 81 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 82 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 83 #include "llvm/CodeGen/RegisterBank.h" 84 #include "llvm/IR/IntrinsicsAMDGPU.h" 85 86 #define GET_TARGET_REGBANK_IMPL 87 #include "AMDGPUGenRegisterBank.inc" 88 89 // This file will be TableGen'ed at some point. 90 #include "AMDGPUGenRegisterBankInfo.def" 91 92 using namespace llvm; 93 using namespace MIPatternMatch; 94 95 namespace { 96 97 // Observer to apply a register bank to new registers created by LegalizerHelper. 98 class ApplyRegBankMapping final : public GISelChangeObserver { 99 private: 100 const AMDGPURegisterBankInfo &RBI; 101 MachineRegisterInfo &MRI; 102 const RegisterBank *NewBank; 103 SmallVector<MachineInstr *, 4> NewInsts; 104 105 public: 106 ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_, 107 MachineRegisterInfo &MRI_, const RegisterBank *RB) 108 : RBI(RBI_), MRI(MRI_), NewBank(RB) {} 109 110 ~ApplyRegBankMapping() { 111 for (MachineInstr *MI : NewInsts) 112 applyBank(*MI); 113 } 114 115 /// Set any registers that don't have a set register class or bank to SALU. 116 void applyBank(MachineInstr &MI) { 117 const unsigned Opc = MI.getOpcode(); 118 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT || 119 Opc == AMDGPU::G_SEXT) { 120 // LegalizerHelper wants to use the basic legalization artifacts when 121 // widening etc. We don't handle selection with vcc in artifact sources, 122 // so we need to use a select instead to handle these properly. 123 Register DstReg = MI.getOperand(0).getReg(); 124 Register SrcReg = MI.getOperand(1).getReg(); 125 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI); 126 if (SrcBank == &AMDGPU::VCCRegBank) { 127 const LLT S32 = LLT::scalar(32); 128 assert(MRI.getType(SrcReg) == LLT::scalar(1)); 129 assert(MRI.getType(DstReg) == S32); 130 assert(NewBank == &AMDGPU::VGPRRegBank); 131 132 // Replace the extension with a select, which really uses the boolean 133 // source. 134 MachineIRBuilder B(MI); 135 auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1); 136 auto False = B.buildConstant(S32, 0); 137 B.buildSelect(DstReg, SrcReg, True, False); 138 MRI.setRegBank(True.getReg(0), *NewBank); 139 MRI.setRegBank(False.getReg(0), *NewBank); 140 MI.eraseFromParent(); 141 } 142 143 assert(!MRI.getRegClassOrRegBank(DstReg)); 144 MRI.setRegBank(DstReg, *NewBank); 145 return; 146 } 147 148 #ifndef NDEBUG 149 if (Opc == AMDGPU::G_TRUNC) { 150 Register DstReg = MI.getOperand(0).getReg(); 151 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI); 152 assert(DstBank != &AMDGPU::VCCRegBank); 153 } 154 #endif 155 156 for (MachineOperand &Op : MI.operands()) { 157 if (!Op.isReg()) 158 continue; 159 160 // We may see physical registers if building a real MI 161 Register Reg = Op.getReg(); 162 if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg)) 163 continue; 164 165 const RegisterBank *RB = NewBank; 166 if (MRI.getType(Reg) == LLT::scalar(1)) { 167 assert(NewBank == &AMDGPU::VGPRRegBank && 168 "s1 operands should only be used for vector bools"); 169 assert((MI.getOpcode() != AMDGPU::G_TRUNC && 170 MI.getOpcode() != AMDGPU::G_ANYEXT) && 171 "not expecting legalization artifacts here"); 172 RB = &AMDGPU::VCCRegBank; 173 } 174 175 MRI.setRegBank(Reg, *RB); 176 } 177 } 178 179 void erasingInstr(MachineInstr &MI) override {} 180 181 void createdInstr(MachineInstr &MI) override { 182 // At this point, the instruction was just inserted and has no operands. 183 NewInsts.push_back(&MI); 184 } 185 186 void changingInstr(MachineInstr &MI) override {} 187 void changedInstr(MachineInstr &MI) override { 188 // FIXME: In principle we should probably add the instruction to NewInsts, 189 // but the way the LegalizerHelper uses the observer, we will always see the 190 // registers we need to set the regbank on also referenced in a new 191 // instruction. 192 } 193 }; 194 195 } 196 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST) 197 : Subtarget(ST), TRI(Subtarget.getRegisterInfo()), 198 TII(Subtarget.getInstrInfo()) { 199 200 // HACK: Until this is fully tablegen'd. 201 static llvm::once_flag InitializeRegisterBankFlag; 202 203 static auto InitializeRegisterBankOnce = [this]() { 204 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && 205 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank && 206 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank); 207 (void)this; 208 }; 209 210 llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce); 211 } 212 213 static bool isVectorRegisterBank(const RegisterBank &Bank) { 214 unsigned BankID = Bank.getID(); 215 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID; 216 } 217 218 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, 219 const RegisterBank &Src, 220 unsigned Size) const { 221 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane? 222 if (Dst.getID() == AMDGPU::SGPRRegBankID && 223 (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) { 224 return std::numeric_limits<unsigned>::max(); 225 } 226 227 // Bool values are tricky, because the meaning is based on context. The SCC 228 // and VCC banks are for the natural scalar and vector conditions produced by 229 // a compare. 230 // 231 // Legalization doesn't know about the necessary context, so an s1 use may 232 // have been a truncate from an arbitrary value, in which case a copy (lowered 233 // as a compare with 0) needs to be inserted. 234 if (Size == 1 && 235 (Dst.getID() == AMDGPU::SGPRRegBankID) && 236 (isVectorRegisterBank(Src) || 237 Src.getID() == AMDGPU::SGPRRegBankID || 238 Src.getID() == AMDGPU::VCCRegBankID)) 239 return std::numeric_limits<unsigned>::max(); 240 241 // There is no direct copy between AGPRs. 242 if (Dst.getID() == AMDGPU::AGPRRegBankID && 243 Src.getID() == AMDGPU::AGPRRegBankID) 244 return 4; 245 246 return RegisterBankInfo::copyCost(Dst, Src, Size); 247 } 248 249 unsigned AMDGPURegisterBankInfo::getBreakDownCost( 250 const ValueMapping &ValMapping, 251 const RegisterBank *CurBank) const { 252 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to 253 // VGPR. 254 // FIXME: Is there a better way to do this? 255 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64) 256 return 10; // This is expensive. 257 258 assert(ValMapping.NumBreakDowns == 2 && 259 ValMapping.BreakDown[0].Length == 32 && 260 ValMapping.BreakDown[0].StartIdx == 0 && 261 ValMapping.BreakDown[1].Length == 32 && 262 ValMapping.BreakDown[1].StartIdx == 32 && 263 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank); 264 265 // 32-bit extract of a 64-bit value is just access of a subregister, so free. 266 // TODO: Cost of 0 hits assert, though it's not clear it's what we really 267 // want. 268 269 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR 270 // alignment restrictions, but this probably isn't important. 271 return 1; 272 } 273 274 const RegisterBank & 275 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, 276 LLT Ty) const { 277 if (&RC == &AMDGPU::SReg_1RegClass) 278 return AMDGPU::VCCRegBank; 279 280 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a 281 // VCC-like use. 282 if (TRI->isSGPRClass(&RC)) { 283 // FIXME: This probably came from a copy from a physical register, which 284 // should be inferable from the copied to-type. We don't have many boolean 285 // physical register constraints so just assume a normal SGPR for now. 286 if (!Ty.isValid()) 287 return AMDGPU::SGPRRegBank; 288 289 return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank; 290 } 291 292 return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank; 293 } 294 295 template <unsigned NumOps> 296 RegisterBankInfo::InstructionMappings 297 AMDGPURegisterBankInfo::addMappingFromTable( 298 const MachineInstr &MI, const MachineRegisterInfo &MRI, 299 const std::array<unsigned, NumOps> RegSrcOpIdx, 300 ArrayRef<OpRegBankEntry<NumOps>> Table) const { 301 302 InstructionMappings AltMappings; 303 304 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands()); 305 306 unsigned Sizes[NumOps]; 307 for (unsigned I = 0; I < NumOps; ++I) { 308 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg(); 309 Sizes[I] = getSizeInBits(Reg, MRI, *TRI); 310 } 311 312 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) { 313 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI); 314 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI); 315 } 316 317 // getInstrMapping's default mapping uses ID 1, so start at 2. 318 unsigned MappingID = 2; 319 for (const auto &Entry : Table) { 320 for (unsigned I = 0; I < NumOps; ++I) { 321 int OpIdx = RegSrcOpIdx[I]; 322 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]); 323 } 324 325 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost, 326 getOperandsMapping(Operands), 327 Operands.size())); 328 } 329 330 return AltMappings; 331 } 332 333 RegisterBankInfo::InstructionMappings 334 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic( 335 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 336 switch (MI.getIntrinsicID()) { 337 case Intrinsic::amdgcn_readlane: { 338 static const OpRegBankEntry<3> Table[2] = { 339 // Perfectly legal. 340 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 341 342 // Need a readfirstlane for the index. 343 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 344 }; 345 346 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 347 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 348 } 349 case Intrinsic::amdgcn_writelane: { 350 static const OpRegBankEntry<4> Table[4] = { 351 // Perfectly legal. 352 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 353 354 // Need readfirstlane of first op 355 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 356 357 // Need readfirstlane of second op 358 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 359 360 // Need readfirstlane of both ops 361 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 } 362 }; 363 364 // rsrc, voffset, offset 365 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } }; 366 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 367 } 368 default: 369 return RegisterBankInfo::getInstrAlternativeMappings(MI); 370 } 371 } 372 373 RegisterBankInfo::InstructionMappings 374 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( 375 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 376 377 switch (MI.getIntrinsicID()) { 378 case Intrinsic::amdgcn_s_buffer_load: { 379 static const OpRegBankEntry<2> Table[4] = { 380 // Perfectly legal. 381 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 382 383 // Only need 1 register in loop 384 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 }, 385 386 // Have to waterfall the resource. 387 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 }, 388 389 // Have to waterfall the resource, and the offset. 390 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 } 391 }; 392 393 // rsrc, offset 394 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } }; 395 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 396 } 397 case Intrinsic::amdgcn_ds_ordered_add: 398 case Intrinsic::amdgcn_ds_ordered_swap: { 399 // VGPR = M0, VGPR 400 static const OpRegBankEntry<3> Table[2] = { 401 // Perfectly legal. 402 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 403 404 // Need a readfirstlane for m0 405 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 406 }; 407 408 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 409 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 410 } 411 case Intrinsic::amdgcn_s_sendmsg: 412 case Intrinsic::amdgcn_s_sendmsghalt: { 413 // FIXME: Should have no register for immediate 414 static const OpRegBankEntry<1> Table[2] = { 415 // Perfectly legal. 416 { { AMDGPU::SGPRRegBankID }, 1 }, 417 418 // Need readlane 419 { { AMDGPU::VGPRRegBankID }, 3 } 420 }; 421 422 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } }; 423 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 424 } 425 default: 426 return RegisterBankInfo::getInstrAlternativeMappings(MI); 427 } 428 } 429 430 // FIXME: Returns uniform if there's no source value information. This is 431 // probably wrong. 432 static bool isScalarLoadLegal(const MachineInstr &MI) { 433 if (!MI.hasOneMemOperand()) 434 return false; 435 436 const MachineMemOperand *MMO = *MI.memoperands_begin(); 437 const unsigned AS = MMO->getAddrSpace(); 438 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS || 439 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT; 440 // Require 4-byte alignment. 441 return MMO->getAlign() >= Align(4) && 442 // Can't do a scalar atomic load. 443 !MMO->isAtomic() && 444 // Don't use scalar loads for volatile accesses to non-constant address 445 // spaces. 446 (IsConst || !MMO->isVolatile()) && 447 // Memory must be known constant, or not written before this load. 448 (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) && 449 AMDGPUInstrInfo::isUniformMMO(MMO); 450 } 451 452 RegisterBankInfo::InstructionMappings 453 AMDGPURegisterBankInfo::getInstrAlternativeMappings( 454 const MachineInstr &MI) const { 455 456 const MachineFunction &MF = *MI.getParent()->getParent(); 457 const MachineRegisterInfo &MRI = MF.getRegInfo(); 458 459 460 InstructionMappings AltMappings; 461 switch (MI.getOpcode()) { 462 case TargetOpcode::G_CONSTANT: { 463 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 464 if (Size == 1) { 465 static const OpRegBankEntry<1> Table[3] = { 466 { { AMDGPU::VGPRRegBankID }, 1 }, 467 { { AMDGPU::SGPRRegBankID }, 1 }, 468 { { AMDGPU::VCCRegBankID }, 1 } 469 }; 470 471 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); 472 } 473 474 LLVM_FALLTHROUGH; 475 } 476 case TargetOpcode::G_FCONSTANT: 477 case TargetOpcode::G_FRAME_INDEX: 478 case TargetOpcode::G_GLOBAL_VALUE: { 479 static const OpRegBankEntry<1> Table[2] = { 480 { { AMDGPU::VGPRRegBankID }, 1 }, 481 { { AMDGPU::SGPRRegBankID }, 1 } 482 }; 483 484 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); 485 } 486 case TargetOpcode::G_AND: 487 case TargetOpcode::G_OR: 488 case TargetOpcode::G_XOR: { 489 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 490 491 if (Size == 1) { 492 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0. 493 const InstructionMapping &SCCMapping = getInstructionMapping( 494 1, 1, getOperandsMapping( 495 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), 496 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), 497 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}), 498 3); // Num Operands 499 AltMappings.push_back(&SCCMapping); 500 501 const InstructionMapping &VCCMapping0 = getInstructionMapping( 502 2, 1, getOperandsMapping( 503 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 504 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 505 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}), 506 3); // Num Operands 507 AltMappings.push_back(&VCCMapping0); 508 return AltMappings; 509 } 510 511 if (Size != 64) 512 break; 513 514 const InstructionMapping &SSMapping = getInstructionMapping( 515 1, 1, getOperandsMapping( 516 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 517 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 518 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 519 3); // Num Operands 520 AltMappings.push_back(&SSMapping); 521 522 const InstructionMapping &VVMapping = getInstructionMapping( 523 2, 2, getOperandsMapping( 524 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 525 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 526 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 527 3); // Num Operands 528 AltMappings.push_back(&VVMapping); 529 break; 530 } 531 case TargetOpcode::G_LOAD: 532 case TargetOpcode::G_ZEXTLOAD: 533 case TargetOpcode::G_SEXTLOAD: { 534 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 535 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); 536 unsigned PtrSize = PtrTy.getSizeInBits(); 537 unsigned AS = PtrTy.getAddressSpace(); 538 539 if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && 540 AS != AMDGPUAS::PRIVATE_ADDRESS) && 541 isScalarLoadLegal(MI)) { 542 const InstructionMapping &SSMapping = getInstructionMapping( 543 1, 1, getOperandsMapping( 544 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 545 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}), 546 2); // Num Operands 547 AltMappings.push_back(&SSMapping); 548 } 549 550 const InstructionMapping &VVMapping = getInstructionMapping( 551 2, 1, 552 getOperandsMapping( 553 {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 554 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}), 555 2); // Num Operands 556 AltMappings.push_back(&VVMapping); 557 558 // It may be possible to have a vgpr = load sgpr mapping here, because 559 // the mubuf instructions support this kind of load, but probably for only 560 // gfx7 and older. However, the addressing mode matching in the instruction 561 // selector should be able to do a better job of detecting and selecting 562 // these kinds of loads from the vgpr = load vgpr mapping. 563 564 return AltMappings; 565 566 } 567 case TargetOpcode::G_SELECT: { 568 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 569 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 570 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 571 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), 572 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 573 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 574 4); // Num Operands 575 AltMappings.push_back(&SSMapping); 576 577 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 578 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 579 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 580 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 581 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 582 4); // Num Operands 583 AltMappings.push_back(&VVMapping); 584 585 return AltMappings; 586 } 587 case TargetOpcode::G_UADDE: 588 case TargetOpcode::G_USUBE: 589 case TargetOpcode::G_SADDE: 590 case TargetOpcode::G_SSUBE: { 591 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 592 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 593 getOperandsMapping( 594 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 595 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), 596 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 597 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 598 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}), 599 5); // Num Operands 600 AltMappings.push_back(&SSMapping); 601 602 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 603 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 604 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 605 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 606 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 607 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}), 608 5); // Num Operands 609 AltMappings.push_back(&VVMapping); 610 return AltMappings; 611 } 612 case AMDGPU::G_BRCOND: { 613 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 614 615 // TODO: Change type to 32 for scalar 616 const InstructionMapping &SMapping = getInstructionMapping( 617 1, 1, getOperandsMapping( 618 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}), 619 2); // Num Operands 620 AltMappings.push_back(&SMapping); 621 622 const InstructionMapping &VMapping = getInstructionMapping( 623 1, 1, getOperandsMapping( 624 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }), 625 2); // Num Operands 626 AltMappings.push_back(&VMapping); 627 return AltMappings; 628 } 629 case AMDGPU::G_INTRINSIC: 630 return getInstrAlternativeMappingsIntrinsic(MI, MRI); 631 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: 632 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI); 633 default: 634 break; 635 } 636 return RegisterBankInfo::getInstrAlternativeMappings(MI); 637 } 638 639 void AMDGPURegisterBankInfo::split64BitValueForMapping( 640 MachineIRBuilder &B, 641 SmallVector<Register, 2> &Regs, 642 LLT HalfTy, 643 Register Reg) const { 644 assert(HalfTy.getSizeInBits() == 32); 645 MachineRegisterInfo *MRI = B.getMRI(); 646 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy); 647 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy); 648 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI); 649 MRI->setRegBank(LoLHS, *Bank); 650 MRI->setRegBank(HiLHS, *Bank); 651 652 Regs.push_back(LoLHS); 653 Regs.push_back(HiLHS); 654 655 B.buildInstr(AMDGPU::G_UNMERGE_VALUES) 656 .addDef(LoLHS) 657 .addDef(HiLHS) 658 .addUse(Reg); 659 } 660 661 /// Replace the current type each register in \p Regs has with \p NewTy 662 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs, 663 LLT NewTy) { 664 for (Register Reg : Regs) { 665 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits()); 666 MRI.setType(Reg, NewTy); 667 } 668 } 669 670 static LLT getHalfSizedType(LLT Ty) { 671 if (Ty.isVector()) { 672 assert(Ty.getElementCount().isKnownMultipleOf(2)); 673 return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2), 674 Ty.getElementType()); 675 } 676 677 assert(Ty.getScalarSizeInBits() % 2 == 0); 678 return LLT::scalar(Ty.getScalarSizeInBits() / 2); 679 } 680 681 // Build one or more V_READFIRSTLANE_B32 instructions to move the given vector 682 // source value into a scalar register. 683 Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B, 684 MachineRegisterInfo &MRI, 685 Register Src) const { 686 LLT Ty = MRI.getType(Src); 687 const RegisterBank *Bank = getRegBank(Src, MRI, *TRI); 688 689 if (Bank == &AMDGPU::SGPRRegBank) 690 return Src; 691 692 unsigned Bits = Ty.getSizeInBits(); 693 assert(Bits % 32 == 0); 694 695 if (Bank != &AMDGPU::VGPRRegBank) { 696 // We need to copy from AGPR to VGPR 697 Src = B.buildCopy(Ty, Src).getReg(0); 698 MRI.setRegBank(Src, AMDGPU::VGPRRegBank); 699 } 700 701 LLT S32 = LLT::scalar(32); 702 unsigned NumParts = Bits / 32; 703 SmallVector<Register, 8> SrcParts; 704 SmallVector<Register, 8> DstParts; 705 706 if (Bits == 32) { 707 SrcParts.push_back(Src); 708 } else { 709 auto Unmerge = B.buildUnmerge(S32, Src); 710 for (unsigned i = 0; i < NumParts; ++i) 711 SrcParts.push_back(Unmerge.getReg(i)); 712 } 713 714 for (unsigned i = 0; i < NumParts; ++i) { 715 Register SrcPart = SrcParts[i]; 716 Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 717 MRI.setType(DstPart, NumParts == 1 ? Ty : S32); 718 719 const TargetRegisterClass *Constrained = 720 constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI); 721 (void)Constrained; 722 assert(Constrained && "Failed to constrain readfirstlane src reg"); 723 724 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart}); 725 726 DstParts.push_back(DstPart); 727 } 728 729 if (Bits == 32) 730 return DstParts[0]; 731 732 Register Dst = B.buildMerge(Ty, DstParts).getReg(0); 733 MRI.setRegBank(Dst, AMDGPU::SGPRRegBank); 734 return Dst; 735 } 736 737 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If 738 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to 739 /// execute the instruction for each unique combination of values in all lanes 740 /// in the wave. The block will be split such that rest of the instructions are 741 /// moved to a new block. 742 /// 743 /// Essentially performs this loop: 744 // 745 /// Save Execution Mask 746 /// For (Lane : Wavefront) { 747 /// Enable Lane, Disable all other lanes 748 /// SGPR = read SGPR value for current lane from VGPR 749 /// VGPRResult[Lane] = use_op SGPR 750 /// } 751 /// Restore Execution Mask 752 /// 753 /// There is additional complexity to try for compare values to identify the 754 /// unique values used. 755 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 756 MachineIRBuilder &B, 757 iterator_range<MachineBasicBlock::iterator> Range, 758 SmallSet<Register, 4> &SGPROperandRegs, 759 MachineRegisterInfo &MRI) const { 760 761 // Track use registers which have already been expanded with a readfirstlane 762 // sequence. This may have multiple uses if moving a sequence. 763 DenseMap<Register, Register> WaterfalledRegMap; 764 765 MachineBasicBlock &MBB = B.getMBB(); 766 MachineFunction *MF = &B.getMF(); 767 768 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); 769 const unsigned MovExecOpc = 770 Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 771 const unsigned MovExecTermOpc = 772 Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; 773 774 const unsigned XorTermOpc = Subtarget.isWave32() ? 775 AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; 776 const unsigned AndSaveExecOpc = Subtarget.isWave32() ? 777 AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; 778 const unsigned ExecReg = Subtarget.isWave32() ? 779 AMDGPU::EXEC_LO : AMDGPU::EXEC; 780 781 #ifndef NDEBUG 782 const int OrigRangeSize = std::distance(Range.begin(), Range.end()); 783 #endif 784 785 Register SaveExecReg = MRI.createVirtualRegister(WaveRC); 786 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC); 787 788 // Don't bother using generic instructions/registers for the exec mask. 789 B.buildInstr(TargetOpcode::IMPLICIT_DEF) 790 .addDef(InitSaveExecReg); 791 792 Register PhiExec = MRI.createVirtualRegister(WaveRC); 793 Register NewExec = MRI.createVirtualRegister(WaveRC); 794 795 // To insert the loop we need to split the block. Move everything before this 796 // point to a new block, and insert a new empty block before this instruction. 797 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); 798 MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock(); 799 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); 800 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock(); 801 MachineFunction::iterator MBBI(MBB); 802 ++MBBI; 803 MF->insert(MBBI, LoopBB); 804 MF->insert(MBBI, BodyBB); 805 MF->insert(MBBI, RestoreExecBB); 806 MF->insert(MBBI, RemainderBB); 807 808 LoopBB->addSuccessor(BodyBB); 809 BodyBB->addSuccessor(RestoreExecBB); 810 BodyBB->addSuccessor(LoopBB); 811 812 // Move the rest of the block into a new block. 813 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 814 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end()); 815 816 MBB.addSuccessor(LoopBB); 817 RestoreExecBB->addSuccessor(RemainderBB); 818 819 B.setInsertPt(*LoopBB, LoopBB->end()); 820 821 B.buildInstr(TargetOpcode::PHI) 822 .addDef(PhiExec) 823 .addReg(InitSaveExecReg) 824 .addMBB(&MBB) 825 .addReg(NewExec) 826 .addMBB(BodyBB); 827 828 const DebugLoc &DL = B.getDL(); 829 830 MachineInstr &FirstInst = *Range.begin(); 831 832 // Move the instruction into the loop body. Note we moved everything after 833 // Range.end() already into a new block, so Range.end() is no longer valid. 834 BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end()); 835 836 // Figure out the iterator range after splicing the instructions. 837 MachineBasicBlock::iterator NewBegin = FirstInst.getIterator(); 838 auto NewEnd = BodyBB->end(); 839 840 B.setMBB(*LoopBB); 841 842 LLT S1 = LLT::scalar(1); 843 Register CondReg; 844 845 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize); 846 847 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) { 848 for (MachineOperand &Op : MI.uses()) { 849 if (!Op.isReg() || Op.isDef()) 850 continue; 851 852 Register OldReg = Op.getReg(); 853 if (!SGPROperandRegs.count(OldReg)) 854 continue; 855 856 // See if we already processed this register in another instruction in the 857 // sequence. 858 auto OldVal = WaterfalledRegMap.find(OldReg); 859 if (OldVal != WaterfalledRegMap.end()) { 860 Op.setReg(OldVal->second); 861 continue; 862 } 863 864 Register OpReg = Op.getReg(); 865 LLT OpTy = MRI.getType(OpReg); 866 867 const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI); 868 if (OpBank != &AMDGPU::VGPRRegBank) { 869 // Insert copy from AGPR to VGPR before the loop. 870 B.setMBB(MBB); 871 OpReg = B.buildCopy(OpTy, OpReg).getReg(0); 872 MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank); 873 B.setMBB(*LoopBB); 874 } 875 876 Register CurrentLaneReg = buildReadFirstLane(B, MRI, OpReg); 877 878 // Build the comparison(s). 879 unsigned OpSize = OpTy.getSizeInBits(); 880 bool Is64 = OpSize % 64 == 0; 881 unsigned PartSize = Is64 ? 64 : 32; 882 LLT PartTy = LLT::scalar(PartSize); 883 unsigned NumParts = OpSize / PartSize; 884 SmallVector<Register, 8> OpParts; 885 SmallVector<Register, 8> CurrentLaneParts; 886 887 if (NumParts == 1) { 888 OpParts.push_back(OpReg); 889 CurrentLaneParts.push_back(CurrentLaneReg); 890 } else { 891 auto UnmergeOp = B.buildUnmerge(PartTy, OpReg); 892 auto UnmergeCurrentLane = B.buildUnmerge(PartTy, CurrentLaneReg); 893 for (unsigned i = 0; i < NumParts; ++i) { 894 OpParts.push_back(UnmergeOp.getReg(i)); 895 CurrentLaneParts.push_back(UnmergeCurrentLane.getReg(i)); 896 MRI.setRegBank(OpParts[i], AMDGPU::VGPRRegBank); 897 MRI.setRegBank(CurrentLaneParts[i], AMDGPU::SGPRRegBank); 898 } 899 } 900 901 for (unsigned i = 0; i < NumParts; ++i) { 902 auto CmpReg = B.buildICmp(CmpInst::ICMP_EQ, S1, CurrentLaneParts[i], 903 OpParts[i]).getReg(0); 904 MRI.setRegBank(CmpReg, AMDGPU::VCCRegBank); 905 906 if (!CondReg) { 907 CondReg = CmpReg; 908 } else { 909 CondReg = B.buildAnd(S1, CondReg, CmpReg).getReg(0); 910 MRI.setRegBank(CondReg, AMDGPU::VCCRegBank); 911 } 912 } 913 914 Op.setReg(CurrentLaneReg); 915 916 // Make sure we don't re-process this register again. 917 WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg())); 918 } 919 } 920 921 // The ballot becomes a no-op during instruction selection. 922 CondReg = B.buildIntrinsic(Intrinsic::amdgcn_ballot, 923 {LLT::scalar(Subtarget.isWave32() ? 32 : 64)}, 924 false) 925 .addReg(CondReg) 926 .getReg(0); 927 MRI.setRegClass(CondReg, WaveRC); 928 929 // Update EXEC, save the original EXEC value to VCC. 930 B.buildInstr(AndSaveExecOpc) 931 .addDef(NewExec) 932 .addReg(CondReg, RegState::Kill); 933 934 MRI.setSimpleHint(NewExec, CondReg); 935 936 B.setInsertPt(*BodyBB, BodyBB->end()); 937 938 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 939 B.buildInstr(XorTermOpc) 940 .addDef(ExecReg) 941 .addReg(ExecReg) 942 .addReg(NewExec); 943 944 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use 945 // s_cbranch_scc0? 946 947 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. 948 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB); 949 950 // Save the EXEC mask before the loop. 951 BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg) 952 .addReg(ExecReg); 953 954 // Restore the EXEC mask after the loop. 955 B.setMBB(*RestoreExecBB); 956 B.buildInstr(MovExecTermOpc) 957 .addDef(ExecReg) 958 .addReg(SaveExecReg); 959 960 // Set the insert point after the original instruction, so any new 961 // instructions will be in the remainder. 962 B.setInsertPt(*RemainderBB, RemainderBB->begin()); 963 964 return true; 965 } 966 967 // Return any unique registers used by \p MI at \p OpIndices that need to be 968 // handled in a waterfall loop. Returns these registers in \p 969 // SGPROperandRegs. Returns true if there are any operands to handle and a 970 // waterfall loop is necessary. 971 bool AMDGPURegisterBankInfo::collectWaterfallOperands( 972 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI, 973 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const { 974 for (unsigned Op : OpIndices) { 975 assert(MI.getOperand(Op).isUse()); 976 Register Reg = MI.getOperand(Op).getReg(); 977 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); 978 if (OpBank->getID() != AMDGPU::SGPRRegBankID) 979 SGPROperandRegs.insert(Reg); 980 } 981 982 // No operands need to be replaced, so no need to loop. 983 return !SGPROperandRegs.empty(); 984 } 985 986 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 987 MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI, 988 ArrayRef<unsigned> OpIndices) const { 989 // Use a set to avoid extra readfirstlanes in the case where multiple operands 990 // are the same register. 991 SmallSet<Register, 4> SGPROperandRegs; 992 993 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices)) 994 return false; 995 996 MachineBasicBlock::iterator I = MI.getIterator(); 997 return executeInWaterfallLoop(B, make_range(I, std::next(I)), 998 SGPROperandRegs, MRI); 999 } 1000 1001 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 1002 MachineInstr &MI, MachineRegisterInfo &MRI, 1003 ArrayRef<unsigned> OpIndices) const { 1004 MachineIRBuilder B(MI); 1005 return executeInWaterfallLoop(B, MI, MRI, OpIndices); 1006 } 1007 1008 // Legalize an operand that must be an SGPR by inserting a readfirstlane. 1009 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane( 1010 MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const { 1011 Register Reg = MI.getOperand(OpIdx).getReg(); 1012 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 1013 if (Bank == &AMDGPU::SGPRRegBank) 1014 return; 1015 1016 MachineIRBuilder B(MI); 1017 1018 Reg = buildReadFirstLane(B, MRI, Reg); 1019 MI.getOperand(OpIdx).setReg(Reg); 1020 } 1021 1022 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the 1023 /// rest will be in the remainder. 1024 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) { 1025 unsigned TotalSize = Ty.getSizeInBits(); 1026 if (!Ty.isVector()) 1027 return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)}; 1028 1029 LLT EltTy = Ty.getElementType(); 1030 unsigned EltSize = EltTy.getSizeInBits(); 1031 assert(FirstSize % EltSize == 0); 1032 1033 unsigned FirstPartNumElts = FirstSize / EltSize; 1034 unsigned RemainderElts = (TotalSize - FirstSize) / EltSize; 1035 1036 return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy), 1037 LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)}; 1038 } 1039 1040 static LLT widen96To128(LLT Ty) { 1041 if (!Ty.isVector()) 1042 return LLT::scalar(128); 1043 1044 LLT EltTy = Ty.getElementType(); 1045 assert(128 % EltTy.getSizeInBits() == 0); 1046 return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy); 1047 } 1048 1049 bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI, 1050 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1051 MachineRegisterInfo &MRI) const { 1052 Register DstReg = MI.getOperand(0).getReg(); 1053 const LLT LoadTy = MRI.getType(DstReg); 1054 unsigned LoadSize = LoadTy.getSizeInBits(); 1055 const unsigned MaxNonSmrdLoadSize = 128; 1056 1057 const RegisterBank *DstBank = 1058 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1059 if (DstBank == &AMDGPU::SGPRRegBank) { 1060 // There are some special cases that we need to look at for 32 bit and 96 1061 // bit SGPR loads otherwise we have nothing to do. 1062 if (LoadSize != 32 && LoadSize != 96) 1063 return false; 1064 1065 MachineMemOperand *MMO = *MI.memoperands_begin(); 1066 const unsigned MemSize = 8 * MMO->getSize(); 1067 // Scalar loads of size 8 or 16 bit with proper alignment may be widened to 1068 // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit 1069 // scalar loads should have a load size of 32 but memory access size of less 1070 // than 32. 1071 if (LoadSize == 32 && 1072 (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI))) 1073 return false; 1074 1075 Register PtrReg = MI.getOperand(1).getReg(); 1076 1077 ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank); 1078 MachineIRBuilder B(MI, O); 1079 1080 if (LoadSize == 32) { 1081 // This is an extending load from a sub-dword size. Widen the memory 1082 // access size to 4 bytes and clear the extra high bits appropriately 1083 const LLT S32 = LLT::scalar(32); 1084 if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) { 1085 // Must extend the sign bit into higher bits for a G_SEXTLOAD 1086 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0); 1087 B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize); 1088 } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) { 1089 // Must extend zero into higher bits with an AND for a G_ZEXTLOAD 1090 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0); 1091 B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize); 1092 } else 1093 // We do not need to touch the higher bits for regular loads. 1094 B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0); 1095 } else { 1096 // 96-bit loads are only available for vector loads. We need to split this 1097 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load). 1098 if (MMO->getAlign() < Align(16)) { 1099 MachineFunction *MF = MI.getParent()->getParent(); 1100 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); 1101 MachineIRBuilder B(MI, ApplyBank); 1102 LegalizerHelper Helper(*MF, ApplyBank, B); 1103 LLT Part64, Part32; 1104 std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64); 1105 if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) != 1106 LegalizerHelper::Legalized) 1107 return false; 1108 return true; 1109 } else { 1110 LLT WiderTy = widen96To128(LoadTy); 1111 auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0); 1112 if (WiderTy.isScalar()) 1113 B.buildTrunc(MI.getOperand(0), WideLoad); 1114 else { 1115 B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(), 1116 WideLoad); 1117 } 1118 } 1119 } 1120 1121 MI.eraseFromParent(); 1122 return true; 1123 } 1124 1125 // 128-bit loads are supported for all instruction types. 1126 if (LoadSize <= MaxNonSmrdLoadSize) 1127 return false; 1128 1129 SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0)); 1130 SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1)); 1131 1132 if (SrcRegs.empty()) 1133 SrcRegs.push_back(MI.getOperand(1).getReg()); 1134 1135 assert(LoadSize % MaxNonSmrdLoadSize == 0); 1136 1137 // RegBankSelect only emits scalar types, so we need to reset the pointer 1138 // operand to a pointer type. 1139 Register BasePtrReg = SrcRegs[0]; 1140 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); 1141 MRI.setType(BasePtrReg, PtrTy); 1142 1143 unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize; 1144 const LLT LoadSplitTy = LoadTy.divide(NumSplitParts); 1145 ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank); 1146 MachineIRBuilder B(MI, Observer); 1147 LegalizerHelper Helper(B.getMF(), Observer, B); 1148 1149 if (LoadTy.isVector()) { 1150 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) 1151 return false; 1152 } else { 1153 if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) 1154 return false; 1155 } 1156 1157 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 1158 return true; 1159 } 1160 1161 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc( 1162 MachineInstr &MI, 1163 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1164 MachineRegisterInfo &MRI) const { 1165 const MachineFunction &MF = *MI.getMF(); 1166 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1167 const auto &TFI = *ST.getFrameLowering(); 1168 1169 // Guard in case the stack growth direction ever changes with scratch 1170 // instructions. 1171 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown) 1172 return false; 1173 1174 Register Dst = MI.getOperand(0).getReg(); 1175 Register AllocSize = MI.getOperand(1).getReg(); 1176 Align Alignment = assumeAligned(MI.getOperand(2).getImm()); 1177 1178 const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI); 1179 1180 // TODO: Need to emit a wave reduction to get the maximum size. 1181 if (SizeBank != &AMDGPU::SGPRRegBank) 1182 return false; 1183 1184 LLT PtrTy = MRI.getType(Dst); 1185 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); 1186 1187 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1188 Register SPReg = Info->getStackPtrOffsetReg(); 1189 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank); 1190 MachineIRBuilder B(MI, ApplyBank); 1191 1192 auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2()); 1193 auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize); 1194 1195 auto SPCopy = B.buildCopy(PtrTy, SPReg); 1196 if (Alignment > TFI.getStackAlign()) { 1197 auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize); 1198 B.buildMaskLowPtrBits(Dst, PtrAdd, 1199 Log2(Alignment) + ST.getWavefrontSizeLog2()); 1200 } else { 1201 B.buildPtrAdd(Dst, SPCopy, ScaledSize); 1202 } 1203 1204 MI.eraseFromParent(); 1205 return true; 1206 } 1207 1208 bool AMDGPURegisterBankInfo::applyMappingImage( 1209 MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1210 MachineRegisterInfo &MRI, int RsrcIdx) const { 1211 const int NumDefs = MI.getNumExplicitDefs(); 1212 1213 // The reported argument index is relative to the IR intrinsic call arguments, 1214 // so we need to shift by the number of defs and the intrinsic ID. 1215 RsrcIdx += NumDefs + 1; 1216 1217 // Insert copies to VGPR arguments. 1218 applyDefaultMapping(OpdMapper); 1219 1220 // Fixup any SGPR arguments. 1221 SmallVector<unsigned, 4> SGPRIndexes; 1222 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) { 1223 if (!MI.getOperand(I).isReg()) 1224 continue; 1225 1226 // If this intrinsic has a sampler, it immediately follows rsrc. 1227 if (I == RsrcIdx || I == RsrcIdx + 1) 1228 SGPRIndexes.push_back(I); 1229 } 1230 1231 executeInWaterfallLoop(MI, MRI, SGPRIndexes); 1232 return true; 1233 } 1234 1235 static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI, 1236 Register Reg) { 1237 MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); 1238 if (!Def) 1239 return Reg; 1240 1241 // TODO: Guard against this being an implicit def 1242 return Def->getOperand(0).getReg(); 1243 } 1244 1245 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store 1246 // the three offsets (voffset, soffset and instoffset) 1247 static unsigned setBufferOffsets(MachineIRBuilder &B, 1248 const AMDGPURegisterBankInfo &RBI, 1249 Register CombinedOffset, Register &VOffsetReg, 1250 Register &SOffsetReg, int64_t &InstOffsetVal, 1251 Align Alignment) { 1252 const LLT S32 = LLT::scalar(32); 1253 MachineRegisterInfo *MRI = B.getMRI(); 1254 1255 if (Optional<int64_t> Imm = getIConstantVRegSExtVal(CombinedOffset, *MRI)) { 1256 uint32_t SOffset, ImmOffset; 1257 if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget, 1258 Alignment)) { 1259 VOffsetReg = B.buildConstant(S32, 0).getReg(0); 1260 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); 1261 InstOffsetVal = ImmOffset; 1262 1263 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1264 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1265 return SOffset + ImmOffset; 1266 } 1267 } 1268 1269 Register Base; 1270 unsigned Offset; 1271 1272 std::tie(Base, Offset) = 1273 AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset); 1274 1275 uint32_t SOffset, ImmOffset; 1276 if ((int)Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, 1277 &RBI.Subtarget, Alignment)) { 1278 if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) { 1279 VOffsetReg = Base; 1280 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); 1281 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1282 InstOffsetVal = ImmOffset; 1283 return 0; // XXX - Why is this 0? 1284 } 1285 1286 // If we have SGPR base, we can use it for soffset. 1287 if (SOffset == 0) { 1288 VOffsetReg = B.buildConstant(S32, 0).getReg(0); 1289 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1290 SOffsetReg = Base; 1291 InstOffsetVal = ImmOffset; 1292 return 0; // XXX - Why is this 0? 1293 } 1294 } 1295 1296 // Handle the variable sgpr + vgpr case. 1297 MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI); 1298 if (Add && (int)Offset >= 0) { 1299 Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg()); 1300 Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg()); 1301 1302 const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI); 1303 const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI); 1304 1305 if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) { 1306 VOffsetReg = Src0; 1307 SOffsetReg = Src1; 1308 return 0; 1309 } 1310 1311 if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) { 1312 VOffsetReg = Src1; 1313 SOffsetReg = Src0; 1314 return 0; 1315 } 1316 } 1317 1318 // Ensure we have a VGPR for the combined offset. This could be an issue if we 1319 // have an SGPR offset and a VGPR resource. 1320 if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) { 1321 VOffsetReg = CombinedOffset; 1322 } else { 1323 VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0); 1324 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1325 } 1326 1327 SOffsetReg = B.buildConstant(S32, 0).getReg(0); 1328 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1329 return 0; 1330 } 1331 1332 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad( 1333 const OperandsMapper &OpdMapper) const { 1334 MachineInstr &MI = OpdMapper.getMI(); 1335 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1336 1337 const LLT S32 = LLT::scalar(32); 1338 Register Dst = MI.getOperand(0).getReg(); 1339 LLT Ty = MRI.getType(Dst); 1340 1341 const RegisterBank *RSrcBank = 1342 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1343 const RegisterBank *OffsetBank = 1344 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 1345 if (RSrcBank == &AMDGPU::SGPRRegBank && 1346 OffsetBank == &AMDGPU::SGPRRegBank) 1347 return true; // Legal mapping 1348 1349 // FIXME: 96-bit case was widened during legalize. We need to narrow it back 1350 // here but don't have an MMO. 1351 1352 unsigned LoadSize = Ty.getSizeInBits(); 1353 int NumLoads = 1; 1354 if (LoadSize == 256 || LoadSize == 512) { 1355 NumLoads = LoadSize / 128; 1356 Ty = Ty.divide(NumLoads); 1357 } 1358 1359 // Use the alignment to ensure that the required offsets will fit into the 1360 // immediate offsets. 1361 const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1); 1362 1363 MachineIRBuilder B(MI); 1364 MachineFunction &MF = B.getMF(); 1365 1366 Register SOffset; 1367 Register VOffset; 1368 int64_t ImmOffset = 0; 1369 1370 unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(), 1371 VOffset, SOffset, ImmOffset, Alignment); 1372 1373 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we 1374 // can, but we need to track an MMO for that. 1375 const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8; 1376 const Align MemAlign(4); // FIXME: ABI type alignment? 1377 MachineMemOperand *BaseMMO = MF.getMachineMemOperand( 1378 MachinePointerInfo(), 1379 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1380 MachineMemOperand::MOInvariant, 1381 MemSize, MemAlign); 1382 if (MMOOffset != 0) 1383 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize); 1384 1385 // If only the offset is divergent, emit a MUBUF buffer load instead. We can 1386 // assume that the buffer is unswizzled. 1387 1388 Register RSrc = MI.getOperand(1).getReg(); 1389 Register VIndex = B.buildConstant(S32, 0).getReg(0); 1390 B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank); 1391 1392 SmallVector<Register, 4> LoadParts(NumLoads); 1393 1394 MachineBasicBlock::iterator MII = MI.getIterator(); 1395 MachineInstrSpan Span(MII, &B.getMBB()); 1396 1397 for (int i = 0; i < NumLoads; ++i) { 1398 if (NumLoads == 1) { 1399 LoadParts[i] = Dst; 1400 } else { 1401 LoadParts[i] = MRI.createGenericVirtualRegister(Ty); 1402 MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank); 1403 } 1404 1405 MachineMemOperand *MMO = BaseMMO; 1406 if (i != 0) 1407 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize); 1408 1409 B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD) 1410 .addDef(LoadParts[i]) // vdata 1411 .addUse(RSrc) // rsrc 1412 .addUse(VIndex) // vindex 1413 .addUse(VOffset) // voffset 1414 .addUse(SOffset) // soffset 1415 .addImm(ImmOffset + 16 * i) // offset(imm) 1416 .addImm(0) // cachepolicy, swizzled buffer(imm) 1417 .addImm(0) // idxen(imm) 1418 .addMemOperand(MMO); 1419 } 1420 1421 // TODO: If only the resource is a VGPR, it may be better to execute the 1422 // scalar load in the waterfall loop if the resource is expected to frequently 1423 // be dynamically uniform. 1424 if (RSrcBank != &AMDGPU::SGPRRegBank) { 1425 // Remove the original instruction to avoid potentially confusing the 1426 // waterfall loop logic. 1427 B.setInstr(*Span.begin()); 1428 MI.eraseFromParent(); 1429 1430 SmallSet<Register, 4> OpsToWaterfall; 1431 1432 OpsToWaterfall.insert(RSrc); 1433 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 1434 OpsToWaterfall, MRI); 1435 } 1436 1437 if (NumLoads != 1) { 1438 if (Ty.isVector()) 1439 B.buildConcatVectors(Dst, LoadParts); 1440 else 1441 B.buildMerge(Dst, LoadParts); 1442 } 1443 1444 // We removed the instruction earlier with a waterfall loop. 1445 if (RSrcBank == &AMDGPU::SGPRRegBank) 1446 MI.eraseFromParent(); 1447 1448 return true; 1449 } 1450 1451 bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper, 1452 bool Signed) const { 1453 MachineInstr &MI = OpdMapper.getMI(); 1454 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1455 1456 // Insert basic copies 1457 applyDefaultMapping(OpdMapper); 1458 1459 Register DstReg = MI.getOperand(0).getReg(); 1460 LLT Ty = MRI.getType(DstReg); 1461 1462 const LLT S32 = LLT::scalar(32); 1463 1464 unsigned FirstOpnd = MI.getOpcode() == AMDGPU::G_INTRINSIC ? 2 : 1; 1465 Register SrcReg = MI.getOperand(FirstOpnd).getReg(); 1466 Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg(); 1467 Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg(); 1468 1469 const RegisterBank *DstBank = 1470 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1471 if (DstBank == &AMDGPU::VGPRRegBank) { 1472 if (Ty == S32) 1473 return true; 1474 1475 // There is no 64-bit vgpr bitfield extract instructions so the operation 1476 // is expanded to a sequence of instructions that implement the operation. 1477 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::VGPRRegBank); 1478 MachineIRBuilder B(MI, ApplyBank); 1479 1480 const LLT S64 = LLT::scalar(64); 1481 // Shift the source operand so that extracted bits start at bit 0. 1482 auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg) 1483 : B.buildLShr(S64, SrcReg, OffsetReg); 1484 auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset); 1485 1486 // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions 1487 // if the width is a constant. 1488 if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) { 1489 // Use the 32-bit bitfield extract instruction if the width is a constant. 1490 // Depending on the width size, use either the low or high 32-bits. 1491 auto Zero = B.buildConstant(S32, 0); 1492 auto WidthImm = ConstWidth->Value.getZExtValue(); 1493 if (WidthImm <= 32) { 1494 // Use bitfield extract on the lower 32-bit source, and then sign-extend 1495 // or clear the upper 32-bits. 1496 auto Extract = 1497 Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg) 1498 : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg); 1499 auto Extend = 1500 Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero; 1501 B.buildMerge(DstReg, {Extract, Extend}); 1502 } else { 1503 // Use bitfield extract on upper 32-bit source, and combine with lower 1504 // 32-bit source. 1505 auto UpperWidth = B.buildConstant(S32, WidthImm - 32); 1506 auto Extract = 1507 Signed 1508 ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth) 1509 : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth); 1510 B.buildMerge(DstReg, {UnmergeSOffset.getReg(0), Extract}); 1511 } 1512 MI.eraseFromParent(); 1513 return true; 1514 } 1515 1516 // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit 1517 // operations. 1518 auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg); 1519 auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift); 1520 if (Signed) 1521 B.buildAShr(S64, SignBit, ExtShift); 1522 else 1523 B.buildLShr(S64, SignBit, ExtShift); 1524 MI.eraseFromParent(); 1525 return true; 1526 } 1527 1528 // The scalar form packs the offset and width in a single operand. 1529 1530 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank); 1531 MachineIRBuilder B(MI, ApplyBank); 1532 1533 // Ensure the high bits are clear to insert the offset. 1534 auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6)); 1535 auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask); 1536 1537 // Zeros out the low bits, so don't bother clamping the input value. 1538 auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16)); 1539 1540 // Transformation function, pack the offset and width of a BFE into 1541 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second 1542 // source, bits [5:0] contain the offset and bits [22:16] the width. 1543 auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth); 1544 1545 // TODO: It might be worth using a pseudo here to avoid scc clobber and 1546 // register class constraints. 1547 unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) : 1548 (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64); 1549 1550 auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs}); 1551 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) 1552 llvm_unreachable("failed to constrain BFE"); 1553 1554 MI.eraseFromParent(); 1555 return true; 1556 } 1557 1558 bool AMDGPURegisterBankInfo::applyMappingMAD_64_32( 1559 const OperandsMapper &OpdMapper) const { 1560 MachineInstr &MI = OpdMapper.getMI(); 1561 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1562 1563 // Insert basic copies. 1564 applyDefaultMapping(OpdMapper); 1565 1566 Register Dst0 = MI.getOperand(0).getReg(); 1567 Register Dst1 = MI.getOperand(1).getReg(); 1568 Register Src0 = MI.getOperand(2).getReg(); 1569 Register Src1 = MI.getOperand(3).getReg(); 1570 Register Src2 = MI.getOperand(4).getReg(); 1571 1572 if (MRI.getRegBankOrNull(Src0) == &AMDGPU::VGPRRegBank) 1573 return true; 1574 1575 bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32; 1576 LLT S1 = LLT::scalar(1); 1577 LLT S32 = LLT::scalar(32); 1578 1579 bool DstOnValu = MRI.getRegBankOrNull(Src2) == &AMDGPU::VGPRRegBank; 1580 bool Accumulate = true; 1581 1582 if (!DstOnValu) { 1583 if (mi_match(Src2, MRI, m_ZeroInt())) 1584 Accumulate = false; 1585 } 1586 1587 // Keep the multiplication on the SALU. 1588 MachineIRBuilder B(MI); 1589 1590 Register DstHi; 1591 Register DstLo = B.buildMul(S32, Src0, Src1).getReg(0); 1592 bool MulHiInVgpr = false; 1593 1594 MRI.setRegBank(DstLo, AMDGPU::SGPRRegBank); 1595 1596 if (Subtarget.hasSMulHi()) { 1597 DstHi = IsUnsigned ? B.buildUMulH(S32, Src0, Src1).getReg(0) 1598 : B.buildSMulH(S32, Src0, Src1).getReg(0); 1599 MRI.setRegBank(DstHi, AMDGPU::SGPRRegBank); 1600 } else { 1601 Register VSrc0 = B.buildCopy(S32, Src0).getReg(0); 1602 Register VSrc1 = B.buildCopy(S32, Src1).getReg(0); 1603 1604 MRI.setRegBank(VSrc0, AMDGPU::VGPRRegBank); 1605 MRI.setRegBank(VSrc1, AMDGPU::VGPRRegBank); 1606 1607 DstHi = IsUnsigned ? B.buildUMulH(S32, VSrc0, VSrc1).getReg(0) 1608 : B.buildSMulH(S32, VSrc0, VSrc1).getReg(0); 1609 MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank); 1610 1611 if (!DstOnValu) { 1612 DstHi = buildReadFirstLane(B, MRI, DstHi); 1613 } else { 1614 MulHiInVgpr = true; 1615 } 1616 } 1617 1618 // Accumulate and produce the "carry-out" bit. 1619 // 1620 // The "carry-out" is defined as bit 64 of the result when computed as a 1621 // big integer. For unsigned multiply-add, this matches the usual definition 1622 // of carry-out. For signed multiply-add, bit 64 is the sign bit of the 1623 // result, which is determined as: 1624 // sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add 1625 LLT CarryType = DstOnValu ? S1 : S32; 1626 const RegisterBank &CarryBank = 1627 DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank; 1628 const RegisterBank &DstBank = 1629 DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank; 1630 Register Carry; 1631 Register Zero; 1632 1633 if (!IsUnsigned) { 1634 Zero = B.buildConstant(S32, 0).getReg(0); 1635 MRI.setRegBank(Zero, 1636 MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank); 1637 1638 Carry = B.buildICmp(CmpInst::ICMP_SLT, MulHiInVgpr ? S1 : S32, DstHi, Zero) 1639 .getReg(0); 1640 MRI.setRegBank(Carry, MulHiInVgpr ? AMDGPU::VCCRegBank 1641 : AMDGPU::SGPRRegBank); 1642 1643 if (DstOnValu && !MulHiInVgpr) { 1644 Carry = B.buildTrunc(S1, Carry).getReg(0); 1645 MRI.setRegBank(Carry, AMDGPU::VCCRegBank); 1646 } 1647 } 1648 1649 if (Accumulate) { 1650 if (DstOnValu) { 1651 DstLo = B.buildCopy(S32, DstLo).getReg(0); 1652 DstHi = B.buildCopy(S32, DstHi).getReg(0); 1653 MRI.setRegBank(DstLo, AMDGPU::VGPRRegBank); 1654 MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank); 1655 } 1656 1657 auto Unmerge = B.buildUnmerge(S32, Src2); 1658 Register Src2Lo = Unmerge.getReg(0); 1659 Register Src2Hi = Unmerge.getReg(1); 1660 MRI.setRegBank(Src2Lo, DstBank); 1661 MRI.setRegBank(Src2Hi, DstBank); 1662 1663 if (!IsUnsigned) { 1664 auto Src2Sign = B.buildICmp(CmpInst::ICMP_SLT, CarryType, Src2Hi, Zero); 1665 MRI.setRegBank(Src2Sign.getReg(0), CarryBank); 1666 1667 Carry = B.buildXor(CarryType, Carry, Src2Sign).getReg(0); 1668 MRI.setRegBank(Carry, CarryBank); 1669 } 1670 1671 auto AddLo = B.buildUAddo(S32, CarryType, DstLo, Src2Lo); 1672 DstLo = AddLo.getReg(0); 1673 Register CarryLo = AddLo.getReg(1); 1674 MRI.setRegBank(DstLo, DstBank); 1675 MRI.setRegBank(CarryLo, CarryBank); 1676 1677 auto AddHi = B.buildUAdde(S32, CarryType, DstHi, Src2Hi, CarryLo); 1678 DstHi = AddHi.getReg(0); 1679 MRI.setRegBank(DstHi, DstBank); 1680 1681 Register CarryHi = AddHi.getReg(1); 1682 MRI.setRegBank(CarryHi, CarryBank); 1683 1684 if (IsUnsigned) { 1685 Carry = CarryHi; 1686 } else { 1687 Carry = B.buildXor(CarryType, Carry, CarryHi).getReg(0); 1688 MRI.setRegBank(Carry, CarryBank); 1689 } 1690 } else { 1691 if (IsUnsigned) { 1692 Carry = B.buildConstant(CarryType, 0).getReg(0); 1693 MRI.setRegBank(Carry, CarryBank); 1694 } 1695 } 1696 1697 B.buildMerge(Dst0, {DstLo, DstHi}); 1698 1699 if (DstOnValu) { 1700 B.buildCopy(Dst1, Carry); 1701 } else { 1702 B.buildTrunc(Dst1, Carry); 1703 } 1704 1705 MI.eraseFromParent(); 1706 return true; 1707 } 1708 1709 // Return a suitable opcode for extending the operands of Opc when widening. 1710 static unsigned getExtendOp(unsigned Opc) { 1711 switch (Opc) { 1712 case TargetOpcode::G_ASHR: 1713 case TargetOpcode::G_SMIN: 1714 case TargetOpcode::G_SMAX: 1715 return TargetOpcode::G_SEXT; 1716 case TargetOpcode::G_LSHR: 1717 case TargetOpcode::G_UMIN: 1718 case TargetOpcode::G_UMAX: 1719 return TargetOpcode::G_ZEXT; 1720 default: 1721 return TargetOpcode::G_ANYEXT; 1722 } 1723 } 1724 1725 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding 1726 // any illegal vector extend or unmerge operations. 1727 static std::pair<Register, Register> 1728 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) { 1729 const LLT S32 = LLT::scalar(32); 1730 auto Bitcast = B.buildBitcast(S32, Src); 1731 1732 if (ExtOpcode == TargetOpcode::G_SEXT) { 1733 auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16); 1734 auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16)); 1735 return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0)); 1736 } 1737 1738 auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16)); 1739 if (ExtOpcode == TargetOpcode::G_ZEXT) { 1740 auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff)); 1741 return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0)); 1742 } 1743 1744 assert(ExtOpcode == TargetOpcode::G_ANYEXT); 1745 return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0)); 1746 } 1747 1748 // For cases where only a single copy is inserted for matching register banks. 1749 // Replace the register in the instruction operand 1750 static bool substituteSimpleCopyRegs( 1751 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) { 1752 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx)); 1753 if (!SrcReg.empty()) { 1754 assert(SrcReg.size() == 1); 1755 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]); 1756 return true; 1757 } 1758 1759 return false; 1760 } 1761 1762 /// Handle register layout difference for f16 images for some subtargets. 1763 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B, 1764 MachineRegisterInfo &MRI, 1765 Register Reg) const { 1766 if (!Subtarget.hasUnpackedD16VMem()) 1767 return Reg; 1768 1769 const LLT S16 = LLT::scalar(16); 1770 LLT StoreVT = MRI.getType(Reg); 1771 if (!StoreVT.isVector() || StoreVT.getElementType() != S16) 1772 return Reg; 1773 1774 auto Unmerge = B.buildUnmerge(S16, Reg); 1775 1776 1777 SmallVector<Register, 4> WideRegs; 1778 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 1779 WideRegs.push_back(Unmerge.getReg(I)); 1780 1781 const LLT S32 = LLT::scalar(32); 1782 int NumElts = StoreVT.getNumElements(); 1783 1784 return B.buildMerge(LLT::fixed_vector(NumElts, S32), WideRegs).getReg(0); 1785 } 1786 1787 static std::pair<Register, unsigned> 1788 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { 1789 int64_t Const; 1790 if (mi_match(Reg, MRI, m_ICst(Const))) 1791 return std::make_pair(Register(), Const); 1792 1793 Register Base; 1794 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const)))) 1795 return std::make_pair(Base, Const); 1796 1797 // TODO: Handle G_OR used for add case 1798 return std::make_pair(Reg, 0); 1799 } 1800 1801 std::pair<Register, unsigned> 1802 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B, 1803 Register OrigOffset) const { 1804 const unsigned MaxImm = 4095; 1805 Register BaseReg; 1806 unsigned ImmOffset; 1807 const LLT S32 = LLT::scalar(32); 1808 1809 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(), 1810 OrigOffset); 1811 1812 unsigned C1 = 0; 1813 if (ImmOffset != 0) { 1814 // If the immediate value is too big for the immoffset field, put the value 1815 // and -4096 into the immoffset field so that the value that is copied/added 1816 // for the voffset field is a multiple of 4096, and it stands more chance 1817 // of being CSEd with the copy/add for another similar load/store. 1818 // However, do not do that rounding down to a multiple of 4096 if that is a 1819 // negative number, as it appears to be illegal to have a negative offset 1820 // in the vgpr, even if adding the immediate offset makes it positive. 1821 unsigned Overflow = ImmOffset & ~MaxImm; 1822 ImmOffset -= Overflow; 1823 if ((int32_t)Overflow < 0) { 1824 Overflow += ImmOffset; 1825 ImmOffset = 0; 1826 } 1827 1828 C1 = ImmOffset; 1829 if (Overflow != 0) { 1830 if (!BaseReg) 1831 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 1832 else { 1833 auto OverflowVal = B.buildConstant(S32, Overflow); 1834 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 1835 } 1836 } 1837 } 1838 1839 if (!BaseReg) 1840 BaseReg = B.buildConstant(S32, 0).getReg(0); 1841 1842 return {BaseReg, C1}; 1843 } 1844 1845 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg, 1846 Register SrcReg) const { 1847 MachineRegisterInfo &MRI = *B.getMRI(); 1848 LLT SrcTy = MRI.getType(SrcReg); 1849 if (SrcTy.getSizeInBits() == 32) { 1850 // Use a v_mov_b32 here to make the exec dependency explicit. 1851 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1852 .addDef(DstReg) 1853 .addUse(SrcReg); 1854 return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) && 1855 constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI); 1856 } 1857 1858 Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1859 Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1860 1861 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1862 .addDef(TmpReg0) 1863 .addUse(SrcReg, 0, AMDGPU::sub0); 1864 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1865 .addDef(TmpReg1) 1866 .addUse(SrcReg, 0, AMDGPU::sub1); 1867 B.buildInstr(AMDGPU::REG_SEQUENCE) 1868 .addDef(DstReg) 1869 .addUse(TmpReg0) 1870 .addImm(AMDGPU::sub0) 1871 .addUse(TmpReg1) 1872 .addImm(AMDGPU::sub1); 1873 1874 return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) && 1875 constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI); 1876 } 1877 1878 /// Utility function for pushing dynamic vector indexes with a constant offset 1879 /// into waterfall loops. 1880 static void reinsertVectorIndexAdd(MachineIRBuilder &B, 1881 MachineInstr &IdxUseInstr, 1882 unsigned OpIdx, 1883 unsigned ConstOffset) { 1884 MachineRegisterInfo &MRI = *B.getMRI(); 1885 const LLT S32 = LLT::scalar(32); 1886 Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg(); 1887 B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator()); 1888 1889 auto MaterializedOffset = B.buildConstant(S32, ConstOffset); 1890 1891 auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset); 1892 MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank); 1893 MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank); 1894 IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0)); 1895 } 1896 1897 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the 1898 /// original 32-bit source value (to be inserted in the low part of the combined 1899 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit 1900 /// value. 1901 static void extendLow32IntoHigh32(MachineIRBuilder &B, 1902 Register Hi32Reg, Register Lo32Reg, 1903 unsigned ExtOpc, 1904 const RegisterBank &RegBank, 1905 bool IsBooleanSrc = false) { 1906 if (ExtOpc == AMDGPU::G_ZEXT) { 1907 B.buildConstant(Hi32Reg, 0); 1908 } else if (ExtOpc == AMDGPU::G_SEXT) { 1909 if (IsBooleanSrc) { 1910 // If we know the original source was an s1, the high half is the same as 1911 // the low. 1912 B.buildCopy(Hi32Reg, Lo32Reg); 1913 } else { 1914 // Replicate sign bit from 32-bit extended part. 1915 auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31); 1916 B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank); 1917 B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt); 1918 } 1919 } else { 1920 assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension"); 1921 B.buildUndef(Hi32Reg); 1922 } 1923 } 1924 1925 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect( 1926 MachineInstr &MI, MachineRegisterInfo &MRI, 1927 const OperandsMapper &OpdMapper) const { 1928 1929 Register VecReg = MI.getOperand(1).getReg(); 1930 Register Idx = MI.getOperand(2).getReg(); 1931 1932 const RegisterBank &IdxBank = 1933 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 1934 1935 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank; 1936 1937 LLT VecTy = MRI.getType(VecReg); 1938 unsigned EltSize = VecTy.getScalarSizeInBits(); 1939 unsigned NumElem = VecTy.getNumElements(); 1940 1941 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, 1942 IsDivergentIdx, &Subtarget)) 1943 return false; 1944 1945 MachineIRBuilder B(MI); 1946 LLT S32 = LLT::scalar(32); 1947 1948 const RegisterBank &DstBank = 1949 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1950 const RegisterBank &SrcBank = 1951 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1952 1953 const RegisterBank &CCBank = 1954 (DstBank == AMDGPU::SGPRRegBank && 1955 SrcBank == AMDGPU::SGPRRegBank && 1956 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank 1957 : AMDGPU::VCCRegBank; 1958 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); 1959 1960 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { 1961 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); 1962 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); 1963 } 1964 1965 LLT EltTy = VecTy.getScalarType(); 1966 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 1967 unsigned NumLanes = DstRegs.size(); 1968 if (!NumLanes) 1969 NumLanes = 1; 1970 else 1971 EltTy = MRI.getType(DstRegs[0]); 1972 1973 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg); 1974 SmallVector<Register, 2> Res(NumLanes); 1975 for (unsigned L = 0; L < NumLanes; ++L) 1976 Res[L] = UnmergeToEltTy.getReg(L); 1977 1978 for (unsigned I = 1; I < NumElem; ++I) { 1979 auto IC = B.buildConstant(S32, I); 1980 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); 1981 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC); 1982 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); 1983 1984 for (unsigned L = 0; L < NumLanes; ++L) { 1985 auto S = B.buildSelect(EltTy, Cmp, 1986 UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]); 1987 1988 for (unsigned N : { 0, 2, 3 }) 1989 MRI.setRegBank(S->getOperand(N).getReg(), DstBank); 1990 1991 Res[L] = S->getOperand(0).getReg(); 1992 } 1993 } 1994 1995 for (unsigned L = 0; L < NumLanes; ++L) { 1996 Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L]; 1997 B.buildCopy(DstReg, Res[L]); 1998 MRI.setRegBank(DstReg, DstBank); 1999 } 2000 2001 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank); 2002 MI.eraseFromParent(); 2003 2004 return true; 2005 } 2006 2007 // Insert a cross regbank copy for a register if it already has a bank that 2008 // differs from the one we want to set. 2009 static Register constrainRegToBank(MachineRegisterInfo &MRI, 2010 MachineIRBuilder &B, Register &Reg, 2011 const RegisterBank &Bank) { 2012 const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg); 2013 if (CurrBank && *CurrBank != Bank) { 2014 Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0); 2015 MRI.setRegBank(Copy, Bank); 2016 return Copy; 2017 } 2018 2019 MRI.setRegBank(Reg, Bank); 2020 return Reg; 2021 } 2022 2023 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect( 2024 MachineInstr &MI, MachineRegisterInfo &MRI, 2025 const OperandsMapper &OpdMapper) const { 2026 2027 Register VecReg = MI.getOperand(1).getReg(); 2028 Register Idx = MI.getOperand(3).getReg(); 2029 2030 const RegisterBank &IdxBank = 2031 *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; 2032 2033 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank; 2034 2035 LLT VecTy = MRI.getType(VecReg); 2036 unsigned EltSize = VecTy.getScalarSizeInBits(); 2037 unsigned NumElem = VecTy.getNumElements(); 2038 2039 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, 2040 IsDivergentIdx, &Subtarget)) 2041 return false; 2042 2043 MachineIRBuilder B(MI); 2044 LLT S32 = LLT::scalar(32); 2045 2046 const RegisterBank &DstBank = 2047 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2048 const RegisterBank &SrcBank = 2049 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2050 const RegisterBank &InsBank = 2051 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2052 2053 const RegisterBank &CCBank = 2054 (DstBank == AMDGPU::SGPRRegBank && 2055 SrcBank == AMDGPU::SGPRRegBank && 2056 InsBank == AMDGPU::SGPRRegBank && 2057 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank 2058 : AMDGPU::VCCRegBank; 2059 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); 2060 2061 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { 2062 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); 2063 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); 2064 } 2065 2066 LLT EltTy = VecTy.getScalarType(); 2067 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); 2068 unsigned NumLanes = InsRegs.size(); 2069 if (!NumLanes) { 2070 NumLanes = 1; 2071 InsRegs.push_back(MI.getOperand(2).getReg()); 2072 } else { 2073 EltTy = MRI.getType(InsRegs[0]); 2074 } 2075 2076 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg); 2077 SmallVector<Register, 16> Ops(NumElem * NumLanes); 2078 2079 for (unsigned I = 0; I < NumElem; ++I) { 2080 auto IC = B.buildConstant(S32, I); 2081 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); 2082 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC); 2083 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); 2084 2085 for (unsigned L = 0; L < NumLanes; ++L) { 2086 Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank); 2087 Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L); 2088 Op1 = constrainRegToBank(MRI, B, Op1, DstBank); 2089 2090 Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0); 2091 MRI.setRegBank(Select, DstBank); 2092 2093 Ops[I * NumLanes + L] = Select; 2094 } 2095 } 2096 2097 LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy); 2098 if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) { 2099 B.buildBuildVector(MI.getOperand(0), Ops); 2100 } else { 2101 auto Vec = B.buildBuildVector(MergeTy, Ops); 2102 MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank); 2103 B.buildBitcast(MI.getOperand(0).getReg(), Vec); 2104 } 2105 2106 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank); 2107 MI.eraseFromParent(); 2108 2109 return true; 2110 } 2111 2112 void AMDGPURegisterBankInfo::applyMappingImpl( 2113 const OperandsMapper &OpdMapper) const { 2114 MachineInstr &MI = OpdMapper.getMI(); 2115 unsigned Opc = MI.getOpcode(); 2116 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 2117 switch (Opc) { 2118 case AMDGPU::G_PHI: { 2119 Register DstReg = MI.getOperand(0).getReg(); 2120 LLT DstTy = MRI.getType(DstReg); 2121 if (DstTy != LLT::scalar(1)) 2122 break; 2123 2124 const LLT S32 = LLT::scalar(32); 2125 const RegisterBank *DstBank = 2126 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2127 if (DstBank == &AMDGPU::VCCRegBank) { 2128 applyDefaultMapping(OpdMapper); 2129 // The standard handling only considers the result register bank for 2130 // phis. For VCC, blindly inserting a copy when the phi is lowered will 2131 // produce an invalid copy. We can only copy with some kind of compare to 2132 // get a vector boolean result. Insert a register bank copy that will be 2133 // correctly lowered to a compare. 2134 MachineIRBuilder B(*MI.getParent()->getParent()); 2135 2136 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 2137 Register SrcReg = MI.getOperand(I).getReg(); 2138 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); 2139 2140 if (SrcBank != &AMDGPU::VCCRegBank) { 2141 MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB(); 2142 B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator()); 2143 2144 auto Copy = B.buildCopy(LLT::scalar(1), SrcReg); 2145 MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank); 2146 MI.getOperand(I).setReg(Copy.getReg(0)); 2147 } 2148 } 2149 2150 return; 2151 } 2152 2153 // Phi handling is strange and only considers the bank of the destination. 2154 substituteSimpleCopyRegs(OpdMapper, 0); 2155 2156 // Promote SGPR/VGPR booleans to s32 2157 MachineFunction *MF = MI.getParent()->getParent(); 2158 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); 2159 MachineIRBuilder B(MI, ApplyBank); 2160 LegalizerHelper Helper(*MF, ApplyBank, B); 2161 2162 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 2163 llvm_unreachable("widen scalar should have succeeded"); 2164 2165 return; 2166 } 2167 case AMDGPU::G_ICMP: 2168 case AMDGPU::G_UADDO: 2169 case AMDGPU::G_USUBO: 2170 case AMDGPU::G_UADDE: 2171 case AMDGPU::G_SADDE: 2172 case AMDGPU::G_USUBE: 2173 case AMDGPU::G_SSUBE: { 2174 unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1; 2175 Register DstReg = MI.getOperand(BoolDstOp).getReg(); 2176 2177 const RegisterBank *DstBank = 2178 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2179 if (DstBank != &AMDGPU::SGPRRegBank) 2180 break; 2181 2182 const bool HasCarryIn = MI.getNumOperands() == 5; 2183 2184 // If this is a scalar compare, promote the result to s32, as the selection 2185 // will end up using a copy to a 32-bit vreg. 2186 const LLT S32 = LLT::scalar(32); 2187 Register NewDstReg = MRI.createGenericVirtualRegister(S32); 2188 MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank); 2189 MI.getOperand(BoolDstOp).setReg(NewDstReg); 2190 MachineIRBuilder B(MI); 2191 2192 if (HasCarryIn) { 2193 Register NewSrcReg = MRI.createGenericVirtualRegister(S32); 2194 MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank); 2195 B.buildZExt(NewSrcReg, MI.getOperand(4).getReg()); 2196 MI.getOperand(4).setReg(NewSrcReg); 2197 } 2198 2199 MachineBasicBlock *MBB = MI.getParent(); 2200 B.setInsertPt(*MBB, std::next(MI.getIterator())); 2201 2202 // If we had a constrained VCC result register, a copy was inserted to VCC 2203 // from SGPR. 2204 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0)); 2205 if (DefRegs.empty()) 2206 DefRegs.push_back(DstReg); 2207 B.buildTrunc(DefRegs[0], NewDstReg); 2208 return; 2209 } 2210 case AMDGPU::G_SELECT: { 2211 Register DstReg = MI.getOperand(0).getReg(); 2212 LLT DstTy = MRI.getType(DstReg); 2213 2214 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1)); 2215 if (CondRegs.empty()) 2216 CondRegs.push_back(MI.getOperand(1).getReg()); 2217 else { 2218 assert(CondRegs.size() == 1); 2219 } 2220 2221 const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI); 2222 if (CondBank == &AMDGPU::SGPRRegBank) { 2223 MachineIRBuilder B(MI); 2224 const LLT S32 = LLT::scalar(32); 2225 Register NewCondReg = MRI.createGenericVirtualRegister(S32); 2226 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); 2227 2228 MI.getOperand(1).setReg(NewCondReg); 2229 B.buildZExt(NewCondReg, CondRegs[0]); 2230 } 2231 2232 if (DstTy.getSizeInBits() != 64) 2233 break; 2234 2235 MachineIRBuilder B(MI); 2236 LLT HalfTy = getHalfSizedType(DstTy); 2237 2238 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2239 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 2240 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3)); 2241 2242 // All inputs are SGPRs, nothing special to do. 2243 if (DefRegs.empty()) { 2244 assert(Src1Regs.empty() && Src2Regs.empty()); 2245 break; 2246 } 2247 2248 if (Src1Regs.empty()) 2249 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 2250 else { 2251 setRegsToType(MRI, Src1Regs, HalfTy); 2252 } 2253 2254 if (Src2Regs.empty()) 2255 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg()); 2256 else 2257 setRegsToType(MRI, Src2Regs, HalfTy); 2258 2259 setRegsToType(MRI, DefRegs, HalfTy); 2260 2261 B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]); 2262 B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]); 2263 2264 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2265 MI.eraseFromParent(); 2266 return; 2267 } 2268 case AMDGPU::G_BRCOND: { 2269 Register CondReg = MI.getOperand(0).getReg(); 2270 // FIXME: Should use legalizer helper, but should change bool ext type. 2271 const RegisterBank *CondBank = 2272 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2273 2274 if (CondBank == &AMDGPU::SGPRRegBank) { 2275 MachineIRBuilder B(MI); 2276 const LLT S32 = LLT::scalar(32); 2277 Register NewCondReg = MRI.createGenericVirtualRegister(S32); 2278 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); 2279 2280 MI.getOperand(0).setReg(NewCondReg); 2281 B.buildZExt(NewCondReg, CondReg); 2282 return; 2283 } 2284 2285 break; 2286 } 2287 case AMDGPU::G_AND: 2288 case AMDGPU::G_OR: 2289 case AMDGPU::G_XOR: { 2290 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if 2291 // there is a VGPR input. 2292 Register DstReg = MI.getOperand(0).getReg(); 2293 LLT DstTy = MRI.getType(DstReg); 2294 2295 if (DstTy.getSizeInBits() == 1) { 2296 const RegisterBank *DstBank = 2297 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2298 if (DstBank == &AMDGPU::VCCRegBank) 2299 break; 2300 2301 MachineFunction *MF = MI.getParent()->getParent(); 2302 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); 2303 MachineIRBuilder B(MI, ApplyBank); 2304 LegalizerHelper Helper(*MF, ApplyBank, B); 2305 2306 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) != 2307 LegalizerHelper::Legalized) 2308 llvm_unreachable("widen scalar should have succeeded"); 2309 return; 2310 } 2311 2312 if (DstTy.getSizeInBits() != 64) 2313 break; 2314 2315 LLT HalfTy = getHalfSizedType(DstTy); 2316 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2317 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1)); 2318 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 2319 2320 // All inputs are SGPRs, nothing special to do. 2321 if (DefRegs.empty()) { 2322 assert(Src0Regs.empty() && Src1Regs.empty()); 2323 break; 2324 } 2325 2326 assert(DefRegs.size() == 2); 2327 assert(Src0Regs.size() == Src1Regs.size() && 2328 (Src0Regs.empty() || Src0Regs.size() == 2)); 2329 2330 // Depending on where the source registers came from, the generic code may 2331 // have decided to split the inputs already or not. If not, we still need to 2332 // extract the values. 2333 MachineIRBuilder B(MI); 2334 2335 if (Src0Regs.empty()) 2336 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg()); 2337 else 2338 setRegsToType(MRI, Src0Regs, HalfTy); 2339 2340 if (Src1Regs.empty()) 2341 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 2342 else 2343 setRegsToType(MRI, Src1Regs, HalfTy); 2344 2345 setRegsToType(MRI, DefRegs, HalfTy); 2346 2347 B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]}); 2348 B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]}); 2349 2350 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2351 MI.eraseFromParent(); 2352 return; 2353 } 2354 case AMDGPU::G_ABS: { 2355 Register SrcReg = MI.getOperand(1).getReg(); 2356 const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg); 2357 2358 // There is no VALU abs instruction so we need to replace it with a sub and 2359 // max combination. 2360 if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) { 2361 MachineFunction *MF = MI.getParent()->getParent(); 2362 ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank); 2363 MachineIRBuilder B(MI, Apply); 2364 LegalizerHelper Helper(*MF, Apply, B); 2365 2366 if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized) 2367 llvm_unreachable("lowerAbsToMaxNeg should have succeeded"); 2368 return; 2369 } 2370 LLVM_FALLTHROUGH; 2371 } 2372 case AMDGPU::G_ADD: 2373 case AMDGPU::G_SUB: 2374 case AMDGPU::G_MUL: 2375 case AMDGPU::G_SHL: 2376 case AMDGPU::G_LSHR: 2377 case AMDGPU::G_ASHR: 2378 case AMDGPU::G_SMIN: 2379 case AMDGPU::G_SMAX: 2380 case AMDGPU::G_UMIN: 2381 case AMDGPU::G_UMAX: { 2382 Register DstReg = MI.getOperand(0).getReg(); 2383 LLT DstTy = MRI.getType(DstReg); 2384 2385 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. 2386 // Packed 16-bit operations need to be scalarized and promoted. 2387 if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16)) 2388 break; 2389 2390 const RegisterBank *DstBank = 2391 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2392 if (DstBank == &AMDGPU::VGPRRegBank) 2393 break; 2394 2395 const LLT S32 = LLT::scalar(32); 2396 MachineBasicBlock *MBB = MI.getParent(); 2397 MachineFunction *MF = MBB->getParent(); 2398 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank); 2399 MachineIRBuilder B(MI, ApplySALU); 2400 2401 if (DstTy.isVector()) { 2402 Register WideSrc0Lo, WideSrc0Hi; 2403 Register WideSrc1Lo, WideSrc1Hi; 2404 2405 unsigned ExtendOp = getExtendOp(MI.getOpcode()); 2406 std::tie(WideSrc0Lo, WideSrc0Hi) 2407 = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp); 2408 std::tie(WideSrc1Lo, WideSrc1Hi) 2409 = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp); 2410 auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo}); 2411 auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi}); 2412 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)}); 2413 MI.eraseFromParent(); 2414 } else { 2415 LegalizerHelper Helper(*MF, ApplySALU, B); 2416 2417 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 2418 llvm_unreachable("widen scalar should have succeeded"); 2419 2420 // FIXME: s16 shift amounts should be legal. 2421 if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR || 2422 Opc == AMDGPU::G_ASHR) { 2423 B.setInsertPt(*MBB, MI.getIterator()); 2424 if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized) 2425 llvm_unreachable("widen scalar should have succeeded"); 2426 } 2427 } 2428 2429 return; 2430 } 2431 case AMDGPU::G_SEXT_INREG: { 2432 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1)); 2433 if (SrcRegs.empty()) 2434 break; // Nothing to repair 2435 2436 const LLT S32 = LLT::scalar(32); 2437 MachineIRBuilder B(MI); 2438 ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank); 2439 GISelObserverWrapper Observer(&O); 2440 B.setChangeObserver(Observer); 2441 2442 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs 2443 // we would need to further expand, and doesn't let us directly set the 2444 // result registers. 2445 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 2446 2447 int Amt = MI.getOperand(2).getImm(); 2448 if (Amt <= 32) { 2449 if (Amt == 32) { 2450 // The low bits are unchanged. 2451 B.buildCopy(DstRegs[0], SrcRegs[0]); 2452 } else { 2453 // Extend in the low bits and propagate the sign bit to the high half. 2454 B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt); 2455 } 2456 2457 B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31)); 2458 } else { 2459 // The low bits are unchanged, and extend in the high bits. 2460 B.buildCopy(DstRegs[0], SrcRegs[0]); 2461 B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32); 2462 } 2463 2464 Register DstReg = MI.getOperand(0).getReg(); 2465 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2466 MI.eraseFromParent(); 2467 return; 2468 } 2469 case AMDGPU::G_CTPOP: 2470 case AMDGPU::G_BITREVERSE: { 2471 const RegisterBank *DstBank = 2472 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2473 if (DstBank == &AMDGPU::SGPRRegBank) 2474 break; 2475 2476 Register SrcReg = MI.getOperand(1).getReg(); 2477 const LLT S32 = LLT::scalar(32); 2478 LLT Ty = MRI.getType(SrcReg); 2479 if (Ty == S32) 2480 break; 2481 2482 ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank); 2483 MachineIRBuilder B(MI, ApplyVALU); 2484 2485 MachineFunction &MF = B.getMF(); 2486 LegalizerHelper Helper(MF, ApplyVALU, B); 2487 2488 if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized) 2489 llvm_unreachable("narrowScalar should have succeeded"); 2490 return; 2491 } 2492 case AMDGPU::G_AMDGPU_FFBH_U32: 2493 case AMDGPU::G_AMDGPU_FFBL_B32: 2494 case AMDGPU::G_CTLZ_ZERO_UNDEF: 2495 case AMDGPU::G_CTTZ_ZERO_UNDEF: { 2496 const RegisterBank *DstBank = 2497 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2498 if (DstBank == &AMDGPU::SGPRRegBank) 2499 break; 2500 2501 Register SrcReg = MI.getOperand(1).getReg(); 2502 const LLT S32 = LLT::scalar(32); 2503 LLT Ty = MRI.getType(SrcReg); 2504 if (Ty == S32) 2505 break; 2506 2507 // We can narrow this more efficiently than Helper can by using ffbh/ffbl 2508 // which return -1 when the input is zero: 2509 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32)) 2510 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo)) 2511 // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32)) 2512 // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo)) 2513 ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank); 2514 MachineIRBuilder B(MI, ApplyVALU); 2515 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1)); 2516 unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF 2517 ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32 2518 : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF 2519 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32 2520 : Opc; 2521 unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32; 2522 auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]}); 2523 auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]}); 2524 unsigned AddOpc = 2525 Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF 2526 ? AMDGPU::G_ADD 2527 : AMDGPU::G_UADDSAT; 2528 Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)}); 2529 Register DstReg = MI.getOperand(0).getReg(); 2530 B.buildUMin(DstReg, X, Y); 2531 MI.eraseFromParent(); 2532 return; 2533 } 2534 case AMDGPU::G_SEXT: 2535 case AMDGPU::G_ZEXT: 2536 case AMDGPU::G_ANYEXT: { 2537 Register SrcReg = MI.getOperand(1).getReg(); 2538 LLT SrcTy = MRI.getType(SrcReg); 2539 const bool Signed = Opc == AMDGPU::G_SEXT; 2540 2541 assert(empty(OpdMapper.getVRegs(1))); 2542 2543 MachineIRBuilder B(MI); 2544 const RegisterBank *SrcBank = 2545 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2546 2547 Register DstReg = MI.getOperand(0).getReg(); 2548 LLT DstTy = MRI.getType(DstReg); 2549 if (DstTy.isScalar() && 2550 SrcBank != &AMDGPU::SGPRRegBank && 2551 SrcBank != &AMDGPU::VCCRegBank && 2552 // FIXME: Should handle any type that round to s64 when irregular 2553 // breakdowns supported. 2554 DstTy.getSizeInBits() == 64 && 2555 SrcTy.getSizeInBits() <= 32) { 2556 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2557 2558 // Extend to 32-bit, and then extend the low half. 2559 if (Signed) { 2560 // TODO: Should really be buildSExtOrCopy 2561 B.buildSExtOrTrunc(DefRegs[0], SrcReg); 2562 } else if (Opc == AMDGPU::G_ZEXT) { 2563 B.buildZExtOrTrunc(DefRegs[0], SrcReg); 2564 } else { 2565 B.buildAnyExtOrTrunc(DefRegs[0], SrcReg); 2566 } 2567 2568 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank); 2569 MRI.setRegBank(DstReg, *SrcBank); 2570 MI.eraseFromParent(); 2571 return; 2572 } 2573 2574 if (SrcTy != LLT::scalar(1)) 2575 return; 2576 2577 // It is not legal to have a legalization artifact with a VCC source. Rather 2578 // than introducing a copy, insert the select we would have to select the 2579 // copy to. 2580 if (SrcBank == &AMDGPU::VCCRegBank) { 2581 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2582 2583 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank; 2584 2585 unsigned DstSize = DstTy.getSizeInBits(); 2586 // 64-bit select is SGPR only 2587 const bool UseSel64 = DstSize > 32 && 2588 SrcBank->getID() == AMDGPU::SGPRRegBankID; 2589 2590 // TODO: Should s16 select be legal? 2591 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32); 2592 auto True = B.buildConstant(SelType, Signed ? -1 : 1); 2593 auto False = B.buildConstant(SelType, 0); 2594 2595 MRI.setRegBank(True.getReg(0), *DstBank); 2596 MRI.setRegBank(False.getReg(0), *DstBank); 2597 MRI.setRegBank(DstReg, *DstBank); 2598 2599 if (DstSize > 32) { 2600 B.buildSelect(DefRegs[0], SrcReg, True, False); 2601 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true); 2602 } else if (DstSize < 32) { 2603 auto Sel = B.buildSelect(SelType, SrcReg, True, False); 2604 MRI.setRegBank(Sel.getReg(0), *DstBank); 2605 B.buildTrunc(DstReg, Sel); 2606 } else { 2607 B.buildSelect(DstReg, SrcReg, True, False); 2608 } 2609 2610 MI.eraseFromParent(); 2611 return; 2612 } 2613 2614 break; 2615 } 2616 case AMDGPU::G_BUILD_VECTOR: 2617 case AMDGPU::G_BUILD_VECTOR_TRUNC: { 2618 Register DstReg = MI.getOperand(0).getReg(); 2619 LLT DstTy = MRI.getType(DstReg); 2620 if (DstTy != LLT::fixed_vector(2, 16)) 2621 break; 2622 2623 assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty()); 2624 substituteSimpleCopyRegs(OpdMapper, 1); 2625 substituteSimpleCopyRegs(OpdMapper, 2); 2626 2627 const RegisterBank *DstBank = 2628 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2629 if (DstBank == &AMDGPU::SGPRRegBank) 2630 break; // Can use S_PACK_* instructions. 2631 2632 MachineIRBuilder B(MI); 2633 2634 Register Lo = MI.getOperand(1).getReg(); 2635 Register Hi = MI.getOperand(2).getReg(); 2636 const LLT S32 = LLT::scalar(32); 2637 2638 const RegisterBank *BankLo = 2639 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2640 const RegisterBank *BankHi = 2641 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2642 2643 Register ZextLo; 2644 Register ShiftHi; 2645 2646 if (Opc == AMDGPU::G_BUILD_VECTOR) { 2647 ZextLo = B.buildZExt(S32, Lo).getReg(0); 2648 MRI.setRegBank(ZextLo, *BankLo); 2649 2650 Register ZextHi = B.buildZExt(S32, Hi).getReg(0); 2651 MRI.setRegBank(ZextHi, *BankHi); 2652 2653 auto ShiftAmt = B.buildConstant(S32, 16); 2654 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); 2655 2656 ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0); 2657 MRI.setRegBank(ShiftHi, *BankHi); 2658 } else { 2659 Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0); 2660 MRI.setRegBank(MaskLo, *BankLo); 2661 2662 auto ShiftAmt = B.buildConstant(S32, 16); 2663 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); 2664 2665 ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0); 2666 MRI.setRegBank(ShiftHi, *BankHi); 2667 2668 ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0); 2669 MRI.setRegBank(ZextLo, *BankLo); 2670 } 2671 2672 auto Or = B.buildOr(S32, ZextLo, ShiftHi); 2673 MRI.setRegBank(Or.getReg(0), *DstBank); 2674 2675 B.buildBitcast(DstReg, Or); 2676 MI.eraseFromParent(); 2677 return; 2678 } 2679 case AMDGPU::G_EXTRACT_VECTOR_ELT: { 2680 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 2681 2682 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty()); 2683 2684 Register DstReg = MI.getOperand(0).getReg(); 2685 Register SrcReg = MI.getOperand(1).getReg(); 2686 2687 const LLT S32 = LLT::scalar(32); 2688 LLT DstTy = MRI.getType(DstReg); 2689 LLT SrcTy = MRI.getType(SrcReg); 2690 2691 if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper)) 2692 return; 2693 2694 MachineIRBuilder B(MI); 2695 2696 const ValueMapping &DstMapping 2697 = OpdMapper.getInstrMapping().getOperandMapping(0); 2698 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank; 2699 const RegisterBank *SrcBank = 2700 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2701 const RegisterBank *IdxBank = 2702 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2703 2704 Register BaseIdxReg; 2705 unsigned ConstOffset; 2706 std::tie(BaseIdxReg, ConstOffset) = 2707 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg()); 2708 2709 // See if the index is an add of a constant which will be foldable by moving 2710 // the base register of the index later if this is going to be executed in a 2711 // waterfall loop. This is essentially to reassociate the add of a constant 2712 // with the readfirstlane. 2713 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && 2714 ConstOffset > 0 && 2715 ConstOffset < SrcTy.getNumElements(); 2716 2717 // Move the base register. We'll re-insert the add later. 2718 if (ShouldMoveIndexIntoLoop) 2719 MI.getOperand(2).setReg(BaseIdxReg); 2720 2721 // If this is a VGPR result only because the index was a VGPR result, the 2722 // actual indexing will be done on the SGPR source vector, which will 2723 // produce a scalar result. We need to copy to the VGPR result inside the 2724 // waterfall loop. 2725 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank && 2726 SrcBank == &AMDGPU::SGPRRegBank; 2727 if (DstRegs.empty()) { 2728 applyDefaultMapping(OpdMapper); 2729 2730 executeInWaterfallLoop(MI, MRI, { 2 }); 2731 2732 if (NeedCopyToVGPR) { 2733 // We don't want a phi for this temporary reg. 2734 Register TmpReg = MRI.createGenericVirtualRegister(DstTy); 2735 MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank); 2736 MI.getOperand(0).setReg(TmpReg); 2737 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 2738 2739 // Use a v_mov_b32 here to make the exec dependency explicit. 2740 buildVCopy(B, DstReg, TmpReg); 2741 } 2742 2743 // Re-insert the constant offset add inside the waterfall loop. 2744 if (ShouldMoveIndexIntoLoop) 2745 reinsertVectorIndexAdd(B, MI, 2, ConstOffset); 2746 2747 return; 2748 } 2749 2750 assert(DstTy.getSizeInBits() == 64); 2751 2752 LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32); 2753 2754 auto CastSrc = B.buildBitcast(Vec32, SrcReg); 2755 auto One = B.buildConstant(S32, 1); 2756 2757 MachineBasicBlock::iterator MII = MI.getIterator(); 2758 2759 // Split the vector index into 32-bit pieces. Prepare to move all of the 2760 // new instructions into a waterfall loop if necessary. 2761 // 2762 // Don't put the bitcast or constant in the loop. 2763 MachineInstrSpan Span(MII, &B.getMBB()); 2764 2765 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). 2766 auto IdxLo = B.buildShl(S32, BaseIdxReg, One); 2767 auto IdxHi = B.buildAdd(S32, IdxLo, One); 2768 2769 auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo); 2770 auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi); 2771 2772 MRI.setRegBank(DstReg, *DstBank); 2773 MRI.setRegBank(CastSrc.getReg(0), *SrcBank); 2774 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); 2775 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); 2776 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); 2777 2778 SmallSet<Register, 4> OpsToWaterfall; 2779 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) { 2780 MI.eraseFromParent(); 2781 return; 2782 } 2783 2784 // Remove the original instruction to avoid potentially confusing the 2785 // waterfall loop logic. 2786 B.setInstr(*Span.begin()); 2787 MI.eraseFromParent(); 2788 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 2789 OpsToWaterfall, MRI); 2790 2791 if (NeedCopyToVGPR) { 2792 MachineBasicBlock *LoopBB = Extract1->getParent(); 2793 Register TmpReg0 = MRI.createGenericVirtualRegister(S32); 2794 Register TmpReg1 = MRI.createGenericVirtualRegister(S32); 2795 MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank); 2796 MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank); 2797 2798 Extract0->getOperand(0).setReg(TmpReg0); 2799 Extract1->getOperand(0).setReg(TmpReg1); 2800 2801 B.setInsertPt(*LoopBB, ++Extract1->getIterator()); 2802 2803 buildVCopy(B, DstRegs[0], TmpReg0); 2804 buildVCopy(B, DstRegs[1], TmpReg1); 2805 } 2806 2807 if (ShouldMoveIndexIntoLoop) 2808 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); 2809 2810 return; 2811 } 2812 case AMDGPU::G_INSERT_VECTOR_ELT: { 2813 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); 2814 2815 Register DstReg = MI.getOperand(0).getReg(); 2816 LLT VecTy = MRI.getType(DstReg); 2817 2818 assert(OpdMapper.getVRegs(0).empty()); 2819 assert(OpdMapper.getVRegs(3).empty()); 2820 2821 if (substituteSimpleCopyRegs(OpdMapper, 1)) 2822 MRI.setType(MI.getOperand(1).getReg(), VecTy); 2823 2824 if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper)) 2825 return; 2826 2827 const RegisterBank *IdxBank = 2828 OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; 2829 2830 Register SrcReg = MI.getOperand(1).getReg(); 2831 Register InsReg = MI.getOperand(2).getReg(); 2832 LLT InsTy = MRI.getType(InsReg); 2833 (void)InsTy; 2834 2835 Register BaseIdxReg; 2836 unsigned ConstOffset; 2837 std::tie(BaseIdxReg, ConstOffset) = 2838 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg()); 2839 2840 // See if the index is an add of a constant which will be foldable by moving 2841 // the base register of the index later if this is going to be executed in a 2842 // waterfall loop. This is essentially to reassociate the add of a constant 2843 // with the readfirstlane. 2844 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && 2845 ConstOffset > 0 && 2846 ConstOffset < VecTy.getNumElements(); 2847 2848 // Move the base register. We'll re-insert the add later. 2849 if (ShouldMoveIndexIntoLoop) 2850 MI.getOperand(3).setReg(BaseIdxReg); 2851 2852 2853 if (InsRegs.empty()) { 2854 executeInWaterfallLoop(MI, MRI, { 3 }); 2855 2856 // Re-insert the constant offset add inside the waterfall loop. 2857 if (ShouldMoveIndexIntoLoop) { 2858 MachineIRBuilder B(MI); 2859 reinsertVectorIndexAdd(B, MI, 3, ConstOffset); 2860 } 2861 2862 return; 2863 } 2864 2865 2866 assert(InsTy.getSizeInBits() == 64); 2867 2868 const LLT S32 = LLT::scalar(32); 2869 LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32); 2870 2871 MachineIRBuilder B(MI); 2872 auto CastSrc = B.buildBitcast(Vec32, SrcReg); 2873 auto One = B.buildConstant(S32, 1); 2874 2875 // Split the vector index into 32-bit pieces. Prepare to move all of the 2876 // new instructions into a waterfall loop if necessary. 2877 // 2878 // Don't put the bitcast or constant in the loop. 2879 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); 2880 2881 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). 2882 auto IdxLo = B.buildShl(S32, BaseIdxReg, One); 2883 auto IdxHi = B.buildAdd(S32, IdxLo, One); 2884 2885 auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo); 2886 auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi); 2887 2888 const RegisterBank *DstBank = 2889 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2890 const RegisterBank *SrcBank = 2891 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2892 const RegisterBank *InsSrcBank = 2893 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2894 2895 MRI.setRegBank(InsReg, *InsSrcBank); 2896 MRI.setRegBank(CastSrc.getReg(0), *SrcBank); 2897 MRI.setRegBank(InsLo.getReg(0), *DstBank); 2898 MRI.setRegBank(InsHi.getReg(0), *DstBank); 2899 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); 2900 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); 2901 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); 2902 2903 2904 SmallSet<Register, 4> OpsToWaterfall; 2905 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) { 2906 B.setInsertPt(B.getMBB(), MI); 2907 B.buildBitcast(DstReg, InsHi); 2908 MI.eraseFromParent(); 2909 return; 2910 } 2911 2912 B.setInstr(*Span.begin()); 2913 MI.eraseFromParent(); 2914 2915 // Figure out the point after the waterfall loop before mangling the control 2916 // flow. 2917 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 2918 OpsToWaterfall, MRI); 2919 2920 // The insertion point is now right after the original instruction. 2921 // 2922 // Keep the bitcast to the original vector type out of the loop. Doing this 2923 // saved an extra phi we don't need inside the loop. 2924 B.buildBitcast(DstReg, InsHi); 2925 2926 // Re-insert the constant offset add inside the waterfall loop. 2927 if (ShouldMoveIndexIntoLoop) 2928 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); 2929 2930 return; 2931 } 2932 case AMDGPU::G_AMDGPU_BUFFER_LOAD: 2933 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 2934 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: 2935 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 2936 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: 2937 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: 2938 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: 2939 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: 2940 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: 2941 case AMDGPU::G_AMDGPU_BUFFER_STORE: 2942 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: 2943 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: 2944 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: 2945 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: 2946 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: 2947 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: { 2948 applyDefaultMapping(OpdMapper); 2949 executeInWaterfallLoop(MI, MRI, {1, 4}); 2950 return; 2951 } 2952 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: 2953 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: 2954 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: 2955 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: 2956 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: 2957 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: 2958 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: 2959 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: 2960 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: 2961 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: 2962 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: 2963 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: { 2964 applyDefaultMapping(OpdMapper); 2965 executeInWaterfallLoop(MI, MRI, {2, 5}); 2966 return; 2967 } 2968 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: 2969 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: 2970 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { 2971 applyDefaultMapping(OpdMapper); 2972 executeInWaterfallLoop(MI, MRI, {2, 5}); 2973 return; 2974 } 2975 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { 2976 applyDefaultMapping(OpdMapper); 2977 executeInWaterfallLoop(MI, MRI, {3, 6}); 2978 return; 2979 } 2980 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { 2981 applyMappingSBufferLoad(OpdMapper); 2982 return; 2983 } 2984 case AMDGPU::G_INTRINSIC: { 2985 switch (MI.getIntrinsicID()) { 2986 case Intrinsic::amdgcn_readlane: { 2987 substituteSimpleCopyRegs(OpdMapper, 2); 2988 2989 assert(OpdMapper.getVRegs(0).empty()); 2990 assert(OpdMapper.getVRegs(3).empty()); 2991 2992 // Make sure the index is an SGPR. It doesn't make sense to run this in a 2993 // waterfall loop, so assume it's a uniform value. 2994 constrainOpWithReadfirstlane(MI, MRI, 3); // Index 2995 return; 2996 } 2997 case Intrinsic::amdgcn_writelane: { 2998 assert(OpdMapper.getVRegs(0).empty()); 2999 assert(OpdMapper.getVRegs(2).empty()); 3000 assert(OpdMapper.getVRegs(3).empty()); 3001 3002 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val 3003 constrainOpWithReadfirstlane(MI, MRI, 2); // Source value 3004 constrainOpWithReadfirstlane(MI, MRI, 3); // Index 3005 return; 3006 } 3007 case Intrinsic::amdgcn_interp_p1: 3008 case Intrinsic::amdgcn_interp_p2: 3009 case Intrinsic::amdgcn_interp_mov: 3010 case Intrinsic::amdgcn_interp_p1_f16: 3011 case Intrinsic::amdgcn_interp_p2_f16: { 3012 applyDefaultMapping(OpdMapper); 3013 3014 // Readlane for m0 value, which is always the last operand. 3015 // FIXME: Should this be a waterfall loop instead? 3016 constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index 3017 return; 3018 } 3019 case Intrinsic::amdgcn_permlane16: 3020 case Intrinsic::amdgcn_permlanex16: { 3021 // Doing a waterfall loop over these wouldn't make any sense. 3022 substituteSimpleCopyRegs(OpdMapper, 2); 3023 substituteSimpleCopyRegs(OpdMapper, 3); 3024 constrainOpWithReadfirstlane(MI, MRI, 4); 3025 constrainOpWithReadfirstlane(MI, MRI, 5); 3026 return; 3027 } 3028 case Intrinsic::amdgcn_sbfe: 3029 applyMappingBFE(OpdMapper, true); 3030 return; 3031 case Intrinsic::amdgcn_ubfe: 3032 applyMappingBFE(OpdMapper, false); 3033 return; 3034 case Intrinsic::amdgcn_ballot: 3035 // Use default handling and insert copy to vcc source. 3036 break; 3037 } 3038 break; 3039 } 3040 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 3041 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: 3042 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: 3043 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { 3044 const AMDGPU::RsrcIntrinsic *RSrcIntrin 3045 = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID()); 3046 assert(RSrcIntrin && RSrcIntrin->IsImage); 3047 // Non-images can have complications from operands that allow both SGPR 3048 // and VGPR. For now it's too complicated to figure out the final opcode 3049 // to derive the register bank from the MCInstrDesc. 3050 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg); 3051 return; 3052 } 3053 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: { 3054 unsigned N = MI.getNumExplicitOperands() - 2; 3055 applyDefaultMapping(OpdMapper); 3056 executeInWaterfallLoop(MI, MRI, { N }); 3057 return; 3058 } 3059 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { 3060 auto IntrID = MI.getIntrinsicID(); 3061 switch (IntrID) { 3062 case Intrinsic::amdgcn_ds_ordered_add: 3063 case Intrinsic::amdgcn_ds_ordered_swap: { 3064 // This is only allowed to execute with 1 lane, so readfirstlane is safe. 3065 assert(OpdMapper.getVRegs(0).empty()); 3066 substituteSimpleCopyRegs(OpdMapper, 3); 3067 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3068 return; 3069 } 3070 case Intrinsic::amdgcn_ds_gws_init: 3071 case Intrinsic::amdgcn_ds_gws_barrier: 3072 case Intrinsic::amdgcn_ds_gws_sema_br: { 3073 // Only the first lane is executes, so readfirstlane is safe. 3074 substituteSimpleCopyRegs(OpdMapper, 1); 3075 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3076 return; 3077 } 3078 case Intrinsic::amdgcn_ds_gws_sema_v: 3079 case Intrinsic::amdgcn_ds_gws_sema_p: 3080 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 3081 // Only the first lane is executes, so readfirstlane is safe. 3082 constrainOpWithReadfirstlane(MI, MRI, 1); // M0 3083 return; 3084 } 3085 case Intrinsic::amdgcn_ds_append: 3086 case Intrinsic::amdgcn_ds_consume: { 3087 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3088 return; 3089 } 3090 case Intrinsic::amdgcn_s_sendmsg: 3091 case Intrinsic::amdgcn_s_sendmsghalt: { 3092 // FIXME: Should this use a waterfall loop? 3093 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3094 return; 3095 } 3096 case Intrinsic::amdgcn_s_setreg: { 3097 constrainOpWithReadfirstlane(MI, MRI, 2); 3098 return; 3099 } 3100 case Intrinsic::amdgcn_raw_buffer_load_lds: { 3101 applyDefaultMapping(OpdMapper); 3102 constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc 3103 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3104 constrainOpWithReadfirstlane(MI, MRI, 5); // soffset 3105 return; 3106 } 3107 case Intrinsic::amdgcn_struct_buffer_load_lds: { 3108 applyDefaultMapping(OpdMapper); 3109 constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc 3110 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3111 constrainOpWithReadfirstlane(MI, MRI, 6); // soffset 3112 return; 3113 } 3114 case Intrinsic::amdgcn_global_load_lds: { 3115 applyDefaultMapping(OpdMapper); 3116 constrainOpWithReadfirstlane(MI, MRI, 2); 3117 return; 3118 } 3119 default: { 3120 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = 3121 AMDGPU::lookupRsrcIntrinsic(IntrID)) { 3122 // Non-images can have complications from operands that allow both SGPR 3123 // and VGPR. For now it's too complicated to figure out the final opcode 3124 // to derive the register bank from the MCInstrDesc. 3125 if (RSrcIntrin->IsImage) { 3126 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg); 3127 return; 3128 } 3129 } 3130 3131 break; 3132 } 3133 } 3134 break; 3135 } 3136 case AMDGPU::G_SI_CALL: { 3137 // Use a set to avoid extra readfirstlanes in the case where multiple 3138 // operands are the same register. 3139 SmallSet<Register, 4> SGPROperandRegs; 3140 3141 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1})) 3142 break; 3143 3144 // Move all copies to physical SGPRs that are used by the call instruction 3145 // into the loop block. Start searching for these copies until the 3146 // ADJCALLSTACKUP. 3147 unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP; 3148 unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN; 3149 3150 // Move all non-copies before the copies, so that a complete range can be 3151 // moved into the waterfall loop. 3152 SmallVector<MachineInstr *, 4> NonCopyInstrs; 3153 // Count of NonCopyInstrs found until the current LastCopy. 3154 unsigned NonCopyInstrsLen = 0; 3155 MachineBasicBlock::iterator Start(&MI); 3156 MachineBasicBlock::iterator LastCopy = Start; 3157 MachineBasicBlock *MBB = MI.getParent(); 3158 const SIMachineFunctionInfo *Info = 3159 MBB->getParent()->getInfo<SIMachineFunctionInfo>(); 3160 while (Start->getOpcode() != FrameSetupOpcode) { 3161 --Start; 3162 bool IsCopy = false; 3163 if (Start->getOpcode() == AMDGPU::COPY) { 3164 auto &Dst = Start->getOperand(0); 3165 if (Dst.isReg()) { 3166 Register Reg = Dst.getReg(); 3167 if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) { 3168 IsCopy = true; 3169 } else { 3170 // Also move the copy from the scratch rsrc descriptor into the loop 3171 // to allow it to be optimized away. 3172 auto &Src = Start->getOperand(1); 3173 if (Src.isReg()) { 3174 Reg = Src.getReg(); 3175 IsCopy = Info->getScratchRSrcReg() == Reg; 3176 } 3177 } 3178 } 3179 } 3180 3181 if (IsCopy) { 3182 LastCopy = Start; 3183 NonCopyInstrsLen = NonCopyInstrs.size(); 3184 } else { 3185 NonCopyInstrs.push_back(&*Start); 3186 } 3187 } 3188 NonCopyInstrs.resize(NonCopyInstrsLen); 3189 3190 for (auto *NonCopy : reverse(NonCopyInstrs)) { 3191 MBB->splice(LastCopy, MBB, NonCopy->getIterator()); 3192 } 3193 Start = LastCopy; 3194 3195 // Do the same for copies after the loop 3196 NonCopyInstrs.clear(); 3197 NonCopyInstrsLen = 0; 3198 MachineBasicBlock::iterator End(&MI); 3199 LastCopy = End; 3200 while (End->getOpcode() != FrameDestroyOpcode) { 3201 ++End; 3202 bool IsCopy = false; 3203 if (End->getOpcode() == AMDGPU::COPY) { 3204 auto &Src = End->getOperand(1); 3205 if (Src.isReg()) { 3206 Register Reg = Src.getReg(); 3207 IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI); 3208 } 3209 } 3210 3211 if (IsCopy) { 3212 LastCopy = End; 3213 NonCopyInstrsLen = NonCopyInstrs.size(); 3214 } else { 3215 NonCopyInstrs.push_back(&*End); 3216 } 3217 } 3218 NonCopyInstrs.resize(NonCopyInstrsLen); 3219 3220 End = LastCopy; 3221 ++LastCopy; 3222 for (auto *NonCopy : reverse(NonCopyInstrs)) { 3223 MBB->splice(LastCopy, MBB, NonCopy->getIterator()); 3224 } 3225 3226 ++End; 3227 MachineIRBuilder B(*Start); 3228 executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs, MRI); 3229 break; 3230 } 3231 case AMDGPU::G_LOAD: 3232 case AMDGPU::G_ZEXTLOAD: 3233 case AMDGPU::G_SEXTLOAD: { 3234 if (applyMappingLoad(MI, OpdMapper, MRI)) 3235 return; 3236 break; 3237 } 3238 case AMDGPU::G_DYN_STACKALLOC: 3239 applyMappingDynStackAlloc(MI, OpdMapper, MRI); 3240 return; 3241 case AMDGPU::G_SBFX: 3242 applyMappingBFE(OpdMapper, /*Signed*/ true); 3243 return; 3244 case AMDGPU::G_UBFX: 3245 applyMappingBFE(OpdMapper, /*Signed*/ false); 3246 return; 3247 case AMDGPU::G_AMDGPU_MAD_U64_U32: 3248 case AMDGPU::G_AMDGPU_MAD_I64_I32: 3249 applyMappingMAD_64_32(OpdMapper); 3250 return; 3251 default: 3252 break; 3253 } 3254 3255 return applyDefaultMapping(OpdMapper); 3256 } 3257 3258 // vgpr, sgpr -> vgpr 3259 // vgpr, agpr -> vgpr 3260 // agpr, agpr -> agpr 3261 // agpr, sgpr -> vgpr 3262 static unsigned regBankUnion(unsigned RB0, unsigned RB1) { 3263 if (RB0 == AMDGPU::InvalidRegBankID) 3264 return RB1; 3265 if (RB1 == AMDGPU::InvalidRegBankID) 3266 return RB0; 3267 3268 if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) 3269 return AMDGPU::SGPRRegBankID; 3270 3271 if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID) 3272 return AMDGPU::AGPRRegBankID; 3273 3274 return AMDGPU::VGPRRegBankID; 3275 } 3276 3277 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) { 3278 if (RB0 == AMDGPU::InvalidRegBankID) 3279 return RB1; 3280 if (RB1 == AMDGPU::InvalidRegBankID) 3281 return RB0; 3282 3283 // vcc, vcc -> vcc 3284 // vcc, sgpr -> vcc 3285 // vcc, vgpr -> vcc 3286 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID) 3287 return AMDGPU::VCCRegBankID; 3288 3289 // vcc, vgpr -> vgpr 3290 return regBankUnion(RB0, RB1); 3291 } 3292 3293 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI, 3294 const MachineInstr &MI) const { 3295 unsigned RegBank = AMDGPU::InvalidRegBankID; 3296 3297 for (const MachineOperand &MO : MI.operands()) { 3298 if (!MO.isReg()) 3299 continue; 3300 Register Reg = MO.getReg(); 3301 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 3302 RegBank = regBankUnion(RegBank, Bank->getID()); 3303 if (RegBank == AMDGPU::VGPRRegBankID) 3304 break; 3305 } 3306 } 3307 3308 return RegBank; 3309 } 3310 3311 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { 3312 const MachineFunction &MF = *MI.getParent()->getParent(); 3313 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3314 for (const MachineOperand &MO : MI.operands()) { 3315 if (!MO.isReg()) 3316 continue; 3317 Register Reg = MO.getReg(); 3318 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 3319 if (Bank->getID() != AMDGPU::SGPRRegBankID) 3320 return false; 3321 } 3322 } 3323 return true; 3324 } 3325 3326 const RegisterBankInfo::InstructionMapping & 3327 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const { 3328 const MachineFunction &MF = *MI.getParent()->getParent(); 3329 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3330 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3331 3332 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3333 const MachineOperand &SrcOp = MI.getOperand(i); 3334 if (!SrcOp.isReg()) 3335 continue; 3336 3337 unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI); 3338 OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3339 } 3340 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3341 MI.getNumOperands()); 3342 } 3343 3344 const RegisterBankInfo::InstructionMapping & 3345 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { 3346 const MachineFunction &MF = *MI.getParent()->getParent(); 3347 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3348 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3349 3350 // Even though we technically could use SGPRs, this would require knowledge of 3351 // the constant bus restriction. Force all sources to VGPR (except for VCC). 3352 // 3353 // TODO: Unary ops are trivially OK, so accept SGPRs? 3354 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3355 const MachineOperand &Src = MI.getOperand(i); 3356 if (!Src.isReg()) 3357 continue; 3358 3359 unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI); 3360 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID; 3361 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size); 3362 } 3363 3364 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3365 MI.getNumOperands()); 3366 } 3367 3368 const RegisterBankInfo::InstructionMapping & 3369 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const { 3370 const MachineFunction &MF = *MI.getParent()->getParent(); 3371 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3372 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3373 3374 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { 3375 const MachineOperand &Op = MI.getOperand(I); 3376 if (!Op.isReg()) 3377 continue; 3378 3379 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI); 3380 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3381 } 3382 3383 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3384 MI.getNumOperands()); 3385 } 3386 3387 const RegisterBankInfo::InstructionMapping & 3388 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI, 3389 const MachineInstr &MI, 3390 int RsrcIdx) const { 3391 // The reported argument index is relative to the IR intrinsic call arguments, 3392 // so we need to shift by the number of defs and the intrinsic ID. 3393 RsrcIdx += MI.getNumExplicitDefs() + 1; 3394 3395 const int NumOps = MI.getNumOperands(); 3396 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps); 3397 3398 // TODO: Should packed/unpacked D16 difference be reported here as part of 3399 // the value mapping? 3400 for (int I = 0; I != NumOps; ++I) { 3401 if (!MI.getOperand(I).isReg()) 3402 continue; 3403 3404 Register OpReg = MI.getOperand(I).getReg(); 3405 // We replace some dead address operands with $noreg 3406 if (!OpReg) 3407 continue; 3408 3409 unsigned Size = getSizeInBits(OpReg, MRI, *TRI); 3410 3411 // FIXME: Probably need a new intrinsic register bank searchable table to 3412 // handle arbitrary intrinsics easily. 3413 // 3414 // If this has a sampler, it immediately follows rsrc. 3415 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1; 3416 3417 if (MustBeSGPR) { 3418 // If this must be an SGPR, so we must report whatever it is as legal. 3419 unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID); 3420 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size); 3421 } else { 3422 // Some operands must be VGPR, and these are easy to copy to. 3423 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3424 } 3425 } 3426 3427 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps); 3428 } 3429 3430 /// Return the mapping for a pointer argument. 3431 const RegisterBankInfo::ValueMapping * 3432 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI, 3433 Register PtrReg) const { 3434 LLT PtrTy = MRI.getType(PtrReg); 3435 unsigned Size = PtrTy.getSizeInBits(); 3436 if (Subtarget.useFlatForGlobal() || 3437 !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace())) 3438 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3439 3440 // If we're using MUBUF instructions for global memory, an SGPR base register 3441 // is possible. Otherwise this needs to be a VGPR. 3442 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); 3443 return AMDGPU::getValueMapping(PtrBank->getID(), Size); 3444 } 3445 3446 const RegisterBankInfo::InstructionMapping & 3447 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { 3448 3449 const MachineFunction &MF = *MI.getParent()->getParent(); 3450 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3451 SmallVector<const ValueMapping*, 2> OpdsMapping(2); 3452 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3453 Register PtrReg = MI.getOperand(1).getReg(); 3454 LLT PtrTy = MRI.getType(PtrReg); 3455 unsigned AS = PtrTy.getAddressSpace(); 3456 unsigned PtrSize = PtrTy.getSizeInBits(); 3457 3458 const ValueMapping *ValMapping; 3459 const ValueMapping *PtrMapping; 3460 3461 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); 3462 3463 if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) { 3464 if (isScalarLoadLegal(MI)) { 3465 // We have a uniform instruction so we want to use an SMRD load 3466 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3467 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); 3468 } else { 3469 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3470 3471 // If we're using MUBUF instructions for global memory, an SGPR base 3472 // register is possible. Otherwise this needs to be a VGPR. 3473 unsigned PtrBankID = Subtarget.useFlatForGlobal() ? 3474 AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID; 3475 3476 PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize); 3477 } 3478 } else { 3479 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3480 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); 3481 } 3482 3483 OpdsMapping[0] = ValMapping; 3484 OpdsMapping[1] = PtrMapping; 3485 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping( 3486 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands()); 3487 return Mapping; 3488 3489 // FIXME: Do we want to add a mapping for FLAT load, or should we just 3490 // handle that during instruction selection? 3491 } 3492 3493 unsigned 3494 AMDGPURegisterBankInfo::getRegBankID(Register Reg, 3495 const MachineRegisterInfo &MRI, 3496 unsigned Default) const { 3497 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 3498 return Bank ? Bank->getID() : Default; 3499 } 3500 3501 const RegisterBankInfo::ValueMapping * 3502 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg, 3503 const MachineRegisterInfo &MRI, 3504 const TargetRegisterInfo &TRI) const { 3505 // Lie and claim anything is legal, even though this needs to be an SGPR 3506 // applyMapping will have to deal with it as a waterfall loop. 3507 unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID); 3508 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3509 return AMDGPU::getValueMapping(Bank, Size); 3510 } 3511 3512 const RegisterBankInfo::ValueMapping * 3513 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg, 3514 const MachineRegisterInfo &MRI, 3515 const TargetRegisterInfo &TRI) const { 3516 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3517 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3518 } 3519 3520 const RegisterBankInfo::ValueMapping * 3521 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg, 3522 const MachineRegisterInfo &MRI, 3523 const TargetRegisterInfo &TRI) const { 3524 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3525 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size); 3526 } 3527 3528 /// 3529 /// This function must return a legal mapping, because 3530 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called 3531 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a 3532 /// VGPR to SGPR generated is illegal. 3533 /// 3534 // Operands that must be SGPRs must accept potentially divergent VGPRs as 3535 // legal. These will be dealt with in applyMappingImpl. 3536 // 3537 const RegisterBankInfo::InstructionMapping & 3538 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { 3539 const MachineFunction &MF = *MI.getParent()->getParent(); 3540 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3541 3542 if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) { 3543 // The default logic bothers to analyze impossible alternative mappings. We 3544 // want the most straightforward mapping, so just directly handle this. 3545 const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI, 3546 *TRI); 3547 const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI, 3548 *TRI); 3549 assert(SrcBank && "src bank should have been assigned already"); 3550 if (!DstBank) 3551 DstBank = SrcBank; 3552 3553 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3554 if (cannotCopy(*DstBank, *SrcBank, Size)) 3555 return getInvalidInstructionMapping(); 3556 3557 const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank); 3558 unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2; 3559 SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize); 3560 OpdsMapping[0] = &ValMap; 3561 if (MI.getOpcode() == AMDGPU::G_FREEZE) 3562 OpdsMapping[1] = &ValMap; 3563 3564 return getInstructionMapping( 3565 1, /*Cost*/ 1, 3566 /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize); 3567 } 3568 3569 if (MI.isRegSequence()) { 3570 // If any input is a VGPR, the result must be a VGPR. The default handling 3571 // assumes any copy between banks is legal. 3572 unsigned BankID = AMDGPU::SGPRRegBankID; 3573 3574 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3575 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI); 3576 // It doesn't make sense to use vcc or scc banks here, so just ignore 3577 // them. 3578 if (OpBank != AMDGPU::SGPRRegBankID) { 3579 BankID = AMDGPU::VGPRRegBankID; 3580 break; 3581 } 3582 } 3583 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3584 3585 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID)); 3586 return getInstructionMapping( 3587 1, /*Cost*/ 1, 3588 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 3589 } 3590 3591 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies 3592 // properly. 3593 // 3594 // TODO: There are additional exec masking dependencies to analyze. 3595 if (MI.getOpcode() == TargetOpcode::G_PHI) { 3596 unsigned ResultBank = AMDGPU::InvalidRegBankID; 3597 Register DstReg = MI.getOperand(0).getReg(); 3598 3599 // Sometimes the result may have already been assigned a bank. 3600 if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI)) 3601 ResultBank = DstBank->getID(); 3602 3603 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3604 Register Reg = MI.getOperand(I).getReg(); 3605 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 3606 3607 // FIXME: Assuming VGPR for any undetermined inputs. 3608 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) { 3609 ResultBank = AMDGPU::VGPRRegBankID; 3610 break; 3611 } 3612 3613 // FIXME: Need to promote SGPR case to s32 3614 unsigned OpBank = Bank->getID(); 3615 ResultBank = regBankBoolUnion(ResultBank, OpBank); 3616 } 3617 3618 assert(ResultBank != AMDGPU::InvalidRegBankID); 3619 3620 unsigned Size = MRI.getType(DstReg).getSizeInBits(); 3621 3622 const ValueMapping &ValMap = 3623 getValueMapping(0, Size, getRegBank(ResultBank)); 3624 return getInstructionMapping( 3625 1, /*Cost*/ 1, 3626 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 3627 } 3628 3629 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI); 3630 if (Mapping.isValid()) 3631 return Mapping; 3632 3633 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3634 3635 switch (MI.getOpcode()) { 3636 default: 3637 return getInvalidInstructionMapping(); 3638 3639 case AMDGPU::G_AND: 3640 case AMDGPU::G_OR: 3641 case AMDGPU::G_XOR: { 3642 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3643 if (Size == 1) { 3644 const RegisterBank *DstBank 3645 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI); 3646 3647 unsigned TargetBankID = AMDGPU::InvalidRegBankID; 3648 unsigned BankLHS = AMDGPU::InvalidRegBankID; 3649 unsigned BankRHS = AMDGPU::InvalidRegBankID; 3650 if (DstBank) { 3651 TargetBankID = DstBank->getID(); 3652 if (DstBank == &AMDGPU::VCCRegBank) { 3653 TargetBankID = AMDGPU::VCCRegBankID; 3654 BankLHS = AMDGPU::VCCRegBankID; 3655 BankRHS = AMDGPU::VCCRegBankID; 3656 } else { 3657 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, 3658 AMDGPU::SGPRRegBankID); 3659 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, 3660 AMDGPU::SGPRRegBankID); 3661 } 3662 } else { 3663 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, 3664 AMDGPU::VCCRegBankID); 3665 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, 3666 AMDGPU::VCCRegBankID); 3667 3668 // Both inputs should be true booleans to produce a boolean result. 3669 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) { 3670 TargetBankID = AMDGPU::VGPRRegBankID; 3671 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) { 3672 TargetBankID = AMDGPU::VCCRegBankID; 3673 BankLHS = AMDGPU::VCCRegBankID; 3674 BankRHS = AMDGPU::VCCRegBankID; 3675 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) { 3676 TargetBankID = AMDGPU::SGPRRegBankID; 3677 } 3678 } 3679 3680 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size); 3681 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size); 3682 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size); 3683 break; 3684 } 3685 3686 if (Size == 64) { 3687 3688 if (isSALUMapping(MI)) { 3689 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size); 3690 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0]; 3691 } else { 3692 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size); 3693 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/); 3694 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size); 3695 3696 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/); 3697 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size); 3698 } 3699 3700 break; 3701 } 3702 3703 LLVM_FALLTHROUGH; 3704 } 3705 case AMDGPU::G_PTR_ADD: 3706 case AMDGPU::G_PTRMASK: 3707 case AMDGPU::G_ADD: 3708 case AMDGPU::G_SUB: 3709 case AMDGPU::G_MUL: 3710 case AMDGPU::G_SHL: 3711 case AMDGPU::G_LSHR: 3712 case AMDGPU::G_ASHR: 3713 case AMDGPU::G_UADDO: 3714 case AMDGPU::G_USUBO: 3715 case AMDGPU::G_UADDE: 3716 case AMDGPU::G_SADDE: 3717 case AMDGPU::G_USUBE: 3718 case AMDGPU::G_SSUBE: 3719 case AMDGPU::G_SMIN: 3720 case AMDGPU::G_SMAX: 3721 case AMDGPU::G_UMIN: 3722 case AMDGPU::G_UMAX: 3723 case AMDGPU::G_ABS: 3724 case AMDGPU::G_SHUFFLE_VECTOR: 3725 case AMDGPU::G_SBFX: 3726 case AMDGPU::G_UBFX: 3727 if (isSALUMapping(MI)) 3728 return getDefaultMappingSOP(MI); 3729 LLVM_FALLTHROUGH; 3730 3731 case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU 3732 case AMDGPU::G_SSUBSAT: 3733 case AMDGPU::G_UADDSAT: 3734 case AMDGPU::G_USUBSAT: 3735 case AMDGPU::G_FADD: 3736 case AMDGPU::G_FSUB: 3737 case AMDGPU::G_FPTOSI: 3738 case AMDGPU::G_FPTOUI: 3739 case AMDGPU::G_FMUL: 3740 case AMDGPU::G_FMA: 3741 case AMDGPU::G_FMAD: 3742 case AMDGPU::G_FSQRT: 3743 case AMDGPU::G_FFLOOR: 3744 case AMDGPU::G_FCEIL: 3745 case AMDGPU::G_FRINT: 3746 case AMDGPU::G_SITOFP: 3747 case AMDGPU::G_UITOFP: 3748 case AMDGPU::G_FPTRUNC: 3749 case AMDGPU::G_FPEXT: 3750 case AMDGPU::G_FEXP2: 3751 case AMDGPU::G_FLOG2: 3752 case AMDGPU::G_FMINNUM: 3753 case AMDGPU::G_FMAXNUM: 3754 case AMDGPU::G_FMINNUM_IEEE: 3755 case AMDGPU::G_FMAXNUM_IEEE: 3756 case AMDGPU::G_FCANONICALIZE: 3757 case AMDGPU::G_INTRINSIC_TRUNC: 3758 case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar? 3759 case AMDGPU::G_FSHR: // TODO: Expand for scalar 3760 case AMDGPU::G_AMDGPU_FMIN_LEGACY: 3761 case AMDGPU::G_AMDGPU_FMAX_LEGACY: 3762 case AMDGPU::G_AMDGPU_RCP_IFLAG: 3763 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0: 3764 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1: 3765 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2: 3766 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: 3767 case AMDGPU::G_AMDGPU_CVT_PK_I16_I32: 3768 case AMDGPU::G_AMDGPU_SMED3: 3769 return getDefaultMappingVOP(MI); 3770 case AMDGPU::G_UMULH: 3771 case AMDGPU::G_SMULH: { 3772 if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI)) 3773 return getDefaultMappingSOP(MI); 3774 return getDefaultMappingVOP(MI); 3775 } 3776 case AMDGPU::G_AMDGPU_MAD_U64_U32: 3777 case AMDGPU::G_AMDGPU_MAD_I64_I32: { 3778 // Three possible mappings: 3779 // 3780 // - Default SOP 3781 // - Default VOP 3782 // - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP. 3783 // 3784 // This allows instruction selection to keep the multiplication part of the 3785 // instruction on the SALU. 3786 bool AllSalu = true; 3787 bool MulSalu = true; 3788 for (unsigned i = 0; i < 5; ++i) { 3789 Register Reg = MI.getOperand(i).getReg(); 3790 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 3791 if (Bank->getID() != AMDGPU::SGPRRegBankID) { 3792 AllSalu = false; 3793 if (i == 2 || i == 3) { 3794 MulSalu = false; 3795 break; 3796 } 3797 } 3798 } 3799 } 3800 3801 if (AllSalu) 3802 return getDefaultMappingSOP(MI); 3803 3804 // If the multiply-add is full-rate in VALU, use that even if the 3805 // multiplication part is scalar. Accumulating separately on the VALU would 3806 // take two instructions. 3807 if (!MulSalu || Subtarget.hasFullRate64Ops()) 3808 return getDefaultMappingVOP(MI); 3809 3810 // Keep the multiplication on the SALU, then accumulate on the VALU. 3811 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64); 3812 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 3813 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 3814 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 3815 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64); 3816 break; 3817 } 3818 case AMDGPU::G_IMPLICIT_DEF: { 3819 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3820 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3821 break; 3822 } 3823 case AMDGPU::G_FCONSTANT: 3824 case AMDGPU::G_CONSTANT: 3825 case AMDGPU::G_GLOBAL_VALUE: 3826 case AMDGPU::G_BLOCK_ADDR: 3827 case AMDGPU::G_READCYCLECOUNTER: { 3828 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3829 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3830 break; 3831 } 3832 case AMDGPU::G_FRAME_INDEX: { 3833 // TODO: This should be the same as other constants, but eliminateFrameIndex 3834 // currently assumes VALU uses. 3835 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3836 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3837 break; 3838 } 3839 case AMDGPU::G_DYN_STACKALLOC: { 3840 // Result is always uniform, and a wave reduction is needed for the source. 3841 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 3842 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3843 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32); 3844 break; 3845 } 3846 case AMDGPU::G_AMDGPU_WAVE_ADDRESS: { 3847 // This case is weird because we expect a physical register in the source, 3848 // but need to set a bank anyway. 3849 // 3850 // We could select the result to SGPR or VGPR, but for the one current use 3851 // it's more practical to always use VGPR. 3852 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 3853 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 3854 break; 3855 } 3856 case AMDGPU::G_INSERT: { 3857 unsigned BankID = getMappingType(MRI, MI); 3858 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3859 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 3860 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI); 3861 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 3862 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 3863 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize); 3864 OpdsMapping[3] = nullptr; 3865 break; 3866 } 3867 case AMDGPU::G_EXTRACT: { 3868 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3869 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3870 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 3871 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 3872 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 3873 OpdsMapping[2] = nullptr; 3874 break; 3875 } 3876 case AMDGPU::G_BUILD_VECTOR: 3877 case AMDGPU::G_BUILD_VECTOR_TRUNC: { 3878 LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); 3879 if (DstTy == LLT::fixed_vector(2, 16)) { 3880 unsigned DstSize = DstTy.getSizeInBits(); 3881 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3882 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3883 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI); 3884 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID); 3885 3886 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize); 3887 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize); 3888 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize); 3889 break; 3890 } 3891 3892 LLVM_FALLTHROUGH; 3893 } 3894 case AMDGPU::G_MERGE_VALUES: 3895 case AMDGPU::G_CONCAT_VECTORS: { 3896 unsigned Bank = getMappingType(MRI, MI); 3897 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3898 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3899 3900 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 3901 // Op1 and Dst should use the same register bank. 3902 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i) 3903 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize); 3904 break; 3905 } 3906 case AMDGPU::G_BITREVERSE: 3907 case AMDGPU::G_BITCAST: 3908 case AMDGPU::G_INTTOPTR: 3909 case AMDGPU::G_PTRTOINT: 3910 case AMDGPU::G_FABS: 3911 case AMDGPU::G_FNEG: { 3912 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3913 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3914 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); 3915 break; 3916 } 3917 case AMDGPU::G_AMDGPU_FFBH_U32: 3918 case AMDGPU::G_AMDGPU_FFBL_B32: 3919 case AMDGPU::G_CTLZ_ZERO_UNDEF: 3920 case AMDGPU::G_CTTZ_ZERO_UNDEF: { 3921 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3922 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3923 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32); 3924 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size); 3925 break; 3926 } 3927 case AMDGPU::G_CTPOP: { 3928 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3929 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3930 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32); 3931 3932 // This should really be getValueMappingSGPR64Only, but allowing the generic 3933 // code to handle the register split just makes using LegalizerHelper more 3934 // difficult. 3935 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); 3936 break; 3937 } 3938 case AMDGPU::G_TRUNC: { 3939 Register Dst = MI.getOperand(0).getReg(); 3940 Register Src = MI.getOperand(1).getReg(); 3941 unsigned Bank = getRegBankID(Src, MRI); 3942 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 3943 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 3944 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 3945 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize); 3946 break; 3947 } 3948 case AMDGPU::G_ZEXT: 3949 case AMDGPU::G_SEXT: 3950 case AMDGPU::G_ANYEXT: 3951 case AMDGPU::G_SEXT_INREG: { 3952 Register Dst = MI.getOperand(0).getReg(); 3953 Register Src = MI.getOperand(1).getReg(); 3954 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 3955 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 3956 3957 unsigned DstBank; 3958 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI); 3959 assert(SrcBank); 3960 switch (SrcBank->getID()) { 3961 case AMDGPU::SGPRRegBankID: 3962 DstBank = AMDGPU::SGPRRegBankID; 3963 break; 3964 default: 3965 DstBank = AMDGPU::VGPRRegBankID; 3966 break; 3967 } 3968 3969 // Scalar extend can use 64-bit BFE, but VGPRs require extending to 3970 // 32-bits, and then to 64. 3971 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize); 3972 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(), 3973 SrcSize); 3974 break; 3975 } 3976 case AMDGPU::G_FCMP: { 3977 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 3978 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI); 3979 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 3980 OpdsMapping[1] = nullptr; // Predicate Operand. 3981 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size); 3982 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3983 break; 3984 } 3985 case AMDGPU::G_STORE: { 3986 assert(MI.getOperand(0).isReg()); 3987 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3988 3989 // FIXME: We need to specify a different reg bank once scalar stores are 3990 // supported. 3991 const ValueMapping *ValMapping = 3992 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3993 OpdsMapping[0] = ValMapping; 3994 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 3995 break; 3996 } 3997 case AMDGPU::G_ICMP: { 3998 auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); 3999 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4000 4001 // See if the result register has already been constrained to vcc, which may 4002 // happen due to control flow intrinsic lowering. 4003 unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI, 4004 AMDGPU::SGPRRegBankID); 4005 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI); 4006 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI); 4007 4008 bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID && 4009 Op2Bank == AMDGPU::SGPRRegBankID && 4010 Op3Bank == AMDGPU::SGPRRegBankID && 4011 (Size == 32 || (Size == 64 && 4012 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) && 4013 Subtarget.hasScalarCompareEq64())); 4014 4015 DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 4016 unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 4017 4018 // TODO: Use 32-bit for scalar output size. 4019 // SCC results will need to be copied to a 32-bit SGPR virtual register. 4020 const unsigned ResultSize = 1; 4021 4022 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize); 4023 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size); 4024 OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size); 4025 break; 4026 } 4027 case AMDGPU::G_EXTRACT_VECTOR_ELT: { 4028 // VGPR index can be used for waterfall when indexing a SGPR vector. 4029 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 4030 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4031 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4032 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4033 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI); 4034 unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank); 4035 4036 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize); 4037 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize); 4038 4039 // The index can be either if the source vector is VGPR. 4040 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4041 break; 4042 } 4043 case AMDGPU::G_INSERT_VECTOR_ELT: { 4044 unsigned OutputBankID = isSALUMapping(MI) ? 4045 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 4046 4047 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4048 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4049 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 4050 unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI); 4051 unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI); 4052 4053 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize); 4054 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize); 4055 4056 // This is a weird case, because we need to break down the mapping based on 4057 // the register bank of a different operand. 4058 if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) { 4059 OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID, 4060 InsertSize); 4061 } else { 4062 assert(InsertSize == 32 || InsertSize == 64); 4063 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize); 4064 } 4065 4066 // The index can be either if the source vector is VGPR. 4067 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize); 4068 break; 4069 } 4070 case AMDGPU::G_UNMERGE_VALUES: { 4071 unsigned Bank = getMappingType(MRI, MI); 4072 4073 // Op1 and Dst should use the same register bank. 4074 // FIXME: Shouldn't this be the default? Why do we need to handle this? 4075 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 4076 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); 4077 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size); 4078 } 4079 break; 4080 } 4081 case AMDGPU::G_AMDGPU_BUFFER_LOAD: 4082 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 4083 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: 4084 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 4085 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: 4086 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: 4087 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: 4088 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: 4089 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: 4090 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: 4091 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: 4092 case AMDGPU::G_AMDGPU_BUFFER_STORE: 4093 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: 4094 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: 4095 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: 4096 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: { 4097 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4098 4099 // rsrc 4100 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4101 4102 // vindex 4103 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4104 4105 // voffset 4106 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4107 4108 // soffset 4109 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4110 4111 // Any remaining operands are immediates and were correctly null 4112 // initialized. 4113 break; 4114 } 4115 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: 4116 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: 4117 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: 4118 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: 4119 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: 4120 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: 4121 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: 4122 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: 4123 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: 4124 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: 4125 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: 4126 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: 4127 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: 4128 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: 4129 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { 4130 // vdata_out 4131 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4132 4133 // vdata_in 4134 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4135 4136 // rsrc 4137 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4138 4139 // vindex 4140 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4141 4142 // voffset 4143 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4144 4145 // soffset 4146 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4147 4148 // Any remaining operands are immediates and were correctly null 4149 // initialized. 4150 break; 4151 } 4152 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { 4153 // vdata_out 4154 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4155 4156 // vdata_in 4157 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4158 4159 // cmp 4160 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4161 4162 // rsrc 4163 OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4164 4165 // vindex 4166 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4167 4168 // voffset 4169 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4170 4171 // soffset 4172 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI); 4173 4174 // Any remaining operands are immediates and were correctly null 4175 // initialized. 4176 break; 4177 } 4178 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { 4179 // Lie and claim everything is legal, even though some need to be 4180 // SGPRs. applyMapping will have to deal with it as a waterfall loop. 4181 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4182 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4183 4184 // We need to convert this to a MUBUF if either the resource of offset is 4185 // VGPR. 4186 unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID(); 4187 unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID(); 4188 unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank); 4189 4190 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4191 OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0); 4192 break; 4193 } 4194 case AMDGPU::G_INTRINSIC: { 4195 switch (MI.getIntrinsicID()) { 4196 default: 4197 return getInvalidInstructionMapping(); 4198 case Intrinsic::amdgcn_div_fmas: 4199 case Intrinsic::amdgcn_div_fixup: 4200 case Intrinsic::amdgcn_trig_preop: 4201 case Intrinsic::amdgcn_sin: 4202 case Intrinsic::amdgcn_cos: 4203 case Intrinsic::amdgcn_log_clamp: 4204 case Intrinsic::amdgcn_rcp: 4205 case Intrinsic::amdgcn_rcp_legacy: 4206 case Intrinsic::amdgcn_sqrt: 4207 case Intrinsic::amdgcn_rsq: 4208 case Intrinsic::amdgcn_rsq_legacy: 4209 case Intrinsic::amdgcn_rsq_clamp: 4210 case Intrinsic::amdgcn_fmul_legacy: 4211 case Intrinsic::amdgcn_fma_legacy: 4212 case Intrinsic::amdgcn_ldexp: 4213 case Intrinsic::amdgcn_frexp_mant: 4214 case Intrinsic::amdgcn_frexp_exp: 4215 case Intrinsic::amdgcn_fract: 4216 case Intrinsic::amdgcn_cvt_pkrtz: 4217 case Intrinsic::amdgcn_cvt_pknorm_i16: 4218 case Intrinsic::amdgcn_cvt_pknorm_u16: 4219 case Intrinsic::amdgcn_cvt_pk_i16: 4220 case Intrinsic::amdgcn_cvt_pk_u16: 4221 case Intrinsic::amdgcn_fmed3: 4222 case Intrinsic::amdgcn_cubeid: 4223 case Intrinsic::amdgcn_cubema: 4224 case Intrinsic::amdgcn_cubesc: 4225 case Intrinsic::amdgcn_cubetc: 4226 case Intrinsic::amdgcn_sffbh: 4227 case Intrinsic::amdgcn_fmad_ftz: 4228 case Intrinsic::amdgcn_mbcnt_lo: 4229 case Intrinsic::amdgcn_mbcnt_hi: 4230 case Intrinsic::amdgcn_mul_u24: 4231 case Intrinsic::amdgcn_mul_i24: 4232 case Intrinsic::amdgcn_mulhi_u24: 4233 case Intrinsic::amdgcn_mulhi_i24: 4234 case Intrinsic::amdgcn_lerp: 4235 case Intrinsic::amdgcn_sad_u8: 4236 case Intrinsic::amdgcn_msad_u8: 4237 case Intrinsic::amdgcn_sad_hi_u8: 4238 case Intrinsic::amdgcn_sad_u16: 4239 case Intrinsic::amdgcn_qsad_pk_u16_u8: 4240 case Intrinsic::amdgcn_mqsad_pk_u16_u8: 4241 case Intrinsic::amdgcn_mqsad_u32_u8: 4242 case Intrinsic::amdgcn_cvt_pk_u8_f32: 4243 case Intrinsic::amdgcn_alignbyte: 4244 case Intrinsic::amdgcn_perm: 4245 case Intrinsic::amdgcn_fdot2: 4246 case Intrinsic::amdgcn_sdot2: 4247 case Intrinsic::amdgcn_udot2: 4248 case Intrinsic::amdgcn_sdot4: 4249 case Intrinsic::amdgcn_udot4: 4250 case Intrinsic::amdgcn_sdot8: 4251 case Intrinsic::amdgcn_udot8: 4252 return getDefaultMappingVOP(MI); 4253 case Intrinsic::amdgcn_sbfe: 4254 case Intrinsic::amdgcn_ubfe: 4255 if (isSALUMapping(MI)) 4256 return getDefaultMappingSOP(MI); 4257 return getDefaultMappingVOP(MI); 4258 case Intrinsic::amdgcn_ds_swizzle: 4259 case Intrinsic::amdgcn_ds_permute: 4260 case Intrinsic::amdgcn_ds_bpermute: 4261 case Intrinsic::amdgcn_update_dpp: 4262 case Intrinsic::amdgcn_mov_dpp8: 4263 case Intrinsic::amdgcn_mov_dpp: 4264 case Intrinsic::amdgcn_strict_wwm: 4265 case Intrinsic::amdgcn_wwm: 4266 case Intrinsic::amdgcn_strict_wqm: 4267 case Intrinsic::amdgcn_wqm: 4268 case Intrinsic::amdgcn_softwqm: 4269 case Intrinsic::amdgcn_set_inactive: 4270 return getDefaultMappingAllVGPR(MI); 4271 case Intrinsic::amdgcn_kernarg_segment_ptr: 4272 case Intrinsic::amdgcn_s_getpc: 4273 case Intrinsic::amdgcn_groupstaticsize: 4274 case Intrinsic::amdgcn_reloc_constant: 4275 case Intrinsic::returnaddress: { 4276 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4277 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4278 break; 4279 } 4280 case Intrinsic::amdgcn_wqm_vote: { 4281 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4282 OpdsMapping[0] = OpdsMapping[2] 4283 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size); 4284 break; 4285 } 4286 case Intrinsic::amdgcn_ps_live: { 4287 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4288 break; 4289 } 4290 case Intrinsic::amdgcn_div_scale: { 4291 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4292 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4293 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size); 4294 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size); 4295 4296 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 4297 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4298 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4299 break; 4300 } 4301 case Intrinsic::amdgcn_class: { 4302 Register Src0Reg = MI.getOperand(2).getReg(); 4303 Register Src1Reg = MI.getOperand(3).getReg(); 4304 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits(); 4305 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits(); 4306 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4307 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize); 4308 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size); 4309 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size); 4310 break; 4311 } 4312 case Intrinsic::amdgcn_icmp: 4313 case Intrinsic::amdgcn_fcmp: { 4314 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4315 // This is not VCCRegBank because this is not used in boolean contexts. 4316 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4317 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4318 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); 4319 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); 4320 break; 4321 } 4322 case Intrinsic::amdgcn_readlane: { 4323 // This must be an SGPR, but accept a VGPR. 4324 Register IdxReg = MI.getOperand(3).getReg(); 4325 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 4326 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID); 4327 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4328 LLVM_FALLTHROUGH; 4329 } 4330 case Intrinsic::amdgcn_readfirstlane: { 4331 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4332 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4333 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4334 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4335 break; 4336 } 4337 case Intrinsic::amdgcn_writelane: { 4338 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4339 Register SrcReg = MI.getOperand(2).getReg(); 4340 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 4341 unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID); 4342 Register IdxReg = MI.getOperand(3).getReg(); 4343 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 4344 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID); 4345 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4346 4347 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted 4348 // to legalize. 4349 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize); 4350 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4351 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4352 break; 4353 } 4354 case Intrinsic::amdgcn_if_break: { 4355 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4356 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4357 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4358 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4359 break; 4360 } 4361 case Intrinsic::amdgcn_permlane16: 4362 case Intrinsic::amdgcn_permlanex16: { 4363 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4364 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4365 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4366 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4367 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4368 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4369 break; 4370 } 4371 case Intrinsic::amdgcn_mfma_f32_4x4x1f32: 4372 case Intrinsic::amdgcn_mfma_f32_4x4x4f16: 4373 case Intrinsic::amdgcn_mfma_i32_4x4x4i8: 4374 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16: 4375 case Intrinsic::amdgcn_mfma_f32_16x16x1f32: 4376 case Intrinsic::amdgcn_mfma_f32_16x16x4f32: 4377 case Intrinsic::amdgcn_mfma_f32_16x16x4f16: 4378 case Intrinsic::amdgcn_mfma_f32_16x16x16f16: 4379 case Intrinsic::amdgcn_mfma_i32_16x16x4i8: 4380 case Intrinsic::amdgcn_mfma_i32_16x16x16i8: 4381 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16: 4382 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16: 4383 case Intrinsic::amdgcn_mfma_f32_32x32x1f32: 4384 case Intrinsic::amdgcn_mfma_f32_32x32x2f32: 4385 case Intrinsic::amdgcn_mfma_f32_32x32x4f16: 4386 case Intrinsic::amdgcn_mfma_f32_32x32x8f16: 4387 case Intrinsic::amdgcn_mfma_i32_32x32x4i8: 4388 case Intrinsic::amdgcn_mfma_i32_32x32x8i8: 4389 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16: 4390 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: 4391 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k: 4392 case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k: 4393 case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k: 4394 case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k: 4395 case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k: 4396 case Intrinsic::amdgcn_mfma_f64_16x16x4f64: 4397 case Intrinsic::amdgcn_mfma_f64_4x4x4f64: 4398 case Intrinsic::amdgcn_mfma_i32_16x16x32_i8: 4399 case Intrinsic::amdgcn_mfma_i32_32x32x16_i8: 4400 case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32: 4401 case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32: { 4402 // Default for MAI intrinsics. 4403 // srcC can also be an immediate which can be folded later. 4404 // FIXME: Should we eventually add an alternative mapping with AGPR src 4405 // for srcA/srcB? 4406 // 4407 // vdst, srcA, srcB, srcC 4408 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 4409 OpdsMapping[0] = 4410 Info->mayNeedAGPRs() 4411 ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) 4412 : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4413 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4414 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4415 OpdsMapping[4] = 4416 Info->mayNeedAGPRs() 4417 ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) 4418 : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4419 break; 4420 } 4421 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16: 4422 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16: 4423 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16: 4424 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16: 4425 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8: 4426 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: { 4427 // vdst, srcA, srcB, srcC, idx 4428 OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4429 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4430 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4431 OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4432 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4433 break; 4434 } 4435 case Intrinsic::amdgcn_interp_p1: 4436 case Intrinsic::amdgcn_interp_p2: 4437 case Intrinsic::amdgcn_interp_mov: 4438 case Intrinsic::amdgcn_interp_p1_f16: 4439 case Intrinsic::amdgcn_interp_p2_f16: { 4440 const int M0Idx = MI.getNumOperands() - 1; 4441 Register M0Reg = MI.getOperand(M0Idx).getReg(); 4442 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID); 4443 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4444 4445 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4446 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I) 4447 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4448 4449 // Must be SGPR, but we must take whatever the original bank is and fix it 4450 // later. 4451 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32); 4452 break; 4453 } 4454 case Intrinsic::amdgcn_ballot: { 4455 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4456 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4457 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4458 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize); 4459 break; 4460 } 4461 } 4462 break; 4463 } 4464 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 4465 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: 4466 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: 4467 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { 4468 auto IntrID = MI.getIntrinsicID(); 4469 const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID); 4470 assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic"); 4471 // Non-images can have complications from operands that allow both SGPR 4472 // and VGPR. For now it's too complicated to figure out the final opcode 4473 // to derive the register bank from the MCInstrDesc. 4474 assert(RSrcIntrin->IsImage); 4475 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg); 4476 } 4477 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: { 4478 unsigned N = MI.getNumExplicitOperands() - 2; 4479 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128); 4480 OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI); 4481 if (N == 3) { 4482 // Sequential form: all operands combined into VGPR256/VGPR512 4483 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4484 if (Size > 256) 4485 Size = 512; 4486 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4487 } else { 4488 // NSA form 4489 for (unsigned I = 2; I < N; ++I) 4490 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4491 } 4492 break; 4493 } 4494 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { 4495 auto IntrID = MI.getIntrinsicID(); 4496 switch (IntrID) { 4497 case Intrinsic::amdgcn_s_getreg: 4498 case Intrinsic::amdgcn_s_memtime: 4499 case Intrinsic::amdgcn_s_memrealtime: 4500 case Intrinsic::amdgcn_s_get_waveid_in_workgroup: 4501 case Intrinsic::amdgcn_s_sendmsg_rtn: { 4502 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4503 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4504 break; 4505 } 4506 case Intrinsic::amdgcn_global_atomic_fadd: 4507 case Intrinsic::amdgcn_global_atomic_csub: 4508 case Intrinsic::amdgcn_global_atomic_fmin: 4509 case Intrinsic::amdgcn_global_atomic_fmax: 4510 case Intrinsic::amdgcn_flat_atomic_fadd: 4511 case Intrinsic::amdgcn_flat_atomic_fmin: 4512 case Intrinsic::amdgcn_flat_atomic_fmax: 4513 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: 4514 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: 4515 return getDefaultMappingAllVGPR(MI); 4516 case Intrinsic::amdgcn_ds_ordered_add: 4517 case Intrinsic::amdgcn_ds_ordered_swap: { 4518 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4519 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4520 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4521 AMDGPU::SGPRRegBankID); 4522 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32); 4523 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4524 break; 4525 } 4526 case Intrinsic::amdgcn_ds_append: 4527 case Intrinsic::amdgcn_ds_consume: { 4528 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4529 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4530 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4531 break; 4532 } 4533 case Intrinsic::amdgcn_exp_compr: 4534 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4535 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4536 break; 4537 case Intrinsic::amdgcn_exp: 4538 // FIXME: Could we support packed types here? 4539 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4540 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4541 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4542 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4543 break; 4544 case Intrinsic::amdgcn_s_sendmsg: 4545 case Intrinsic::amdgcn_s_sendmsghalt: { 4546 // This must be an SGPR, but accept a VGPR. 4547 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4548 AMDGPU::SGPRRegBankID); 4549 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4550 break; 4551 } 4552 case Intrinsic::amdgcn_s_setreg: { 4553 // This must be an SGPR, but accept a VGPR. 4554 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4555 AMDGPU::SGPRRegBankID); 4556 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4557 break; 4558 } 4559 case Intrinsic::amdgcn_end_cf: { 4560 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4561 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4562 break; 4563 } 4564 case Intrinsic::amdgcn_else: { 4565 unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4566 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4567 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); 4568 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); 4569 break; 4570 } 4571 case Intrinsic::amdgcn_live_mask: { 4572 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4573 break; 4574 } 4575 case Intrinsic::amdgcn_wqm_demote: 4576 case Intrinsic::amdgcn_kill: { 4577 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4578 break; 4579 } 4580 case Intrinsic::amdgcn_raw_buffer_load: 4581 case Intrinsic::amdgcn_raw_tbuffer_load: { 4582 // FIXME: Should make intrinsic ID the last operand of the instruction, 4583 // then this would be the same as store 4584 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4585 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4586 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4587 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4588 break; 4589 } 4590 case Intrinsic::amdgcn_raw_buffer_load_lds: { 4591 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4592 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4593 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4594 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4595 break; 4596 } 4597 case Intrinsic::amdgcn_raw_buffer_store: 4598 case Intrinsic::amdgcn_raw_buffer_store_format: 4599 case Intrinsic::amdgcn_raw_tbuffer_store: { 4600 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4601 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4602 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4603 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4604 break; 4605 } 4606 case Intrinsic::amdgcn_struct_buffer_load: 4607 case Intrinsic::amdgcn_struct_tbuffer_load: { 4608 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4609 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4610 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4611 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4612 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4613 break; 4614 } 4615 case Intrinsic::amdgcn_struct_buffer_load_lds: { 4616 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4617 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4618 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4619 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4620 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI); 4621 break; 4622 } 4623 case Intrinsic::amdgcn_struct_buffer_store: 4624 case Intrinsic::amdgcn_struct_tbuffer_store: { 4625 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4626 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4627 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4628 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4629 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4630 break; 4631 } 4632 case Intrinsic::amdgcn_init_exec_from_input: { 4633 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4634 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4635 break; 4636 } 4637 case Intrinsic::amdgcn_ds_gws_init: 4638 case Intrinsic::amdgcn_ds_gws_barrier: 4639 case Intrinsic::amdgcn_ds_gws_sema_br: { 4640 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4641 4642 // This must be an SGPR, but accept a VGPR. 4643 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4644 AMDGPU::SGPRRegBankID); 4645 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4646 break; 4647 } 4648 case Intrinsic::amdgcn_ds_gws_sema_v: 4649 case Intrinsic::amdgcn_ds_gws_sema_p: 4650 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 4651 // This must be an SGPR, but accept a VGPR. 4652 unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, 4653 AMDGPU::SGPRRegBankID); 4654 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); 4655 break; 4656 } 4657 case Intrinsic::amdgcn_global_load_lds: { 4658 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4659 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4660 break; 4661 } 4662 default: 4663 return getInvalidInstructionMapping(); 4664 } 4665 break; 4666 } 4667 case AMDGPU::G_SELECT: { 4668 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4669 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4670 AMDGPU::SGPRRegBankID); 4671 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, 4672 AMDGPU::SGPRRegBankID); 4673 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID && 4674 Op3Bank == AMDGPU::SGPRRegBankID; 4675 4676 unsigned CondBankDefault = SGPRSrcs ? 4677 AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 4678 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, 4679 CondBankDefault); 4680 if (CondBank == AMDGPU::SGPRRegBankID) 4681 CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 4682 else if (CondBank == AMDGPU::VGPRRegBankID) 4683 CondBank = AMDGPU::VCCRegBankID; 4684 4685 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ? 4686 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 4687 4688 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID); 4689 4690 // TODO: Should report 32-bit for scalar condition type. 4691 if (Size == 64) { 4692 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 4693 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 4694 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 4695 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 4696 } else { 4697 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size); 4698 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 4699 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size); 4700 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size); 4701 } 4702 4703 break; 4704 } 4705 4706 case AMDGPU::G_SI_CALL: { 4707 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64); 4708 // Lie and claim everything is legal, even though some need to be 4709 // SGPRs. applyMapping will have to deal with it as a waterfall loop. 4710 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4711 4712 // Allow anything for implicit arguments 4713 for (unsigned I = 4; I < MI.getNumOperands(); ++I) { 4714 if (MI.getOperand(I).isReg()) { 4715 Register Reg = MI.getOperand(I).getReg(); 4716 auto OpBank = getRegBankID(Reg, MRI); 4717 unsigned Size = getSizeInBits(Reg, MRI, *TRI); 4718 OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size); 4719 } 4720 } 4721 break; 4722 } 4723 case AMDGPU::G_LOAD: 4724 case AMDGPU::G_ZEXTLOAD: 4725 case AMDGPU::G_SEXTLOAD: 4726 return getInstrMappingForLoad(MI); 4727 4728 case AMDGPU::G_ATOMICRMW_XCHG: 4729 case AMDGPU::G_ATOMICRMW_ADD: 4730 case AMDGPU::G_ATOMICRMW_SUB: 4731 case AMDGPU::G_ATOMICRMW_AND: 4732 case AMDGPU::G_ATOMICRMW_OR: 4733 case AMDGPU::G_ATOMICRMW_XOR: 4734 case AMDGPU::G_ATOMICRMW_MAX: 4735 case AMDGPU::G_ATOMICRMW_MIN: 4736 case AMDGPU::G_ATOMICRMW_UMAX: 4737 case AMDGPU::G_ATOMICRMW_UMIN: 4738 case AMDGPU::G_ATOMICRMW_FADD: 4739 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: 4740 case AMDGPU::G_AMDGPU_ATOMIC_INC: 4741 case AMDGPU::G_AMDGPU_ATOMIC_DEC: 4742 case AMDGPU::G_AMDGPU_ATOMIC_FMIN: 4743 case AMDGPU::G_AMDGPU_ATOMIC_FMAX: { 4744 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4745 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 4746 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4747 break; 4748 } 4749 case AMDGPU::G_ATOMIC_CMPXCHG: { 4750 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4751 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 4752 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4753 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4754 break; 4755 } 4756 case AMDGPU::G_BRCOND: { 4757 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, 4758 AMDGPU::SGPRRegBankID); 4759 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 4760 if (Bank != AMDGPU::SGPRRegBankID) 4761 Bank = AMDGPU::VCCRegBankID; 4762 4763 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1); 4764 break; 4765 } 4766 case AMDGPU::G_FPTRUNC_ROUND_UPWARD: 4767 case AMDGPU::G_FPTRUNC_ROUND_DOWNWARD: 4768 return getDefaultMappingVOP(MI); 4769 } 4770 4771 return getInstructionMapping(/*ID*/1, /*Cost*/1, 4772 getOperandsMapping(OpdsMapping), 4773 MI.getNumOperands()); 4774 } 4775