1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the RegisterBankInfo class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPURegisterBankInfo.h" 15 #include "AMDGPUInstrInfo.h" 16 #include "AMDGPUSubtarget.h" 17 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 18 #include "SIMachineFunctionInfo.h" 19 #include "SIRegisterInfo.h" 20 #include "llvm/ADT/SmallSet.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/RegisterBank.h" 24 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" 25 #include "llvm/CodeGen/TargetRegisterInfo.h" 26 #include "llvm/CodeGen/TargetSubtargetInfo.h" 27 #include "llvm/IR/Constants.h" 28 29 #define GET_TARGET_REGBANK_IMPL 30 #include "AMDGPUGenRegisterBank.inc" 31 32 // This file will be TableGen'ed at some point. 33 #include "AMDGPUGenRegisterBankInfo.def" 34 35 using namespace llvm; 36 37 namespace { 38 39 // Observer to apply a register bank to new registers created by LegalizerHelper. 40 class ApplyRegBankMapping final : public GISelChangeObserver { 41 private: 42 MachineRegisterInfo &MRI; 43 const RegisterBank *NewBank; 44 SmallVector<MachineInstr *, 4> NewInsts; 45 46 public: 47 ApplyRegBankMapping(MachineRegisterInfo &MRI_, const RegisterBank *RB) 48 : MRI(MRI_), NewBank(RB) {} 49 50 ~ApplyRegBankMapping() { 51 for (MachineInstr *MI : NewInsts) 52 applyBank(*MI); 53 } 54 55 /// Set any registers that don't have a set register class or bank to SALU. 56 void applyBank(MachineInstr &MI) { 57 for (MachineOperand &Op : MI.operands()) { 58 if (!Op.isReg()) 59 continue; 60 61 Register Reg = Op.getReg(); 62 if (MRI.getRegClassOrRegBank(Reg)) 63 continue; 64 65 const RegisterBank *RB = NewBank; 66 // FIXME: This might not be enough to detect when SCC should be used. 67 if (MRI.getType(Reg) == LLT::scalar(1)) 68 RB = (NewBank == &AMDGPU::SGPRRegBank ? 69 &AMDGPU::SCCRegBank : &AMDGPU::VCCRegBank); 70 71 MRI.setRegBank(Reg, *RB); 72 } 73 } 74 75 void erasingInstr(MachineInstr &MI) override {} 76 77 void createdInstr(MachineInstr &MI) override { 78 // At this point, the instruction was just inserted and has no operands. 79 NewInsts.push_back(&MI); 80 } 81 82 void changingInstr(MachineInstr &MI) override {} 83 void changedInstr(MachineInstr &MI) override {} 84 }; 85 86 } 87 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI) 88 : AMDGPUGenRegisterBankInfo(), 89 TRI(static_cast<const SIRegisterInfo*>(&TRI)) { 90 91 // HACK: Until this is fully tablegen'd. 92 static bool AlreadyInit = false; 93 if (AlreadyInit) 94 return; 95 96 AlreadyInit = true; 97 98 const RegisterBank &RBSGPR = getRegBank(AMDGPU::SGPRRegBankID); 99 (void)RBSGPR; 100 assert(&RBSGPR == &AMDGPU::SGPRRegBank); 101 102 const RegisterBank &RBVGPR = getRegBank(AMDGPU::VGPRRegBankID); 103 (void)RBVGPR; 104 assert(&RBVGPR == &AMDGPU::VGPRRegBank); 105 106 } 107 108 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, 109 const RegisterBank &Src, 110 unsigned Size) const { 111 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane? 112 if (Dst.getID() == AMDGPU::SGPRRegBankID && 113 Src.getID() == AMDGPU::VGPRRegBankID) { 114 return std::numeric_limits<unsigned>::max(); 115 } 116 117 // Bool values are tricky, because the meaning is based on context. The SCC 118 // and VCC banks are for the natural scalar and vector conditions produced by 119 // a compare. 120 // 121 // Legalization doesn't know about the necessary context, so an s1 use may 122 // have been a truncate from an arbitrary value, in which case a copy (lowered 123 // as a compare with 0) needs to be inserted. 124 if (Size == 1 && 125 (Dst.getID() == AMDGPU::SCCRegBankID || 126 Dst.getID() == AMDGPU::SGPRRegBankID) && 127 (Src.getID() == AMDGPU::SGPRRegBankID || 128 Src.getID() == AMDGPU::VGPRRegBankID || 129 Src.getID() == AMDGPU::VCCRegBankID)) 130 return std::numeric_limits<unsigned>::max(); 131 132 if (Dst.getID() == AMDGPU::SCCRegBankID && 133 Src.getID() == AMDGPU::VCCRegBankID) 134 return std::numeric_limits<unsigned>::max(); 135 136 return RegisterBankInfo::copyCost(Dst, Src, Size); 137 } 138 139 unsigned AMDGPURegisterBankInfo::getBreakDownCost( 140 const ValueMapping &ValMapping, 141 const RegisterBank *CurBank) const { 142 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to 143 // VGPR. 144 // FIXME: Is there a better way to do this? 145 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64) 146 return 10; // This is expensive. 147 148 assert(ValMapping.NumBreakDowns == 2 && 149 ValMapping.BreakDown[0].Length == 32 && 150 ValMapping.BreakDown[0].StartIdx == 0 && 151 ValMapping.BreakDown[1].Length == 32 && 152 ValMapping.BreakDown[1].StartIdx == 32 && 153 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank); 154 155 // 32-bit extract of a 64-bit value is just access of a subregister, so free. 156 // TODO: Cost of 0 hits assert, though it's not clear it's what we really 157 // want. 158 159 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR 160 // alignment restrictions, but this probably isn't important. 161 return 1; 162 } 163 164 const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass( 165 const TargetRegisterClass &RC) const { 166 167 if (TRI->isSGPRClass(&RC)) 168 return getRegBank(AMDGPU::SGPRRegBankID); 169 170 return getRegBank(AMDGPU::VGPRRegBankID); 171 } 172 173 template <unsigned NumOps> 174 RegisterBankInfo::InstructionMappings 175 AMDGPURegisterBankInfo::addMappingFromTable( 176 const MachineInstr &MI, const MachineRegisterInfo &MRI, 177 const std::array<unsigned, NumOps> RegSrcOpIdx, 178 ArrayRef<OpRegBankEntry<NumOps>> Table) const { 179 180 InstructionMappings AltMappings; 181 182 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands()); 183 184 unsigned Sizes[NumOps]; 185 for (unsigned I = 0; I < NumOps; ++I) { 186 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg(); 187 Sizes[I] = getSizeInBits(Reg, MRI, *TRI); 188 } 189 190 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) { 191 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI); 192 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI); 193 } 194 195 // getInstrMapping's default mapping uses ID 1, so start at 2. 196 unsigned MappingID = 2; 197 for (const auto &Entry : Table) { 198 for (unsigned I = 0; I < NumOps; ++I) { 199 int OpIdx = RegSrcOpIdx[I]; 200 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]); 201 } 202 203 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost, 204 getOperandsMapping(Operands), 205 Operands.size())); 206 } 207 208 return AltMappings; 209 } 210 211 RegisterBankInfo::InstructionMappings 212 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic( 213 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 214 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { 215 case Intrinsic::amdgcn_readlane: { 216 static const OpRegBankEntry<3> Table[2] = { 217 // Perfectly legal. 218 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 219 220 // Need a readfirstlane for the index. 221 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 222 }; 223 224 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 225 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 226 } 227 case Intrinsic::amdgcn_writelane: { 228 static const OpRegBankEntry<4> Table[4] = { 229 // Perfectly legal. 230 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 231 232 // Need readfirstlane of first op 233 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 234 235 // Need readfirstlane of second op 236 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 237 238 // Need readfirstlane of both ops 239 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 } 240 }; 241 242 // rsrc, voffset, offset 243 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } }; 244 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 245 } 246 default: 247 return RegisterBankInfo::getInstrAlternativeMappings(MI); 248 } 249 } 250 251 RegisterBankInfo::InstructionMappings 252 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( 253 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 254 255 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { 256 case Intrinsic::amdgcn_buffer_load: { 257 static const OpRegBankEntry<3> Table[4] = { 258 // Perfectly legal. 259 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 260 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 261 262 // Waterfall loop needed for rsrc. In the worst case this will execute 263 // approximately an extra 10 * wavesize + 2 instructions. 264 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 }, 265 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1000 } 266 }; 267 268 // rsrc, voffset, offset 269 const std::array<unsigned, 3> RegSrcOpIdx = { { 2, 3, 4 } }; 270 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 271 } 272 case Intrinsic::amdgcn_s_buffer_load: { 273 static const OpRegBankEntry<2> Table[4] = { 274 // Perfectly legal. 275 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 276 277 // Only need 1 register in loop 278 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 }, 279 280 // Have to waterfall the resource. 281 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 }, 282 283 // Have to waterfall the resource, and the offset. 284 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 } 285 }; 286 287 // rsrc, offset 288 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } }; 289 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 290 } 291 case Intrinsic::amdgcn_ds_ordered_add: 292 case Intrinsic::amdgcn_ds_ordered_swap: { 293 // VGPR = M0, VGPR 294 static const OpRegBankEntry<3> Table[2] = { 295 // Perfectly legal. 296 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 297 298 // Need a readfirstlane for m0 299 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 300 }; 301 302 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 303 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 304 } 305 case Intrinsic::amdgcn_s_sendmsg: 306 case Intrinsic::amdgcn_s_sendmsghalt: { 307 static const OpRegBankEntry<1> Table[2] = { 308 // Perfectly legal. 309 { { AMDGPU::SGPRRegBankID }, 1 }, 310 311 // Need readlane 312 { { AMDGPU::VGPRRegBankID }, 3 } 313 }; 314 315 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } }; 316 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 317 } 318 default: 319 return RegisterBankInfo::getInstrAlternativeMappings(MI); 320 } 321 } 322 323 static bool isInstrUniform(const MachineInstr &MI) { 324 if (!MI.hasOneMemOperand()) 325 return false; 326 327 const MachineMemOperand *MMO = *MI.memoperands_begin(); 328 return AMDGPUInstrInfo::isUniformMMO(MMO); 329 } 330 331 RegisterBankInfo::InstructionMappings 332 AMDGPURegisterBankInfo::getInstrAlternativeMappings( 333 const MachineInstr &MI) const { 334 335 const MachineFunction &MF = *MI.getParent()->getParent(); 336 const MachineRegisterInfo &MRI = MF.getRegInfo(); 337 338 339 InstructionMappings AltMappings; 340 switch (MI.getOpcode()) { 341 case TargetOpcode::G_CONSTANT: 342 case TargetOpcode::G_FCONSTANT: 343 case TargetOpcode::G_FRAME_INDEX: 344 case TargetOpcode::G_GLOBAL_VALUE: { 345 static const OpRegBankEntry<1> Table[2] = { 346 { { AMDGPU::VGPRRegBankID }, 1 }, 347 { { AMDGPU::SGPRRegBankID }, 1 } 348 }; 349 350 return addMappingFromTable<1>(MI, MRI, { 0 }, Table); 351 } 352 case TargetOpcode::G_AND: 353 case TargetOpcode::G_OR: 354 case TargetOpcode::G_XOR: { 355 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 356 357 if (Size == 1) { 358 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0. 359 const InstructionMapping &SCCMapping = getInstructionMapping( 360 1, 1, getOperandsMapping( 361 {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, Size), 362 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 363 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 364 3); // Num Operands 365 AltMappings.push_back(&SCCMapping); 366 367 const InstructionMapping &SGPRMapping = getInstructionMapping( 368 1, 1, getOperandsMapping( 369 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 370 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 371 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 372 3); // Num Operands 373 AltMappings.push_back(&SGPRMapping); 374 375 const InstructionMapping &VCCMapping0 = getInstructionMapping( 376 2, 10, getOperandsMapping( 377 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 378 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 379 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}), 380 3); // Num Operands 381 AltMappings.push_back(&VCCMapping0); 382 return AltMappings; 383 } 384 385 if (Size != 64) 386 break; 387 388 const InstructionMapping &SSMapping = getInstructionMapping( 389 1, 1, getOperandsMapping( 390 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 391 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 392 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 393 3); // Num Operands 394 AltMappings.push_back(&SSMapping); 395 396 const InstructionMapping &VVMapping = getInstructionMapping( 397 2, 2, getOperandsMapping( 398 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 399 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 400 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 401 3); // Num Operands 402 AltMappings.push_back(&VVMapping); 403 404 const InstructionMapping &SVMapping = getInstructionMapping( 405 3, 3, getOperandsMapping( 406 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 407 AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size), 408 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 409 3); // Num Operands 410 AltMappings.push_back(&SVMapping); 411 412 // SGPR in LHS is slightly preferrable, so make it VS more expensive than 413 // SV. 414 const InstructionMapping &VSMapping = getInstructionMapping( 415 3, 4, getOperandsMapping( 416 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 417 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 418 AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size)}), 419 3); // Num Operands 420 AltMappings.push_back(&VSMapping); 421 break; 422 } 423 case TargetOpcode::G_LOAD: { 424 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 425 LLT LoadTy = MRI.getType(MI.getOperand(0).getReg()); 426 // FIXME: Should we be hard coding the size for these mappings? 427 if (isInstrUniform(MI)) { 428 const InstructionMapping &SSMapping = getInstructionMapping( 429 1, 1, getOperandsMapping( 430 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 431 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}), 432 2); // Num Operands 433 AltMappings.push_back(&SSMapping); 434 } 435 436 const InstructionMapping &VVMapping = getInstructionMapping( 437 2, 1, getOperandsMapping( 438 {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy), 439 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64)}), 440 2); // Num Operands 441 AltMappings.push_back(&VVMapping); 442 443 // It may be possible to have a vgpr = load sgpr mapping here, because 444 // the mubuf instructions support this kind of load, but probably for only 445 // gfx7 and older. However, the addressing mode matching in the instruction 446 // selector should be able to do a better job of detecting and selecting 447 // these kinds of loads from the vgpr = load vgpr mapping. 448 449 return AltMappings; 450 451 } 452 case TargetOpcode::G_ICMP: { 453 unsigned Size = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI); 454 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 455 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1), 456 nullptr, // Predicate operand. 457 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 458 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 459 4); // Num Operands 460 AltMappings.push_back(&SSMapping); 461 462 const InstructionMapping &SVMapping = getInstructionMapping(2, 1, 463 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 464 nullptr, // Predicate operand. 465 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 466 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}), 467 4); // Num Operands 468 AltMappings.push_back(&SVMapping); 469 470 const InstructionMapping &VSMapping = getInstructionMapping(3, 1, 471 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 472 nullptr, // Predicate operand. 473 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 474 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 475 4); // Num Operands 476 AltMappings.push_back(&VSMapping); 477 478 const InstructionMapping &VVMapping = getInstructionMapping(4, 1, 479 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 480 nullptr, // Predicate operand. 481 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 482 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}), 483 4); // Num Operands 484 AltMappings.push_back(&VVMapping); 485 486 return AltMappings; 487 } 488 case TargetOpcode::G_SELECT: { 489 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 490 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 491 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 492 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1), 493 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 494 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 495 4); // Num Operands 496 AltMappings.push_back(&SSMapping); 497 498 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 499 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 500 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 501 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 502 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 503 4); // Num Operands 504 AltMappings.push_back(&VVMapping); 505 506 return AltMappings; 507 } 508 case TargetOpcode::G_SMIN: 509 case TargetOpcode::G_SMAX: 510 case TargetOpcode::G_UMIN: 511 case TargetOpcode::G_UMAX: { 512 static const OpRegBankEntry<3> Table[4] = { 513 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 514 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 515 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 516 517 // Scalar requires cmp+select, and extends if 16-bit. 518 // FIXME: Should there be separate costs for 32 and 16-bit 519 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 } 520 }; 521 522 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 1, 2 } }; 523 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 524 } 525 case TargetOpcode::G_UADDE: 526 case TargetOpcode::G_USUBE: 527 case TargetOpcode::G_SADDE: 528 case TargetOpcode::G_SSUBE: { 529 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 530 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 531 getOperandsMapping( 532 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 533 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1), 534 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 535 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 536 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1)}), 537 5); // Num Operands 538 AltMappings.push_back(&SSMapping); 539 540 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 541 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 542 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 543 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 544 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 545 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}), 546 5); // Num Operands 547 AltMappings.push_back(&VVMapping); 548 return AltMappings; 549 } 550 case AMDGPU::G_BRCOND: { 551 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 552 553 const InstructionMapping &SMapping = getInstructionMapping( 554 1, 1, getOperandsMapping( 555 {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1), nullptr}), 556 2); // Num Operands 557 AltMappings.push_back(&SMapping); 558 559 const InstructionMapping &VMapping = getInstructionMapping( 560 1, 1, getOperandsMapping( 561 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }), 562 2); // Num Operands 563 AltMappings.push_back(&VMapping); 564 return AltMappings; 565 } 566 case AMDGPU::G_INTRINSIC: 567 return getInstrAlternativeMappingsIntrinsic(MI, MRI); 568 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: 569 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI); 570 default: 571 break; 572 } 573 return RegisterBankInfo::getInstrAlternativeMappings(MI); 574 } 575 576 void AMDGPURegisterBankInfo::split64BitValueForMapping( 577 MachineIRBuilder &B, 578 SmallVector<Register, 2> &Regs, 579 LLT HalfTy, 580 Register Reg) const { 581 assert(HalfTy.getSizeInBits() == 32); 582 MachineRegisterInfo *MRI = B.getMRI(); 583 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy); 584 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy); 585 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI); 586 MRI->setRegBank(LoLHS, *Bank); 587 MRI->setRegBank(HiLHS, *Bank); 588 589 Regs.push_back(LoLHS); 590 Regs.push_back(HiLHS); 591 592 B.buildInstr(AMDGPU::G_UNMERGE_VALUES) 593 .addDef(LoLHS) 594 .addDef(HiLHS) 595 .addUse(Reg); 596 } 597 598 /// Replace the current type each register in \p Regs has with \p NewTy 599 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs, 600 LLT NewTy) { 601 for (Register Reg : Regs) { 602 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits()); 603 MRI.setType(Reg, NewTy); 604 } 605 } 606 607 static LLT getHalfSizedType(LLT Ty) { 608 if (Ty.isVector()) { 609 assert(Ty.getNumElements() % 2 == 0); 610 return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType()); 611 } 612 613 assert(Ty.getSizeInBits() % 2 == 0); 614 return LLT::scalar(Ty.getSizeInBits() / 2); 615 } 616 617 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If 618 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to 619 /// execute the instruction for each unique combination of values in all lanes 620 /// in the wave. The block will be split such that rest of the instructions are 621 /// moved to a new block. 622 /// 623 /// Essentially performs this loop: 624 // 625 /// Save Execution Mask 626 /// For (Lane : Wavefront) { 627 /// Enable Lane, Disable all other lanes 628 /// SGPR = read SGPR value for current lane from VGPR 629 /// VGPRResult[Lane] = use_op SGPR 630 /// } 631 /// Restore Execution Mask 632 /// 633 /// There is additional complexity to try for compare values to identify the 634 /// unique values used. 635 void AMDGPURegisterBankInfo::executeInWaterfallLoop( 636 MachineInstr &MI, MachineRegisterInfo &MRI, 637 ArrayRef<unsigned> OpIndices) const { 638 MachineFunction *MF = MI.getParent()->getParent(); 639 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 640 const SIInstrInfo *TII = ST.getInstrInfo(); 641 MachineBasicBlock::iterator I(MI); 642 643 MachineBasicBlock &MBB = *MI.getParent(); 644 const DebugLoc &DL = MI.getDebugLoc(); 645 646 // Use a set to avoid extra readfirstlanes in the case where multiple operands 647 // are the same register. 648 SmallSet<Register, 4> SGPROperandRegs; 649 for (unsigned Op : OpIndices) { 650 assert(MI.getOperand(Op).isUse()); 651 Register Reg = MI.getOperand(Op).getReg(); 652 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); 653 if (OpBank->getID() == AMDGPU::VGPRRegBankID) 654 SGPROperandRegs.insert(Reg); 655 } 656 657 // No operands need to be replaced, so no need to loop. 658 if (SGPROperandRegs.empty()) 659 return; 660 661 MachineIRBuilder B(MI); 662 SmallVector<Register, 4> ResultRegs; 663 SmallVector<Register, 4> InitResultRegs; 664 SmallVector<Register, 4> PhiRegs; 665 for (MachineOperand &Def : MI.defs()) { 666 LLT ResTy = MRI.getType(Def.getReg()); 667 const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI); 668 ResultRegs.push_back(Def.getReg()); 669 Register InitReg = B.buildUndef(ResTy).getReg(0); 670 Register PhiReg = MRI.createGenericVirtualRegister(ResTy); 671 InitResultRegs.push_back(InitReg); 672 PhiRegs.push_back(PhiReg); 673 MRI.setRegBank(PhiReg, *DefBank); 674 MRI.setRegBank(InitReg, *DefBank); 675 } 676 677 Register SaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); 678 Register InitSaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); 679 680 // Don't bother using generic instructions/registers for the exec mask. 681 B.buildInstr(TargetOpcode::IMPLICIT_DEF) 682 .addDef(InitSaveExecReg); 683 684 Register PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 685 Register NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 686 687 // To insert the loop we need to split the block. Move everything before this 688 // point to a new block, and insert a new empty block before this instruction. 689 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); 690 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); 691 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock(); 692 MachineFunction::iterator MBBI(MBB); 693 ++MBBI; 694 MF->insert(MBBI, LoopBB); 695 MF->insert(MBBI, RestoreExecBB); 696 MF->insert(MBBI, RemainderBB); 697 698 LoopBB->addSuccessor(RestoreExecBB); 699 LoopBB->addSuccessor(LoopBB); 700 701 // Move the rest of the block into a new block. 702 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 703 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); 704 705 MBB.addSuccessor(LoopBB); 706 RestoreExecBB->addSuccessor(RemainderBB); 707 708 B.setInsertPt(*LoopBB, LoopBB->end()); 709 710 B.buildInstr(TargetOpcode::PHI) 711 .addDef(PhiExec) 712 .addReg(InitSaveExecReg) 713 .addMBB(&MBB) 714 .addReg(NewExec) 715 .addMBB(LoopBB); 716 717 for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) { 718 B.buildInstr(TargetOpcode::G_PHI) 719 .addDef(std::get<2>(Result)) 720 .addReg(std::get<0>(Result)) // Initial value / implicit_def 721 .addMBB(&MBB) 722 .addReg(std::get<1>(Result)) // Mid-loop value. 723 .addMBB(LoopBB); 724 } 725 726 // Move the instruction into the loop. 727 LoopBB->splice(LoopBB->end(), &MBB, I); 728 I = std::prev(LoopBB->end()); 729 730 B.setInstr(*I); 731 732 Register CondReg; 733 734 for (MachineOperand &Op : MI.uses()) { 735 if (!Op.isReg()) 736 continue; 737 738 assert(!Op.isDef()); 739 if (SGPROperandRegs.count(Op.getReg())) { 740 LLT OpTy = MRI.getType(Op.getReg()); 741 unsigned OpSize = OpTy.getSizeInBits(); 742 743 // Can only do a readlane of 32-bit pieces. 744 if (OpSize == 32) { 745 // Avoid extra copies in the simple case of one 32-bit register. 746 Register CurrentLaneOpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 747 MRI.setType(CurrentLaneOpReg, OpTy); 748 749 constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI); 750 // Read the next variant <- also loop target. 751 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentLaneOpReg) 752 .addReg(Op.getReg()); 753 754 Register NewCondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 755 bool First = CondReg == AMDGPU::NoRegister; 756 if (First) 757 CondReg = NewCondReg; 758 759 // Compare the just read M0 value to all possible Idx values. 760 B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64) 761 .addDef(NewCondReg) 762 .addReg(CurrentLaneOpReg) 763 .addReg(Op.getReg()); 764 Op.setReg(CurrentLaneOpReg); 765 766 if (!First) { 767 Register AndReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); 768 769 // If there are multiple operands to consider, and the conditions. 770 B.buildInstr(AMDGPU::S_AND_B64) 771 .addDef(AndReg) 772 .addReg(NewCondReg) 773 .addReg(CondReg); 774 CondReg = AndReg; 775 } 776 } else { 777 LLT S32 = LLT::scalar(32); 778 SmallVector<Register, 8> ReadlanePieces; 779 780 // The compares can be done as 64-bit, but the extract needs to be done 781 // in 32-bit pieces. 782 783 bool Is64 = OpSize % 64 == 0; 784 785 LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32); 786 unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64 787 : AMDGPU::V_CMP_EQ_U32_e64; 788 789 // The compares can be done as 64-bit, but the extract needs to be done 790 // in 32-bit pieces. 791 792 // Insert the unmerge before the loop. 793 794 B.setMBB(MBB); 795 auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg()); 796 B.setInstr(*I); 797 798 unsigned NumPieces = Unmerge->getNumOperands() - 1; 799 for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) { 800 Register UnmergePiece = Unmerge.getReg(PieceIdx); 801 802 Register CurrentLaneOpReg; 803 if (Is64) { 804 Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32); 805 Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32); 806 807 MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass); 808 MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass); 809 MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass); 810 811 // Read the next variant <- also loop target. 812 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 813 CurrentLaneOpRegLo) 814 .addReg(UnmergePiece, 0, AMDGPU::sub0); 815 816 // Read the next variant <- also loop target. 817 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 818 CurrentLaneOpRegHi) 819 .addReg(UnmergePiece, 0, AMDGPU::sub1); 820 821 CurrentLaneOpReg = 822 B.buildMerge(LLT::scalar(64), 823 {CurrentLaneOpRegLo, CurrentLaneOpRegHi}) 824 .getReg(0); 825 826 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass); 827 828 if (OpTy.getScalarSizeInBits() == 64) { 829 // If we need to produce a 64-bit element vector, so use the 830 // merged pieces 831 ReadlanePieces.push_back(CurrentLaneOpReg); 832 } else { 833 // 32-bit element type. 834 ReadlanePieces.push_back(CurrentLaneOpRegLo); 835 ReadlanePieces.push_back(CurrentLaneOpRegHi); 836 } 837 } else { 838 CurrentLaneOpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); 839 MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass); 840 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass); 841 842 // Read the next variant <- also loop target. 843 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 844 CurrentLaneOpReg) 845 .addReg(UnmergePiece); 846 ReadlanePieces.push_back(CurrentLaneOpReg); 847 } 848 849 Register NewCondReg 850 = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); 851 bool First = CondReg == AMDGPU::NoRegister; 852 if (First) 853 CondReg = NewCondReg; 854 855 B.buildInstr(CmpOp) 856 .addDef(NewCondReg) 857 .addReg(CurrentLaneOpReg) 858 .addReg(UnmergePiece); 859 860 if (!First) { 861 Register AndReg 862 = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); 863 864 // If there are multiple operands to consider, and the conditions. 865 B.buildInstr(AMDGPU::S_AND_B64) 866 .addDef(AndReg) 867 .addReg(NewCondReg) 868 .addReg(CondReg); 869 CondReg = AndReg; 870 } 871 } 872 873 // FIXME: Build merge seems to switch to CONCAT_VECTORS but not 874 // BUILD_VECTOR 875 if (OpTy.isVector()) { 876 auto Merge = B.buildBuildVector(OpTy, ReadlanePieces); 877 Op.setReg(Merge.getReg(0)); 878 } else { 879 auto Merge = B.buildMerge(OpTy, ReadlanePieces); 880 Op.setReg(Merge.getReg(0)); 881 } 882 883 MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID)); 884 } 885 } 886 } 887 888 B.setInsertPt(*LoopBB, LoopBB->end()); 889 890 // Update EXEC, save the original EXEC value to VCC. 891 B.buildInstr(AMDGPU::S_AND_SAVEEXEC_B64) 892 .addDef(NewExec) 893 .addReg(CondReg, RegState::Kill); 894 895 MRI.setSimpleHint(NewExec, CondReg); 896 897 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 898 B.buildInstr(AMDGPU::S_XOR_B64_term) 899 .addDef(AMDGPU::EXEC) 900 .addReg(AMDGPU::EXEC) 901 .addReg(NewExec); 902 903 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use 904 // s_cbranch_scc0? 905 906 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. 907 B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ) 908 .addMBB(LoopBB); 909 910 // Save the EXEC mask before the loop. 911 BuildMI(MBB, MBB.end(), DL, TII->get(AMDGPU::S_MOV_B64_term), SaveExecReg) 912 .addReg(AMDGPU::EXEC); 913 914 // Restore the EXEC mask after the loop. 915 B.setMBB(*RestoreExecBB); 916 B.buildInstr(AMDGPU::S_MOV_B64_term) 917 .addDef(AMDGPU::EXEC) 918 .addReg(SaveExecReg); 919 } 920 921 // Legalize an operand that must be an SGPR by inserting a readfirstlane. 922 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane( 923 MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const { 924 Register Reg = MI.getOperand(OpIdx).getReg(); 925 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 926 if (Bank != &AMDGPU::VGPRRegBank) 927 return; 928 929 MachineIRBuilder B(MI); 930 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 931 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32) 932 .addDef(SGPR) 933 .addReg(Reg); 934 935 const TargetRegisterClass *Constrained = 936 constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI); 937 (void)Constrained; 938 assert(Constrained && "Failed to constrain readfirstlane src reg"); 939 940 MI.getOperand(OpIdx).setReg(SGPR); 941 } 942 943 // When regbankselect repairs registers, it will insert a repair instruction 944 // which defines the repaired register. Then it calls applyMapping and expects 945 // that the targets will either delete or rewrite the originally wrote to the 946 // repaired registers. Beccause of this, we end up in a situation where 947 // we have 2 instructions defining the same registers. 948 static MachineInstr *getOtherVRegDef(const MachineRegisterInfo &MRI, 949 Register Reg, 950 const MachineInstr &MI) { 951 // Is there some way we can assert that there are exactly 2 def instructions? 952 for (MachineInstr &Other : MRI.def_instructions(Reg)) { 953 if (&Other != &MI) 954 return &Other; 955 } 956 957 return nullptr; 958 } 959 960 bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI, 961 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 962 MachineRegisterInfo &MRI) const { 963 Register DstReg = MI.getOperand(0).getReg(); 964 const LLT LoadTy = MRI.getType(DstReg); 965 unsigned LoadSize = LoadTy.getSizeInBits(); 966 const unsigned MaxNonSmrdLoadSize = 128; 967 // 128-bit loads are supported for all instruction types. 968 if (LoadSize <= MaxNonSmrdLoadSize) 969 return false; 970 971 SmallVector<unsigned, 16> DefRegs(OpdMapper.getVRegs(0)); 972 SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1)); 973 974 // If the pointer is an SGPR, we have nothing to do. 975 if (SrcRegs.empty()) 976 return false; 977 978 assert(LoadSize % MaxNonSmrdLoadSize == 0); 979 980 // We want to get the repair instruction now, because it will help us 981 // determine which instruction the legalizer inserts that will also 982 // write to DstReg. 983 MachineInstr *RepairInst = getOtherVRegDef(MRI, DstReg, MI); 984 985 // RegBankSelect only emits scalar types, so we need to reset the pointer 986 // operand to a pointer type. 987 Register BasePtrReg = SrcRegs[0]; 988 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); 989 MRI.setType(BasePtrReg, PtrTy); 990 991 MachineIRBuilder B(MI); 992 993 unsigned SplitElts = 994 MaxNonSmrdLoadSize / LoadTy.getScalarType().getSizeInBits(); 995 const LLT LoadSplitTy = LLT::vector(SplitElts, LoadTy.getScalarType()); 996 ApplyRegBankMapping O(MRI, &AMDGPU::VGPRRegBank); 997 GISelObserverWrapper Observer(&O); 998 B.setChangeObserver(Observer); 999 LegalizerHelper Helper(B.getMF(), Observer, B); 1000 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) 1001 return false; 1002 1003 // At this point, the legalizer has split the original load into smaller 1004 // loads. At the end of lowering, it inserts an instruction (LegalizedInst) 1005 // that combines the outputs of the lower loads and writes it to DstReg. 1006 // The register bank selector has also added the RepairInst which writes to 1007 // DstReg as well. 1008 1009 MachineInstr *LegalizedInst = getOtherVRegDef(MRI, DstReg, *RepairInst); 1010 1011 // Replace the output of the LegalizedInst with a temporary register, since 1012 // RepairInst already defines DstReg. 1013 Register TmpReg = MRI.createGenericVirtualRegister(MRI.getType(DstReg)); 1014 LegalizedInst->getOperand(0).setReg(TmpReg); 1015 B.setInsertPt(*RepairInst->getParent(), RepairInst); 1016 1017 for (unsigned DefIdx = 0, e = DefRegs.size(); DefIdx != e; ++DefIdx) { 1018 Register IdxReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); 1019 B.buildConstant(IdxReg, DefIdx); 1020 MRI.setRegBank(IdxReg, getRegBank(AMDGPU::VGPRRegBankID)); 1021 B.buildExtractVectorElement(DefRegs[DefIdx], TmpReg, IdxReg); 1022 } 1023 1024 MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID)); 1025 return true; 1026 } 1027 1028 // For cases where only a single copy is inserted for matching register banks. 1029 // Replace the register in the instruction operand 1030 static void substituteSimpleCopyRegs( 1031 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) { 1032 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx)); 1033 if (!SrcReg.empty()) { 1034 assert(SrcReg.size() == 1); 1035 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]); 1036 } 1037 } 1038 1039 void AMDGPURegisterBankInfo::applyMappingImpl( 1040 const OperandsMapper &OpdMapper) const { 1041 MachineInstr &MI = OpdMapper.getMI(); 1042 unsigned Opc = MI.getOpcode(); 1043 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1044 switch (Opc) { 1045 case AMDGPU::G_SELECT: { 1046 Register DstReg = MI.getOperand(0).getReg(); 1047 LLT DstTy = MRI.getType(DstReg); 1048 if (DstTy.getSizeInBits() != 64) 1049 break; 1050 1051 LLT HalfTy = getHalfSizedType(DstTy); 1052 1053 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 1054 SmallVector<Register, 1> Src0Regs(OpdMapper.getVRegs(1)); 1055 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 1056 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3)); 1057 1058 // All inputs are SGPRs, nothing special to do. 1059 if (DefRegs.empty()) { 1060 assert(Src1Regs.empty() && Src2Regs.empty()); 1061 break; 1062 } 1063 1064 MachineIRBuilder B(MI); 1065 if (Src0Regs.empty()) 1066 Src0Regs.push_back(MI.getOperand(1).getReg()); 1067 else { 1068 assert(Src0Regs.size() == 1); 1069 } 1070 1071 if (Src1Regs.empty()) 1072 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 1073 else { 1074 setRegsToType(MRI, Src1Regs, HalfTy); 1075 } 1076 1077 if (Src2Regs.empty()) 1078 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg()); 1079 else 1080 setRegsToType(MRI, Src2Regs, HalfTy); 1081 1082 setRegsToType(MRI, DefRegs, HalfTy); 1083 1084 B.buildSelect(DefRegs[0], Src0Regs[0], Src1Regs[0], Src2Regs[0]); 1085 B.buildSelect(DefRegs[1], Src0Regs[0], Src1Regs[1], Src2Regs[1]); 1086 1087 MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID)); 1088 MI.eraseFromParent(); 1089 return; 1090 } 1091 case AMDGPU::G_AND: 1092 case AMDGPU::G_OR: 1093 case AMDGPU::G_XOR: { 1094 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if 1095 // there is a VGPR input. 1096 Register DstReg = MI.getOperand(0).getReg(); 1097 LLT DstTy = MRI.getType(DstReg); 1098 if (DstTy.getSizeInBits() != 64) 1099 break; 1100 1101 LLT HalfTy = getHalfSizedType(DstTy); 1102 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 1103 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1)); 1104 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 1105 1106 // All inputs are SGPRs, nothing special to do. 1107 if (DefRegs.empty()) { 1108 assert(Src0Regs.empty() && Src1Regs.empty()); 1109 break; 1110 } 1111 1112 assert(DefRegs.size() == 2); 1113 assert(Src0Regs.size() == Src1Regs.size() && 1114 (Src0Regs.empty() || Src0Regs.size() == 2)); 1115 1116 // Depending on where the source registers came from, the generic code may 1117 // have decided to split the inputs already or not. If not, we still need to 1118 // extract the values. 1119 MachineIRBuilder B(MI); 1120 1121 if (Src0Regs.empty()) 1122 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg()); 1123 else 1124 setRegsToType(MRI, Src0Regs, HalfTy); 1125 1126 if (Src1Regs.empty()) 1127 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 1128 else 1129 setRegsToType(MRI, Src1Regs, HalfTy); 1130 1131 setRegsToType(MRI, DefRegs, HalfTy); 1132 1133 B.buildInstr(Opc) 1134 .addDef(DefRegs[0]) 1135 .addUse(Src0Regs[0]) 1136 .addUse(Src1Regs[0]); 1137 1138 B.buildInstr(Opc) 1139 .addDef(DefRegs[1]) 1140 .addUse(Src0Regs[1]) 1141 .addUse(Src1Regs[1]); 1142 1143 MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID)); 1144 MI.eraseFromParent(); 1145 return; 1146 } 1147 case AMDGPU::G_ADD: 1148 case AMDGPU::G_SUB: 1149 case AMDGPU::G_MUL: { 1150 Register DstReg = MI.getOperand(0).getReg(); 1151 LLT DstTy = MRI.getType(DstReg); 1152 if (DstTy != LLT::scalar(16)) 1153 break; 1154 1155 const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI); 1156 if (DstBank == &AMDGPU::VGPRRegBank) 1157 break; 1158 1159 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. 1160 MachineFunction *MF = MI.getParent()->getParent(); 1161 MachineIRBuilder B(MI); 1162 ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank); 1163 GISelObserverWrapper Observer(&ApplySALU); 1164 LegalizerHelper Helper(*MF, Observer, B); 1165 1166 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) != 1167 LegalizerHelper::Legalized) 1168 llvm_unreachable("widen scalar should have succeeded"); 1169 return; 1170 } 1171 case AMDGPU::G_SMIN: 1172 case AMDGPU::G_SMAX: 1173 case AMDGPU::G_UMIN: 1174 case AMDGPU::G_UMAX: { 1175 Register DstReg = MI.getOperand(0).getReg(); 1176 const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI); 1177 if (DstBank == &AMDGPU::VGPRRegBank) 1178 break; 1179 1180 MachineFunction *MF = MI.getParent()->getParent(); 1181 MachineIRBuilder B(MI); 1182 ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank); 1183 GISelObserverWrapper Observer(&ApplySALU); 1184 LegalizerHelper Helper(*MF, Observer, B); 1185 1186 // Turn scalar min/max into a compare and select. 1187 LLT Ty = MRI.getType(DstReg); 1188 LLT S32 = LLT::scalar(32); 1189 LLT S16 = LLT::scalar(16); 1190 1191 if (Ty == S16) { 1192 // Need to widen to s32, and expand as cmp + select. 1193 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 1194 llvm_unreachable("widenScalar should have succeeded"); 1195 1196 // FIXME: This is relying on widenScalar leaving MI in place. 1197 if (Helper.lower(MI, 0, S32) != LegalizerHelper::Legalized) 1198 llvm_unreachable("lower should have succeeded"); 1199 } else { 1200 if (Helper.lower(MI, 0, Ty) != LegalizerHelper::Legalized) 1201 llvm_unreachable("lower should have succeeded"); 1202 } 1203 1204 return; 1205 } 1206 case AMDGPU::G_SEXT: 1207 case AMDGPU::G_ZEXT: { 1208 Register SrcReg = MI.getOperand(1).getReg(); 1209 LLT SrcTy = MRI.getType(SrcReg); 1210 bool Signed = Opc == AMDGPU::G_SEXT; 1211 1212 MachineIRBuilder B(MI); 1213 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); 1214 1215 Register DstReg = MI.getOperand(0).getReg(); 1216 LLT DstTy = MRI.getType(DstReg); 1217 if (DstTy.isScalar() && 1218 SrcBank != &AMDGPU::SGPRRegBank && 1219 SrcBank != &AMDGPU::SCCRegBank && 1220 SrcBank != &AMDGPU::VCCRegBank && 1221 // FIXME: Should handle any type that round to s64 when irregular 1222 // breakdowns supported. 1223 DstTy.getSizeInBits() == 64 && 1224 SrcTy.getSizeInBits() <= 32) { 1225 const LLT S32 = LLT::scalar(32); 1226 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 1227 1228 // Extend to 32-bit, and then extend the low half. 1229 if (Signed) { 1230 // TODO: Should really be buildSExtOrCopy 1231 B.buildSExtOrTrunc(DefRegs[0], SrcReg); 1232 1233 // Replicate sign bit from 32-bit extended part. 1234 auto ShiftAmt = B.buildConstant(S32, 31); 1235 MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank); 1236 B.buildAShr(DefRegs[1], DefRegs[0], ShiftAmt); 1237 } else { 1238 B.buildZExtOrTrunc(DefRegs[0], SrcReg); 1239 B.buildConstant(DefRegs[1], 0); 1240 } 1241 1242 MRI.setRegBank(DstReg, *SrcBank); 1243 MI.eraseFromParent(); 1244 return; 1245 } 1246 1247 if (SrcTy != LLT::scalar(1)) 1248 return; 1249 1250 if (SrcBank == &AMDGPU::SCCRegBank || SrcBank == &AMDGPU::VCCRegBank) { 1251 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 1252 1253 const RegisterBank *DstBank = SrcBank == &AMDGPU::SCCRegBank ? 1254 &AMDGPU::SGPRRegBank : &AMDGPU::VGPRRegBank; 1255 1256 unsigned DstSize = DstTy.getSizeInBits(); 1257 // 64-bit select is SGPR only 1258 const bool UseSel64 = DstSize > 32 && 1259 SrcBank->getID() == AMDGPU::SCCRegBankID; 1260 1261 // TODO: Should s16 select be legal? 1262 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32); 1263 auto True = B.buildConstant(SelType, Signed ? -1 : 1); 1264 auto False = B.buildConstant(SelType, 0); 1265 1266 MRI.setRegBank(True.getReg(0), *DstBank); 1267 MRI.setRegBank(False.getReg(0), *DstBank); 1268 MRI.setRegBank(DstReg, *DstBank); 1269 1270 if (DstSize > 32 && SrcBank->getID() != AMDGPU::SCCRegBankID) { 1271 B.buildSelect(DefRegs[0], SrcReg, True, False); 1272 B.buildCopy(DefRegs[1], DefRegs[0]); 1273 } else if (DstSize < 32) { 1274 auto Sel = B.buildSelect(SelType, SrcReg, True, False); 1275 MRI.setRegBank(Sel.getReg(0), *DstBank); 1276 B.buildTrunc(DstReg, Sel); 1277 } else { 1278 B.buildSelect(DstReg, SrcReg, True, False); 1279 } 1280 1281 MI.eraseFromParent(); 1282 return; 1283 } 1284 1285 // Fixup the case with an s1 src that isn't a condition register. Use shifts 1286 // instead of introducing a compare to avoid an unnecessary condition 1287 // register (and since there's no scalar 16-bit compares). 1288 auto Ext = B.buildAnyExt(DstTy, SrcReg); 1289 auto ShiftAmt = B.buildConstant(LLT::scalar(32), DstTy.getSizeInBits() - 1); 1290 auto Shl = B.buildShl(DstTy, Ext, ShiftAmt); 1291 1292 if (MI.getOpcode() == AMDGPU::G_SEXT) 1293 B.buildAShr(DstReg, Shl, ShiftAmt); 1294 else 1295 B.buildLShr(DstReg, Shl, ShiftAmt); 1296 1297 MRI.setRegBank(DstReg, *SrcBank); 1298 MRI.setRegBank(Ext.getReg(0), *SrcBank); 1299 MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank); 1300 MRI.setRegBank(Shl.getReg(0), *SrcBank); 1301 MI.eraseFromParent(); 1302 return; 1303 } 1304 case AMDGPU::G_EXTRACT_VECTOR_ELT: 1305 applyDefaultMapping(OpdMapper); 1306 executeInWaterfallLoop(MI, MRI, { 2 }); 1307 return; 1308 case AMDGPU::G_INTRINSIC: { 1309 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { 1310 case Intrinsic::amdgcn_s_buffer_load: { 1311 // FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS 1312 executeInWaterfallLoop(MI, MRI, { 2, 3 }); 1313 return; 1314 } 1315 case Intrinsic::amdgcn_readlane: { 1316 substituteSimpleCopyRegs(OpdMapper, 2); 1317 1318 assert(empty(OpdMapper.getVRegs(0))); 1319 assert(empty(OpdMapper.getVRegs(3))); 1320 1321 // Make sure the index is an SGPR. It doesn't make sense to run this in a 1322 // waterfall loop, so assume it's a uniform value. 1323 constrainOpWithReadfirstlane(MI, MRI, 3); // Index 1324 return; 1325 } 1326 case Intrinsic::amdgcn_writelane: { 1327 assert(empty(OpdMapper.getVRegs(0))); 1328 assert(empty(OpdMapper.getVRegs(2))); 1329 assert(empty(OpdMapper.getVRegs(3))); 1330 1331 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val 1332 constrainOpWithReadfirstlane(MI, MRI, 2); // Source value 1333 constrainOpWithReadfirstlane(MI, MRI, 3); // Index 1334 return; 1335 } 1336 default: 1337 break; 1338 } 1339 break; 1340 } 1341 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { 1342 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { 1343 case Intrinsic::amdgcn_buffer_load: { 1344 executeInWaterfallLoop(MI, MRI, { 2 }); 1345 return; 1346 } 1347 case Intrinsic::amdgcn_ds_ordered_add: 1348 case Intrinsic::amdgcn_ds_ordered_swap: { 1349 // This is only allowed to execute with 1 lane, so readfirstlane is safe. 1350 assert(empty(OpdMapper.getVRegs(0))); 1351 substituteSimpleCopyRegs(OpdMapper, 3); 1352 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 1353 return; 1354 } 1355 case Intrinsic::amdgcn_s_sendmsg: 1356 case Intrinsic::amdgcn_s_sendmsghalt: { 1357 // FIXME: Should this use a waterfall loop? 1358 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 1359 return; 1360 } 1361 default: 1362 break; 1363 } 1364 break; 1365 } 1366 case AMDGPU::G_LOAD: { 1367 if (applyMappingWideLoad(MI, OpdMapper, MRI)) 1368 return; 1369 break; 1370 } 1371 default: 1372 break; 1373 } 1374 1375 return applyDefaultMapping(OpdMapper); 1376 } 1377 1378 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { 1379 const MachineFunction &MF = *MI.getParent()->getParent(); 1380 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1381 for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) { 1382 if (!MI.getOperand(i).isReg()) 1383 continue; 1384 Register Reg = MI.getOperand(i).getReg(); 1385 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 1386 if (Bank->getID() == AMDGPU::VGPRRegBankID) 1387 return false; 1388 1389 assert(Bank->getID() == AMDGPU::SGPRRegBankID || 1390 Bank->getID() == AMDGPU::SCCRegBankID); 1391 } 1392 } 1393 return true; 1394 } 1395 1396 const RegisterBankInfo::InstructionMapping & 1397 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const { 1398 const MachineFunction &MF = *MI.getParent()->getParent(); 1399 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1400 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 1401 1402 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 1403 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); 1404 unsigned BankID = Size == 1 ? AMDGPU::SCCRegBankID : AMDGPU::SGPRRegBankID; 1405 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size); 1406 } 1407 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 1408 MI.getNumOperands()); 1409 } 1410 1411 const RegisterBankInfo::InstructionMapping & 1412 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { 1413 const MachineFunction &MF = *MI.getParent()->getParent(); 1414 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1415 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 1416 unsigned OpdIdx = 0; 1417 1418 unsigned Size0 = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 1419 OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0); 1420 1421 if (MI.getOperand(OpdIdx).isIntrinsicID()) 1422 OpdsMapping[OpdIdx++] = nullptr; 1423 1424 Register Reg1 = MI.getOperand(OpdIdx).getReg(); 1425 unsigned Size1 = getSizeInBits(Reg1, MRI, *TRI); 1426 1427 unsigned DefaultBankID = Size1 == 1 ? 1428 AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID; 1429 unsigned Bank1 = getRegBankID(Reg1, MRI, *TRI, DefaultBankID); 1430 1431 OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(Bank1, Size1); 1432 1433 for (unsigned e = MI.getNumOperands(); OpdIdx != e; ++OpdIdx) { 1434 const MachineOperand &MO = MI.getOperand(OpdIdx); 1435 if (!MO.isReg()) 1436 continue; 1437 1438 unsigned Size = getSizeInBits(MO.getReg(), MRI, *TRI); 1439 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID; 1440 OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(BankID, Size); 1441 } 1442 1443 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 1444 MI.getNumOperands()); 1445 } 1446 1447 const RegisterBankInfo::InstructionMapping & 1448 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const { 1449 const MachineFunction &MF = *MI.getParent()->getParent(); 1450 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1451 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 1452 1453 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { 1454 const MachineOperand &Op = MI.getOperand(I); 1455 if (!Op.isReg()) 1456 continue; 1457 1458 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI); 1459 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 1460 } 1461 1462 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 1463 MI.getNumOperands()); 1464 } 1465 1466 const RegisterBankInfo::InstructionMapping & 1467 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { 1468 1469 const MachineFunction &MF = *MI.getParent()->getParent(); 1470 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1471 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 1472 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 1473 LLT LoadTy = MRI.getType(MI.getOperand(0).getReg()); 1474 unsigned PtrSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 1475 1476 const ValueMapping *ValMapping; 1477 const ValueMapping *PtrMapping; 1478 1479 if (isInstrUniform(MI)) { 1480 // We have a uniform instruction so we want to use an SMRD load 1481 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 1482 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); 1483 } else { 1484 ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy); 1485 // FIXME: What would happen if we used SGPRRegBankID here? 1486 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); 1487 } 1488 1489 OpdsMapping[0] = ValMapping; 1490 OpdsMapping[1] = PtrMapping; 1491 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping( 1492 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands()); 1493 return Mapping; 1494 1495 // FIXME: Do we want to add a mapping for FLAT load, or should we just 1496 // handle that during instruction selection? 1497 } 1498 1499 unsigned 1500 AMDGPURegisterBankInfo::getRegBankID(Register Reg, 1501 const MachineRegisterInfo &MRI, 1502 const TargetRegisterInfo &TRI, 1503 unsigned Default) const { 1504 1505 const RegisterBank *Bank = getRegBank(Reg, MRI, TRI); 1506 return Bank ? Bank->getID() : Default; 1507 } 1508 1509 /// 1510 /// This function must return a legal mapping, because 1511 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called 1512 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a 1513 /// VGPR to SGPR generated is illegal. 1514 /// 1515 const RegisterBankInfo::InstructionMapping & 1516 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { 1517 const MachineFunction &MF = *MI.getParent()->getParent(); 1518 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1519 1520 if (MI.isRegSequence()) { 1521 // If any input is a VGPR, the result must be a VGPR. The default handling 1522 // assumes any copy between banks is legal. 1523 unsigned BankID = AMDGPU::SGPRRegBankID; 1524 1525 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 1526 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI, *TRI); 1527 // It doesn't make sense to use vcc or scc banks here, so just ignore 1528 // them. 1529 if (OpBank != AMDGPU::SGPRRegBankID) { 1530 BankID = AMDGPU::VGPRRegBankID; 1531 break; 1532 } 1533 } 1534 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 1535 1536 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID)); 1537 return getInstructionMapping( 1538 1, /*Cost*/ 1, 1539 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 1540 } 1541 1542 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies 1543 // properly. 1544 // 1545 // TODO: There are additional exec masking dependencies to analyze. 1546 if (MI.getOpcode() == TargetOpcode::G_PHI) { 1547 // TODO: Generate proper invalid bank enum. 1548 int ResultBank = -1; 1549 1550 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 1551 Register Reg = MI.getOperand(I).getReg(); 1552 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 1553 1554 // FIXME: Assuming VGPR for any undetermined inputs. 1555 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) { 1556 ResultBank = AMDGPU::VGPRRegBankID; 1557 break; 1558 } 1559 1560 unsigned OpBank = Bank->getID(); 1561 // scc, scc -> sgpr 1562 if (OpBank == AMDGPU::SCCRegBankID) { 1563 // There's only one SCC register, so a phi requires copying to SGPR. 1564 OpBank = AMDGPU::SGPRRegBankID; 1565 } else if (OpBank == AMDGPU::VCCRegBankID) { 1566 // vcc, vcc -> vcc 1567 // vcc, sgpr -> vgpr 1568 if (ResultBank != -1 && ResultBank != AMDGPU::VCCRegBankID) { 1569 ResultBank = AMDGPU::VGPRRegBankID; 1570 break; 1571 } 1572 } 1573 1574 ResultBank = OpBank; 1575 } 1576 1577 assert(ResultBank != -1); 1578 1579 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 1580 1581 const ValueMapping &ValMap = 1582 getValueMapping(0, Size, getRegBank(ResultBank)); 1583 return getInstructionMapping( 1584 1, /*Cost*/ 1, 1585 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 1586 } 1587 1588 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI); 1589 if (Mapping.isValid()) 1590 return Mapping; 1591 1592 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 1593 1594 switch (MI.getOpcode()) { 1595 default: 1596 return getInvalidInstructionMapping(); 1597 1598 case AMDGPU::G_AND: 1599 case AMDGPU::G_OR: 1600 case AMDGPU::G_XOR: { 1601 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 1602 if (Size == 1) { 1603 const RegisterBank *DstBank 1604 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI); 1605 1606 unsigned TargetBankID = -1; 1607 unsigned BankLHS = -1; 1608 unsigned BankRHS = -1; 1609 if (DstBank) { 1610 TargetBankID = DstBank->getID(); 1611 if (DstBank == &AMDGPU::VCCRegBank) { 1612 TargetBankID = AMDGPU::VCCRegBankID; 1613 BankLHS = AMDGPU::VCCRegBankID; 1614 BankRHS = AMDGPU::VCCRegBankID; 1615 } else if (DstBank == &AMDGPU::SCCRegBank) { 1616 TargetBankID = AMDGPU::SCCRegBankID; 1617 BankLHS = AMDGPU::SGPRRegBankID; 1618 BankRHS = AMDGPU::SGPRRegBankID; 1619 } else { 1620 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, 1621 AMDGPU::SGPRRegBankID); 1622 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, 1623 AMDGPU::SGPRRegBankID); 1624 } 1625 } else { 1626 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, 1627 AMDGPU::VCCRegBankID); 1628 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, 1629 AMDGPU::VCCRegBankID); 1630 1631 // Both inputs should be true booleans to produce a boolean result. 1632 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) { 1633 TargetBankID = AMDGPU::VGPRRegBankID; 1634 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) { 1635 TargetBankID = AMDGPU::VCCRegBankID; 1636 BankLHS = AMDGPU::VCCRegBankID; 1637 BankRHS = AMDGPU::VCCRegBankID; 1638 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) { 1639 TargetBankID = AMDGPU::SGPRRegBankID; 1640 } else if (BankLHS == AMDGPU::SCCRegBankID || BankRHS == AMDGPU::SCCRegBankID) { 1641 // The operation must be done on a 32-bit register, but it will set 1642 // scc. The result type could interchangably be SCC or SGPR, since 1643 // both values will be produced. 1644 TargetBankID = AMDGPU::SCCRegBankID; 1645 BankLHS = AMDGPU::SGPRRegBankID; 1646 BankRHS = AMDGPU::SGPRRegBankID; 1647 } 1648 } 1649 1650 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size); 1651 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size); 1652 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size); 1653 break; 1654 } 1655 1656 if (Size == 64) { 1657 1658 if (isSALUMapping(MI)) { 1659 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size); 1660 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0]; 1661 } else { 1662 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size); 1663 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI/*, DefaultBankID*/); 1664 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size); 1665 1666 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI/*, DefaultBankID*/); 1667 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size); 1668 } 1669 1670 break; 1671 } 1672 1673 LLVM_FALLTHROUGH; 1674 } 1675 1676 case AMDGPU::G_GEP: 1677 case AMDGPU::G_ADD: 1678 case AMDGPU::G_SUB: 1679 case AMDGPU::G_MUL: 1680 case AMDGPU::G_SHL: 1681 case AMDGPU::G_LSHR: 1682 case AMDGPU::G_ASHR: 1683 case AMDGPU::G_UADDO: 1684 case AMDGPU::G_SADDO: 1685 case AMDGPU::G_USUBO: 1686 case AMDGPU::G_SSUBO: 1687 case AMDGPU::G_UADDE: 1688 case AMDGPU::G_SADDE: 1689 case AMDGPU::G_USUBE: 1690 case AMDGPU::G_SSUBE: 1691 case AMDGPU::G_UMULH: 1692 case AMDGPU::G_SMULH: 1693 case AMDGPU::G_SMIN: 1694 case AMDGPU::G_SMAX: 1695 case AMDGPU::G_UMIN: 1696 case AMDGPU::G_UMAX: 1697 if (isSALUMapping(MI)) 1698 return getDefaultMappingSOP(MI); 1699 LLVM_FALLTHROUGH; 1700 1701 case AMDGPU::G_FADD: 1702 case AMDGPU::G_FSUB: 1703 case AMDGPU::G_FPTOSI: 1704 case AMDGPU::G_FPTOUI: 1705 case AMDGPU::G_FMUL: 1706 case AMDGPU::G_FMA: 1707 case AMDGPU::G_FSQRT: 1708 case AMDGPU::G_SITOFP: 1709 case AMDGPU::G_UITOFP: 1710 case AMDGPU::G_FPTRUNC: 1711 case AMDGPU::G_FPEXT: 1712 case AMDGPU::G_FEXP2: 1713 case AMDGPU::G_FLOG2: 1714 case AMDGPU::G_FMINNUM: 1715 case AMDGPU::G_FMAXNUM: 1716 case AMDGPU::G_FMINNUM_IEEE: 1717 case AMDGPU::G_FMAXNUM_IEEE: 1718 case AMDGPU::G_FCANONICALIZE: 1719 case AMDGPU::G_INTRINSIC_TRUNC: 1720 case AMDGPU::G_INTRINSIC_ROUND: 1721 return getDefaultMappingVOP(MI); 1722 case AMDGPU::G_IMPLICIT_DEF: { 1723 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 1724 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 1725 break; 1726 } 1727 case AMDGPU::G_FCONSTANT: 1728 case AMDGPU::G_CONSTANT: 1729 case AMDGPU::G_FRAME_INDEX: 1730 case AMDGPU::G_BLOCK_ADDR: { 1731 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 1732 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 1733 break; 1734 } 1735 case AMDGPU::G_INSERT: { 1736 unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : 1737 AMDGPU::VGPRRegBankID; 1738 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 1739 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 1740 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI); 1741 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 1742 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 1743 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize); 1744 OpdsMapping[3] = nullptr; 1745 break; 1746 } 1747 case AMDGPU::G_EXTRACT: { 1748 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); 1749 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 1750 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 1751 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 1752 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 1753 OpdsMapping[2] = nullptr; 1754 break; 1755 } 1756 case AMDGPU::G_MERGE_VALUES: 1757 case AMDGPU::G_BUILD_VECTOR: 1758 case AMDGPU::G_CONCAT_VECTORS: { 1759 unsigned Bank = isSALUMapping(MI) ? 1760 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 1761 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 1762 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 1763 1764 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 1765 // Op1 and Dst should use the same register bank. 1766 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i) 1767 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize); 1768 break; 1769 } 1770 case AMDGPU::G_BITCAST: 1771 case AMDGPU::G_INTTOPTR: 1772 case AMDGPU::G_PTRTOINT: 1773 case AMDGPU::G_CTLZ: 1774 case AMDGPU::G_CTLZ_ZERO_UNDEF: 1775 case AMDGPU::G_CTTZ: 1776 case AMDGPU::G_CTTZ_ZERO_UNDEF: 1777 case AMDGPU::G_CTPOP: 1778 case AMDGPU::G_BSWAP: 1779 case AMDGPU::G_FABS: 1780 case AMDGPU::G_FNEG: { 1781 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 1782 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); 1783 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); 1784 break; 1785 } 1786 case AMDGPU::G_TRUNC: { 1787 Register Dst = MI.getOperand(0).getReg(); 1788 Register Src = MI.getOperand(1).getReg(); 1789 unsigned Bank = getRegBankID(Src, MRI, *TRI); 1790 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 1791 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 1792 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 1793 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize); 1794 break; 1795 } 1796 case AMDGPU::G_ZEXT: 1797 case AMDGPU::G_SEXT: 1798 case AMDGPU::G_ANYEXT: { 1799 Register Dst = MI.getOperand(0).getReg(); 1800 Register Src = MI.getOperand(1).getReg(); 1801 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 1802 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 1803 1804 unsigned DstBank; 1805 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI); 1806 assert(SrcBank); 1807 switch (SrcBank->getID()) { 1808 case AMDGPU::SCCRegBankID: 1809 case AMDGPU::SGPRRegBankID: 1810 DstBank = AMDGPU::SGPRRegBankID; 1811 break; 1812 default: 1813 DstBank = AMDGPU::VGPRRegBankID; 1814 break; 1815 } 1816 1817 // TODO: Should anyext be split into 32-bit part as well? 1818 if (MI.getOpcode() == AMDGPU::G_ANYEXT) { 1819 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize); 1820 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank->getID(), SrcSize); 1821 } else { 1822 // Scalar extend can use 64-bit BFE, but VGPRs require extending to 1823 // 32-bits, and then to 64. 1824 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize); 1825 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(), 1826 SrcSize); 1827 } 1828 break; 1829 } 1830 case AMDGPU::G_FCMP: { 1831 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 1832 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); 1833 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 1834 OpdsMapping[1] = nullptr; // Predicate Operand. 1835 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size); 1836 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 1837 break; 1838 } 1839 case AMDGPU::G_STORE: { 1840 assert(MI.getOperand(0).isReg()); 1841 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 1842 // FIXME: We need to specify a different reg bank once scalar stores 1843 // are supported. 1844 const ValueMapping *ValMapping = 1845 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 1846 // FIXME: Depending on the type of store, the pointer could be in 1847 // the SGPR Reg bank. 1848 // FIXME: Pointer size should be based on the address space. 1849 const ValueMapping *PtrMapping = 1850 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64); 1851 1852 OpdsMapping[0] = ValMapping; 1853 OpdsMapping[1] = PtrMapping; 1854 break; 1855 } 1856 1857 case AMDGPU::G_ICMP: { 1858 auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); 1859 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 1860 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); 1861 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); 1862 1863 bool CanUseSCC = Op2Bank == AMDGPU::SGPRRegBankID && 1864 Op3Bank == AMDGPU::SGPRRegBankID && 1865 (Size == 32 || (Size == 64 && 1866 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) && 1867 MF.getSubtarget<GCNSubtarget>().hasScalarCompareEq64())); 1868 1869 unsigned Op0Bank = CanUseSCC ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID; 1870 1871 OpdsMapping[0] = AMDGPU::getValueMapping(Op0Bank, 1); 1872 OpdsMapping[1] = nullptr; // Predicate Operand. 1873 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size); 1874 OpdsMapping[3] = AMDGPU::getValueMapping(Op3Bank, Size); 1875 break; 1876 } 1877 case AMDGPU::G_EXTRACT_VECTOR_ELT: { 1878 unsigned OutputBankID = isSALUMapping(MI) ? 1879 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 1880 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 1881 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 1882 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); 1883 1884 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, SrcSize); 1885 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, SrcSize); 1886 1887 // The index can be either if the source vector is VGPR. 1888 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize); 1889 break; 1890 } 1891 case AMDGPU::G_INSERT_VECTOR_ELT: { 1892 unsigned OutputBankID = isSALUMapping(MI) ? 1893 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 1894 1895 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 1896 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 1897 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 1898 unsigned InsertEltBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); 1899 unsigned IdxBank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); 1900 1901 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize); 1902 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize); 1903 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBank, InsertSize); 1904 1905 // The index can be either if the source vector is VGPR. 1906 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 1907 break; 1908 } 1909 case AMDGPU::G_UNMERGE_VALUES: { 1910 unsigned Bank = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : 1911 AMDGPU::VGPRRegBankID; 1912 1913 // Op1 and Dst should use the same register bank. 1914 // FIXME: Shouldn't this be the default? Why do we need to handle this? 1915 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 1916 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); 1917 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size); 1918 } 1919 break; 1920 } 1921 case AMDGPU::G_INTRINSIC: { 1922 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { 1923 default: 1924 return getInvalidInstructionMapping(); 1925 case Intrinsic::amdgcn_div_fmas: 1926 case Intrinsic::amdgcn_trig_preop: 1927 case Intrinsic::amdgcn_sin: 1928 case Intrinsic::amdgcn_cos: 1929 case Intrinsic::amdgcn_log_clamp: 1930 case Intrinsic::amdgcn_rcp: 1931 case Intrinsic::amdgcn_rcp_legacy: 1932 case Intrinsic::amdgcn_rsq: 1933 case Intrinsic::amdgcn_rsq_legacy: 1934 case Intrinsic::amdgcn_rsq_clamp: 1935 case Intrinsic::amdgcn_ldexp: 1936 case Intrinsic::amdgcn_frexp_mant: 1937 case Intrinsic::amdgcn_frexp_exp: 1938 case Intrinsic::amdgcn_fract: 1939 case Intrinsic::amdgcn_cvt_pkrtz: 1940 case Intrinsic::amdgcn_cvt_pknorm_i16: 1941 case Intrinsic::amdgcn_cvt_pknorm_u16: 1942 case Intrinsic::amdgcn_cvt_pk_i16: 1943 case Intrinsic::amdgcn_cvt_pk_u16: 1944 case Intrinsic::amdgcn_fmed3: 1945 case Intrinsic::amdgcn_cubeid: 1946 case Intrinsic::amdgcn_cubema: 1947 case Intrinsic::amdgcn_cubesc: 1948 case Intrinsic::amdgcn_cubetc: 1949 case Intrinsic::amdgcn_sffbh: 1950 case Intrinsic::amdgcn_fmad_ftz: 1951 case Intrinsic::amdgcn_mbcnt_lo: 1952 case Intrinsic::amdgcn_mbcnt_hi: 1953 case Intrinsic::amdgcn_ubfe: 1954 case Intrinsic::amdgcn_sbfe: 1955 case Intrinsic::amdgcn_lerp: 1956 case Intrinsic::amdgcn_sad_u8: 1957 case Intrinsic::amdgcn_msad_u8: 1958 case Intrinsic::amdgcn_sad_hi_u8: 1959 case Intrinsic::amdgcn_sad_u16: 1960 case Intrinsic::amdgcn_qsad_pk_u16_u8: 1961 case Intrinsic::amdgcn_mqsad_pk_u16_u8: 1962 case Intrinsic::amdgcn_mqsad_u32_u8: 1963 case Intrinsic::amdgcn_cvt_pk_u8_f32: 1964 case Intrinsic::amdgcn_alignbit: 1965 case Intrinsic::amdgcn_alignbyte: 1966 case Intrinsic::amdgcn_fdot2: 1967 case Intrinsic::amdgcn_sdot2: 1968 case Intrinsic::amdgcn_udot2: 1969 case Intrinsic::amdgcn_sdot4: 1970 case Intrinsic::amdgcn_udot4: 1971 case Intrinsic::amdgcn_sdot8: 1972 case Intrinsic::amdgcn_udot8: 1973 case Intrinsic::amdgcn_wwm: 1974 case Intrinsic::amdgcn_wqm: 1975 return getDefaultMappingVOP(MI); 1976 case Intrinsic::amdgcn_ds_permute: 1977 case Intrinsic::amdgcn_ds_bpermute: 1978 case Intrinsic::amdgcn_update_dpp: 1979 return getDefaultMappingAllVGPR(MI); 1980 case Intrinsic::amdgcn_kernarg_segment_ptr: 1981 case Intrinsic::amdgcn_s_getpc: 1982 case Intrinsic::amdgcn_groupstaticsize: { 1983 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 1984 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 1985 break; 1986 } 1987 case Intrinsic::amdgcn_wqm_vote: { 1988 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 1989 OpdsMapping[0] = OpdsMapping[2] 1990 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size); 1991 break; 1992 } 1993 case Intrinsic::amdgcn_s_buffer_load: { 1994 // FIXME: This should be moved to G_INTRINSIC_W_SIDE_EFFECTS 1995 Register RSrc = MI.getOperand(2).getReg(); // SGPR 1996 Register Offset = MI.getOperand(3).getReg(); // SGPR/imm 1997 1998 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 1999 unsigned Size2 = MRI.getType(RSrc).getSizeInBits(); 2000 unsigned Size3 = MRI.getType(Offset).getSizeInBits(); 2001 2002 unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI); 2003 unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI); 2004 2005 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size0); 2006 OpdsMapping[1] = nullptr; // intrinsic id 2007 2008 // Lie and claim everything is legal, even though some need to be 2009 // SGPRs. applyMapping will have to deal with it as a waterfall loop. 2010 OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc 2011 OpdsMapping[3] = AMDGPU::getValueMapping(OffsetBank, Size3); 2012 OpdsMapping[4] = nullptr; 2013 break; 2014 } 2015 case Intrinsic::amdgcn_div_scale: { 2016 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2017 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 2018 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size); 2019 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size); 2020 2021 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 2022 OpdsMapping[3] = AMDGPU::getValueMapping( 2023 getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI), SrcSize); 2024 OpdsMapping[4] = AMDGPU::getValueMapping( 2025 getRegBankID(MI.getOperand(4).getReg(), MRI, *TRI), SrcSize); 2026 2027 break; 2028 } 2029 case Intrinsic::amdgcn_class: { 2030 Register Src0Reg = MI.getOperand(2).getReg(); 2031 Register Src1Reg = MI.getOperand(3).getReg(); 2032 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits(); 2033 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits(); 2034 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2035 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize); 2036 OpdsMapping[2] = AMDGPU::getValueMapping(getRegBankID(Src0Reg, MRI, *TRI), 2037 Src0Size); 2038 OpdsMapping[3] = AMDGPU::getValueMapping(getRegBankID(Src1Reg, MRI, *TRI), 2039 Src1Size); 2040 break; 2041 } 2042 case Intrinsic::amdgcn_icmp: 2043 case Intrinsic::amdgcn_fcmp: { 2044 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2045 // This is not VCCRegBank because this is not used in boolean contexts. 2046 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 2047 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 2048 unsigned Op1Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); 2049 unsigned Op2Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); 2050 OpdsMapping[2] = AMDGPU::getValueMapping(Op1Bank, OpSize); 2051 OpdsMapping[3] = AMDGPU::getValueMapping(Op2Bank, OpSize); 2052 break; 2053 } 2054 case Intrinsic::amdgcn_readlane: { 2055 // This must be an SGPR, but accept a VGPR. 2056 Register IdxReg = MI.getOperand(3).getReg(); 2057 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 2058 unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID); 2059 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 2060 LLVM_FALLTHROUGH; 2061 } 2062 case Intrinsic::amdgcn_readfirstlane: { 2063 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2064 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 2065 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 2066 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 2067 break; 2068 } 2069 case Intrinsic::amdgcn_writelane: { 2070 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2071 Register SrcReg = MI.getOperand(2).getReg(); 2072 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 2073 unsigned SrcBank = getRegBankID(SrcReg, MRI, *TRI, AMDGPU::SGPRRegBankID); 2074 Register IdxReg = MI.getOperand(3).getReg(); 2075 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 2076 unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID); 2077 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 2078 2079 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted 2080 // to legalize. 2081 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize); 2082 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 2083 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 2084 break; 2085 } 2086 case Intrinsic::amdgcn_if_break: { 2087 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 2088 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 2089 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 2090 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 2091 break; 2092 } 2093 } 2094 break; 2095 } 2096 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { 2097 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { 2098 default: 2099 return getInvalidInstructionMapping(); 2100 case Intrinsic::amdgcn_s_getreg: 2101 case Intrinsic::amdgcn_s_memtime: 2102 case Intrinsic::amdgcn_s_memrealtime: 2103 case Intrinsic::amdgcn_s_get_waveid_in_workgroup: { 2104 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2105 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 2106 break; 2107 } 2108 case Intrinsic::amdgcn_ds_append: 2109 case Intrinsic::amdgcn_ds_consume: 2110 case Intrinsic::amdgcn_ds_fadd: 2111 case Intrinsic::amdgcn_ds_fmin: 2112 case Intrinsic::amdgcn_ds_fmax: 2113 case Intrinsic::amdgcn_atomic_inc: 2114 case Intrinsic::amdgcn_atomic_dec: 2115 return getDefaultMappingAllVGPR(MI); 2116 case Intrinsic::amdgcn_ds_ordered_add: 2117 case Intrinsic::amdgcn_ds_ordered_swap: { 2118 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2119 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 2120 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, 2121 AMDGPU::SGPRRegBankID); 2122 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32); 2123 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 2124 break; 2125 } 2126 case Intrinsic::amdgcn_exp_compr: 2127 OpdsMapping[0] = nullptr; // IntrinsicID 2128 // FIXME: These are immediate values which can't be read from registers. 2129 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 2130 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 2131 // FIXME: Could we support packed types here? 2132 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 2133 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 2134 // FIXME: These are immediate values which can't be read from registers. 2135 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 2136 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 2137 break; 2138 case Intrinsic::amdgcn_exp: 2139 OpdsMapping[0] = nullptr; // IntrinsicID 2140 // FIXME: These are immediate values which can't be read from registers. 2141 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 2142 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 2143 // FIXME: Could we support packed types here? 2144 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 2145 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 2146 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 2147 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 2148 // FIXME: These are immediate values which can't be read from registers. 2149 OpdsMapping[7] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 2150 OpdsMapping[8] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 2151 break; 2152 case Intrinsic::amdgcn_buffer_load: { 2153 Register RSrc = MI.getOperand(2).getReg(); // SGPR 2154 Register VIndex = MI.getOperand(3).getReg(); // VGPR 2155 Register Offset = MI.getOperand(4).getReg(); // SGPR/VGPR/imm 2156 2157 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2158 unsigned Size2 = MRI.getType(RSrc).getSizeInBits(); 2159 unsigned Size3 = MRI.getType(VIndex).getSizeInBits(); 2160 unsigned Size4 = MRI.getType(Offset).getSizeInBits(); 2161 2162 unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI); 2163 unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI); 2164 2165 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0); 2166 OpdsMapping[1] = nullptr; // intrinsic id 2167 2168 // Lie and claim everything is legal, even though some need to be 2169 // SGPRs. applyMapping will have to deal with it as a waterfall loop. 2170 OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc 2171 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size3); 2172 OpdsMapping[4] = AMDGPU::getValueMapping(OffsetBank, Size4); 2173 OpdsMapping[5] = nullptr; 2174 OpdsMapping[6] = nullptr; 2175 break; 2176 } 2177 case Intrinsic::amdgcn_s_sendmsg: 2178 case Intrinsic::amdgcn_s_sendmsghalt: { 2179 // This must be an SGPR, but accept a VGPR. 2180 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, 2181 AMDGPU::SGPRRegBankID); 2182 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 2183 break; 2184 } 2185 case Intrinsic::amdgcn_end_cf: { 2186 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 2187 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 2188 break; 2189 } 2190 } 2191 break; 2192 } 2193 case AMDGPU::G_SELECT: { 2194 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2195 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, 2196 AMDGPU::SGPRRegBankID); 2197 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI, 2198 AMDGPU::SGPRRegBankID); 2199 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID && 2200 Op3Bank == AMDGPU::SGPRRegBankID; 2201 2202 unsigned CondBankDefault = SGPRSrcs ? 2203 AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID; 2204 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, 2205 CondBankDefault); 2206 if (CondBank == AMDGPU::SGPRRegBankID) 2207 CondBank = SGPRSrcs ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID; 2208 else if (CondBank == AMDGPU::VGPRRegBankID) 2209 CondBank = AMDGPU::VCCRegBankID; 2210 2211 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SCCRegBankID ? 2212 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 2213 2214 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SCCRegBankID); 2215 2216 if (Size == 64) { 2217 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 2218 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 2219 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 2220 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 2221 } else { 2222 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size); 2223 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 2224 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size); 2225 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size); 2226 } 2227 2228 break; 2229 } 2230 2231 case AMDGPU::G_LOAD: 2232 return getInstrMappingForLoad(MI); 2233 2234 case AMDGPU::G_ATOMICRMW_XCHG: 2235 case AMDGPU::G_ATOMICRMW_ADD: 2236 case AMDGPU::G_ATOMICRMW_SUB: 2237 case AMDGPU::G_ATOMICRMW_AND: 2238 case AMDGPU::G_ATOMICRMW_OR: 2239 case AMDGPU::G_ATOMICRMW_XOR: 2240 case AMDGPU::G_ATOMICRMW_MAX: 2241 case AMDGPU::G_ATOMICRMW_MIN: 2242 case AMDGPU::G_ATOMICRMW_UMAX: 2243 case AMDGPU::G_ATOMICRMW_UMIN: 2244 case AMDGPU::G_ATOMICRMW_FADD: 2245 case AMDGPU::G_ATOMIC_CMPXCHG: { 2246 return getDefaultMappingAllVGPR(MI); 2247 } 2248 case AMDGPU::G_BRCOND: { 2249 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI, 2250 AMDGPU::SGPRRegBankID); 2251 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 2252 if (Bank != AMDGPU::SCCRegBankID) 2253 Bank = AMDGPU::VCCRegBankID; 2254 2255 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1); 2256 break; 2257 } 2258 } 2259 2260 return getInstructionMapping(/*ID*/1, /*Cost*/1, 2261 getOperandsMapping(OpdsMapping), 2262 MI.getNumOperands()); 2263 } 2264 2265