1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the InstructionSelector class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUInstructionSelector.h" 15 #include "AMDGPUInstrInfo.h" 16 #include "AMDGPUGlobalISelUtils.h" 17 #include "AMDGPURegisterBankInfo.h" 18 #include "AMDGPUSubtarget.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 23 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" 24 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" 25 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 26 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 27 #include "llvm/CodeGen/GlobalISel/Utils.h" 28 #include "llvm/CodeGen/MachineBasicBlock.h" 29 #include "llvm/CodeGen/MachineFunction.h" 30 #include "llvm/CodeGen/MachineInstr.h" 31 #include "llvm/CodeGen/MachineInstrBuilder.h" 32 #include "llvm/CodeGen/MachineRegisterInfo.h" 33 #include "llvm/IR/Type.h" 34 #include "llvm/Support/Debug.h" 35 #include "llvm/Support/raw_ostream.h" 36 37 #define DEBUG_TYPE "amdgpu-isel" 38 39 using namespace llvm; 40 using namespace MIPatternMatch; 41 42 static cl::opt<bool> AllowRiskySelect( 43 "amdgpu-global-isel-risky-select", 44 cl::desc("Allow GlobalISel to select cases that are likely to not work yet"), 45 cl::init(false), 46 cl::ReallyHidden); 47 48 #define GET_GLOBALISEL_IMPL 49 #define AMDGPUSubtarget GCNSubtarget 50 #include "AMDGPUGenGlobalISel.inc" 51 #undef GET_GLOBALISEL_IMPL 52 #undef AMDGPUSubtarget 53 54 AMDGPUInstructionSelector::AMDGPUInstructionSelector( 55 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, 56 const AMDGPUTargetMachine &TM) 57 : InstructionSelector(), TII(*STI.getInstrInfo()), 58 TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM), 59 STI(STI), 60 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG), 61 #define GET_GLOBALISEL_PREDICATES_INIT 62 #include "AMDGPUGenGlobalISel.inc" 63 #undef GET_GLOBALISEL_PREDICATES_INIT 64 #define GET_GLOBALISEL_TEMPORARIES_INIT 65 #include "AMDGPUGenGlobalISel.inc" 66 #undef GET_GLOBALISEL_TEMPORARIES_INIT 67 { 68 } 69 70 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } 71 72 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB, 73 CodeGenCoverage &CoverageInfo) { 74 MRI = &MF.getRegInfo(); 75 InstructionSelector::setupMF(MF, KB, CoverageInfo); 76 } 77 78 bool AMDGPUInstructionSelector::isVCC(Register Reg, 79 const MachineRegisterInfo &MRI) const { 80 if (Register::isPhysicalRegister(Reg)) 81 return Reg == TRI.getVCC(); 82 83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 84 const TargetRegisterClass *RC = 85 RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); 86 if (RC) { 87 const LLT Ty = MRI.getType(Reg); 88 return RC->hasSuperClassEq(TRI.getBoolRC()) && 89 Ty.isValid() && Ty.getSizeInBits() == 1; 90 } 91 92 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); 93 return RB->getID() == AMDGPU::VCCRegBankID; 94 } 95 96 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI, 97 unsigned NewOpc) const { 98 MI.setDesc(TII.get(NewOpc)); 99 MI.RemoveOperand(1); // Remove intrinsic ID. 100 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 101 102 MachineOperand &Dst = MI.getOperand(0); 103 MachineOperand &Src = MI.getOperand(1); 104 105 // TODO: This should be legalized to s32 if needed 106 if (MRI->getType(Dst.getReg()) == LLT::scalar(1)) 107 return false; 108 109 const TargetRegisterClass *DstRC 110 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 111 const TargetRegisterClass *SrcRC 112 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 113 if (!DstRC || DstRC != SrcRC) 114 return false; 115 116 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) && 117 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI); 118 } 119 120 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { 121 const DebugLoc &DL = I.getDebugLoc(); 122 MachineBasicBlock *BB = I.getParent(); 123 I.setDesc(TII.get(TargetOpcode::COPY)); 124 125 const MachineOperand &Src = I.getOperand(1); 126 MachineOperand &Dst = I.getOperand(0); 127 Register DstReg = Dst.getReg(); 128 Register SrcReg = Src.getReg(); 129 130 if (isVCC(DstReg, *MRI)) { 131 if (SrcReg == AMDGPU::SCC) { 132 const TargetRegisterClass *RC 133 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 134 if (!RC) 135 return true; 136 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 137 } 138 139 if (!isVCC(SrcReg, *MRI)) { 140 // TODO: Should probably leave the copy and let copyPhysReg expand it. 141 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI)) 142 return false; 143 144 const TargetRegisterClass *SrcRC 145 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 146 147 Register MaskedReg = MRI->createVirtualRegister(SrcRC); 148 149 // We can't trust the high bits at this point, so clear them. 150 151 // TODO: Skip masking high bits if def is known boolean. 152 153 unsigned AndOpc = TRI.isSGPRClass(SrcRC) ? 154 AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32; 155 BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg) 156 .addImm(1) 157 .addReg(SrcReg); 158 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) 159 .addImm(0) 160 .addReg(MaskedReg); 161 162 if (!MRI->getRegClassOrNull(SrcReg)) 163 MRI->setRegClass(SrcReg, SrcRC); 164 I.eraseFromParent(); 165 return true; 166 } 167 168 const TargetRegisterClass *RC = 169 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 170 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) 171 return false; 172 173 return true; 174 } 175 176 for (const MachineOperand &MO : I.operands()) { 177 if (Register::isPhysicalRegister(MO.getReg())) 178 continue; 179 180 const TargetRegisterClass *RC = 181 TRI.getConstrainedRegClassForOperand(MO, *MRI); 182 if (!RC) 183 continue; 184 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); 185 } 186 return true; 187 } 188 189 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { 190 const Register DefReg = I.getOperand(0).getReg(); 191 const LLT DefTy = MRI->getType(DefReg); 192 if (DefTy == LLT::scalar(1)) { 193 if (!AllowRiskySelect) { 194 LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n"); 195 return false; 196 } 197 198 LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n"); 199 } 200 201 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy) 202 203 const RegClassOrRegBank &RegClassOrBank = 204 MRI->getRegClassOrRegBank(DefReg); 205 206 const TargetRegisterClass *DefRC 207 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); 208 if (!DefRC) { 209 if (!DefTy.isValid()) { 210 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); 211 return false; 212 } 213 214 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); 215 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI); 216 if (!DefRC) { 217 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); 218 return false; 219 } 220 } 221 222 // TODO: Verify that all registers have the same bank 223 I.setDesc(TII.get(TargetOpcode::PHI)); 224 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI); 225 } 226 227 MachineOperand 228 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, 229 const TargetRegisterClass &SubRC, 230 unsigned SubIdx) const { 231 232 MachineInstr *MI = MO.getParent(); 233 MachineBasicBlock *BB = MO.getParent()->getParent(); 234 Register DstReg = MRI->createVirtualRegister(&SubRC); 235 236 if (MO.isReg()) { 237 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); 238 Register Reg = MO.getReg(); 239 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) 240 .addReg(Reg, 0, ComposedSubIdx); 241 242 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(), 243 MO.isKill(), MO.isDead(), MO.isUndef(), 244 MO.isEarlyClobber(), 0, MO.isDebug(), 245 MO.isInternalRead()); 246 } 247 248 assert(MO.isImm()); 249 250 APInt Imm(64, MO.getImm()); 251 252 switch (SubIdx) { 253 default: 254 llvm_unreachable("do not know to split immediate with this sub index."); 255 case AMDGPU::sub0: 256 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue()); 257 case AMDGPU::sub1: 258 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue()); 259 } 260 } 261 262 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { 263 switch (Opc) { 264 case AMDGPU::G_AND: 265 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; 266 case AMDGPU::G_OR: 267 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32; 268 case AMDGPU::G_XOR: 269 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32; 270 default: 271 llvm_unreachable("not a bit op"); 272 } 273 } 274 275 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { 276 Register DstReg = I.getOperand(0).getReg(); 277 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 278 279 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 280 if (DstRB->getID() != AMDGPU::SGPRRegBankID && 281 DstRB->getID() != AMDGPU::VCCRegBankID) 282 return false; 283 284 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID && 285 STI.isWave64()); 286 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64))); 287 288 // Dead implicit-def of scc 289 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef 290 true, // isImp 291 false, // isKill 292 true)); // isDead 293 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 294 } 295 296 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { 297 MachineBasicBlock *BB = I.getParent(); 298 MachineFunction *MF = BB->getParent(); 299 Register DstReg = I.getOperand(0).getReg(); 300 const DebugLoc &DL = I.getDebugLoc(); 301 LLT Ty = MRI->getType(DstReg); 302 if (Ty.isVector()) 303 return false; 304 305 unsigned Size = Ty.getSizeInBits(); 306 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 307 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID; 308 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB; 309 310 if (Size == 32) { 311 if (IsSALU) { 312 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; 313 MachineInstr *Add = 314 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 315 .add(I.getOperand(1)) 316 .add(I.getOperand(2)); 317 I.eraseFromParent(); 318 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 319 } 320 321 if (STI.hasAddNoCarry()) { 322 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64; 323 I.setDesc(TII.get(Opc)); 324 I.addOperand(*MF, MachineOperand::CreateImm(0)); 325 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 326 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 327 } 328 329 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64; 330 331 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass()); 332 MachineInstr *Add 333 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 334 .addDef(UnusedCarry, RegState::Dead) 335 .add(I.getOperand(1)) 336 .add(I.getOperand(2)) 337 .addImm(0); 338 I.eraseFromParent(); 339 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 340 } 341 342 assert(!Sub && "illegal sub should not reach here"); 343 344 const TargetRegisterClass &RC 345 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass; 346 const TargetRegisterClass &HalfRC 347 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass; 348 349 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0)); 350 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0)); 351 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1)); 352 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1)); 353 354 Register DstLo = MRI->createVirtualRegister(&HalfRC); 355 Register DstHi = MRI->createVirtualRegister(&HalfRC); 356 357 if (IsSALU) { 358 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) 359 .add(Lo1) 360 .add(Lo2); 361 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) 362 .add(Hi1) 363 .add(Hi2); 364 } else { 365 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass(); 366 Register CarryReg = MRI->createVirtualRegister(CarryRC); 367 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo) 368 .addDef(CarryReg) 369 .add(Lo1) 370 .add(Lo2) 371 .addImm(0); 372 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi) 373 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead) 374 .add(Hi1) 375 .add(Hi2) 376 .addReg(CarryReg, RegState::Kill) 377 .addImm(0); 378 379 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI)) 380 return false; 381 } 382 383 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 384 .addReg(DstLo) 385 .addImm(AMDGPU::sub0) 386 .addReg(DstHi) 387 .addImm(AMDGPU::sub1); 388 389 390 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI)) 391 return false; 392 393 I.eraseFromParent(); 394 return true; 395 } 396 397 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE( 398 MachineInstr &I) const { 399 MachineBasicBlock *BB = I.getParent(); 400 MachineFunction *MF = BB->getParent(); 401 const DebugLoc &DL = I.getDebugLoc(); 402 Register Dst0Reg = I.getOperand(0).getReg(); 403 Register Dst1Reg = I.getOperand(1).getReg(); 404 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO || 405 I.getOpcode() == AMDGPU::G_UADDE; 406 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE || 407 I.getOpcode() == AMDGPU::G_USUBE; 408 409 if (isVCC(Dst1Reg, *MRI)) { 410 unsigned NoCarryOpc = 411 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64; 412 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 413 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc)); 414 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 415 I.addOperand(*MF, MachineOperand::CreateImm(0)); 416 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 417 } 418 419 Register Src0Reg = I.getOperand(2).getReg(); 420 Register Src1Reg = I.getOperand(3).getReg(); 421 422 if (HasCarryIn) { 423 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 424 .addReg(I.getOperand(4).getReg()); 425 } 426 427 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; 428 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; 429 430 BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg) 431 .add(I.getOperand(2)) 432 .add(I.getOperand(3)); 433 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg) 434 .addReg(AMDGPU::SCC); 435 436 if (!MRI->getRegClassOrNull(Dst1Reg)) 437 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass); 438 439 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) || 440 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) || 441 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI)) 442 return false; 443 444 if (HasCarryIn && 445 !RBI.constrainGenericRegister(I.getOperand(4).getReg(), 446 AMDGPU::SReg_32RegClass, *MRI)) 447 return false; 448 449 I.eraseFromParent(); 450 return true; 451 } 452 453 // TODO: We should probably legalize these to only using 32-bit results. 454 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { 455 MachineBasicBlock *BB = I.getParent(); 456 Register DstReg = I.getOperand(0).getReg(); 457 Register SrcReg = I.getOperand(1).getReg(); 458 LLT DstTy = MRI->getType(DstReg); 459 LLT SrcTy = MRI->getType(SrcReg); 460 const unsigned SrcSize = SrcTy.getSizeInBits(); 461 unsigned DstSize = DstTy.getSizeInBits(); 462 463 // TODO: Should handle any multiple of 32 offset. 464 unsigned Offset = I.getOperand(2).getImm(); 465 if (Offset % 32 != 0 || DstSize > 128) 466 return false; 467 468 // 16-bit operations really use 32-bit registers. 469 // FIXME: Probably should not allow 16-bit G_EXTRACT results. 470 if (DstSize == 16) 471 DstSize = 32; 472 473 const TargetRegisterClass *DstRC = 474 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI); 475 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 476 return false; 477 478 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 479 const TargetRegisterClass *SrcRC = 480 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 481 if (!SrcRC) 482 return false; 483 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32, 484 DstSize / 32); 485 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg); 486 if (!SrcRC) 487 return false; 488 489 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I, 490 *SrcRC, I.getOperand(1)); 491 const DebugLoc &DL = I.getDebugLoc(); 492 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg) 493 .addReg(SrcReg, 0, SubReg); 494 495 I.eraseFromParent(); 496 return true; 497 } 498 499 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { 500 MachineBasicBlock *BB = MI.getParent(); 501 Register DstReg = MI.getOperand(0).getReg(); 502 LLT DstTy = MRI->getType(DstReg); 503 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg()); 504 505 const unsigned SrcSize = SrcTy.getSizeInBits(); 506 if (SrcSize < 32) 507 return selectImpl(MI, *CoverageInfo); 508 509 const DebugLoc &DL = MI.getDebugLoc(); 510 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 511 const unsigned DstSize = DstTy.getSizeInBits(); 512 const TargetRegisterClass *DstRC = 513 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 514 if (!DstRC) 515 return false; 516 517 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8); 518 MachineInstrBuilder MIB = 519 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg); 520 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) { 521 MachineOperand &Src = MI.getOperand(I + 1); 522 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef())); 523 MIB.addImm(SubRegs[I]); 524 525 const TargetRegisterClass *SrcRC 526 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 527 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI)) 528 return false; 529 } 530 531 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 532 return false; 533 534 MI.eraseFromParent(); 535 return true; 536 } 537 538 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { 539 MachineBasicBlock *BB = MI.getParent(); 540 const int NumDst = MI.getNumOperands() - 1; 541 542 MachineOperand &Src = MI.getOperand(NumDst); 543 544 Register SrcReg = Src.getReg(); 545 Register DstReg0 = MI.getOperand(0).getReg(); 546 LLT DstTy = MRI->getType(DstReg0); 547 LLT SrcTy = MRI->getType(SrcReg); 548 549 const unsigned DstSize = DstTy.getSizeInBits(); 550 const unsigned SrcSize = SrcTy.getSizeInBits(); 551 const DebugLoc &DL = MI.getDebugLoc(); 552 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 553 554 const TargetRegisterClass *SrcRC = 555 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 556 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 557 return false; 558 559 const unsigned SrcFlags = getUndefRegState(Src.isUndef()); 560 561 // Note we could have mixed SGPR and VGPR destination banks for an SGPR 562 // source, and this relies on the fact that the same subregister indices are 563 // used for both. 564 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); 565 for (int I = 0, E = NumDst; I != E; ++I) { 566 MachineOperand &Dst = MI.getOperand(I); 567 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg()) 568 .addReg(SrcReg, SrcFlags, SubRegs[I]); 569 570 const TargetRegisterClass *DstRC = 571 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 572 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI)) 573 return false; 574 } 575 576 MI.eraseFromParent(); 577 return true; 578 } 579 580 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC( 581 MachineInstr &MI) const { 582 if (selectImpl(MI, *CoverageInfo)) 583 return true; 584 585 const LLT S32 = LLT::scalar(32); 586 const LLT V2S16 = LLT::vector(2, 16); 587 588 Register Dst = MI.getOperand(0).getReg(); 589 if (MRI->getType(Dst) != V2S16) 590 return false; 591 592 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI); 593 if (DstBank->getID() != AMDGPU::SGPRRegBankID) 594 return false; 595 596 Register Src0 = MI.getOperand(1).getReg(); 597 Register Src1 = MI.getOperand(2).getReg(); 598 if (MRI->getType(Src0) != S32) 599 return false; 600 601 const DebugLoc &DL = MI.getDebugLoc(); 602 MachineBasicBlock *BB = MI.getParent(); 603 604 auto ConstSrc1 = getConstantVRegValWithLookThrough(Src1, *MRI, true, true); 605 if (ConstSrc1) { 606 auto ConstSrc0 = getConstantVRegValWithLookThrough(Src0, *MRI, true, true); 607 if (ConstSrc0) { 608 uint32_t Lo16 = static_cast<uint32_t>(ConstSrc0->Value) & 0xffff; 609 uint32_t Hi16 = static_cast<uint32_t>(ConstSrc1->Value) & 0xffff; 610 611 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst) 612 .addImm(Lo16 | (Hi16 << 16)); 613 MI.eraseFromParent(); 614 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI); 615 } 616 } 617 618 // TODO: This should probably be a combine somewhere 619 // (build_vector_trunc $src0, undef -> copy $src0 620 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI); 621 if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) { 622 MI.setDesc(TII.get(AMDGPU::COPY)); 623 MI.RemoveOperand(2); 624 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) && 625 RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI); 626 } 627 628 Register ShiftSrc0; 629 Register ShiftSrc1; 630 int64_t ShiftAmt; 631 632 // With multiple uses of the shift, this will duplicate the shift and 633 // increase register pressure. 634 // 635 // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16) 636 // => (S_PACK_HH_B32_B16 $src0, $src1) 637 // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16)) 638 // => (S_PACK_LH_B32_B16 $src0, $src1) 639 // (build_vector_trunc $src0, $src1) 640 // => (S_PACK_LL_B32_B16 $src0, $src1) 641 642 // FIXME: This is an inconvenient way to check a specific value 643 bool Shift0 = mi_match( 644 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_ICst(ShiftAmt)))) && 645 ShiftAmt == 16; 646 647 bool Shift1 = mi_match( 648 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_ICst(ShiftAmt)))) && 649 ShiftAmt == 16; 650 651 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16; 652 if (Shift0 && Shift1) { 653 Opc = AMDGPU::S_PACK_HH_B32_B16; 654 MI.getOperand(1).setReg(ShiftSrc0); 655 MI.getOperand(2).setReg(ShiftSrc1); 656 } else if (Shift1) { 657 Opc = AMDGPU::S_PACK_LH_B32_B16; 658 MI.getOperand(2).setReg(ShiftSrc1); 659 } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) { 660 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16 661 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst) 662 .addReg(ShiftSrc0) 663 .addImm(16); 664 665 MI.eraseFromParent(); 666 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 667 } 668 669 MI.setDesc(TII.get(Opc)); 670 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); 671 } 672 673 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const { 674 return selectG_ADD_SUB(I); 675 } 676 677 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { 678 const MachineOperand &MO = I.getOperand(0); 679 680 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The 681 // regbank check here is to know why getConstrainedRegClassForOperand failed. 682 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI); 683 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) || 684 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) { 685 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); 686 return true; 687 } 688 689 return false; 690 } 691 692 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { 693 MachineBasicBlock *BB = I.getParent(); 694 695 Register DstReg = I.getOperand(0).getReg(); 696 Register Src0Reg = I.getOperand(1).getReg(); 697 Register Src1Reg = I.getOperand(2).getReg(); 698 LLT Src1Ty = MRI->getType(Src1Reg); 699 700 unsigned DstSize = MRI->getType(DstReg).getSizeInBits(); 701 unsigned InsSize = Src1Ty.getSizeInBits(); 702 703 int64_t Offset = I.getOperand(3).getImm(); 704 705 // FIXME: These cases should have been illegal and unnecessary to check here. 706 if (Offset % 32 != 0 || InsSize % 32 != 0) 707 return false; 708 709 // Currently not handled by getSubRegFromChannel. 710 if (InsSize > 128) 711 return false; 712 713 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32); 714 if (SubReg == AMDGPU::NoSubRegister) 715 return false; 716 717 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 718 const TargetRegisterClass *DstRC = 719 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 720 if (!DstRC) 721 return false; 722 723 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI); 724 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI); 725 const TargetRegisterClass *Src0RC = 726 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI); 727 const TargetRegisterClass *Src1RC = 728 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI); 729 730 // Deal with weird cases where the class only partially supports the subreg 731 // index. 732 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg); 733 if (!Src0RC || !Src1RC) 734 return false; 735 736 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 737 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) || 738 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI)) 739 return false; 740 741 const DebugLoc &DL = I.getDebugLoc(); 742 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg) 743 .addReg(Src0Reg) 744 .addReg(Src1Reg) 745 .addImm(SubReg); 746 747 I.eraseFromParent(); 748 return true; 749 } 750 751 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const { 752 if (STI.getLDSBankCount() != 16) 753 return selectImpl(MI, *CoverageInfo); 754 755 Register Dst = MI.getOperand(0).getReg(); 756 Register Src0 = MI.getOperand(2).getReg(); 757 Register M0Val = MI.getOperand(6).getReg(); 758 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) || 759 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) || 760 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI)) 761 return false; 762 763 // This requires 2 instructions. It is possible to write a pattern to support 764 // this, but the generated isel emitter doesn't correctly deal with multiple 765 // output instructions using the same physical register input. The copy to m0 766 // is incorrectly placed before the second instruction. 767 // 768 // TODO: Match source modifiers. 769 770 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 771 const DebugLoc &DL = MI.getDebugLoc(); 772 MachineBasicBlock *MBB = MI.getParent(); 773 774 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 775 .addReg(M0Val); 776 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov) 777 .addImm(2) 778 .addImm(MI.getOperand(4).getImm()) // $attr 779 .addImm(MI.getOperand(3).getImm()); // $attrchan 780 781 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst) 782 .addImm(0) // $src0_modifiers 783 .addReg(Src0) // $src0 784 .addImm(MI.getOperand(4).getImm()) // $attr 785 .addImm(MI.getOperand(3).getImm()) // $attrchan 786 .addImm(0) // $src2_modifiers 787 .addReg(InterpMov) // $src2 - 2 f16 values selected by high 788 .addImm(MI.getOperand(5).getImm()) // $high 789 .addImm(0) // $clamp 790 .addImm(0); // $omod 791 792 MI.eraseFromParent(); 793 return true; 794 } 795 796 // We need to handle this here because tablegen doesn't support matching 797 // instructions with multiple outputs. 798 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const { 799 Register Dst0 = MI.getOperand(0).getReg(); 800 Register Dst1 = MI.getOperand(1).getReg(); 801 802 LLT Ty = MRI->getType(Dst0); 803 unsigned Opc; 804 if (Ty == LLT::scalar(32)) 805 Opc = AMDGPU::V_DIV_SCALE_F32; 806 else if (Ty == LLT::scalar(64)) 807 Opc = AMDGPU::V_DIV_SCALE_F64; 808 else 809 return false; 810 811 const DebugLoc &DL = MI.getDebugLoc(); 812 MachineBasicBlock *MBB = MI.getParent(); 813 814 Register Numer = MI.getOperand(3).getReg(); 815 Register Denom = MI.getOperand(4).getReg(); 816 unsigned ChooseDenom = MI.getOperand(5).getImm(); 817 818 Register Src0 = ChooseDenom != 0 ? Numer : Denom; 819 820 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0) 821 .addDef(Dst1) 822 .addUse(Src0) 823 .addUse(Denom) 824 .addUse(Numer); 825 826 MI.eraseFromParent(); 827 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 828 } 829 830 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { 831 unsigned IntrinsicID = I.getIntrinsicID(); 832 switch (IntrinsicID) { 833 case Intrinsic::amdgcn_if_break: { 834 MachineBasicBlock *BB = I.getParent(); 835 836 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 837 // SelectionDAG uses for wave32 vs wave64. 838 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK)) 839 .add(I.getOperand(0)) 840 .add(I.getOperand(2)) 841 .add(I.getOperand(3)); 842 843 Register DstReg = I.getOperand(0).getReg(); 844 Register Src0Reg = I.getOperand(2).getReg(); 845 Register Src1Reg = I.getOperand(3).getReg(); 846 847 I.eraseFromParent(); 848 849 for (Register Reg : { DstReg, Src0Reg, Src1Reg }) 850 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 851 852 return true; 853 } 854 case Intrinsic::amdgcn_interp_p1_f16: 855 return selectInterpP1F16(I); 856 case Intrinsic::amdgcn_wqm: 857 return constrainCopyLikeIntrin(I, AMDGPU::WQM); 858 case Intrinsic::amdgcn_softwqm: 859 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM); 860 case Intrinsic::amdgcn_wwm: 861 return constrainCopyLikeIntrin(I, AMDGPU::WWM); 862 case Intrinsic::amdgcn_div_scale: 863 return selectDivScale(I); 864 case Intrinsic::amdgcn_icmp: 865 return selectIntrinsicIcmp(I); 866 case Intrinsic::amdgcn_ballot: 867 return selectBallot(I); 868 case Intrinsic::amdgcn_reloc_constant: 869 return selectRelocConstant(I); 870 default: 871 return selectImpl(I, *CoverageInfo); 872 } 873 } 874 875 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) { 876 if (Size != 32 && Size != 64) 877 return -1; 878 switch (P) { 879 default: 880 llvm_unreachable("Unknown condition code!"); 881 case CmpInst::ICMP_NE: 882 return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64; 883 case CmpInst::ICMP_EQ: 884 return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64; 885 case CmpInst::ICMP_SGT: 886 return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64; 887 case CmpInst::ICMP_SGE: 888 return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64; 889 case CmpInst::ICMP_SLT: 890 return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64; 891 case CmpInst::ICMP_SLE: 892 return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64; 893 case CmpInst::ICMP_UGT: 894 return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64; 895 case CmpInst::ICMP_UGE: 896 return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64; 897 case CmpInst::ICMP_ULT: 898 return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64; 899 case CmpInst::ICMP_ULE: 900 return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64; 901 } 902 } 903 904 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P, 905 unsigned Size) const { 906 if (Size == 64) { 907 if (!STI.hasScalarCompareEq64()) 908 return -1; 909 910 switch (P) { 911 case CmpInst::ICMP_NE: 912 return AMDGPU::S_CMP_LG_U64; 913 case CmpInst::ICMP_EQ: 914 return AMDGPU::S_CMP_EQ_U64; 915 default: 916 return -1; 917 } 918 } 919 920 if (Size != 32) 921 return -1; 922 923 switch (P) { 924 case CmpInst::ICMP_NE: 925 return AMDGPU::S_CMP_LG_U32; 926 case CmpInst::ICMP_EQ: 927 return AMDGPU::S_CMP_EQ_U32; 928 case CmpInst::ICMP_SGT: 929 return AMDGPU::S_CMP_GT_I32; 930 case CmpInst::ICMP_SGE: 931 return AMDGPU::S_CMP_GE_I32; 932 case CmpInst::ICMP_SLT: 933 return AMDGPU::S_CMP_LT_I32; 934 case CmpInst::ICMP_SLE: 935 return AMDGPU::S_CMP_LE_I32; 936 case CmpInst::ICMP_UGT: 937 return AMDGPU::S_CMP_GT_U32; 938 case CmpInst::ICMP_UGE: 939 return AMDGPU::S_CMP_GE_U32; 940 case CmpInst::ICMP_ULT: 941 return AMDGPU::S_CMP_LT_U32; 942 case CmpInst::ICMP_ULE: 943 return AMDGPU::S_CMP_LE_U32; 944 default: 945 llvm_unreachable("Unknown condition code!"); 946 } 947 } 948 949 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { 950 MachineBasicBlock *BB = I.getParent(); 951 const DebugLoc &DL = I.getDebugLoc(); 952 953 Register SrcReg = I.getOperand(2).getReg(); 954 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 955 956 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); 957 958 Register CCReg = I.getOperand(0).getReg(); 959 if (!isVCC(CCReg, *MRI)) { 960 int Opcode = getS_CMPOpcode(Pred, Size); 961 if (Opcode == -1) 962 return false; 963 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode)) 964 .add(I.getOperand(2)) 965 .add(I.getOperand(3)); 966 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg) 967 .addReg(AMDGPU::SCC); 968 bool Ret = 969 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) && 970 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI); 971 I.eraseFromParent(); 972 return Ret; 973 } 974 975 int Opcode = getV_CMPOpcode(Pred, Size); 976 if (Opcode == -1) 977 return false; 978 979 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), 980 I.getOperand(0).getReg()) 981 .add(I.getOperand(2)) 982 .add(I.getOperand(3)); 983 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), 984 *TRI.getBoolRC(), *MRI); 985 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 986 I.eraseFromParent(); 987 return Ret; 988 } 989 990 bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const { 991 Register Dst = I.getOperand(0).getReg(); 992 if (isVCC(Dst, *MRI)) 993 return false; 994 995 if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize()) 996 return false; 997 998 MachineBasicBlock *BB = I.getParent(); 999 const DebugLoc &DL = I.getDebugLoc(); 1000 Register SrcReg = I.getOperand(2).getReg(); 1001 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 1002 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm()); 1003 1004 int Opcode = getV_CMPOpcode(Pred, Size); 1005 if (Opcode == -1) 1006 return false; 1007 1008 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst) 1009 .add(I.getOperand(2)) 1010 .add(I.getOperand(3)); 1011 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(), 1012 *MRI); 1013 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 1014 I.eraseFromParent(); 1015 return Ret; 1016 } 1017 1018 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const { 1019 MachineBasicBlock *BB = I.getParent(); 1020 const DebugLoc &DL = I.getDebugLoc(); 1021 Register DstReg = I.getOperand(0).getReg(); 1022 const unsigned Size = MRI->getType(DstReg).getSizeInBits(); 1023 const bool Is64 = Size == 64; 1024 1025 if (Size != STI.getWavefrontSize()) 1026 return false; 1027 1028 Optional<ValueAndVReg> Arg = 1029 getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true); 1030 1031 if (Arg.hasValue()) { 1032 const int64_t Value = Arg.getValue().Value; 1033 if (Value == 0) { 1034 unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; 1035 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0); 1036 } else if (Value == -1) { // all ones 1037 Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO; 1038 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg); 1039 } else 1040 return false; 1041 } else { 1042 Register SrcReg = I.getOperand(2).getReg(); 1043 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg); 1044 } 1045 1046 I.eraseFromParent(); 1047 return true; 1048 } 1049 1050 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const { 1051 Register DstReg = I.getOperand(0).getReg(); 1052 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 1053 const TargetRegisterClass *DstRC = 1054 TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI); 1055 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 1056 return false; 1057 1058 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID; 1059 1060 Module *M = MF->getFunction().getParent(); 1061 const MDNode *Metadata = I.getOperand(2).getMetadata(); 1062 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString(); 1063 auto RelocSymbol = cast<GlobalVariable>( 1064 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext()))); 1065 1066 MachineBasicBlock *BB = I.getParent(); 1067 BuildMI(*BB, &I, I.getDebugLoc(), 1068 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg) 1069 .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO); 1070 1071 I.eraseFromParent(); 1072 return true; 1073 } 1074 1075 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const { 1076 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 1077 // SelectionDAG uses for wave32 vs wave64. 1078 MachineBasicBlock *BB = MI.getParent(); 1079 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF)) 1080 .add(MI.getOperand(1)); 1081 1082 Register Reg = MI.getOperand(1).getReg(); 1083 MI.eraseFromParent(); 1084 1085 if (!MRI->getRegClassOrNull(Reg)) 1086 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 1087 return true; 1088 } 1089 1090 static unsigned getDSShaderTypeValue(const MachineFunction &MF) { 1091 switch (MF.getFunction().getCallingConv()) { 1092 case CallingConv::AMDGPU_PS: 1093 return 1; 1094 case CallingConv::AMDGPU_VS: 1095 return 2; 1096 case CallingConv::AMDGPU_GS: 1097 return 3; 1098 case CallingConv::AMDGPU_HS: 1099 case CallingConv::AMDGPU_LS: 1100 case CallingConv::AMDGPU_ES: 1101 report_fatal_error("ds_ordered_count unsupported for this calling conv"); 1102 case CallingConv::AMDGPU_CS: 1103 case CallingConv::AMDGPU_KERNEL: 1104 case CallingConv::C: 1105 case CallingConv::Fast: 1106 default: 1107 // Assume other calling conventions are various compute callable functions 1108 return 0; 1109 } 1110 } 1111 1112 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic( 1113 MachineInstr &MI, Intrinsic::ID IntrID) const { 1114 MachineBasicBlock *MBB = MI.getParent(); 1115 MachineFunction *MF = MBB->getParent(); 1116 const DebugLoc &DL = MI.getDebugLoc(); 1117 1118 unsigned IndexOperand = MI.getOperand(7).getImm(); 1119 bool WaveRelease = MI.getOperand(8).getImm() != 0; 1120 bool WaveDone = MI.getOperand(9).getImm() != 0; 1121 1122 if (WaveDone && !WaveRelease) 1123 report_fatal_error("ds_ordered_count: wave_done requires wave_release"); 1124 1125 unsigned OrderedCountIndex = IndexOperand & 0x3f; 1126 IndexOperand &= ~0x3f; 1127 unsigned CountDw = 0; 1128 1129 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) { 1130 CountDw = (IndexOperand >> 24) & 0xf; 1131 IndexOperand &= ~(0xf << 24); 1132 1133 if (CountDw < 1 || CountDw > 4) { 1134 report_fatal_error( 1135 "ds_ordered_count: dword count must be between 1 and 4"); 1136 } 1137 } 1138 1139 if (IndexOperand) 1140 report_fatal_error("ds_ordered_count: bad index operand"); 1141 1142 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1; 1143 unsigned ShaderType = getDSShaderTypeValue(*MF); 1144 1145 unsigned Offset0 = OrderedCountIndex << 2; 1146 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) | 1147 (Instruction << 4); 1148 1149 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) 1150 Offset1 |= (CountDw - 1) << 6; 1151 1152 unsigned Offset = Offset0 | (Offset1 << 8); 1153 1154 Register M0Val = MI.getOperand(2).getReg(); 1155 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1156 .addReg(M0Val); 1157 1158 Register DstReg = MI.getOperand(0).getReg(); 1159 Register ValReg = MI.getOperand(3).getReg(); 1160 MachineInstrBuilder DS = 1161 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg) 1162 .addReg(ValReg) 1163 .addImm(Offset) 1164 .cloneMemRefs(MI); 1165 1166 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI)) 1167 return false; 1168 1169 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI); 1170 MI.eraseFromParent(); 1171 return Ret; 1172 } 1173 1174 static unsigned gwsIntrinToOpcode(unsigned IntrID) { 1175 switch (IntrID) { 1176 case Intrinsic::amdgcn_ds_gws_init: 1177 return AMDGPU::DS_GWS_INIT; 1178 case Intrinsic::amdgcn_ds_gws_barrier: 1179 return AMDGPU::DS_GWS_BARRIER; 1180 case Intrinsic::amdgcn_ds_gws_sema_v: 1181 return AMDGPU::DS_GWS_SEMA_V; 1182 case Intrinsic::amdgcn_ds_gws_sema_br: 1183 return AMDGPU::DS_GWS_SEMA_BR; 1184 case Intrinsic::amdgcn_ds_gws_sema_p: 1185 return AMDGPU::DS_GWS_SEMA_P; 1186 case Intrinsic::amdgcn_ds_gws_sema_release_all: 1187 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL; 1188 default: 1189 llvm_unreachable("not a gws intrinsic"); 1190 } 1191 } 1192 1193 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, 1194 Intrinsic::ID IID) const { 1195 if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all && 1196 !STI.hasGWSSemaReleaseAll()) 1197 return false; 1198 1199 // intrinsic ID, vsrc, offset 1200 const bool HasVSrc = MI.getNumOperands() == 3; 1201 assert(HasVSrc || MI.getNumOperands() == 2); 1202 1203 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg(); 1204 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI); 1205 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID) 1206 return false; 1207 1208 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1209 assert(OffsetDef); 1210 1211 unsigned ImmOffset; 1212 1213 MachineBasicBlock *MBB = MI.getParent(); 1214 const DebugLoc &DL = MI.getDebugLoc(); 1215 1216 MachineInstr *Readfirstlane = nullptr; 1217 1218 // If we legalized the VGPR input, strip out the readfirstlane to analyze the 1219 // incoming offset, in case there's an add of a constant. We'll have to put it 1220 // back later. 1221 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) { 1222 Readfirstlane = OffsetDef; 1223 BaseOffset = OffsetDef->getOperand(1).getReg(); 1224 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1225 } 1226 1227 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) { 1228 // If we have a constant offset, try to use the 0 in m0 as the base. 1229 // TODO: Look into changing the default m0 initialization value. If the 1230 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to 1231 // the immediate offset. 1232 1233 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue(); 1234 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1235 .addImm(0); 1236 } else { 1237 std::tie(BaseOffset, ImmOffset, OffsetDef) 1238 = AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset); 1239 1240 if (Readfirstlane) { 1241 // We have the constant offset now, so put the readfirstlane back on the 1242 // variable component. 1243 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI)) 1244 return false; 1245 1246 Readfirstlane->getOperand(1).setReg(BaseOffset); 1247 BaseOffset = Readfirstlane->getOperand(0).getReg(); 1248 } else { 1249 if (!RBI.constrainGenericRegister(BaseOffset, 1250 AMDGPU::SReg_32RegClass, *MRI)) 1251 return false; 1252 } 1253 1254 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1255 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base) 1256 .addReg(BaseOffset) 1257 .addImm(16); 1258 1259 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1260 .addReg(M0Base); 1261 } 1262 1263 // The resource id offset is computed as (<isa opaque base> + M0[21:16] + 1264 // offset field) % 64. Some versions of the programming guide omit the m0 1265 // part, or claim it's from offset 0. 1266 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID))); 1267 1268 if (HasVSrc) { 1269 Register VSrc = MI.getOperand(1).getReg(); 1270 MIB.addReg(VSrc); 1271 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI)) 1272 return false; 1273 } 1274 1275 MIB.addImm(ImmOffset) 1276 .addImm(-1) // $gds 1277 .cloneMemRefs(MI); 1278 1279 MI.eraseFromParent(); 1280 return true; 1281 } 1282 1283 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI, 1284 bool IsAppend) const { 1285 Register PtrBase = MI.getOperand(2).getReg(); 1286 LLT PtrTy = MRI->getType(PtrBase); 1287 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS; 1288 1289 unsigned Offset; 1290 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2)); 1291 1292 // TODO: Should this try to look through readfirstlane like GWS? 1293 if (!isDSOffsetLegal(PtrBase, Offset, 16)) { 1294 PtrBase = MI.getOperand(2).getReg(); 1295 Offset = 0; 1296 } 1297 1298 MachineBasicBlock *MBB = MI.getParent(); 1299 const DebugLoc &DL = MI.getDebugLoc(); 1300 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME; 1301 1302 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1303 .addReg(PtrBase); 1304 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI)) 1305 return false; 1306 1307 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg()) 1308 .addImm(Offset) 1309 .addImm(IsGDS ? -1 : 0) 1310 .cloneMemRefs(MI); 1311 MI.eraseFromParent(); 1312 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1313 } 1314 1315 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, 1316 bool &IsTexFail) { 1317 if (TexFailCtrl) 1318 IsTexFail = true; 1319 1320 TFE = (TexFailCtrl & 0x1) ? 1 : 0; 1321 TexFailCtrl &= ~(uint64_t)0x1; 1322 LWE = (TexFailCtrl & 0x2) ? 1 : 0; 1323 TexFailCtrl &= ~(uint64_t)0x2; 1324 1325 return TexFailCtrl == 0; 1326 } 1327 1328 static bool parseCachePolicy(uint64_t Value, 1329 bool *GLC, bool *SLC, bool *DLC) { 1330 if (GLC) { 1331 *GLC = (Value & 0x1) ? 1 : 0; 1332 Value &= ~(uint64_t)0x1; 1333 } 1334 if (SLC) { 1335 *SLC = (Value & 0x2) ? 1 : 0; 1336 Value &= ~(uint64_t)0x2; 1337 } 1338 if (DLC) { 1339 *DLC = (Value & 0x4) ? 1 : 0; 1340 Value &= ~(uint64_t)0x4; 1341 } 1342 1343 return Value == 0; 1344 } 1345 1346 bool AMDGPUInstructionSelector::selectImageIntrinsic( 1347 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 1348 MachineBasicBlock *MBB = MI.getParent(); 1349 const DebugLoc &DL = MI.getDebugLoc(); 1350 1351 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 1352 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 1353 1354 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); 1355 const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 1356 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode); 1357 const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo = 1358 AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode); 1359 unsigned IntrOpcode = Intr->BaseOpcode; 1360 const bool IsGFX10 = STI.getGeneration() >= AMDGPUSubtarget::GFX10; 1361 1362 const int VAddrIdx = getImageVAddrIdxBegin(BaseOpcode, 1363 MI.getNumExplicitDefs()); 1364 int NumVAddr, NumGradients; 1365 std::tie(NumVAddr, NumGradients) = getImageNumVAddr(Intr, BaseOpcode); 1366 1367 Register VDataIn, VDataOut; 1368 LLT VDataTy; 1369 int NumVDataDwords = -1; 1370 bool IsD16 = false; 1371 1372 // XXX - Can we just get the second to last argument for ctrl? 1373 unsigned CtrlIdx; // Index of texfailctrl argument 1374 bool Unorm; 1375 if (!BaseOpcode->Sampler) { 1376 Unorm = true; 1377 CtrlIdx = VAddrIdx + NumVAddr + 1; 1378 } else { 1379 Unorm = MI.getOperand(VAddrIdx + NumVAddr + 2).getImm() != 0; 1380 CtrlIdx = VAddrIdx + NumVAddr + 3; 1381 } 1382 1383 bool TFE; 1384 bool LWE; 1385 bool IsTexFail = false; 1386 if (!parseTexFail(MI.getOperand(CtrlIdx).getImm(), TFE, LWE, IsTexFail)) 1387 return false; 1388 1389 const int Flags = MI.getOperand(CtrlIdx + 2).getImm(); 1390 const bool IsA16 = (Flags & 1) != 0; 1391 const bool IsG16 = (Flags & 2) != 0; 1392 1393 // A16 implies 16 bit gradients 1394 if (IsA16 && !IsG16) 1395 return false; 1396 1397 unsigned DMask = 0; 1398 unsigned DMaskLanes = 0; 1399 1400 if (BaseOpcode->Atomic) { 1401 VDataOut = MI.getOperand(0).getReg(); 1402 VDataIn = MI.getOperand(2).getReg(); 1403 LLT Ty = MRI->getType(VDataIn); 1404 1405 // Be careful to allow atomic swap on 16-bit element vectors. 1406 const bool Is64Bit = BaseOpcode->AtomicX2 ? 1407 Ty.getSizeInBits() == 128 : 1408 Ty.getSizeInBits() == 64; 1409 1410 if (BaseOpcode->AtomicX2) { 1411 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister); 1412 1413 DMask = Is64Bit ? 0xf : 0x3; 1414 NumVDataDwords = Is64Bit ? 4 : 2; 1415 } else { 1416 DMask = Is64Bit ? 0x3 : 0x1; 1417 NumVDataDwords = Is64Bit ? 2 : 1; 1418 } 1419 } else { 1420 const int DMaskIdx = 2; // Input/output + intrinsic ID. 1421 1422 DMask = MI.getOperand(DMaskIdx).getImm(); 1423 DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask); 1424 1425 if (BaseOpcode->Store) { 1426 VDataIn = MI.getOperand(1).getReg(); 1427 VDataTy = MRI->getType(VDataIn); 1428 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32; 1429 } else { 1430 VDataOut = MI.getOperand(0).getReg(); 1431 VDataTy = MRI->getType(VDataOut); 1432 NumVDataDwords = DMaskLanes; 1433 1434 // One memoperand is mandatory, except for getresinfo. 1435 // FIXME: Check this in verifier. 1436 if (!MI.memoperands_empty()) { 1437 const MachineMemOperand *MMO = *MI.memoperands_begin(); 1438 1439 // Infer d16 from the memory size, as the register type will be mangled by 1440 // unpacked subtargets, or by TFE. 1441 IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32; 1442 1443 if (IsD16 && !STI.hasUnpackedD16VMem()) 1444 NumVDataDwords = (DMaskLanes + 1) / 2; 1445 } 1446 } 1447 } 1448 1449 // Optimize _L to _LZ when _L is zero 1450 if (LZMappingInfo) { 1451 // The legalizer replaced the register with an immediate 0 if we need to 1452 // change the opcode. 1453 const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1); 1454 if (Lod.isImm()) { 1455 assert(Lod.getImm() == 0); 1456 IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l 1457 } 1458 } 1459 1460 // Optimize _mip away, when 'lod' is zero 1461 if (MIPMappingInfo) { 1462 const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1); 1463 if (Lod.isImm()) { 1464 assert(Lod.getImm() == 0); 1465 IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip 1466 } 1467 } 1468 1469 // Set G16 opcode 1470 if (IsG16 && !IsA16) { 1471 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = 1472 AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode); 1473 assert(G16MappingInfo); 1474 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16 1475 } 1476 1477 // TODO: Check this in verifier. 1478 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this"); 1479 1480 bool GLC = false; 1481 bool SLC = false; 1482 bool DLC = false; 1483 if (BaseOpcode->Atomic) { 1484 GLC = true; // TODO no-return optimization 1485 if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), nullptr, &SLC, 1486 IsGFX10 ? &DLC : nullptr)) 1487 return false; 1488 } else { 1489 if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), &GLC, &SLC, 1490 IsGFX10 ? &DLC : nullptr)) 1491 return false; 1492 } 1493 1494 int NumVAddrRegs = 0; 1495 int NumVAddrDwords = 0; 1496 for (int I = 0; I < NumVAddr; ++I) { 1497 // Skip the $noregs and 0s inserted during legalization. 1498 MachineOperand &AddrOp = MI.getOperand(VAddrIdx + I); 1499 if (!AddrOp.isReg()) 1500 continue; // XXX - Break? 1501 1502 Register Addr = AddrOp.getReg(); 1503 if (!Addr) 1504 break; 1505 1506 ++NumVAddrRegs; 1507 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32; 1508 } 1509 1510 // The legalizer preprocessed the intrinsic arguments. If we aren't using 1511 // NSA, these should have beeen packed into a single value in the first 1512 // address register 1513 const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs; 1514 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) { 1515 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n"); 1516 return false; 1517 } 1518 1519 if (IsTexFail) 1520 ++NumVDataDwords; 1521 1522 int Opcode = -1; 1523 if (IsGFX10) { 1524 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, 1525 UseNSA ? AMDGPU::MIMGEncGfx10NSA 1526 : AMDGPU::MIMGEncGfx10Default, 1527 NumVDataDwords, NumVAddrDwords); 1528 } else { 1529 if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 1530 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, 1531 NumVDataDwords, NumVAddrDwords); 1532 if (Opcode == -1) 1533 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6, 1534 NumVDataDwords, NumVAddrDwords); 1535 } 1536 assert(Opcode != -1); 1537 1538 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode)) 1539 .cloneMemRefs(MI); 1540 1541 if (VDataOut) { 1542 if (BaseOpcode->AtomicX2) { 1543 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64; 1544 1545 Register TmpReg = MRI->createVirtualRegister( 1546 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass); 1547 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 1548 1549 MIB.addDef(TmpReg); 1550 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut) 1551 .addReg(TmpReg, RegState::Kill, SubReg); 1552 1553 } else { 1554 MIB.addDef(VDataOut); // vdata output 1555 } 1556 } 1557 1558 if (VDataIn) 1559 MIB.addReg(VDataIn); // vdata input 1560 1561 for (int i = 0; i != NumVAddrRegs; ++i) { 1562 MachineOperand &SrcOp = MI.getOperand(VAddrIdx + i); 1563 if (SrcOp.isReg()) { 1564 assert(SrcOp.getReg() != 0); 1565 MIB.addReg(SrcOp.getReg()); 1566 } 1567 } 1568 1569 MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr).getReg()); // rsrc 1570 if (BaseOpcode->Sampler) 1571 MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr + 1).getReg()); // sampler 1572 1573 MIB.addImm(DMask); // dmask 1574 1575 if (IsGFX10) 1576 MIB.addImm(DimInfo->Encoding); 1577 MIB.addImm(Unorm); 1578 if (IsGFX10) 1579 MIB.addImm(DLC); 1580 1581 MIB.addImm(GLC); 1582 MIB.addImm(SLC); 1583 MIB.addImm(IsA16 && // a16 or r128 1584 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0); 1585 if (IsGFX10) 1586 MIB.addImm(IsA16 ? -1 : 0); 1587 1588 MIB.addImm(TFE); // tfe 1589 MIB.addImm(LWE); // lwe 1590 if (!IsGFX10) 1591 MIB.addImm(DimInfo->DA ? -1 : 0); 1592 if (BaseOpcode->HasD16) 1593 MIB.addImm(IsD16 ? -1 : 0); 1594 1595 MI.eraseFromParent(); 1596 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1597 } 1598 1599 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( 1600 MachineInstr &I) const { 1601 unsigned IntrinsicID = I.getIntrinsicID(); 1602 switch (IntrinsicID) { 1603 case Intrinsic::amdgcn_end_cf: 1604 return selectEndCfIntrinsic(I); 1605 case Intrinsic::amdgcn_ds_ordered_add: 1606 case Intrinsic::amdgcn_ds_ordered_swap: 1607 return selectDSOrderedIntrinsic(I, IntrinsicID); 1608 case Intrinsic::amdgcn_ds_gws_init: 1609 case Intrinsic::amdgcn_ds_gws_barrier: 1610 case Intrinsic::amdgcn_ds_gws_sema_v: 1611 case Intrinsic::amdgcn_ds_gws_sema_br: 1612 case Intrinsic::amdgcn_ds_gws_sema_p: 1613 case Intrinsic::amdgcn_ds_gws_sema_release_all: 1614 return selectDSGWSIntrinsic(I, IntrinsicID); 1615 case Intrinsic::amdgcn_ds_append: 1616 return selectDSAppendConsume(I, true); 1617 case Intrinsic::amdgcn_ds_consume: 1618 return selectDSAppendConsume(I, false); 1619 default: { 1620 return selectImpl(I, *CoverageInfo); 1621 } 1622 } 1623 } 1624 1625 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { 1626 if (selectImpl(I, *CoverageInfo)) 1627 return true; 1628 1629 MachineBasicBlock *BB = I.getParent(); 1630 const DebugLoc &DL = I.getDebugLoc(); 1631 1632 Register DstReg = I.getOperand(0).getReg(); 1633 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 1634 assert(Size <= 32 || Size == 64); 1635 const MachineOperand &CCOp = I.getOperand(1); 1636 Register CCReg = CCOp.getReg(); 1637 if (!isVCC(CCReg, *MRI)) { 1638 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 : 1639 AMDGPU::S_CSELECT_B32; 1640 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 1641 .addReg(CCReg); 1642 1643 // The generic constrainSelectedInstRegOperands doesn't work for the scc register 1644 // bank, because it does not cover the register class that we used to represent 1645 // for it. So we need to manually set the register class here. 1646 if (!MRI->getRegClassOrNull(CCReg)) 1647 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI)); 1648 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) 1649 .add(I.getOperand(2)) 1650 .add(I.getOperand(3)); 1651 1652 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) | 1653 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI); 1654 I.eraseFromParent(); 1655 return Ret; 1656 } 1657 1658 // Wide VGPR select should have been split in RegBankSelect. 1659 if (Size > 32) 1660 return false; 1661 1662 MachineInstr *Select = 1663 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1664 .addImm(0) 1665 .add(I.getOperand(3)) 1666 .addImm(0) 1667 .add(I.getOperand(2)) 1668 .add(I.getOperand(1)); 1669 1670 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); 1671 I.eraseFromParent(); 1672 return Ret; 1673 } 1674 1675 bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { 1676 initM0(I); 1677 return selectImpl(I, *CoverageInfo); 1678 } 1679 1680 static int sizeToSubRegIndex(unsigned Size) { 1681 switch (Size) { 1682 case 32: 1683 return AMDGPU::sub0; 1684 case 64: 1685 return AMDGPU::sub0_sub1; 1686 case 96: 1687 return AMDGPU::sub0_sub1_sub2; 1688 case 128: 1689 return AMDGPU::sub0_sub1_sub2_sub3; 1690 case 256: 1691 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; 1692 default: 1693 if (Size < 32) 1694 return AMDGPU::sub0; 1695 if (Size > 256) 1696 return -1; 1697 return sizeToSubRegIndex(PowerOf2Ceil(Size)); 1698 } 1699 } 1700 1701 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { 1702 Register DstReg = I.getOperand(0).getReg(); 1703 Register SrcReg = I.getOperand(1).getReg(); 1704 const LLT DstTy = MRI->getType(DstReg); 1705 const LLT SrcTy = MRI->getType(SrcReg); 1706 const LLT S1 = LLT::scalar(1); 1707 1708 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1709 const RegisterBank *DstRB; 1710 if (DstTy == S1) { 1711 // This is a special case. We don't treat s1 for legalization artifacts as 1712 // vcc booleans. 1713 DstRB = SrcRB; 1714 } else { 1715 DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1716 if (SrcRB != DstRB) 1717 return false; 1718 } 1719 1720 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; 1721 1722 unsigned DstSize = DstTy.getSizeInBits(); 1723 unsigned SrcSize = SrcTy.getSizeInBits(); 1724 1725 const TargetRegisterClass *SrcRC 1726 = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI); 1727 const TargetRegisterClass *DstRC 1728 = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI); 1729 if (!SrcRC || !DstRC) 1730 return false; 1731 1732 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 1733 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) { 1734 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); 1735 return false; 1736 } 1737 1738 if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) { 1739 MachineBasicBlock *MBB = I.getParent(); 1740 const DebugLoc &DL = I.getDebugLoc(); 1741 1742 Register LoReg = MRI->createVirtualRegister(DstRC); 1743 Register HiReg = MRI->createVirtualRegister(DstRC); 1744 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg) 1745 .addReg(SrcReg, 0, AMDGPU::sub0); 1746 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg) 1747 .addReg(SrcReg, 0, AMDGPU::sub1); 1748 1749 if (IsVALU && STI.hasSDWA()) { 1750 // Write the low 16-bits of the high element into the high 16-bits of the 1751 // low element. 1752 MachineInstr *MovSDWA = 1753 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 1754 .addImm(0) // $src0_modifiers 1755 .addReg(HiReg) // $src0 1756 .addImm(0) // $clamp 1757 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel 1758 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 1759 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel 1760 .addReg(LoReg, RegState::Implicit); 1761 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 1762 } else { 1763 Register TmpReg0 = MRI->createVirtualRegister(DstRC); 1764 Register TmpReg1 = MRI->createVirtualRegister(DstRC); 1765 Register ImmReg = MRI->createVirtualRegister(DstRC); 1766 if (IsVALU) { 1767 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0) 1768 .addImm(16) 1769 .addReg(HiReg); 1770 } else { 1771 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0) 1772 .addReg(HiReg) 1773 .addImm(16); 1774 } 1775 1776 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 1777 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 1778 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32; 1779 1780 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg) 1781 .addImm(0xffff); 1782 BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1) 1783 .addReg(LoReg) 1784 .addReg(ImmReg); 1785 BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg) 1786 .addReg(TmpReg0) 1787 .addReg(TmpReg1); 1788 } 1789 1790 I.eraseFromParent(); 1791 return true; 1792 } 1793 1794 if (!DstTy.isScalar()) 1795 return false; 1796 1797 if (SrcSize > 32) { 1798 int SubRegIdx = sizeToSubRegIndex(DstSize); 1799 if (SubRegIdx == -1) 1800 return false; 1801 1802 // Deal with weird cases where the class only partially supports the subreg 1803 // index. 1804 const TargetRegisterClass *SrcWithSubRC 1805 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx); 1806 if (!SrcWithSubRC) 1807 return false; 1808 1809 if (SrcWithSubRC != SrcRC) { 1810 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI)) 1811 return false; 1812 } 1813 1814 I.getOperand(1).setSubReg(SubRegIdx); 1815 } 1816 1817 I.setDesc(TII.get(TargetOpcode::COPY)); 1818 return true; 1819 } 1820 1821 /// \returns true if a bitmask for \p Size bits will be an inline immediate. 1822 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) { 1823 Mask = maskTrailingOnes<unsigned>(Size); 1824 int SignedMask = static_cast<int>(Mask); 1825 return SignedMask >= -16 && SignedMask <= 64; 1826 } 1827 1828 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1. 1829 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank( 1830 Register Reg, const MachineRegisterInfo &MRI, 1831 const TargetRegisterInfo &TRI) const { 1832 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 1833 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>()) 1834 return RB; 1835 1836 // Ignore the type, since we don't use vcc in artifacts. 1837 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>()) 1838 return &RBI.getRegBankFromRegClass(*RC, LLT()); 1839 return nullptr; 1840 } 1841 1842 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { 1843 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG; 1844 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg; 1845 const DebugLoc &DL = I.getDebugLoc(); 1846 MachineBasicBlock &MBB = *I.getParent(); 1847 const Register DstReg = I.getOperand(0).getReg(); 1848 const Register SrcReg = I.getOperand(1).getReg(); 1849 1850 const LLT DstTy = MRI->getType(DstReg); 1851 const LLT SrcTy = MRI->getType(SrcReg); 1852 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ? 1853 I.getOperand(2).getImm() : SrcTy.getSizeInBits(); 1854 const unsigned DstSize = DstTy.getSizeInBits(); 1855 if (!DstTy.isScalar()) 1856 return false; 1857 1858 // Artifact casts should never use vcc. 1859 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI); 1860 1861 // FIXME: This should probably be illegal and split earlier. 1862 if (I.getOpcode() == AMDGPU::G_ANYEXT) { 1863 if (DstSize <= 32) 1864 return selectCOPY(I); 1865 1866 const TargetRegisterClass *SrcRC = 1867 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank, *MRI); 1868 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 1869 const TargetRegisterClass *DstRC = 1870 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 1871 1872 Register UndefReg = MRI->createVirtualRegister(SrcRC); 1873 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 1874 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 1875 .addReg(SrcReg) 1876 .addImm(AMDGPU::sub0) 1877 .addReg(UndefReg) 1878 .addImm(AMDGPU::sub1); 1879 I.eraseFromParent(); 1880 1881 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) && 1882 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI); 1883 } 1884 1885 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) { 1886 // 64-bit should have been split up in RegBankSelect 1887 1888 // Try to use an and with a mask if it will save code size. 1889 unsigned Mask; 1890 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1891 MachineInstr *ExtI = 1892 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg) 1893 .addImm(Mask) 1894 .addReg(SrcReg); 1895 I.eraseFromParent(); 1896 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1897 } 1898 1899 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32; 1900 MachineInstr *ExtI = 1901 BuildMI(MBB, I, DL, TII.get(BFE), DstReg) 1902 .addReg(SrcReg) 1903 .addImm(0) // Offset 1904 .addImm(SrcSize); // Width 1905 I.eraseFromParent(); 1906 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1907 } 1908 1909 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) { 1910 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ? 1911 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass; 1912 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI)) 1913 return false; 1914 1915 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) { 1916 const unsigned SextOpc = SrcSize == 8 ? 1917 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16; 1918 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg) 1919 .addReg(SrcReg); 1920 I.eraseFromParent(); 1921 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1922 } 1923 1924 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; 1925 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; 1926 1927 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width. 1928 if (DstSize > 32 && (SrcSize <= 32 || InReg)) { 1929 // We need a 64-bit register source, but the high bits don't matter. 1930 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); 1931 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1932 unsigned SubReg = InReg ? AMDGPU::sub0 : 0; 1933 1934 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 1935 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) 1936 .addReg(SrcReg, 0, SubReg) 1937 .addImm(AMDGPU::sub0) 1938 .addReg(UndefReg) 1939 .addImm(AMDGPU::sub1); 1940 1941 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg) 1942 .addReg(ExtReg) 1943 .addImm(SrcSize << 16); 1944 1945 I.eraseFromParent(); 1946 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI); 1947 } 1948 1949 unsigned Mask; 1950 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1951 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg) 1952 .addReg(SrcReg) 1953 .addImm(Mask); 1954 } else { 1955 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg) 1956 .addReg(SrcReg) 1957 .addImm(SrcSize << 16); 1958 } 1959 1960 I.eraseFromParent(); 1961 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1962 } 1963 1964 return false; 1965 } 1966 1967 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { 1968 MachineBasicBlock *BB = I.getParent(); 1969 MachineOperand &ImmOp = I.getOperand(1); 1970 1971 // The AMDGPU backend only supports Imm operands and not CImm or FPImm. 1972 if (ImmOp.isFPImm()) { 1973 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt(); 1974 ImmOp.ChangeToImmediate(Imm.getZExtValue()); 1975 } else if (ImmOp.isCImm()) { 1976 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue()); 1977 } 1978 1979 Register DstReg = I.getOperand(0).getReg(); 1980 unsigned Size; 1981 bool IsSgpr; 1982 const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg()); 1983 if (RB) { 1984 IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID; 1985 Size = MRI->getType(DstReg).getSizeInBits(); 1986 } else { 1987 const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg); 1988 IsSgpr = TRI.isSGPRClass(RC); 1989 Size = TRI.getRegSizeInBits(*RC); 1990 } 1991 1992 if (Size != 32 && Size != 64) 1993 return false; 1994 1995 unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1996 if (Size == 32) { 1997 I.setDesc(TII.get(Opcode)); 1998 I.addImplicitDefUseOperands(*MF); 1999 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2000 } 2001 2002 const DebugLoc &DL = I.getDebugLoc(); 2003 2004 APInt Imm(Size, I.getOperand(1).getImm()); 2005 2006 MachineInstr *ResInst; 2007 if (IsSgpr && TII.isInlineConstant(Imm)) { 2008 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) 2009 .addImm(I.getOperand(1).getImm()); 2010 } else { 2011 const TargetRegisterClass *RC = IsSgpr ? 2012 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass; 2013 Register LoReg = MRI->createVirtualRegister(RC); 2014 Register HiReg = MRI->createVirtualRegister(RC); 2015 2016 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg) 2017 .addImm(Imm.trunc(32).getZExtValue()); 2018 2019 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg) 2020 .addImm(Imm.ashr(32).getZExtValue()); 2021 2022 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 2023 .addReg(LoReg) 2024 .addImm(AMDGPU::sub0) 2025 .addReg(HiReg) 2026 .addImm(AMDGPU::sub1); 2027 } 2028 2029 // We can't call constrainSelectedInstRegOperands here, because it doesn't 2030 // work for target independent opcodes 2031 I.eraseFromParent(); 2032 const TargetRegisterClass *DstRC = 2033 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI); 2034 if (!DstRC) 2035 return true; 2036 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI); 2037 } 2038 2039 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const { 2040 // Only manually handle the f64 SGPR case. 2041 // 2042 // FIXME: This is a workaround for 2.5 different tablegen problems. Because 2043 // the bit ops theoretically have a second result due to the implicit def of 2044 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing 2045 // that is easy by disabling the check. The result works, but uses a 2046 // nonsensical sreg32orlds_and_sreg_1 regclass. 2047 // 2048 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to 2049 // the variadic REG_SEQUENCE operands. 2050 2051 Register Dst = MI.getOperand(0).getReg(); 2052 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 2053 if (DstRB->getID() != AMDGPU::SGPRRegBankID || 2054 MRI->getType(Dst) != LLT::scalar(64)) 2055 return false; 2056 2057 Register Src = MI.getOperand(1).getReg(); 2058 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI); 2059 if (Fabs) 2060 Src = Fabs->getOperand(1).getReg(); 2061 2062 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || 2063 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) 2064 return false; 2065 2066 MachineBasicBlock *BB = MI.getParent(); 2067 const DebugLoc &DL = MI.getDebugLoc(); 2068 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2069 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2070 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2071 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2072 2073 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) 2074 .addReg(Src, 0, AMDGPU::sub0); 2075 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) 2076 .addReg(Src, 0, AMDGPU::sub1); 2077 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) 2078 .addImm(0x80000000); 2079 2080 // Set or toggle sign bit. 2081 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32; 2082 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg) 2083 .addReg(HiReg) 2084 .addReg(ConstReg); 2085 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) 2086 .addReg(LoReg) 2087 .addImm(AMDGPU::sub0) 2088 .addReg(OpReg) 2089 .addImm(AMDGPU::sub1); 2090 MI.eraseFromParent(); 2091 return true; 2092 } 2093 2094 // FIXME: This is a workaround for the same tablegen problems as G_FNEG 2095 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const { 2096 Register Dst = MI.getOperand(0).getReg(); 2097 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 2098 if (DstRB->getID() != AMDGPU::SGPRRegBankID || 2099 MRI->getType(Dst) != LLT::scalar(64)) 2100 return false; 2101 2102 Register Src = MI.getOperand(1).getReg(); 2103 MachineBasicBlock *BB = MI.getParent(); 2104 const DebugLoc &DL = MI.getDebugLoc(); 2105 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2106 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2107 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2108 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2109 2110 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || 2111 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) 2112 return false; 2113 2114 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) 2115 .addReg(Src, 0, AMDGPU::sub0); 2116 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) 2117 .addReg(Src, 0, AMDGPU::sub1); 2118 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) 2119 .addImm(0x7fffffff); 2120 2121 // Clear sign bit. 2122 // TODO: Should this used S_BITSET0_*? 2123 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg) 2124 .addReg(HiReg) 2125 .addReg(ConstReg); 2126 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) 2127 .addReg(LoReg) 2128 .addImm(AMDGPU::sub0) 2129 .addReg(OpReg) 2130 .addImm(AMDGPU::sub1); 2131 2132 MI.eraseFromParent(); 2133 return true; 2134 } 2135 2136 static bool isConstant(const MachineInstr &MI) { 2137 return MI.getOpcode() == TargetOpcode::G_CONSTANT; 2138 } 2139 2140 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, 2141 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const { 2142 2143 const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg()); 2144 2145 assert(PtrMI); 2146 2147 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD) 2148 return; 2149 2150 GEPInfo GEPInfo(*PtrMI); 2151 2152 for (unsigned i = 1; i != 3; ++i) { 2153 const MachineOperand &GEPOp = PtrMI->getOperand(i); 2154 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg()); 2155 assert(OpDef); 2156 if (i == 2 && isConstant(*OpDef)) { 2157 // TODO: Could handle constant base + variable offset, but a combine 2158 // probably should have commuted it. 2159 assert(GEPInfo.Imm == 0); 2160 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue(); 2161 continue; 2162 } 2163 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI); 2164 if (OpBank->getID() == AMDGPU::SGPRRegBankID) 2165 GEPInfo.SgprParts.push_back(GEPOp.getReg()); 2166 else 2167 GEPInfo.VgprParts.push_back(GEPOp.getReg()); 2168 } 2169 2170 AddrInfo.push_back(GEPInfo); 2171 getAddrModeInfo(*PtrMI, MRI, AddrInfo); 2172 } 2173 2174 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const { 2175 if (!MI.hasOneMemOperand()) 2176 return false; 2177 2178 const MachineMemOperand *MMO = *MI.memoperands_begin(); 2179 const Value *Ptr = MMO->getValue(); 2180 2181 // UndefValue means this is a load of a kernel input. These are uniform. 2182 // Sometimes LDS instructions have constant pointers. 2183 // If Ptr is null, then that means this mem operand contains a 2184 // PseudoSourceValue like GOT. 2185 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || 2186 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) 2187 return true; 2188 2189 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 2190 return true; 2191 2192 const Instruction *I = dyn_cast<Instruction>(Ptr); 2193 return I && I->getMetadata("amdgpu.uniform"); 2194 } 2195 2196 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const { 2197 for (const GEPInfo &GEPInfo : AddrInfo) { 2198 if (!GEPInfo.VgprParts.empty()) 2199 return true; 2200 } 2201 return false; 2202 } 2203 2204 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const { 2205 MachineBasicBlock *BB = I.getParent(); 2206 2207 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); 2208 unsigned AS = PtrTy.getAddressSpace(); 2209 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) && 2210 STI.ldsRequiresM0Init()) { 2211 // If DS instructions require M0 initializtion, insert it before selecting. 2212 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 2213 .addImm(-1); 2214 } 2215 } 2216 2217 bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const { 2218 initM0(I); 2219 return selectImpl(I, *CoverageInfo); 2220 } 2221 2222 // TODO: No rtn optimization. 2223 bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG( 2224 MachineInstr &MI) const { 2225 Register PtrReg = MI.getOperand(1).getReg(); 2226 const LLT PtrTy = MRI->getType(PtrReg); 2227 if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS || 2228 STI.useFlatForGlobal()) 2229 return selectImpl(MI, *CoverageInfo); 2230 2231 Register DstReg = MI.getOperand(0).getReg(); 2232 const LLT Ty = MRI->getType(DstReg); 2233 const bool Is64 = Ty.getSizeInBits() == 64; 2234 const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 2235 Register TmpReg = MRI->createVirtualRegister( 2236 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass); 2237 2238 const DebugLoc &DL = MI.getDebugLoc(); 2239 MachineBasicBlock *BB = MI.getParent(); 2240 2241 Register VAddr, RSrcReg, SOffset; 2242 int64_t Offset = 0; 2243 2244 unsigned Opcode; 2245 if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) { 2246 Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN : 2247 AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN; 2248 } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr, 2249 RSrcReg, SOffset, Offset)) { 2250 Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN : 2251 AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN; 2252 } else 2253 return selectImpl(MI, *CoverageInfo); 2254 2255 auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg) 2256 .addReg(MI.getOperand(2).getReg()); 2257 2258 if (VAddr) 2259 MIB.addReg(VAddr); 2260 2261 MIB.addReg(RSrcReg); 2262 if (SOffset) 2263 MIB.addReg(SOffset); 2264 else 2265 MIB.addImm(0); 2266 2267 MIB.addImm(Offset); 2268 MIB.addImm(0); // slc 2269 MIB.cloneMemRefs(MI); 2270 2271 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg) 2272 .addReg(TmpReg, RegState::Kill, SubReg); 2273 2274 MI.eraseFromParent(); 2275 2276 MRI->setRegClass( 2277 DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass); 2278 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 2279 } 2280 2281 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { 2282 MachineBasicBlock *BB = I.getParent(); 2283 MachineOperand &CondOp = I.getOperand(0); 2284 Register CondReg = CondOp.getReg(); 2285 const DebugLoc &DL = I.getDebugLoc(); 2286 2287 unsigned BrOpcode; 2288 Register CondPhysReg; 2289 const TargetRegisterClass *ConstrainRC; 2290 2291 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide 2292 // whether the branch is uniform when selecting the instruction. In 2293 // GlobalISel, we should push that decision into RegBankSelect. Assume for now 2294 // RegBankSelect knows what it's doing if the branch condition is scc, even 2295 // though it currently does not. 2296 if (!isVCC(CondReg, *MRI)) { 2297 if (MRI->getType(CondReg) != LLT::scalar(32)) 2298 return false; 2299 2300 CondPhysReg = AMDGPU::SCC; 2301 BrOpcode = AMDGPU::S_CBRANCH_SCC1; 2302 ConstrainRC = &AMDGPU::SReg_32RegClass; 2303 } else { 2304 // FIXME: Do we have to insert an and with exec here, like in SelectionDAG? 2305 // We sort of know that a VCC producer based on the register bank, that ands 2306 // inactive lanes with 0. What if there was a logical operation with vcc 2307 // producers in different blocks/with different exec masks? 2308 // FIXME: Should scc->vcc copies and with exec? 2309 CondPhysReg = TRI.getVCC(); 2310 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ; 2311 ConstrainRC = TRI.getBoolRC(); 2312 } 2313 2314 if (!MRI->getRegClassOrNull(CondReg)) 2315 MRI->setRegClass(CondReg, ConstrainRC); 2316 2317 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg) 2318 .addReg(CondReg); 2319 BuildMI(*BB, &I, DL, TII.get(BrOpcode)) 2320 .addMBB(I.getOperand(1).getMBB()); 2321 2322 I.eraseFromParent(); 2323 return true; 2324 } 2325 2326 bool AMDGPUInstructionSelector::selectG_FRAME_INDEX_GLOBAL_VALUE( 2327 MachineInstr &I) const { 2328 Register DstReg = I.getOperand(0).getReg(); 2329 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2330 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 2331 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32)); 2332 if (IsVGPR) 2333 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 2334 2335 return RBI.constrainGenericRegister( 2336 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI); 2337 } 2338 2339 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { 2340 Register DstReg = I.getOperand(0).getReg(); 2341 Register SrcReg = I.getOperand(1).getReg(); 2342 Register MaskReg = I.getOperand(2).getReg(); 2343 LLT Ty = MRI->getType(DstReg); 2344 LLT MaskTy = MRI->getType(MaskReg); 2345 2346 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2347 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 2348 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI); 2349 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 2350 if (DstRB != SrcRB) // Should only happen for hand written MIR. 2351 return false; 2352 2353 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 2354 const TargetRegisterClass &RegRC 2355 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 2356 2357 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB, 2358 *MRI); 2359 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB, 2360 *MRI); 2361 const TargetRegisterClass *MaskRC = 2362 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI); 2363 2364 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 2365 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 2366 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI)) 2367 return false; 2368 2369 MachineBasicBlock *BB = I.getParent(); 2370 const DebugLoc &DL = I.getDebugLoc(); 2371 if (Ty.getSizeInBits() == 32) { 2372 assert(MaskTy.getSizeInBits() == 32 && 2373 "ptrmask should have been narrowed during legalize"); 2374 2375 BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg) 2376 .addReg(SrcReg) 2377 .addReg(MaskReg); 2378 I.eraseFromParent(); 2379 return true; 2380 } 2381 2382 Register HiReg = MRI->createVirtualRegister(&RegRC); 2383 Register LoReg = MRI->createVirtualRegister(&RegRC); 2384 2385 // Extract the subregisters from the source pointer. 2386 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg) 2387 .addReg(SrcReg, 0, AMDGPU::sub0); 2388 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg) 2389 .addReg(SrcReg, 0, AMDGPU::sub1); 2390 2391 Register MaskedLo, MaskedHi; 2392 2393 // Try to avoid emitting a bit operation when we only need to touch half of 2394 // the 64-bit pointer. 2395 APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64); 2396 2397 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32); 2398 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32); 2399 if ((MaskOnes & MaskLo32) == MaskLo32) { 2400 // If all the bits in the low half are 1, we only need a copy for it. 2401 MaskedLo = LoReg; 2402 } else { 2403 // Extract the mask subregister and apply the and. 2404 Register MaskLo = MRI->createVirtualRegister(&RegRC); 2405 MaskedLo = MRI->createVirtualRegister(&RegRC); 2406 2407 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo) 2408 .addReg(MaskReg, 0, AMDGPU::sub0); 2409 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo) 2410 .addReg(LoReg) 2411 .addReg(MaskLo); 2412 } 2413 2414 if ((MaskOnes & MaskHi32) == MaskHi32) { 2415 // If all the bits in the high half are 1, we only need a copy for it. 2416 MaskedHi = HiReg; 2417 } else { 2418 Register MaskHi = MRI->createVirtualRegister(&RegRC); 2419 MaskedHi = MRI->createVirtualRegister(&RegRC); 2420 2421 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi) 2422 .addReg(MaskReg, 0, AMDGPU::sub1); 2423 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi) 2424 .addReg(HiReg) 2425 .addReg(MaskHi); 2426 } 2427 2428 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 2429 .addReg(MaskedLo) 2430 .addImm(AMDGPU::sub0) 2431 .addReg(MaskedHi) 2432 .addImm(AMDGPU::sub1); 2433 I.eraseFromParent(); 2434 return true; 2435 } 2436 2437 /// Return the register to use for the index value, and the subregister to use 2438 /// for the indirectly accessed register. 2439 static std::pair<Register, unsigned> 2440 computeIndirectRegIndex(MachineRegisterInfo &MRI, 2441 const SIRegisterInfo &TRI, 2442 const TargetRegisterClass *SuperRC, 2443 Register IdxReg, 2444 unsigned EltSize) { 2445 Register IdxBaseReg; 2446 int Offset; 2447 MachineInstr *Unused; 2448 2449 std::tie(IdxBaseReg, Offset, Unused) 2450 = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg); 2451 if (IdxBaseReg == AMDGPU::NoRegister) { 2452 // This will happen if the index is a known constant. This should ordinarily 2453 // be legalized out, but handle it as a register just in case. 2454 assert(Offset == 0); 2455 IdxBaseReg = IdxReg; 2456 } 2457 2458 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize); 2459 2460 // Skip out of bounds offsets, or else we would end up using an undefined 2461 // register. 2462 if (static_cast<unsigned>(Offset) >= SubRegs.size()) 2463 return std::make_pair(IdxReg, SubRegs[0]); 2464 return std::make_pair(IdxBaseReg, SubRegs[Offset]); 2465 } 2466 2467 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT( 2468 MachineInstr &MI) const { 2469 Register DstReg = MI.getOperand(0).getReg(); 2470 Register SrcReg = MI.getOperand(1).getReg(); 2471 Register IdxReg = MI.getOperand(2).getReg(); 2472 2473 LLT DstTy = MRI->getType(DstReg); 2474 LLT SrcTy = MRI->getType(SrcReg); 2475 2476 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2477 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 2478 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 2479 2480 // The index must be scalar. If it wasn't RegBankSelect should have moved this 2481 // into a waterfall loop. 2482 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 2483 return false; 2484 2485 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB, 2486 *MRI); 2487 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB, 2488 *MRI); 2489 if (!SrcRC || !DstRC) 2490 return false; 2491 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 2492 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 2493 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 2494 return false; 2495 2496 MachineBasicBlock *BB = MI.getParent(); 2497 const DebugLoc &DL = MI.getDebugLoc(); 2498 const bool Is64 = DstTy.getSizeInBits() == 64; 2499 2500 unsigned SubReg; 2501 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg, 2502 DstTy.getSizeInBits() / 8); 2503 2504 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) { 2505 if (DstTy.getSizeInBits() != 32 && !Is64) 2506 return false; 2507 2508 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2509 .addReg(IdxReg); 2510 2511 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32; 2512 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg) 2513 .addReg(SrcReg, 0, SubReg) 2514 .addReg(SrcReg, RegState::Implicit); 2515 MI.eraseFromParent(); 2516 return true; 2517 } 2518 2519 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32) 2520 return false; 2521 2522 if (!STI.useVGPRIndexMode()) { 2523 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2524 .addReg(IdxReg); 2525 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg) 2526 .addReg(SrcReg, 0, SubReg) 2527 .addReg(SrcReg, RegState::Implicit); 2528 MI.eraseFromParent(); 2529 return true; 2530 } 2531 2532 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON)) 2533 .addReg(IdxReg) 2534 .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE); 2535 BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstReg) 2536 .addReg(SrcReg, 0, SubReg) 2537 .addReg(SrcReg, RegState::Implicit) 2538 .addReg(AMDGPU::M0, RegState::Implicit); 2539 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF)); 2540 2541 MI.eraseFromParent(); 2542 return true; 2543 } 2544 2545 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd 2546 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT( 2547 MachineInstr &MI) const { 2548 Register DstReg = MI.getOperand(0).getReg(); 2549 Register VecReg = MI.getOperand(1).getReg(); 2550 Register ValReg = MI.getOperand(2).getReg(); 2551 Register IdxReg = MI.getOperand(3).getReg(); 2552 2553 LLT VecTy = MRI->getType(DstReg); 2554 LLT ValTy = MRI->getType(ValReg); 2555 unsigned VecSize = VecTy.getSizeInBits(); 2556 unsigned ValSize = ValTy.getSizeInBits(); 2557 2558 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI); 2559 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI); 2560 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 2561 2562 assert(VecTy.getElementType() == ValTy); 2563 2564 // The index must be scalar. If it wasn't RegBankSelect should have moved this 2565 // into a waterfall loop. 2566 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 2567 return false; 2568 2569 const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB, 2570 *MRI); 2571 const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB, 2572 *MRI); 2573 2574 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) || 2575 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) || 2576 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) || 2577 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 2578 return false; 2579 2580 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32) 2581 return false; 2582 2583 unsigned SubReg; 2584 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, 2585 ValSize / 8); 2586 2587 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID && 2588 STI.useVGPRIndexMode(); 2589 2590 MachineBasicBlock *BB = MI.getParent(); 2591 const DebugLoc &DL = MI.getDebugLoc(); 2592 2593 if (IndexMode) { 2594 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON)) 2595 .addReg(IdxReg) 2596 .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE); 2597 } else { 2598 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2599 .addReg(IdxReg); 2600 } 2601 2602 const MCInstrDesc &RegWriteOp 2603 = TII.getIndirectRegWritePseudo(VecSize, ValSize, 2604 VecRB->getID() == AMDGPU::SGPRRegBankID); 2605 BuildMI(*BB, MI, DL, RegWriteOp, DstReg) 2606 .addReg(VecReg) 2607 .addReg(ValReg) 2608 .addImm(SubReg); 2609 2610 if (IndexMode) 2611 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF)); 2612 2613 MI.eraseFromParent(); 2614 return true; 2615 } 2616 2617 static bool isZeroOrUndef(int X) { 2618 return X == 0 || X == -1; 2619 } 2620 2621 static bool isOneOrUndef(int X) { 2622 return X == 1 || X == -1; 2623 } 2624 2625 static bool isZeroOrOneOrUndef(int X) { 2626 return X == 0 || X == 1 || X == -1; 2627 } 2628 2629 // Normalize a VOP3P shuffle mask to refer to the low/high half of a single 2630 // 32-bit register. 2631 static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1, 2632 ArrayRef<int> Mask) { 2633 NewMask[0] = Mask[0]; 2634 NewMask[1] = Mask[1]; 2635 if (isZeroOrOneOrUndef(Mask[0]) && isZeroOrOneOrUndef(Mask[1])) 2636 return Src0; 2637 2638 assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1); 2639 assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1); 2640 2641 // Shift the mask inputs to be 0/1; 2642 NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2; 2643 NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2; 2644 return Src1; 2645 } 2646 2647 // This is only legal with VOP3P instructions as an aid to op_sel matching. 2648 bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR( 2649 MachineInstr &MI) const { 2650 Register DstReg = MI.getOperand(0).getReg(); 2651 Register Src0Reg = MI.getOperand(1).getReg(); 2652 Register Src1Reg = MI.getOperand(2).getReg(); 2653 ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask(); 2654 2655 const LLT V2S16 = LLT::vector(2, 16); 2656 if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16) 2657 return false; 2658 2659 if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask)) 2660 return false; 2661 2662 assert(ShufMask.size() == 2); 2663 assert(STI.hasSDWA() && "no target has VOP3P but not SDWA"); 2664 2665 MachineBasicBlock *MBB = MI.getParent(); 2666 const DebugLoc &DL = MI.getDebugLoc(); 2667 2668 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2669 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; 2670 const TargetRegisterClass &RC = IsVALU ? 2671 AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 2672 2673 // Handle the degenerate case which should have folded out. 2674 if (ShufMask[0] == -1 && ShufMask[1] == -1) { 2675 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg); 2676 2677 MI.eraseFromParent(); 2678 return RBI.constrainGenericRegister(DstReg, RC, *MRI); 2679 } 2680 2681 // A legal VOP3P mask only reads one of the sources. 2682 int Mask[2]; 2683 Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask); 2684 2685 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) || 2686 !RBI.constrainGenericRegister(SrcVec, RC, *MRI)) 2687 return false; 2688 2689 // TODO: This also should have been folded out 2690 if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) { 2691 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg) 2692 .addReg(SrcVec); 2693 2694 MI.eraseFromParent(); 2695 return true; 2696 } 2697 2698 if (Mask[0] == 1 && Mask[1] == -1) { 2699 if (IsVALU) { 2700 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg) 2701 .addImm(16) 2702 .addReg(SrcVec); 2703 } else { 2704 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg) 2705 .addReg(SrcVec) 2706 .addImm(16); 2707 } 2708 } else if (Mask[0] == -1 && Mask[1] == 0) { 2709 if (IsVALU) { 2710 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg) 2711 .addImm(16) 2712 .addReg(SrcVec); 2713 } else { 2714 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg) 2715 .addReg(SrcVec) 2716 .addImm(16); 2717 } 2718 } else if (Mask[0] == 0 && Mask[1] == 0) { 2719 if (IsVALU) { 2720 // Write low half of the register into the high half. 2721 MachineInstr *MovSDWA = 2722 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 2723 .addImm(0) // $src0_modifiers 2724 .addReg(SrcVec) // $src0 2725 .addImm(0) // $clamp 2726 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel 2727 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 2728 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel 2729 .addReg(SrcVec, RegState::Implicit); 2730 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 2731 } else { 2732 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg) 2733 .addReg(SrcVec) 2734 .addReg(SrcVec); 2735 } 2736 } else if (Mask[0] == 1 && Mask[1] == 1) { 2737 if (IsVALU) { 2738 // Write high half of the register into the low half. 2739 MachineInstr *MovSDWA = 2740 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 2741 .addImm(0) // $src0_modifiers 2742 .addReg(SrcVec) // $src0 2743 .addImm(0) // $clamp 2744 .addImm(AMDGPU::SDWA::WORD_0) // $dst_sel 2745 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 2746 .addImm(AMDGPU::SDWA::WORD_1) // $src0_sel 2747 .addReg(SrcVec, RegState::Implicit); 2748 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 2749 } else { 2750 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg) 2751 .addReg(SrcVec) 2752 .addReg(SrcVec); 2753 } 2754 } else if (Mask[0] == 1 && Mask[1] == 0) { 2755 if (IsVALU) { 2756 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32), DstReg) 2757 .addReg(SrcVec) 2758 .addReg(SrcVec) 2759 .addImm(16); 2760 } else { 2761 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2762 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg) 2763 .addReg(SrcVec) 2764 .addImm(16); 2765 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg) 2766 .addReg(TmpReg) 2767 .addReg(SrcVec); 2768 } 2769 } else 2770 llvm_unreachable("all shuffle masks should be handled"); 2771 2772 MI.eraseFromParent(); 2773 return true; 2774 } 2775 2776 bool AMDGPUInstructionSelector::select(MachineInstr &I) { 2777 if (I.isPHI()) 2778 return selectPHI(I); 2779 2780 if (!I.isPreISelOpcode()) { 2781 if (I.isCopy()) 2782 return selectCOPY(I); 2783 return true; 2784 } 2785 2786 switch (I.getOpcode()) { 2787 case TargetOpcode::G_AND: 2788 case TargetOpcode::G_OR: 2789 case TargetOpcode::G_XOR: 2790 if (selectImpl(I, *CoverageInfo)) 2791 return true; 2792 return selectG_AND_OR_XOR(I); 2793 case TargetOpcode::G_ADD: 2794 case TargetOpcode::G_SUB: 2795 if (selectImpl(I, *CoverageInfo)) 2796 return true; 2797 return selectG_ADD_SUB(I); 2798 case TargetOpcode::G_UADDO: 2799 case TargetOpcode::G_USUBO: 2800 case TargetOpcode::G_UADDE: 2801 case TargetOpcode::G_USUBE: 2802 return selectG_UADDO_USUBO_UADDE_USUBE(I); 2803 case TargetOpcode::G_INTTOPTR: 2804 case TargetOpcode::G_BITCAST: 2805 case TargetOpcode::G_PTRTOINT: 2806 return selectCOPY(I); 2807 case TargetOpcode::G_CONSTANT: 2808 case TargetOpcode::G_FCONSTANT: 2809 return selectG_CONSTANT(I); 2810 case TargetOpcode::G_FNEG: 2811 if (selectImpl(I, *CoverageInfo)) 2812 return true; 2813 return selectG_FNEG(I); 2814 case TargetOpcode::G_FABS: 2815 if (selectImpl(I, *CoverageInfo)) 2816 return true; 2817 return selectG_FABS(I); 2818 case TargetOpcode::G_EXTRACT: 2819 return selectG_EXTRACT(I); 2820 case TargetOpcode::G_MERGE_VALUES: 2821 case TargetOpcode::G_BUILD_VECTOR: 2822 case TargetOpcode::G_CONCAT_VECTORS: 2823 return selectG_MERGE_VALUES(I); 2824 case TargetOpcode::G_UNMERGE_VALUES: 2825 return selectG_UNMERGE_VALUES(I); 2826 case TargetOpcode::G_BUILD_VECTOR_TRUNC: 2827 return selectG_BUILD_VECTOR_TRUNC(I); 2828 case TargetOpcode::G_PTR_ADD: 2829 return selectG_PTR_ADD(I); 2830 case TargetOpcode::G_IMPLICIT_DEF: 2831 return selectG_IMPLICIT_DEF(I); 2832 case TargetOpcode::G_FREEZE: 2833 return selectCOPY(I); 2834 case TargetOpcode::G_INSERT: 2835 return selectG_INSERT(I); 2836 case TargetOpcode::G_INTRINSIC: 2837 return selectG_INTRINSIC(I); 2838 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: 2839 return selectG_INTRINSIC_W_SIDE_EFFECTS(I); 2840 case TargetOpcode::G_ICMP: 2841 if (selectG_ICMP(I)) 2842 return true; 2843 return selectImpl(I, *CoverageInfo); 2844 case TargetOpcode::G_LOAD: 2845 case TargetOpcode::G_ATOMIC_CMPXCHG: 2846 case TargetOpcode::G_ATOMICRMW_XCHG: 2847 case TargetOpcode::G_ATOMICRMW_ADD: 2848 case TargetOpcode::G_ATOMICRMW_SUB: 2849 case TargetOpcode::G_ATOMICRMW_AND: 2850 case TargetOpcode::G_ATOMICRMW_OR: 2851 case TargetOpcode::G_ATOMICRMW_XOR: 2852 case TargetOpcode::G_ATOMICRMW_MIN: 2853 case TargetOpcode::G_ATOMICRMW_MAX: 2854 case TargetOpcode::G_ATOMICRMW_UMIN: 2855 case TargetOpcode::G_ATOMICRMW_UMAX: 2856 case TargetOpcode::G_ATOMICRMW_FADD: 2857 case AMDGPU::G_AMDGPU_ATOMIC_INC: 2858 case AMDGPU::G_AMDGPU_ATOMIC_DEC: 2859 return selectG_LOAD_ATOMICRMW(I); 2860 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: 2861 return selectG_AMDGPU_ATOMIC_CMPXCHG(I); 2862 case TargetOpcode::G_SELECT: 2863 return selectG_SELECT(I); 2864 case TargetOpcode::G_STORE: 2865 return selectG_STORE(I); 2866 case TargetOpcode::G_TRUNC: 2867 return selectG_TRUNC(I); 2868 case TargetOpcode::G_SEXT: 2869 case TargetOpcode::G_ZEXT: 2870 case TargetOpcode::G_ANYEXT: 2871 case TargetOpcode::G_SEXT_INREG: 2872 if (selectImpl(I, *CoverageInfo)) 2873 return true; 2874 return selectG_SZA_EXT(I); 2875 case TargetOpcode::G_BRCOND: 2876 return selectG_BRCOND(I); 2877 case TargetOpcode::G_FRAME_INDEX: 2878 case TargetOpcode::G_GLOBAL_VALUE: 2879 return selectG_FRAME_INDEX_GLOBAL_VALUE(I); 2880 case TargetOpcode::G_PTRMASK: 2881 return selectG_PTRMASK(I); 2882 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 2883 return selectG_EXTRACT_VECTOR_ELT(I); 2884 case TargetOpcode::G_INSERT_VECTOR_ELT: 2885 return selectG_INSERT_VECTOR_ELT(I); 2886 case TargetOpcode::G_SHUFFLE_VECTOR: 2887 return selectG_SHUFFLE_VECTOR(I); 2888 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 2889 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: { 2890 const AMDGPU::ImageDimIntrinsicInfo *Intr 2891 = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID()); 2892 assert(Intr && "not an image intrinsic with image pseudo"); 2893 return selectImageIntrinsic(I, Intr); 2894 } 2895 default: 2896 return selectImpl(I, *CoverageInfo); 2897 } 2898 return false; 2899 } 2900 2901 InstructionSelector::ComplexRendererFns 2902 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { 2903 return {{ 2904 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 2905 }}; 2906 2907 } 2908 2909 std::pair<Register, unsigned> 2910 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root) const { 2911 Register Src = Root.getReg(); 2912 Register OrigSrc = Src; 2913 unsigned Mods = 0; 2914 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI); 2915 2916 if (MI && MI->getOpcode() == AMDGPU::G_FNEG) { 2917 Src = MI->getOperand(1).getReg(); 2918 Mods |= SISrcMods::NEG; 2919 MI = getDefIgnoringCopies(Src, *MRI); 2920 } 2921 2922 if (MI && MI->getOpcode() == AMDGPU::G_FABS) { 2923 Src = MI->getOperand(1).getReg(); 2924 Mods |= SISrcMods::ABS; 2925 } 2926 2927 if (Mods != 0 && 2928 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) { 2929 MachineInstr *UseMI = Root.getParent(); 2930 2931 // If we looked through copies to find source modifiers on an SGPR operand, 2932 // we now have an SGPR register source. To avoid potentially violating the 2933 // constant bus restriction, we need to insert a copy to a VGPR. 2934 Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc); 2935 BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(), 2936 TII.get(AMDGPU::COPY), VGPRSrc) 2937 .addReg(Src); 2938 Src = VGPRSrc; 2939 } 2940 2941 return std::make_pair(Src, Mods); 2942 } 2943 2944 /// 2945 /// This will select either an SGPR or VGPR operand and will save us from 2946 /// having to write an extra tablegen pattern. 2947 InstructionSelector::ComplexRendererFns 2948 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const { 2949 return {{ 2950 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 2951 }}; 2952 } 2953 2954 InstructionSelector::ComplexRendererFns 2955 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { 2956 Register Src; 2957 unsigned Mods; 2958 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 2959 2960 return {{ 2961 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2962 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 2963 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 2964 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 2965 }}; 2966 } 2967 2968 InstructionSelector::ComplexRendererFns 2969 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { 2970 return {{ 2971 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 2972 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 2973 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 2974 }}; 2975 } 2976 2977 InstructionSelector::ComplexRendererFns 2978 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { 2979 Register Src; 2980 unsigned Mods; 2981 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 2982 2983 return {{ 2984 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2985 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 2986 }}; 2987 } 2988 2989 InstructionSelector::ComplexRendererFns 2990 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { 2991 Register Reg = Root.getReg(); 2992 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); 2993 if (Def && (Def->getOpcode() == AMDGPU::G_FNEG || 2994 Def->getOpcode() == AMDGPU::G_FABS)) 2995 return {}; 2996 return {{ 2997 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 2998 }}; 2999 } 3000 3001 std::pair<Register, unsigned> 3002 AMDGPUInstructionSelector::selectVOP3PModsImpl( 3003 Register Src, const MachineRegisterInfo &MRI) const { 3004 unsigned Mods = 0; 3005 MachineInstr *MI = MRI.getVRegDef(Src); 3006 3007 if (MI && MI->getOpcode() == AMDGPU::G_FNEG && 3008 // It's possible to see an f32 fneg here, but unlikely. 3009 // TODO: Treat f32 fneg as only high bit. 3010 MRI.getType(Src) == LLT::vector(2, 16)) { 3011 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); 3012 Src = MI->getOperand(1).getReg(); 3013 MI = MRI.getVRegDef(Src); 3014 } 3015 3016 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. 3017 3018 // Packed instructions do not have abs modifiers. 3019 Mods |= SISrcMods::OP_SEL_1; 3020 3021 return std::make_pair(Src, Mods); 3022 } 3023 3024 InstructionSelector::ComplexRendererFns 3025 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const { 3026 MachineRegisterInfo &MRI 3027 = Root.getParent()->getParent()->getParent()->getRegInfo(); 3028 3029 Register Src; 3030 unsigned Mods; 3031 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI); 3032 3033 return {{ 3034 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3035 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3036 }}; 3037 } 3038 3039 InstructionSelector::ComplexRendererFns 3040 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const { 3041 Register Src; 3042 unsigned Mods; 3043 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 3044 if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI)) 3045 return None; 3046 3047 return {{ 3048 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3049 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3050 }}; 3051 } 3052 3053 InstructionSelector::ComplexRendererFns 3054 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { 3055 // FIXME: Handle op_sel 3056 return {{ 3057 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 3058 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods 3059 }}; 3060 } 3061 3062 InstructionSelector::ComplexRendererFns 3063 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { 3064 SmallVector<GEPInfo, 4> AddrInfo; 3065 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 3066 3067 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 3068 return None; 3069 3070 const GEPInfo &GEPInfo = AddrInfo[0]; 3071 Optional<int64_t> EncodedImm = 3072 AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false); 3073 if (!EncodedImm) 3074 return None; 3075 3076 unsigned PtrReg = GEPInfo.SgprParts[0]; 3077 return {{ 3078 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 3079 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } 3080 }}; 3081 } 3082 3083 InstructionSelector::ComplexRendererFns 3084 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { 3085 SmallVector<GEPInfo, 4> AddrInfo; 3086 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 3087 3088 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 3089 return None; 3090 3091 const GEPInfo &GEPInfo = AddrInfo[0]; 3092 Register PtrReg = GEPInfo.SgprParts[0]; 3093 Optional<int64_t> EncodedImm = 3094 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm); 3095 if (!EncodedImm) 3096 return None; 3097 3098 return {{ 3099 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 3100 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } 3101 }}; 3102 } 3103 3104 InstructionSelector::ComplexRendererFns 3105 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { 3106 MachineInstr *MI = Root.getParent(); 3107 MachineBasicBlock *MBB = MI->getParent(); 3108 3109 SmallVector<GEPInfo, 4> AddrInfo; 3110 getAddrModeInfo(*MI, *MRI, AddrInfo); 3111 3112 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, 3113 // then we can select all ptr + 32-bit offsets not just immediate offsets. 3114 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 3115 return None; 3116 3117 const GEPInfo &GEPInfo = AddrInfo[0]; 3118 // SGPR offset is unsigned. 3119 if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm)) 3120 return None; 3121 3122 // If we make it this far we have a load with an 32-bit immediate offset. 3123 // It is OK to select this using a sgpr offset, because we have already 3124 // failed trying to select this load into one of the _IMM variants since 3125 // the _IMM Patterns are considered before the _SGPR patterns. 3126 Register PtrReg = GEPInfo.SgprParts[0]; 3127 Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 3128 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) 3129 .addImm(GEPInfo.Imm); 3130 return {{ 3131 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 3132 [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); } 3133 }}; 3134 } 3135 3136 template <bool Signed> 3137 InstructionSelector::ComplexRendererFns 3138 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const { 3139 MachineInstr *MI = Root.getParent(); 3140 3141 InstructionSelector::ComplexRendererFns Default = {{ 3142 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 3143 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // offset 3144 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 3145 }}; 3146 3147 if (!STI.hasFlatInstOffsets()) 3148 return Default; 3149 3150 const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg()); 3151 if (!OpDef || OpDef->getOpcode() != AMDGPU::G_PTR_ADD) 3152 return Default; 3153 3154 Optional<int64_t> Offset = 3155 getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI); 3156 if (!Offset.hasValue()) 3157 return Default; 3158 3159 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); 3160 if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed)) 3161 return Default; 3162 3163 Register BasePtr = OpDef->getOperand(1).getReg(); 3164 3165 return {{ 3166 [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); }, 3167 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); }, 3168 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 3169 }}; 3170 } 3171 3172 InstructionSelector::ComplexRendererFns 3173 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const { 3174 return selectFlatOffsetImpl<false>(Root); 3175 } 3176 3177 InstructionSelector::ComplexRendererFns 3178 AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const { 3179 return selectFlatOffsetImpl<true>(Root); 3180 } 3181 3182 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { 3183 auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>(); 3184 return PSV && PSV->isStack(); 3185 } 3186 3187 InstructionSelector::ComplexRendererFns 3188 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { 3189 MachineInstr *MI = Root.getParent(); 3190 MachineBasicBlock *MBB = MI->getParent(); 3191 MachineFunction *MF = MBB->getParent(); 3192 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 3193 3194 int64_t Offset = 0; 3195 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) && 3196 Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) { 3197 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3198 3199 // TODO: Should this be inside the render function? The iterator seems to 3200 // move. 3201 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), 3202 HighBits) 3203 .addImm(Offset & ~4095); 3204 3205 return {{[=](MachineInstrBuilder &MIB) { // rsrc 3206 MIB.addReg(Info->getScratchRSrcReg()); 3207 }, 3208 [=](MachineInstrBuilder &MIB) { // vaddr 3209 MIB.addReg(HighBits); 3210 }, 3211 [=](MachineInstrBuilder &MIB) { // soffset 3212 const MachineMemOperand *MMO = *MI->memoperands_begin(); 3213 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 3214 3215 if (isStackPtrRelative(PtrInfo)) 3216 MIB.addReg(Info->getStackPtrOffsetReg()); 3217 else 3218 MIB.addImm(0); 3219 }, 3220 [=](MachineInstrBuilder &MIB) { // offset 3221 MIB.addImm(Offset & 4095); 3222 }}}; 3223 } 3224 3225 assert(Offset == 0 || Offset == -1); 3226 3227 // Try to fold a frame index directly into the MUBUF vaddr field, and any 3228 // offsets. 3229 Optional<int> FI; 3230 Register VAddr = Root.getReg(); 3231 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) { 3232 if (isBaseWithConstantOffset(Root, *MRI)) { 3233 const MachineOperand &LHS = RootDef->getOperand(1); 3234 const MachineOperand &RHS = RootDef->getOperand(2); 3235 const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); 3236 const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); 3237 if (LHSDef && RHSDef) { 3238 int64_t PossibleOffset = 3239 RHSDef->getOperand(1).getCImm()->getSExtValue(); 3240 if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) && 3241 (!STI.privateMemoryResourceIsRangeChecked() || 3242 KnownBits->signBitIsZero(LHS.getReg()))) { 3243 if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX) 3244 FI = LHSDef->getOperand(1).getIndex(); 3245 else 3246 VAddr = LHS.getReg(); 3247 Offset = PossibleOffset; 3248 } 3249 } 3250 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) { 3251 FI = RootDef->getOperand(1).getIndex(); 3252 } 3253 } 3254 3255 return {{[=](MachineInstrBuilder &MIB) { // rsrc 3256 MIB.addReg(Info->getScratchRSrcReg()); 3257 }, 3258 [=](MachineInstrBuilder &MIB) { // vaddr 3259 if (FI.hasValue()) 3260 MIB.addFrameIndex(FI.getValue()); 3261 else 3262 MIB.addReg(VAddr); 3263 }, 3264 [=](MachineInstrBuilder &MIB) { // soffset 3265 // If we don't know this private access is a local stack object, it 3266 // needs to be relative to the entry point's scratch wave offset. 3267 // TODO: Should split large offsets that don't fit like above. 3268 // TODO: Don't use scratch wave offset just because the offset 3269 // didn't fit. 3270 if (!Info->isEntryFunction() && FI.hasValue()) 3271 MIB.addReg(Info->getStackPtrOffsetReg()); 3272 else 3273 MIB.addImm(0); 3274 }, 3275 [=](MachineInstrBuilder &MIB) { // offset 3276 MIB.addImm(Offset); 3277 }}}; 3278 } 3279 3280 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base, 3281 int64_t Offset, 3282 unsigned OffsetBits) const { 3283 if ((OffsetBits == 16 && !isUInt<16>(Offset)) || 3284 (OffsetBits == 8 && !isUInt<8>(Offset))) 3285 return false; 3286 3287 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) 3288 return true; 3289 3290 // On Southern Islands instruction with a negative base value and an offset 3291 // don't seem to work. 3292 return KnownBits->signBitIsZero(Base); 3293 } 3294 3295 InstructionSelector::ComplexRendererFns 3296 AMDGPUInstructionSelector::selectMUBUFScratchOffset( 3297 MachineOperand &Root) const { 3298 MachineInstr *MI = Root.getParent(); 3299 MachineBasicBlock *MBB = MI->getParent(); 3300 3301 int64_t Offset = 0; 3302 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) || 3303 !SIInstrInfo::isLegalMUBUFImmOffset(Offset)) 3304 return {}; 3305 3306 const MachineFunction *MF = MBB->getParent(); 3307 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 3308 const MachineMemOperand *MMO = *MI->memoperands_begin(); 3309 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 3310 3311 return {{ 3312 [=](MachineInstrBuilder &MIB) { // rsrc 3313 MIB.addReg(Info->getScratchRSrcReg()); 3314 }, 3315 [=](MachineInstrBuilder &MIB) { // soffset 3316 if (isStackPtrRelative(PtrInfo)) 3317 MIB.addReg(Info->getStackPtrOffsetReg()); 3318 else 3319 MIB.addImm(0); 3320 }, 3321 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset 3322 }}; 3323 } 3324 3325 std::pair<Register, unsigned> 3326 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const { 3327 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 3328 if (!RootDef) 3329 return std::make_pair(Root.getReg(), 0); 3330 3331 int64_t ConstAddr = 0; 3332 3333 Register PtrBase; 3334 int64_t Offset; 3335 std::tie(PtrBase, Offset) = 3336 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 3337 3338 if (Offset) { 3339 if (isDSOffsetLegal(PtrBase, Offset, 16)) { 3340 // (add n0, c0) 3341 return std::make_pair(PtrBase, Offset); 3342 } 3343 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 3344 // TODO 3345 3346 3347 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 3348 // TODO 3349 3350 } 3351 3352 return std::make_pair(Root.getReg(), 0); 3353 } 3354 3355 InstructionSelector::ComplexRendererFns 3356 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const { 3357 Register Reg; 3358 unsigned Offset; 3359 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root); 3360 return {{ 3361 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 3362 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } 3363 }}; 3364 } 3365 3366 InstructionSelector::ComplexRendererFns 3367 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const { 3368 Register Reg; 3369 unsigned Offset; 3370 std::tie(Reg, Offset) = selectDS64Bit4ByteAlignedImpl(Root); 3371 return {{ 3372 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 3373 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, 3374 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); } 3375 }}; 3376 } 3377 3378 std::pair<Register, unsigned> 3379 AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const { 3380 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 3381 if (!RootDef) 3382 return std::make_pair(Root.getReg(), 0); 3383 3384 int64_t ConstAddr = 0; 3385 3386 Register PtrBase; 3387 int64_t Offset; 3388 std::tie(PtrBase, Offset) = 3389 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 3390 3391 if (Offset) { 3392 int64_t DWordOffset0 = Offset / 4; 3393 int64_t DWordOffset1 = DWordOffset0 + 1; 3394 if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) { 3395 // (add n0, c0) 3396 return std::make_pair(PtrBase, DWordOffset0); 3397 } 3398 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 3399 // TODO 3400 3401 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 3402 // TODO 3403 3404 } 3405 3406 return std::make_pair(Root.getReg(), 0); 3407 } 3408 3409 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return 3410 /// the base value with the constant offset. There may be intervening copies 3411 /// between \p Root and the identified constant. Returns \p Root, 0 if this does 3412 /// not match the pattern. 3413 std::pair<Register, int64_t> 3414 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset( 3415 Register Root, const MachineRegisterInfo &MRI) const { 3416 MachineInstr *RootI = MRI.getVRegDef(Root); 3417 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD) 3418 return {Root, 0}; 3419 3420 MachineOperand &RHS = RootI->getOperand(2); 3421 Optional<ValueAndVReg> MaybeOffset 3422 = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true); 3423 if (!MaybeOffset) 3424 return {Root, 0}; 3425 return {RootI->getOperand(1).getReg(), MaybeOffset->Value}; 3426 } 3427 3428 static void addZeroImm(MachineInstrBuilder &MIB) { 3429 MIB.addImm(0); 3430 } 3431 3432 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p 3433 /// BasePtr is not valid, a null base pointer will be used. 3434 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, 3435 uint32_t FormatLo, uint32_t FormatHi, 3436 Register BasePtr) { 3437 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 3438 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 3439 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 3440 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 3441 3442 B.buildInstr(AMDGPU::S_MOV_B32) 3443 .addDef(RSrc2) 3444 .addImm(FormatLo); 3445 B.buildInstr(AMDGPU::S_MOV_B32) 3446 .addDef(RSrc3) 3447 .addImm(FormatHi); 3448 3449 // Build the half of the subregister with the constants before building the 3450 // full 128-bit register. If we are building multiple resource descriptors, 3451 // this will allow CSEing of the 2-component register. 3452 B.buildInstr(AMDGPU::REG_SEQUENCE) 3453 .addDef(RSrcHi) 3454 .addReg(RSrc2) 3455 .addImm(AMDGPU::sub0) 3456 .addReg(RSrc3) 3457 .addImm(AMDGPU::sub1); 3458 3459 Register RSrcLo = BasePtr; 3460 if (!BasePtr) { 3461 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 3462 B.buildInstr(AMDGPU::S_MOV_B64) 3463 .addDef(RSrcLo) 3464 .addImm(0); 3465 } 3466 3467 B.buildInstr(AMDGPU::REG_SEQUENCE) 3468 .addDef(RSrc) 3469 .addReg(RSrcLo) 3470 .addImm(AMDGPU::sub0_sub1) 3471 .addReg(RSrcHi) 3472 .addImm(AMDGPU::sub2_sub3); 3473 3474 return RSrc; 3475 } 3476 3477 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 3478 const SIInstrInfo &TII, Register BasePtr) { 3479 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 3480 3481 // FIXME: Why are half the "default" bits ignored based on the addressing 3482 // mode? 3483 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr); 3484 } 3485 3486 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 3487 const SIInstrInfo &TII, Register BasePtr) { 3488 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 3489 3490 // FIXME: Why are half the "default" bits ignored based on the addressing 3491 // mode? 3492 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr); 3493 } 3494 3495 AMDGPUInstructionSelector::MUBUFAddressData 3496 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const { 3497 MUBUFAddressData Data; 3498 Data.N0 = Src; 3499 3500 Register PtrBase; 3501 int64_t Offset; 3502 3503 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI); 3504 if (isUInt<32>(Offset)) { 3505 Data.N0 = PtrBase; 3506 Data.Offset = Offset; 3507 } 3508 3509 if (MachineInstr *InputAdd 3510 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) { 3511 Data.N2 = InputAdd->getOperand(1).getReg(); 3512 Data.N3 = InputAdd->getOperand(2).getReg(); 3513 3514 // FIXME: Need to fix extra SGPR->VGPRcopies inserted 3515 // FIXME: Don't know this was defined by operand 0 3516 // 3517 // TODO: Remove this when we have copy folding optimizations after 3518 // RegBankSelect. 3519 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg(); 3520 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg(); 3521 } 3522 3523 return Data; 3524 } 3525 3526 /// Return if the addr64 mubuf mode should be used for the given address. 3527 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const { 3528 // (ptr_add N2, N3) -> addr64, or 3529 // (ptr_add (ptr_add N2, N3), C1) -> addr64 3530 if (Addr.N2) 3531 return true; 3532 3533 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI); 3534 return N0Bank->getID() == AMDGPU::VGPRRegBankID; 3535 } 3536 3537 /// Split an immediate offset \p ImmOffset depending on whether it fits in the 3538 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable 3539 /// component. 3540 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset( 3541 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const { 3542 if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset)) 3543 return; 3544 3545 // Illegal offset, store it in soffset. 3546 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 3547 B.buildInstr(AMDGPU::S_MOV_B32) 3548 .addDef(SOffset) 3549 .addImm(ImmOffset); 3550 ImmOffset = 0; 3551 } 3552 3553 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl( 3554 MachineOperand &Root, Register &VAddr, Register &RSrcReg, 3555 Register &SOffset, int64_t &Offset) const { 3556 // FIXME: Predicates should stop this from reaching here. 3557 // addr64 bit was removed for volcanic islands. 3558 if (!STI.hasAddr64() || STI.useFlatForGlobal()) 3559 return false; 3560 3561 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 3562 if (!shouldUseAddr64(AddrData)) 3563 return false; 3564 3565 Register N0 = AddrData.N0; 3566 Register N2 = AddrData.N2; 3567 Register N3 = AddrData.N3; 3568 Offset = AddrData.Offset; 3569 3570 // Base pointer for the SRD. 3571 Register SRDPtr; 3572 3573 if (N2) { 3574 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 3575 assert(N3); 3576 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 3577 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the 3578 // addr64, and construct the default resource from a 0 address. 3579 VAddr = N0; 3580 } else { 3581 SRDPtr = N3; 3582 VAddr = N2; 3583 } 3584 } else { 3585 // N2 is not divergent. 3586 SRDPtr = N2; 3587 VAddr = N3; 3588 } 3589 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 3590 // Use the default null pointer in the resource 3591 VAddr = N0; 3592 } else { 3593 // N0 -> offset, or 3594 // (N0 + C1) -> offset 3595 SRDPtr = N0; 3596 } 3597 3598 MachineIRBuilder B(*Root.getParent()); 3599 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr); 3600 splitIllegalMUBUFOffset(B, SOffset, Offset); 3601 return true; 3602 } 3603 3604 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl( 3605 MachineOperand &Root, Register &RSrcReg, Register &SOffset, 3606 int64_t &Offset) const { 3607 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 3608 if (shouldUseAddr64(AddrData)) 3609 return false; 3610 3611 // N0 -> offset, or 3612 // (N0 + C1) -> offset 3613 Register SRDPtr = AddrData.N0; 3614 Offset = AddrData.Offset; 3615 3616 // TODO: Look through extensions for 32-bit soffset. 3617 MachineIRBuilder B(*Root.getParent()); 3618 3619 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr); 3620 splitIllegalMUBUFOffset(B, SOffset, Offset); 3621 return true; 3622 } 3623 3624 InstructionSelector::ComplexRendererFns 3625 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const { 3626 Register VAddr; 3627 Register RSrcReg; 3628 Register SOffset; 3629 int64_t Offset = 0; 3630 3631 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 3632 return {}; 3633 3634 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 3635 // pattern. 3636 return {{ 3637 [=](MachineInstrBuilder &MIB) { // rsrc 3638 MIB.addReg(RSrcReg); 3639 }, 3640 [=](MachineInstrBuilder &MIB) { // vaddr 3641 MIB.addReg(VAddr); 3642 }, 3643 [=](MachineInstrBuilder &MIB) { // soffset 3644 if (SOffset) 3645 MIB.addReg(SOffset); 3646 else 3647 MIB.addImm(0); 3648 }, 3649 [=](MachineInstrBuilder &MIB) { // offset 3650 MIB.addImm(Offset); 3651 }, 3652 addZeroImm, // glc 3653 addZeroImm, // slc 3654 addZeroImm, // tfe 3655 addZeroImm, // dlc 3656 addZeroImm // swz 3657 }}; 3658 } 3659 3660 InstructionSelector::ComplexRendererFns 3661 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const { 3662 Register RSrcReg; 3663 Register SOffset; 3664 int64_t Offset = 0; 3665 3666 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 3667 return {}; 3668 3669 return {{ 3670 [=](MachineInstrBuilder &MIB) { // rsrc 3671 MIB.addReg(RSrcReg); 3672 }, 3673 [=](MachineInstrBuilder &MIB) { // soffset 3674 if (SOffset) 3675 MIB.addReg(SOffset); 3676 else 3677 MIB.addImm(0); 3678 }, 3679 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 3680 addZeroImm, // glc 3681 addZeroImm, // slc 3682 addZeroImm, // tfe 3683 addZeroImm, // dlc 3684 addZeroImm // swz 3685 }}; 3686 } 3687 3688 InstructionSelector::ComplexRendererFns 3689 AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const { 3690 Register VAddr; 3691 Register RSrcReg; 3692 Register SOffset; 3693 int64_t Offset = 0; 3694 3695 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 3696 return {}; 3697 3698 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 3699 // pattern. 3700 return {{ 3701 [=](MachineInstrBuilder &MIB) { // rsrc 3702 MIB.addReg(RSrcReg); 3703 }, 3704 [=](MachineInstrBuilder &MIB) { // vaddr 3705 MIB.addReg(VAddr); 3706 }, 3707 [=](MachineInstrBuilder &MIB) { // soffset 3708 if (SOffset) 3709 MIB.addReg(SOffset); 3710 else 3711 MIB.addImm(0); 3712 }, 3713 [=](MachineInstrBuilder &MIB) { // offset 3714 MIB.addImm(Offset); 3715 }, 3716 addZeroImm // slc 3717 }}; 3718 } 3719 3720 InstructionSelector::ComplexRendererFns 3721 AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const { 3722 Register RSrcReg; 3723 Register SOffset; 3724 int64_t Offset = 0; 3725 3726 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 3727 return {}; 3728 3729 return {{ 3730 [=](MachineInstrBuilder &MIB) { // rsrc 3731 MIB.addReg(RSrcReg); 3732 }, 3733 [=](MachineInstrBuilder &MIB) { // soffset 3734 if (SOffset) 3735 MIB.addReg(SOffset); 3736 else 3737 MIB.addImm(0); 3738 }, 3739 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 3740 addZeroImm // slc 3741 }}; 3742 } 3743 3744 /// Get an immediate that must be 32-bits, and treated as zero extended. 3745 static Optional<uint64_t> getConstantZext32Val(Register Reg, 3746 const MachineRegisterInfo &MRI) { 3747 // getConstantVRegVal sexts any values, so see if that matters. 3748 Optional<int64_t> OffsetVal = getConstantVRegVal(Reg, MRI); 3749 if (!OffsetVal || !isInt<32>(*OffsetVal)) 3750 return None; 3751 return Lo_32(*OffsetVal); 3752 } 3753 3754 InstructionSelector::ComplexRendererFns 3755 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const { 3756 Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI); 3757 if (!OffsetVal) 3758 return {}; 3759 3760 Optional<int64_t> EncodedImm = 3761 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true); 3762 if (!EncodedImm) 3763 return {}; 3764 3765 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; 3766 } 3767 3768 InstructionSelector::ComplexRendererFns 3769 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const { 3770 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); 3771 3772 Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI); 3773 if (!OffsetVal) 3774 return {}; 3775 3776 Optional<int64_t> EncodedImm 3777 = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal); 3778 if (!EncodedImm) 3779 return {}; 3780 3781 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; 3782 } 3783 3784 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, 3785 const MachineInstr &MI, 3786 int OpIdx) const { 3787 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 3788 "Expected G_CONSTANT"); 3789 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue()); 3790 } 3791 3792 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB, 3793 const MachineInstr &MI, 3794 int OpIdx) const { 3795 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 3796 "Expected G_CONSTANT"); 3797 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue()); 3798 } 3799 3800 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB, 3801 const MachineInstr &MI, 3802 int OpIdx) const { 3803 assert(OpIdx == -1); 3804 3805 const MachineOperand &Op = MI.getOperand(1); 3806 if (MI.getOpcode() == TargetOpcode::G_FCONSTANT) 3807 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); 3808 else { 3809 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT"); 3810 MIB.addImm(Op.getCImm()->getSExtValue()); 3811 } 3812 } 3813 3814 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB, 3815 const MachineInstr &MI, 3816 int OpIdx) const { 3817 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 3818 "Expected G_CONSTANT"); 3819 MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation()); 3820 } 3821 3822 /// This only really exists to satisfy DAG type checking machinery, so is a 3823 /// no-op here. 3824 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB, 3825 const MachineInstr &MI, 3826 int OpIdx) const { 3827 MIB.addImm(MI.getOperand(OpIdx).getImm()); 3828 } 3829 3830 void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB, 3831 const MachineInstr &MI, 3832 int OpIdx) const { 3833 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3834 MIB.addImm(MI.getOperand(OpIdx).getImm() & 1); 3835 } 3836 3837 void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB, 3838 const MachineInstr &MI, 3839 int OpIdx) const { 3840 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3841 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1); 3842 } 3843 3844 void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB, 3845 const MachineInstr &MI, 3846 int OpIdx) const { 3847 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3848 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1); 3849 } 3850 3851 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB, 3852 const MachineInstr &MI, 3853 int OpIdx) const { 3854 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3855 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1); 3856 } 3857 3858 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const { 3859 return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm()); 3860 } 3861 3862 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const { 3863 return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm()); 3864 } 3865 3866 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const { 3867 return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm()); 3868 } 3869 3870 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const { 3871 return TII.isInlineConstant(Imm); 3872 } 3873