1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the InstructionSelector class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUInstructionSelector.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUGlobalISelUtils.h" 17 #include "AMDGPUInstrInfo.h" 18 #include "AMDGPURegisterBankInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 23 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" 24 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 25 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 26 #include "llvm/CodeGen/MachineFrameInfo.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-isel" 31 32 using namespace llvm; 33 using namespace MIPatternMatch; 34 35 static cl::opt<bool> AllowRiskySelect( 36 "amdgpu-global-isel-risky-select", 37 cl::desc("Allow GlobalISel to select cases that are likely to not work yet"), 38 cl::init(false), 39 cl::ReallyHidden); 40 41 #define GET_GLOBALISEL_IMPL 42 #define AMDGPUSubtarget GCNSubtarget 43 #include "AMDGPUGenGlobalISel.inc" 44 #undef GET_GLOBALISEL_IMPL 45 #undef AMDGPUSubtarget 46 47 AMDGPUInstructionSelector::AMDGPUInstructionSelector( 48 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, 49 const AMDGPUTargetMachine &TM) 50 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM), 51 STI(STI), 52 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG), 53 #define GET_GLOBALISEL_PREDICATES_INIT 54 #include "AMDGPUGenGlobalISel.inc" 55 #undef GET_GLOBALISEL_PREDICATES_INIT 56 #define GET_GLOBALISEL_TEMPORARIES_INIT 57 #include "AMDGPUGenGlobalISel.inc" 58 #undef GET_GLOBALISEL_TEMPORARIES_INIT 59 { 60 } 61 62 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } 63 64 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits *KB, 65 CodeGenCoverage &CoverageInfo, 66 ProfileSummaryInfo *PSI, 67 BlockFrequencyInfo *BFI) { 68 MRI = &MF.getRegInfo(); 69 Subtarget = &MF.getSubtarget<GCNSubtarget>(); 70 InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI); 71 } 72 73 bool AMDGPUInstructionSelector::isVCC(Register Reg, 74 const MachineRegisterInfo &MRI) const { 75 // The verifier is oblivious to s1 being a valid value for wavesize registers. 76 if (Reg.isPhysical()) 77 return false; 78 79 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 80 const TargetRegisterClass *RC = 81 RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); 82 if (RC) { 83 const LLT Ty = MRI.getType(Reg); 84 if (!Ty.isValid() || Ty.getSizeInBits() != 1) 85 return false; 86 // G_TRUNC s1 result is never vcc. 87 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC && 88 RC->hasSuperClassEq(TRI.getBoolRC()); 89 } 90 91 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); 92 return RB->getID() == AMDGPU::VCCRegBankID; 93 } 94 95 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI, 96 unsigned NewOpc) const { 97 MI.setDesc(TII.get(NewOpc)); 98 MI.removeOperand(1); // Remove intrinsic ID. 99 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 100 101 MachineOperand &Dst = MI.getOperand(0); 102 MachineOperand &Src = MI.getOperand(1); 103 104 // TODO: This should be legalized to s32 if needed 105 if (MRI->getType(Dst.getReg()) == LLT::scalar(1)) 106 return false; 107 108 const TargetRegisterClass *DstRC 109 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 110 const TargetRegisterClass *SrcRC 111 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 112 if (!DstRC || DstRC != SrcRC) 113 return false; 114 115 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) && 116 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI); 117 } 118 119 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { 120 const DebugLoc &DL = I.getDebugLoc(); 121 MachineBasicBlock *BB = I.getParent(); 122 I.setDesc(TII.get(TargetOpcode::COPY)); 123 124 const MachineOperand &Src = I.getOperand(1); 125 MachineOperand &Dst = I.getOperand(0); 126 Register DstReg = Dst.getReg(); 127 Register SrcReg = Src.getReg(); 128 129 if (isVCC(DstReg, *MRI)) { 130 if (SrcReg == AMDGPU::SCC) { 131 const TargetRegisterClass *RC 132 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 133 if (!RC) 134 return true; 135 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 136 } 137 138 if (!isVCC(SrcReg, *MRI)) { 139 // TODO: Should probably leave the copy and let copyPhysReg expand it. 140 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI)) 141 return false; 142 143 const TargetRegisterClass *SrcRC 144 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 145 146 Optional<ValueAndVReg> ConstVal = 147 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true); 148 if (ConstVal) { 149 unsigned MovOpc = 150 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; 151 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg) 152 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0); 153 } else { 154 Register MaskedReg = MRI->createVirtualRegister(SrcRC); 155 156 // We can't trust the high bits at this point, so clear them. 157 158 // TODO: Skip masking high bits if def is known boolean. 159 160 unsigned AndOpc = 161 TRI.isSGPRClass(SrcRC) ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32; 162 BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg) 163 .addImm(1) 164 .addReg(SrcReg); 165 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) 166 .addImm(0) 167 .addReg(MaskedReg); 168 } 169 170 if (!MRI->getRegClassOrNull(SrcReg)) 171 MRI->setRegClass(SrcReg, SrcRC); 172 I.eraseFromParent(); 173 return true; 174 } 175 176 const TargetRegisterClass *RC = 177 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 178 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) 179 return false; 180 181 return true; 182 } 183 184 for (const MachineOperand &MO : I.operands()) { 185 if (MO.getReg().isPhysical()) 186 continue; 187 188 const TargetRegisterClass *RC = 189 TRI.getConstrainedRegClassForOperand(MO, *MRI); 190 if (!RC) 191 continue; 192 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); 193 } 194 return true; 195 } 196 197 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { 198 const Register DefReg = I.getOperand(0).getReg(); 199 const LLT DefTy = MRI->getType(DefReg); 200 if (DefTy == LLT::scalar(1)) { 201 if (!AllowRiskySelect) { 202 LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n"); 203 return false; 204 } 205 206 LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n"); 207 } 208 209 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy) 210 211 const RegClassOrRegBank &RegClassOrBank = 212 MRI->getRegClassOrRegBank(DefReg); 213 214 const TargetRegisterClass *DefRC 215 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); 216 if (!DefRC) { 217 if (!DefTy.isValid()) { 218 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); 219 return false; 220 } 221 222 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); 223 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB); 224 if (!DefRC) { 225 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); 226 return false; 227 } 228 } 229 230 // TODO: Verify that all registers have the same bank 231 I.setDesc(TII.get(TargetOpcode::PHI)); 232 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI); 233 } 234 235 MachineOperand 236 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, 237 const TargetRegisterClass &SubRC, 238 unsigned SubIdx) const { 239 240 MachineInstr *MI = MO.getParent(); 241 MachineBasicBlock *BB = MO.getParent()->getParent(); 242 Register DstReg = MRI->createVirtualRegister(&SubRC); 243 244 if (MO.isReg()) { 245 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); 246 Register Reg = MO.getReg(); 247 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) 248 .addReg(Reg, 0, ComposedSubIdx); 249 250 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(), 251 MO.isKill(), MO.isDead(), MO.isUndef(), 252 MO.isEarlyClobber(), 0, MO.isDebug(), 253 MO.isInternalRead()); 254 } 255 256 assert(MO.isImm()); 257 258 APInt Imm(64, MO.getImm()); 259 260 switch (SubIdx) { 261 default: 262 llvm_unreachable("do not know to split immediate with this sub index."); 263 case AMDGPU::sub0: 264 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue()); 265 case AMDGPU::sub1: 266 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue()); 267 } 268 } 269 270 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { 271 switch (Opc) { 272 case AMDGPU::G_AND: 273 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; 274 case AMDGPU::G_OR: 275 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32; 276 case AMDGPU::G_XOR: 277 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32; 278 default: 279 llvm_unreachable("not a bit op"); 280 } 281 } 282 283 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { 284 Register DstReg = I.getOperand(0).getReg(); 285 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 286 287 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 288 if (DstRB->getID() != AMDGPU::SGPRRegBankID && 289 DstRB->getID() != AMDGPU::VCCRegBankID) 290 return false; 291 292 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID && 293 STI.isWave64()); 294 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64))); 295 296 // Dead implicit-def of scc 297 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef 298 true, // isImp 299 false, // isKill 300 true)); // isDead 301 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 302 } 303 304 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { 305 MachineBasicBlock *BB = I.getParent(); 306 MachineFunction *MF = BB->getParent(); 307 Register DstReg = I.getOperand(0).getReg(); 308 const DebugLoc &DL = I.getDebugLoc(); 309 LLT Ty = MRI->getType(DstReg); 310 if (Ty.isVector()) 311 return false; 312 313 unsigned Size = Ty.getSizeInBits(); 314 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 315 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID; 316 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB; 317 318 if (Size == 32) { 319 if (IsSALU) { 320 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; 321 MachineInstr *Add = 322 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 323 .add(I.getOperand(1)) 324 .add(I.getOperand(2)); 325 I.eraseFromParent(); 326 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 327 } 328 329 if (STI.hasAddNoCarry()) { 330 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64; 331 I.setDesc(TII.get(Opc)); 332 I.addOperand(*MF, MachineOperand::CreateImm(0)); 333 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 334 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 335 } 336 337 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64; 338 339 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass()); 340 MachineInstr *Add 341 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 342 .addDef(UnusedCarry, RegState::Dead) 343 .add(I.getOperand(1)) 344 .add(I.getOperand(2)) 345 .addImm(0); 346 I.eraseFromParent(); 347 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 348 } 349 350 assert(!Sub && "illegal sub should not reach here"); 351 352 const TargetRegisterClass &RC 353 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass; 354 const TargetRegisterClass &HalfRC 355 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass; 356 357 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0)); 358 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0)); 359 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1)); 360 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1)); 361 362 Register DstLo = MRI->createVirtualRegister(&HalfRC); 363 Register DstHi = MRI->createVirtualRegister(&HalfRC); 364 365 if (IsSALU) { 366 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) 367 .add(Lo1) 368 .add(Lo2); 369 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) 370 .add(Hi1) 371 .add(Hi2); 372 } else { 373 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass(); 374 Register CarryReg = MRI->createVirtualRegister(CarryRC); 375 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo) 376 .addDef(CarryReg) 377 .add(Lo1) 378 .add(Lo2) 379 .addImm(0); 380 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi) 381 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead) 382 .add(Hi1) 383 .add(Hi2) 384 .addReg(CarryReg, RegState::Kill) 385 .addImm(0); 386 387 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI)) 388 return false; 389 } 390 391 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 392 .addReg(DstLo) 393 .addImm(AMDGPU::sub0) 394 .addReg(DstHi) 395 .addImm(AMDGPU::sub1); 396 397 398 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI)) 399 return false; 400 401 I.eraseFromParent(); 402 return true; 403 } 404 405 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE( 406 MachineInstr &I) const { 407 MachineBasicBlock *BB = I.getParent(); 408 MachineFunction *MF = BB->getParent(); 409 const DebugLoc &DL = I.getDebugLoc(); 410 Register Dst0Reg = I.getOperand(0).getReg(); 411 Register Dst1Reg = I.getOperand(1).getReg(); 412 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO || 413 I.getOpcode() == AMDGPU::G_UADDE; 414 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE || 415 I.getOpcode() == AMDGPU::G_USUBE; 416 417 if (isVCC(Dst1Reg, *MRI)) { 418 unsigned NoCarryOpc = 419 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64; 420 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 421 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc)); 422 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 423 I.addOperand(*MF, MachineOperand::CreateImm(0)); 424 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 425 } 426 427 Register Src0Reg = I.getOperand(2).getReg(); 428 Register Src1Reg = I.getOperand(3).getReg(); 429 430 if (HasCarryIn) { 431 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 432 .addReg(I.getOperand(4).getReg()); 433 } 434 435 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; 436 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; 437 438 BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg) 439 .add(I.getOperand(2)) 440 .add(I.getOperand(3)); 441 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg) 442 .addReg(AMDGPU::SCC); 443 444 if (!MRI->getRegClassOrNull(Dst1Reg)) 445 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass); 446 447 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) || 448 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) || 449 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI)) 450 return false; 451 452 if (HasCarryIn && 453 !RBI.constrainGenericRegister(I.getOperand(4).getReg(), 454 AMDGPU::SReg_32RegClass, *MRI)) 455 return false; 456 457 I.eraseFromParent(); 458 return true; 459 } 460 461 // TODO: We should probably legalize these to only using 32-bit results. 462 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { 463 MachineBasicBlock *BB = I.getParent(); 464 Register DstReg = I.getOperand(0).getReg(); 465 Register SrcReg = I.getOperand(1).getReg(); 466 LLT DstTy = MRI->getType(DstReg); 467 LLT SrcTy = MRI->getType(SrcReg); 468 const unsigned SrcSize = SrcTy.getSizeInBits(); 469 unsigned DstSize = DstTy.getSizeInBits(); 470 471 // TODO: Should handle any multiple of 32 offset. 472 unsigned Offset = I.getOperand(2).getImm(); 473 if (Offset % 32 != 0 || DstSize > 128) 474 return false; 475 476 // 16-bit operations really use 32-bit registers. 477 // FIXME: Probably should not allow 16-bit G_EXTRACT results. 478 if (DstSize == 16) 479 DstSize = 32; 480 481 const TargetRegisterClass *DstRC = 482 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI); 483 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 484 return false; 485 486 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 487 const TargetRegisterClass *SrcRC = 488 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank); 489 if (!SrcRC) 490 return false; 491 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32, 492 DstSize / 32); 493 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg); 494 if (!SrcRC) 495 return false; 496 497 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I, 498 *SrcRC, I.getOperand(1)); 499 const DebugLoc &DL = I.getDebugLoc(); 500 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg) 501 .addReg(SrcReg, 0, SubReg); 502 503 I.eraseFromParent(); 504 return true; 505 } 506 507 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { 508 MachineBasicBlock *BB = MI.getParent(); 509 Register DstReg = MI.getOperand(0).getReg(); 510 LLT DstTy = MRI->getType(DstReg); 511 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg()); 512 513 const unsigned SrcSize = SrcTy.getSizeInBits(); 514 if (SrcSize < 32) 515 return selectImpl(MI, *CoverageInfo); 516 517 const DebugLoc &DL = MI.getDebugLoc(); 518 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 519 const unsigned DstSize = DstTy.getSizeInBits(); 520 const TargetRegisterClass *DstRC = 521 TRI.getRegClassForSizeOnBank(DstSize, *DstBank); 522 if (!DstRC) 523 return false; 524 525 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8); 526 MachineInstrBuilder MIB = 527 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg); 528 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) { 529 MachineOperand &Src = MI.getOperand(I + 1); 530 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef())); 531 MIB.addImm(SubRegs[I]); 532 533 const TargetRegisterClass *SrcRC 534 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 535 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI)) 536 return false; 537 } 538 539 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 540 return false; 541 542 MI.eraseFromParent(); 543 return true; 544 } 545 546 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { 547 MachineBasicBlock *BB = MI.getParent(); 548 const int NumDst = MI.getNumOperands() - 1; 549 550 MachineOperand &Src = MI.getOperand(NumDst); 551 552 Register SrcReg = Src.getReg(); 553 Register DstReg0 = MI.getOperand(0).getReg(); 554 LLT DstTy = MRI->getType(DstReg0); 555 LLT SrcTy = MRI->getType(SrcReg); 556 557 const unsigned DstSize = DstTy.getSizeInBits(); 558 const unsigned SrcSize = SrcTy.getSizeInBits(); 559 const DebugLoc &DL = MI.getDebugLoc(); 560 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 561 562 const TargetRegisterClass *SrcRC = 563 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank); 564 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 565 return false; 566 567 // Note we could have mixed SGPR and VGPR destination banks for an SGPR 568 // source, and this relies on the fact that the same subregister indices are 569 // used for both. 570 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); 571 for (int I = 0, E = NumDst; I != E; ++I) { 572 MachineOperand &Dst = MI.getOperand(I); 573 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg()) 574 .addReg(SrcReg, 0, SubRegs[I]); 575 576 // Make sure the subregister index is valid for the source register. 577 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]); 578 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 579 return false; 580 581 const TargetRegisterClass *DstRC = 582 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 583 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI)) 584 return false; 585 } 586 587 MI.eraseFromParent(); 588 return true; 589 } 590 591 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC( 592 MachineInstr &MI) const { 593 if (selectImpl(MI, *CoverageInfo)) 594 return true; 595 596 const LLT S32 = LLT::scalar(32); 597 const LLT V2S16 = LLT::fixed_vector(2, 16); 598 599 Register Dst = MI.getOperand(0).getReg(); 600 if (MRI->getType(Dst) != V2S16) 601 return false; 602 603 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI); 604 if (DstBank->getID() != AMDGPU::SGPRRegBankID) 605 return false; 606 607 Register Src0 = MI.getOperand(1).getReg(); 608 Register Src1 = MI.getOperand(2).getReg(); 609 if (MRI->getType(Src0) != S32) 610 return false; 611 612 const DebugLoc &DL = MI.getDebugLoc(); 613 MachineBasicBlock *BB = MI.getParent(); 614 615 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true); 616 if (ConstSrc1) { 617 auto ConstSrc0 = 618 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true); 619 if (ConstSrc0) { 620 const int64_t K0 = ConstSrc0->Value.getSExtValue(); 621 const int64_t K1 = ConstSrc1->Value.getSExtValue(); 622 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff; 623 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff; 624 625 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst) 626 .addImm(Lo16 | (Hi16 << 16)); 627 MI.eraseFromParent(); 628 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI); 629 } 630 } 631 632 // TODO: This should probably be a combine somewhere 633 // (build_vector_trunc $src0, undef -> copy $src0 634 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI); 635 if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) { 636 MI.setDesc(TII.get(AMDGPU::COPY)); 637 MI.removeOperand(2); 638 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) && 639 RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI); 640 } 641 642 Register ShiftSrc0; 643 Register ShiftSrc1; 644 645 // With multiple uses of the shift, this will duplicate the shift and 646 // increase register pressure. 647 // 648 // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16) 649 // => (S_PACK_HH_B32_B16 $src0, $src1) 650 // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16)) 651 // => (S_PACK_LH_B32_B16 $src0, $src1) 652 // (build_vector_trunc $src0, $src1) 653 // => (S_PACK_LL_B32_B16 $src0, $src1) 654 655 bool Shift0 = mi_match( 656 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16)))); 657 658 bool Shift1 = mi_match( 659 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16)))); 660 661 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16; 662 if (Shift0 && Shift1) { 663 Opc = AMDGPU::S_PACK_HH_B32_B16; 664 MI.getOperand(1).setReg(ShiftSrc0); 665 MI.getOperand(2).setReg(ShiftSrc1); 666 } else if (Shift1) { 667 Opc = AMDGPU::S_PACK_LH_B32_B16; 668 MI.getOperand(2).setReg(ShiftSrc1); 669 } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) { 670 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16 671 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst) 672 .addReg(ShiftSrc0) 673 .addImm(16); 674 675 MI.eraseFromParent(); 676 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 677 } 678 679 MI.setDesc(TII.get(Opc)); 680 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); 681 } 682 683 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const { 684 return selectG_ADD_SUB(I); 685 } 686 687 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { 688 const MachineOperand &MO = I.getOperand(0); 689 690 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The 691 // regbank check here is to know why getConstrainedRegClassForOperand failed. 692 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI); 693 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) || 694 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) { 695 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); 696 return true; 697 } 698 699 return false; 700 } 701 702 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { 703 MachineBasicBlock *BB = I.getParent(); 704 705 Register DstReg = I.getOperand(0).getReg(); 706 Register Src0Reg = I.getOperand(1).getReg(); 707 Register Src1Reg = I.getOperand(2).getReg(); 708 LLT Src1Ty = MRI->getType(Src1Reg); 709 710 unsigned DstSize = MRI->getType(DstReg).getSizeInBits(); 711 unsigned InsSize = Src1Ty.getSizeInBits(); 712 713 int64_t Offset = I.getOperand(3).getImm(); 714 715 // FIXME: These cases should have been illegal and unnecessary to check here. 716 if (Offset % 32 != 0 || InsSize % 32 != 0) 717 return false; 718 719 // Currently not handled by getSubRegFromChannel. 720 if (InsSize > 128) 721 return false; 722 723 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32); 724 if (SubReg == AMDGPU::NoSubRegister) 725 return false; 726 727 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 728 const TargetRegisterClass *DstRC = 729 TRI.getRegClassForSizeOnBank(DstSize, *DstBank); 730 if (!DstRC) 731 return false; 732 733 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI); 734 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI); 735 const TargetRegisterClass *Src0RC = 736 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank); 737 const TargetRegisterClass *Src1RC = 738 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank); 739 740 // Deal with weird cases where the class only partially supports the subreg 741 // index. 742 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg); 743 if (!Src0RC || !Src1RC) 744 return false; 745 746 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 747 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) || 748 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI)) 749 return false; 750 751 const DebugLoc &DL = I.getDebugLoc(); 752 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg) 753 .addReg(Src0Reg) 754 .addReg(Src1Reg) 755 .addImm(SubReg); 756 757 I.eraseFromParent(); 758 return true; 759 } 760 761 bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const { 762 Register DstReg = MI.getOperand(0).getReg(); 763 Register SrcReg = MI.getOperand(1).getReg(); 764 Register OffsetReg = MI.getOperand(2).getReg(); 765 Register WidthReg = MI.getOperand(3).getReg(); 766 767 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID && 768 "scalar BFX instructions are expanded in regbankselect"); 769 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 && 770 "64-bit vector BFX instructions are expanded in regbankselect"); 771 772 const DebugLoc &DL = MI.getDebugLoc(); 773 MachineBasicBlock *MBB = MI.getParent(); 774 775 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX; 776 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64; 777 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg) 778 .addReg(SrcReg) 779 .addReg(OffsetReg) 780 .addReg(WidthReg); 781 MI.eraseFromParent(); 782 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 783 } 784 785 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const { 786 if (STI.getLDSBankCount() != 16) 787 return selectImpl(MI, *CoverageInfo); 788 789 Register Dst = MI.getOperand(0).getReg(); 790 Register Src0 = MI.getOperand(2).getReg(); 791 Register M0Val = MI.getOperand(6).getReg(); 792 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) || 793 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) || 794 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI)) 795 return false; 796 797 // This requires 2 instructions. It is possible to write a pattern to support 798 // this, but the generated isel emitter doesn't correctly deal with multiple 799 // output instructions using the same physical register input. The copy to m0 800 // is incorrectly placed before the second instruction. 801 // 802 // TODO: Match source modifiers. 803 804 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 805 const DebugLoc &DL = MI.getDebugLoc(); 806 MachineBasicBlock *MBB = MI.getParent(); 807 808 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 809 .addReg(M0Val); 810 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov) 811 .addImm(2) 812 .addImm(MI.getOperand(4).getImm()) // $attr 813 .addImm(MI.getOperand(3).getImm()); // $attrchan 814 815 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst) 816 .addImm(0) // $src0_modifiers 817 .addReg(Src0) // $src0 818 .addImm(MI.getOperand(4).getImm()) // $attr 819 .addImm(MI.getOperand(3).getImm()) // $attrchan 820 .addImm(0) // $src2_modifiers 821 .addReg(InterpMov) // $src2 - 2 f16 values selected by high 822 .addImm(MI.getOperand(5).getImm()) // $high 823 .addImm(0) // $clamp 824 .addImm(0); // $omod 825 826 MI.eraseFromParent(); 827 return true; 828 } 829 830 // Writelane is special in that it can use SGPR and M0 (which would normally 831 // count as using the constant bus twice - but in this case it is allowed since 832 // the lane selector doesn't count as a use of the constant bus). However, it is 833 // still required to abide by the 1 SGPR rule. Fix this up if we might have 834 // multiple SGPRs. 835 bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const { 836 // With a constant bus limit of at least 2, there's no issue. 837 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1) 838 return selectImpl(MI, *CoverageInfo); 839 840 MachineBasicBlock *MBB = MI.getParent(); 841 const DebugLoc &DL = MI.getDebugLoc(); 842 Register VDst = MI.getOperand(0).getReg(); 843 Register Val = MI.getOperand(2).getReg(); 844 Register LaneSelect = MI.getOperand(3).getReg(); 845 Register VDstIn = MI.getOperand(4).getReg(); 846 847 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst); 848 849 Optional<ValueAndVReg> ConstSelect = 850 getIConstantVRegValWithLookThrough(LaneSelect, *MRI); 851 if (ConstSelect) { 852 // The selector has to be an inline immediate, so we can use whatever for 853 // the other operands. 854 MIB.addReg(Val); 855 MIB.addImm(ConstSelect->Value.getSExtValue() & 856 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2())); 857 } else { 858 Optional<ValueAndVReg> ConstVal = 859 getIConstantVRegValWithLookThrough(Val, *MRI); 860 861 // If the value written is an inline immediate, we can get away without a 862 // copy to m0. 863 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(), 864 STI.hasInv2PiInlineImm())) { 865 MIB.addImm(ConstVal->Value.getSExtValue()); 866 MIB.addReg(LaneSelect); 867 } else { 868 MIB.addReg(Val); 869 870 // If the lane selector was originally in a VGPR and copied with 871 // readfirstlane, there's a hazard to read the same SGPR from the 872 // VALU. Constrain to a different SGPR to help avoid needing a nop later. 873 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI); 874 875 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 876 .addReg(LaneSelect); 877 MIB.addReg(AMDGPU::M0); 878 } 879 } 880 881 MIB.addReg(VDstIn); 882 883 MI.eraseFromParent(); 884 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 885 } 886 887 // We need to handle this here because tablegen doesn't support matching 888 // instructions with multiple outputs. 889 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const { 890 Register Dst0 = MI.getOperand(0).getReg(); 891 Register Dst1 = MI.getOperand(1).getReg(); 892 893 LLT Ty = MRI->getType(Dst0); 894 unsigned Opc; 895 if (Ty == LLT::scalar(32)) 896 Opc = AMDGPU::V_DIV_SCALE_F32_e64; 897 else if (Ty == LLT::scalar(64)) 898 Opc = AMDGPU::V_DIV_SCALE_F64_e64; 899 else 900 return false; 901 902 // TODO: Match source modifiers. 903 904 const DebugLoc &DL = MI.getDebugLoc(); 905 MachineBasicBlock *MBB = MI.getParent(); 906 907 Register Numer = MI.getOperand(3).getReg(); 908 Register Denom = MI.getOperand(4).getReg(); 909 unsigned ChooseDenom = MI.getOperand(5).getImm(); 910 911 Register Src0 = ChooseDenom != 0 ? Numer : Denom; 912 913 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0) 914 .addDef(Dst1) 915 .addImm(0) // $src0_modifiers 916 .addUse(Src0) // $src0 917 .addImm(0) // $src1_modifiers 918 .addUse(Denom) // $src1 919 .addImm(0) // $src2_modifiers 920 .addUse(Numer) // $src2 921 .addImm(0) // $clamp 922 .addImm(0); // $omod 923 924 MI.eraseFromParent(); 925 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 926 } 927 928 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { 929 unsigned IntrinsicID = I.getIntrinsicID(); 930 switch (IntrinsicID) { 931 case Intrinsic::amdgcn_if_break: { 932 MachineBasicBlock *BB = I.getParent(); 933 934 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick 935 // SelectionDAG uses for wave32 vs wave64. 936 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK)) 937 .add(I.getOperand(0)) 938 .add(I.getOperand(2)) 939 .add(I.getOperand(3)); 940 941 Register DstReg = I.getOperand(0).getReg(); 942 Register Src0Reg = I.getOperand(2).getReg(); 943 Register Src1Reg = I.getOperand(3).getReg(); 944 945 I.eraseFromParent(); 946 947 for (Register Reg : { DstReg, Src0Reg, Src1Reg }) 948 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 949 950 return true; 951 } 952 case Intrinsic::amdgcn_interp_p1_f16: 953 return selectInterpP1F16(I); 954 case Intrinsic::amdgcn_wqm: 955 return constrainCopyLikeIntrin(I, AMDGPU::WQM); 956 case Intrinsic::amdgcn_softwqm: 957 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM); 958 case Intrinsic::amdgcn_strict_wwm: 959 case Intrinsic::amdgcn_wwm: 960 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM); 961 case Intrinsic::amdgcn_strict_wqm: 962 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM); 963 case Intrinsic::amdgcn_writelane: 964 return selectWritelane(I); 965 case Intrinsic::amdgcn_div_scale: 966 return selectDivScale(I); 967 case Intrinsic::amdgcn_icmp: 968 return selectIntrinsicIcmp(I); 969 case Intrinsic::amdgcn_ballot: 970 return selectBallot(I); 971 case Intrinsic::amdgcn_reloc_constant: 972 return selectRelocConstant(I); 973 case Intrinsic::amdgcn_groupstaticsize: 974 return selectGroupStaticSize(I); 975 case Intrinsic::returnaddress: 976 return selectReturnAddress(I); 977 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16: 978 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16: 979 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16: 980 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16: 981 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8: 982 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: 983 return selectSMFMACIntrin(I); 984 default: 985 return selectImpl(I, *CoverageInfo); 986 } 987 } 988 989 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) { 990 if (Size != 32 && Size != 64) 991 return -1; 992 switch (P) { 993 default: 994 llvm_unreachable("Unknown condition code!"); 995 case CmpInst::ICMP_NE: 996 return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64; 997 case CmpInst::ICMP_EQ: 998 return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64; 999 case CmpInst::ICMP_SGT: 1000 return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64; 1001 case CmpInst::ICMP_SGE: 1002 return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64; 1003 case CmpInst::ICMP_SLT: 1004 return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64; 1005 case CmpInst::ICMP_SLE: 1006 return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64; 1007 case CmpInst::ICMP_UGT: 1008 return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64; 1009 case CmpInst::ICMP_UGE: 1010 return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64; 1011 case CmpInst::ICMP_ULT: 1012 return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64; 1013 case CmpInst::ICMP_ULE: 1014 return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64; 1015 } 1016 } 1017 1018 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P, 1019 unsigned Size) const { 1020 if (Size == 64) { 1021 if (!STI.hasScalarCompareEq64()) 1022 return -1; 1023 1024 switch (P) { 1025 case CmpInst::ICMP_NE: 1026 return AMDGPU::S_CMP_LG_U64; 1027 case CmpInst::ICMP_EQ: 1028 return AMDGPU::S_CMP_EQ_U64; 1029 default: 1030 return -1; 1031 } 1032 } 1033 1034 if (Size != 32) 1035 return -1; 1036 1037 switch (P) { 1038 case CmpInst::ICMP_NE: 1039 return AMDGPU::S_CMP_LG_U32; 1040 case CmpInst::ICMP_EQ: 1041 return AMDGPU::S_CMP_EQ_U32; 1042 case CmpInst::ICMP_SGT: 1043 return AMDGPU::S_CMP_GT_I32; 1044 case CmpInst::ICMP_SGE: 1045 return AMDGPU::S_CMP_GE_I32; 1046 case CmpInst::ICMP_SLT: 1047 return AMDGPU::S_CMP_LT_I32; 1048 case CmpInst::ICMP_SLE: 1049 return AMDGPU::S_CMP_LE_I32; 1050 case CmpInst::ICMP_UGT: 1051 return AMDGPU::S_CMP_GT_U32; 1052 case CmpInst::ICMP_UGE: 1053 return AMDGPU::S_CMP_GE_U32; 1054 case CmpInst::ICMP_ULT: 1055 return AMDGPU::S_CMP_LT_U32; 1056 case CmpInst::ICMP_ULE: 1057 return AMDGPU::S_CMP_LE_U32; 1058 default: 1059 llvm_unreachable("Unknown condition code!"); 1060 } 1061 } 1062 1063 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { 1064 MachineBasicBlock *BB = I.getParent(); 1065 const DebugLoc &DL = I.getDebugLoc(); 1066 1067 Register SrcReg = I.getOperand(2).getReg(); 1068 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 1069 1070 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); 1071 1072 Register CCReg = I.getOperand(0).getReg(); 1073 if (!isVCC(CCReg, *MRI)) { 1074 int Opcode = getS_CMPOpcode(Pred, Size); 1075 if (Opcode == -1) 1076 return false; 1077 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode)) 1078 .add(I.getOperand(2)) 1079 .add(I.getOperand(3)); 1080 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg) 1081 .addReg(AMDGPU::SCC); 1082 bool Ret = 1083 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) && 1084 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI); 1085 I.eraseFromParent(); 1086 return Ret; 1087 } 1088 1089 int Opcode = getV_CMPOpcode(Pred, Size); 1090 if (Opcode == -1) 1091 return false; 1092 1093 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), 1094 I.getOperand(0).getReg()) 1095 .add(I.getOperand(2)) 1096 .add(I.getOperand(3)); 1097 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), 1098 *TRI.getBoolRC(), *MRI); 1099 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 1100 I.eraseFromParent(); 1101 return Ret; 1102 } 1103 1104 bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const { 1105 Register Dst = I.getOperand(0).getReg(); 1106 if (isVCC(Dst, *MRI)) 1107 return false; 1108 1109 if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize()) 1110 return false; 1111 1112 MachineBasicBlock *BB = I.getParent(); 1113 const DebugLoc &DL = I.getDebugLoc(); 1114 Register SrcReg = I.getOperand(2).getReg(); 1115 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 1116 1117 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm()); 1118 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(Pred))) { 1119 MachineInstr *ICmp = 1120 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst); 1121 1122 if (!RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), 1123 *TRI.getBoolRC(), *MRI)) 1124 return false; 1125 I.eraseFromParent(); 1126 return true; 1127 } 1128 1129 int Opcode = getV_CMPOpcode(Pred, Size); 1130 if (Opcode == -1) 1131 return false; 1132 1133 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst) 1134 .add(I.getOperand(2)) 1135 .add(I.getOperand(3)); 1136 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(), 1137 *MRI); 1138 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 1139 I.eraseFromParent(); 1140 return Ret; 1141 } 1142 1143 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const { 1144 MachineBasicBlock *BB = I.getParent(); 1145 const DebugLoc &DL = I.getDebugLoc(); 1146 Register DstReg = I.getOperand(0).getReg(); 1147 const unsigned Size = MRI->getType(DstReg).getSizeInBits(); 1148 const bool Is64 = Size == 64; 1149 1150 if (Size != STI.getWavefrontSize()) 1151 return false; 1152 1153 Optional<ValueAndVReg> Arg = 1154 getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI); 1155 1156 if (Arg.hasValue()) { 1157 const int64_t Value = Arg.getValue().Value.getSExtValue(); 1158 if (Value == 0) { 1159 unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; 1160 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0); 1161 } else if (Value == -1) { // all ones 1162 Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO; 1163 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg); 1164 } else 1165 return false; 1166 } else { 1167 Register SrcReg = I.getOperand(2).getReg(); 1168 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg); 1169 } 1170 1171 I.eraseFromParent(); 1172 return true; 1173 } 1174 1175 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const { 1176 Register DstReg = I.getOperand(0).getReg(); 1177 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 1178 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank); 1179 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 1180 return false; 1181 1182 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID; 1183 1184 Module *M = MF->getFunction().getParent(); 1185 const MDNode *Metadata = I.getOperand(2).getMetadata(); 1186 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString(); 1187 auto RelocSymbol = cast<GlobalVariable>( 1188 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext()))); 1189 1190 MachineBasicBlock *BB = I.getParent(); 1191 BuildMI(*BB, &I, I.getDebugLoc(), 1192 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg) 1193 .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO); 1194 1195 I.eraseFromParent(); 1196 return true; 1197 } 1198 1199 bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const { 1200 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS(); 1201 1202 Register DstReg = I.getOperand(0).getReg(); 1203 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1204 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ? 1205 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1206 1207 MachineBasicBlock *MBB = I.getParent(); 1208 const DebugLoc &DL = I.getDebugLoc(); 1209 1210 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg); 1211 1212 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) { 1213 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1214 MIB.addImm(MFI->getLDSSize()); 1215 } else { 1216 Module *M = MF->getFunction().getParent(); 1217 const GlobalValue *GV 1218 = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize); 1219 MIB.addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO); 1220 } 1221 1222 I.eraseFromParent(); 1223 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1224 } 1225 1226 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const { 1227 MachineBasicBlock *MBB = I.getParent(); 1228 MachineFunction &MF = *MBB->getParent(); 1229 const DebugLoc &DL = I.getDebugLoc(); 1230 1231 MachineOperand &Dst = I.getOperand(0); 1232 Register DstReg = Dst.getReg(); 1233 unsigned Depth = I.getOperand(2).getImm(); 1234 1235 const TargetRegisterClass *RC 1236 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 1237 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) || 1238 !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) 1239 return false; 1240 1241 // Check for kernel and shader functions 1242 if (Depth != 0 || 1243 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) { 1244 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) 1245 .addImm(0); 1246 I.eraseFromParent(); 1247 return true; 1248 } 1249 1250 MachineFrameInfo &MFI = MF.getFrameInfo(); 1251 // There is a call to @llvm.returnaddress in this function 1252 MFI.setReturnAddressIsTaken(true); 1253 1254 // Get the return address reg and mark it as an implicit live-in 1255 Register ReturnAddrReg = TRI.getReturnAddressReg(MF); 1256 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg, 1257 AMDGPU::SReg_64RegClass, DL); 1258 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg) 1259 .addReg(LiveIn); 1260 I.eraseFromParent(); 1261 return true; 1262 } 1263 1264 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const { 1265 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick 1266 // SelectionDAG uses for wave32 vs wave64. 1267 MachineBasicBlock *BB = MI.getParent(); 1268 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF)) 1269 .add(MI.getOperand(1)); 1270 1271 Register Reg = MI.getOperand(1).getReg(); 1272 MI.eraseFromParent(); 1273 1274 if (!MRI->getRegClassOrNull(Reg)) 1275 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 1276 return true; 1277 } 1278 1279 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic( 1280 MachineInstr &MI, Intrinsic::ID IntrID) const { 1281 MachineBasicBlock *MBB = MI.getParent(); 1282 MachineFunction *MF = MBB->getParent(); 1283 const DebugLoc &DL = MI.getDebugLoc(); 1284 1285 unsigned IndexOperand = MI.getOperand(7).getImm(); 1286 bool WaveRelease = MI.getOperand(8).getImm() != 0; 1287 bool WaveDone = MI.getOperand(9).getImm() != 0; 1288 1289 if (WaveDone && !WaveRelease) 1290 report_fatal_error("ds_ordered_count: wave_done requires wave_release"); 1291 1292 unsigned OrderedCountIndex = IndexOperand & 0x3f; 1293 IndexOperand &= ~0x3f; 1294 unsigned CountDw = 0; 1295 1296 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) { 1297 CountDw = (IndexOperand >> 24) & 0xf; 1298 IndexOperand &= ~(0xf << 24); 1299 1300 if (CountDw < 1 || CountDw > 4) { 1301 report_fatal_error( 1302 "ds_ordered_count: dword count must be between 1 and 4"); 1303 } 1304 } 1305 1306 if (IndexOperand) 1307 report_fatal_error("ds_ordered_count: bad index operand"); 1308 1309 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1; 1310 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF); 1311 1312 unsigned Offset0 = OrderedCountIndex << 2; 1313 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) | 1314 (Instruction << 4); 1315 1316 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) 1317 Offset1 |= (CountDw - 1) << 6; 1318 1319 unsigned Offset = Offset0 | (Offset1 << 8); 1320 1321 Register M0Val = MI.getOperand(2).getReg(); 1322 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1323 .addReg(M0Val); 1324 1325 Register DstReg = MI.getOperand(0).getReg(); 1326 Register ValReg = MI.getOperand(3).getReg(); 1327 MachineInstrBuilder DS = 1328 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg) 1329 .addReg(ValReg) 1330 .addImm(Offset) 1331 .cloneMemRefs(MI); 1332 1333 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI)) 1334 return false; 1335 1336 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI); 1337 MI.eraseFromParent(); 1338 return Ret; 1339 } 1340 1341 static unsigned gwsIntrinToOpcode(unsigned IntrID) { 1342 switch (IntrID) { 1343 case Intrinsic::amdgcn_ds_gws_init: 1344 return AMDGPU::DS_GWS_INIT; 1345 case Intrinsic::amdgcn_ds_gws_barrier: 1346 return AMDGPU::DS_GWS_BARRIER; 1347 case Intrinsic::amdgcn_ds_gws_sema_v: 1348 return AMDGPU::DS_GWS_SEMA_V; 1349 case Intrinsic::amdgcn_ds_gws_sema_br: 1350 return AMDGPU::DS_GWS_SEMA_BR; 1351 case Intrinsic::amdgcn_ds_gws_sema_p: 1352 return AMDGPU::DS_GWS_SEMA_P; 1353 case Intrinsic::amdgcn_ds_gws_sema_release_all: 1354 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL; 1355 default: 1356 llvm_unreachable("not a gws intrinsic"); 1357 } 1358 } 1359 1360 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, 1361 Intrinsic::ID IID) const { 1362 if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all && 1363 !STI.hasGWSSemaReleaseAll()) 1364 return false; 1365 1366 // intrinsic ID, vsrc, offset 1367 const bool HasVSrc = MI.getNumOperands() == 3; 1368 assert(HasVSrc || MI.getNumOperands() == 2); 1369 1370 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg(); 1371 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI); 1372 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID) 1373 return false; 1374 1375 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1376 assert(OffsetDef); 1377 1378 unsigned ImmOffset; 1379 1380 MachineBasicBlock *MBB = MI.getParent(); 1381 const DebugLoc &DL = MI.getDebugLoc(); 1382 1383 MachineInstr *Readfirstlane = nullptr; 1384 1385 // If we legalized the VGPR input, strip out the readfirstlane to analyze the 1386 // incoming offset, in case there's an add of a constant. We'll have to put it 1387 // back later. 1388 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) { 1389 Readfirstlane = OffsetDef; 1390 BaseOffset = OffsetDef->getOperand(1).getReg(); 1391 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1392 } 1393 1394 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) { 1395 // If we have a constant offset, try to use the 0 in m0 as the base. 1396 // TODO: Look into changing the default m0 initialization value. If the 1397 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to 1398 // the immediate offset. 1399 1400 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue(); 1401 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1402 .addImm(0); 1403 } else { 1404 std::tie(BaseOffset, ImmOffset) = 1405 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset); 1406 1407 if (Readfirstlane) { 1408 // We have the constant offset now, so put the readfirstlane back on the 1409 // variable component. 1410 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI)) 1411 return false; 1412 1413 Readfirstlane->getOperand(1).setReg(BaseOffset); 1414 BaseOffset = Readfirstlane->getOperand(0).getReg(); 1415 } else { 1416 if (!RBI.constrainGenericRegister(BaseOffset, 1417 AMDGPU::SReg_32RegClass, *MRI)) 1418 return false; 1419 } 1420 1421 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1422 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base) 1423 .addReg(BaseOffset) 1424 .addImm(16); 1425 1426 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1427 .addReg(M0Base); 1428 } 1429 1430 // The resource id offset is computed as (<isa opaque base> + M0[21:16] + 1431 // offset field) % 64. Some versions of the programming guide omit the m0 1432 // part, or claim it's from offset 0. 1433 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID))); 1434 1435 if (HasVSrc) { 1436 Register VSrc = MI.getOperand(1).getReg(); 1437 1438 if (STI.needsAlignedVGPRs()) { 1439 // Add implicit aligned super-reg to force alignment on the data operand. 1440 Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1441 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef); 1442 Register NewVR = 1443 MRI->createVirtualRegister(&AMDGPU::VReg_64_Align2RegClass); 1444 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), NewVR) 1445 .addReg(VSrc, 0, MI.getOperand(1).getSubReg()) 1446 .addImm(AMDGPU::sub0) 1447 .addReg(Undef) 1448 .addImm(AMDGPU::sub1); 1449 MIB.addReg(NewVR, 0, AMDGPU::sub0); 1450 MIB.addReg(NewVR, RegState::Implicit); 1451 } else { 1452 MIB.addReg(VSrc); 1453 } 1454 1455 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI)) 1456 return false; 1457 } 1458 1459 MIB.addImm(ImmOffset) 1460 .cloneMemRefs(MI); 1461 1462 MI.eraseFromParent(); 1463 return true; 1464 } 1465 1466 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI, 1467 bool IsAppend) const { 1468 Register PtrBase = MI.getOperand(2).getReg(); 1469 LLT PtrTy = MRI->getType(PtrBase); 1470 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS; 1471 1472 unsigned Offset; 1473 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2)); 1474 1475 // TODO: Should this try to look through readfirstlane like GWS? 1476 if (!isDSOffsetLegal(PtrBase, Offset)) { 1477 PtrBase = MI.getOperand(2).getReg(); 1478 Offset = 0; 1479 } 1480 1481 MachineBasicBlock *MBB = MI.getParent(); 1482 const DebugLoc &DL = MI.getDebugLoc(); 1483 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME; 1484 1485 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1486 .addReg(PtrBase); 1487 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI)) 1488 return false; 1489 1490 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg()) 1491 .addImm(Offset) 1492 .addImm(IsGDS ? -1 : 0) 1493 .cloneMemRefs(MI); 1494 MI.eraseFromParent(); 1495 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1496 } 1497 1498 bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const { 1499 if (TM.getOptLevel() > CodeGenOpt::None) { 1500 unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second; 1501 if (WGSize <= STI.getWavefrontSize()) { 1502 MachineBasicBlock *MBB = MI.getParent(); 1503 const DebugLoc &DL = MI.getDebugLoc(); 1504 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER)); 1505 MI.eraseFromParent(); 1506 return true; 1507 } 1508 } 1509 return selectImpl(MI, *CoverageInfo); 1510 } 1511 1512 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, 1513 bool &IsTexFail) { 1514 if (TexFailCtrl) 1515 IsTexFail = true; 1516 1517 TFE = (TexFailCtrl & 0x1) ? true : false; 1518 TexFailCtrl &= ~(uint64_t)0x1; 1519 LWE = (TexFailCtrl & 0x2) ? true : false; 1520 TexFailCtrl &= ~(uint64_t)0x2; 1521 1522 return TexFailCtrl == 0; 1523 } 1524 1525 bool AMDGPUInstructionSelector::selectImageIntrinsic( 1526 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 1527 MachineBasicBlock *MBB = MI.getParent(); 1528 const DebugLoc &DL = MI.getDebugLoc(); 1529 1530 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 1531 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 1532 1533 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); 1534 unsigned IntrOpcode = Intr->BaseOpcode; 1535 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI); 1536 1537 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1; 1538 1539 Register VDataIn, VDataOut; 1540 LLT VDataTy; 1541 int NumVDataDwords = -1; 1542 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 || 1543 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16; 1544 1545 bool Unorm; 1546 if (!BaseOpcode->Sampler) 1547 Unorm = true; 1548 else 1549 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0; 1550 1551 bool TFE; 1552 bool LWE; 1553 bool IsTexFail = false; 1554 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(), 1555 TFE, LWE, IsTexFail)) 1556 return false; 1557 1558 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm(); 1559 const bool IsA16 = (Flags & 1) != 0; 1560 const bool IsG16 = (Flags & 2) != 0; 1561 1562 // A16 implies 16 bit gradients if subtarget doesn't support G16 1563 if (IsA16 && !STI.hasG16() && !IsG16) 1564 return false; 1565 1566 unsigned DMask = 0; 1567 unsigned DMaskLanes = 0; 1568 1569 if (BaseOpcode->Atomic) { 1570 VDataOut = MI.getOperand(0).getReg(); 1571 VDataIn = MI.getOperand(2).getReg(); 1572 LLT Ty = MRI->getType(VDataIn); 1573 1574 // Be careful to allow atomic swap on 16-bit element vectors. 1575 const bool Is64Bit = BaseOpcode->AtomicX2 ? 1576 Ty.getSizeInBits() == 128 : 1577 Ty.getSizeInBits() == 64; 1578 1579 if (BaseOpcode->AtomicX2) { 1580 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister); 1581 1582 DMask = Is64Bit ? 0xf : 0x3; 1583 NumVDataDwords = Is64Bit ? 4 : 2; 1584 } else { 1585 DMask = Is64Bit ? 0x3 : 0x1; 1586 NumVDataDwords = Is64Bit ? 2 : 1; 1587 } 1588 } else { 1589 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 1590 DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask); 1591 1592 if (BaseOpcode->Store) { 1593 VDataIn = MI.getOperand(1).getReg(); 1594 VDataTy = MRI->getType(VDataIn); 1595 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32; 1596 } else { 1597 VDataOut = MI.getOperand(0).getReg(); 1598 VDataTy = MRI->getType(VDataOut); 1599 NumVDataDwords = DMaskLanes; 1600 1601 if (IsD16 && !STI.hasUnpackedD16VMem()) 1602 NumVDataDwords = (DMaskLanes + 1) / 2; 1603 } 1604 } 1605 1606 // Set G16 opcode 1607 if (IsG16 && !IsA16) { 1608 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = 1609 AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode); 1610 assert(G16MappingInfo); 1611 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16 1612 } 1613 1614 // TODO: Check this in verifier. 1615 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this"); 1616 1617 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(); 1618 if (BaseOpcode->Atomic) 1619 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization 1620 if (CPol & ~AMDGPU::CPol::ALL) 1621 return false; 1622 1623 int NumVAddrRegs = 0; 1624 int NumVAddrDwords = 0; 1625 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 1626 // Skip the $noregs and 0s inserted during legalization. 1627 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I); 1628 if (!AddrOp.isReg()) 1629 continue; // XXX - Break? 1630 1631 Register Addr = AddrOp.getReg(); 1632 if (!Addr) 1633 break; 1634 1635 ++NumVAddrRegs; 1636 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32; 1637 } 1638 1639 // The legalizer preprocessed the intrinsic arguments. If we aren't using 1640 // NSA, these should have been packed into a single value in the first 1641 // address register 1642 const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs; 1643 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) { 1644 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n"); 1645 return false; 1646 } 1647 1648 if (IsTexFail) 1649 ++NumVDataDwords; 1650 1651 int Opcode = -1; 1652 if (IsGFX10Plus) { 1653 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, 1654 UseNSA ? AMDGPU::MIMGEncGfx10NSA 1655 : AMDGPU::MIMGEncGfx10Default, 1656 NumVDataDwords, NumVAddrDwords); 1657 } else { 1658 if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 1659 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, 1660 NumVDataDwords, NumVAddrDwords); 1661 if (Opcode == -1) 1662 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6, 1663 NumVDataDwords, NumVAddrDwords); 1664 } 1665 assert(Opcode != -1); 1666 1667 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode)) 1668 .cloneMemRefs(MI); 1669 1670 if (VDataOut) { 1671 if (BaseOpcode->AtomicX2) { 1672 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64; 1673 1674 Register TmpReg = MRI->createVirtualRegister( 1675 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass); 1676 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 1677 1678 MIB.addDef(TmpReg); 1679 if (!MRI->use_empty(VDataOut)) { 1680 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut) 1681 .addReg(TmpReg, RegState::Kill, SubReg); 1682 } 1683 1684 } else { 1685 MIB.addDef(VDataOut); // vdata output 1686 } 1687 } 1688 1689 if (VDataIn) 1690 MIB.addReg(VDataIn); // vdata input 1691 1692 for (int I = 0; I != NumVAddrRegs; ++I) { 1693 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I); 1694 if (SrcOp.isReg()) { 1695 assert(SrcOp.getReg() != 0); 1696 MIB.addReg(SrcOp.getReg()); 1697 } 1698 } 1699 1700 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg()); 1701 if (BaseOpcode->Sampler) 1702 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg()); 1703 1704 MIB.addImm(DMask); // dmask 1705 1706 if (IsGFX10Plus) 1707 MIB.addImm(DimInfo->Encoding); 1708 MIB.addImm(Unorm); 1709 1710 MIB.addImm(CPol); 1711 MIB.addImm(IsA16 && // a16 or r128 1712 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0); 1713 if (IsGFX10Plus) 1714 MIB.addImm(IsA16 ? -1 : 0); 1715 1716 MIB.addImm(TFE); // tfe 1717 MIB.addImm(LWE); // lwe 1718 if (!IsGFX10Plus) 1719 MIB.addImm(DimInfo->DA ? -1 : 0); 1720 if (BaseOpcode->HasD16) 1721 MIB.addImm(IsD16 ? -1 : 0); 1722 1723 if (IsTexFail) { 1724 // An image load instruction with TFE/LWE only conditionally writes to its 1725 // result registers. Initialize them to zero so that we always get well 1726 // defined result values. 1727 assert(VDataOut && !VDataIn); 1728 Register Tied = MRI->cloneVirtualRegister(VDataOut); 1729 Register Zero = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1730 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::V_MOV_B32_e32), Zero) 1731 .addImm(0); 1732 auto Parts = TRI.getRegSplitParts(MRI->getRegClass(Tied), 4); 1733 if (STI.usePRTStrictNull()) { 1734 // With enable-prt-strict-null enabled, initialize all result registers to 1735 // zero. 1736 auto RegSeq = 1737 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied); 1738 for (auto Sub : Parts) 1739 RegSeq.addReg(Zero).addImm(Sub); 1740 } else { 1741 // With enable-prt-strict-null disabled, only initialize the extra TFE/LWE 1742 // result register. 1743 Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1744 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef); 1745 auto RegSeq = 1746 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied); 1747 for (auto Sub : Parts.drop_back(1)) 1748 RegSeq.addReg(Undef).addImm(Sub); 1749 RegSeq.addReg(Zero).addImm(Parts.back()); 1750 } 1751 MIB.addReg(Tied, RegState::Implicit); 1752 MIB->tieOperands(0, MIB->getNumOperands() - 1); 1753 } 1754 1755 MI.eraseFromParent(); 1756 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1757 } 1758 1759 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( 1760 MachineInstr &I) const { 1761 unsigned IntrinsicID = I.getIntrinsicID(); 1762 switch (IntrinsicID) { 1763 case Intrinsic::amdgcn_end_cf: 1764 return selectEndCfIntrinsic(I); 1765 case Intrinsic::amdgcn_ds_ordered_add: 1766 case Intrinsic::amdgcn_ds_ordered_swap: 1767 return selectDSOrderedIntrinsic(I, IntrinsicID); 1768 case Intrinsic::amdgcn_ds_gws_init: 1769 case Intrinsic::amdgcn_ds_gws_barrier: 1770 case Intrinsic::amdgcn_ds_gws_sema_v: 1771 case Intrinsic::amdgcn_ds_gws_sema_br: 1772 case Intrinsic::amdgcn_ds_gws_sema_p: 1773 case Intrinsic::amdgcn_ds_gws_sema_release_all: 1774 return selectDSGWSIntrinsic(I, IntrinsicID); 1775 case Intrinsic::amdgcn_ds_append: 1776 return selectDSAppendConsume(I, true); 1777 case Intrinsic::amdgcn_ds_consume: 1778 return selectDSAppendConsume(I, false); 1779 case Intrinsic::amdgcn_s_barrier: 1780 return selectSBarrier(I); 1781 case Intrinsic::amdgcn_global_atomic_fadd: 1782 return selectGlobalAtomicFadd(I, I.getOperand(2), I.getOperand(3)); 1783 case Intrinsic::amdgcn_raw_buffer_load_lds: 1784 case Intrinsic::amdgcn_struct_buffer_load_lds: 1785 return selectBufferLoadLds(I); 1786 case Intrinsic::amdgcn_global_load_lds: 1787 return selectGlobalLoadLds(I); 1788 default: { 1789 return selectImpl(I, *CoverageInfo); 1790 } 1791 } 1792 } 1793 1794 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { 1795 if (selectImpl(I, *CoverageInfo)) 1796 return true; 1797 1798 MachineBasicBlock *BB = I.getParent(); 1799 const DebugLoc &DL = I.getDebugLoc(); 1800 1801 Register DstReg = I.getOperand(0).getReg(); 1802 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 1803 assert(Size <= 32 || Size == 64); 1804 const MachineOperand &CCOp = I.getOperand(1); 1805 Register CCReg = CCOp.getReg(); 1806 if (!isVCC(CCReg, *MRI)) { 1807 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 : 1808 AMDGPU::S_CSELECT_B32; 1809 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 1810 .addReg(CCReg); 1811 1812 // The generic constrainSelectedInstRegOperands doesn't work for the scc register 1813 // bank, because it does not cover the register class that we used to represent 1814 // for it. So we need to manually set the register class here. 1815 if (!MRI->getRegClassOrNull(CCReg)) 1816 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI)); 1817 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) 1818 .add(I.getOperand(2)) 1819 .add(I.getOperand(3)); 1820 1821 bool Ret = false; 1822 Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); 1823 Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI); 1824 I.eraseFromParent(); 1825 return Ret; 1826 } 1827 1828 // Wide VGPR select should have been split in RegBankSelect. 1829 if (Size > 32) 1830 return false; 1831 1832 MachineInstr *Select = 1833 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1834 .addImm(0) 1835 .add(I.getOperand(3)) 1836 .addImm(0) 1837 .add(I.getOperand(2)) 1838 .add(I.getOperand(1)); 1839 1840 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); 1841 I.eraseFromParent(); 1842 return Ret; 1843 } 1844 1845 static int sizeToSubRegIndex(unsigned Size) { 1846 switch (Size) { 1847 case 32: 1848 return AMDGPU::sub0; 1849 case 64: 1850 return AMDGPU::sub0_sub1; 1851 case 96: 1852 return AMDGPU::sub0_sub1_sub2; 1853 case 128: 1854 return AMDGPU::sub0_sub1_sub2_sub3; 1855 case 256: 1856 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; 1857 default: 1858 if (Size < 32) 1859 return AMDGPU::sub0; 1860 if (Size > 256) 1861 return -1; 1862 return sizeToSubRegIndex(PowerOf2Ceil(Size)); 1863 } 1864 } 1865 1866 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { 1867 Register DstReg = I.getOperand(0).getReg(); 1868 Register SrcReg = I.getOperand(1).getReg(); 1869 const LLT DstTy = MRI->getType(DstReg); 1870 const LLT SrcTy = MRI->getType(SrcReg); 1871 const LLT S1 = LLT::scalar(1); 1872 1873 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1874 const RegisterBank *DstRB; 1875 if (DstTy == S1) { 1876 // This is a special case. We don't treat s1 for legalization artifacts as 1877 // vcc booleans. 1878 DstRB = SrcRB; 1879 } else { 1880 DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1881 if (SrcRB != DstRB) 1882 return false; 1883 } 1884 1885 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; 1886 1887 unsigned DstSize = DstTy.getSizeInBits(); 1888 unsigned SrcSize = SrcTy.getSizeInBits(); 1889 1890 const TargetRegisterClass *SrcRC = 1891 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB); 1892 const TargetRegisterClass *DstRC = 1893 TRI.getRegClassForSizeOnBank(DstSize, *DstRB); 1894 if (!SrcRC || !DstRC) 1895 return false; 1896 1897 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 1898 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) { 1899 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); 1900 return false; 1901 } 1902 1903 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) { 1904 MachineBasicBlock *MBB = I.getParent(); 1905 const DebugLoc &DL = I.getDebugLoc(); 1906 1907 Register LoReg = MRI->createVirtualRegister(DstRC); 1908 Register HiReg = MRI->createVirtualRegister(DstRC); 1909 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg) 1910 .addReg(SrcReg, 0, AMDGPU::sub0); 1911 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg) 1912 .addReg(SrcReg, 0, AMDGPU::sub1); 1913 1914 if (IsVALU && STI.hasSDWA()) { 1915 // Write the low 16-bits of the high element into the high 16-bits of the 1916 // low element. 1917 MachineInstr *MovSDWA = 1918 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 1919 .addImm(0) // $src0_modifiers 1920 .addReg(HiReg) // $src0 1921 .addImm(0) // $clamp 1922 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel 1923 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 1924 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel 1925 .addReg(LoReg, RegState::Implicit); 1926 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 1927 } else { 1928 Register TmpReg0 = MRI->createVirtualRegister(DstRC); 1929 Register TmpReg1 = MRI->createVirtualRegister(DstRC); 1930 Register ImmReg = MRI->createVirtualRegister(DstRC); 1931 if (IsVALU) { 1932 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0) 1933 .addImm(16) 1934 .addReg(HiReg); 1935 } else { 1936 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0) 1937 .addReg(HiReg) 1938 .addImm(16); 1939 } 1940 1941 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 1942 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 1943 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32; 1944 1945 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg) 1946 .addImm(0xffff); 1947 BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1) 1948 .addReg(LoReg) 1949 .addReg(ImmReg); 1950 BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg) 1951 .addReg(TmpReg0) 1952 .addReg(TmpReg1); 1953 } 1954 1955 I.eraseFromParent(); 1956 return true; 1957 } 1958 1959 if (!DstTy.isScalar()) 1960 return false; 1961 1962 if (SrcSize > 32) { 1963 int SubRegIdx = sizeToSubRegIndex(DstSize); 1964 if (SubRegIdx == -1) 1965 return false; 1966 1967 // Deal with weird cases where the class only partially supports the subreg 1968 // index. 1969 const TargetRegisterClass *SrcWithSubRC 1970 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx); 1971 if (!SrcWithSubRC) 1972 return false; 1973 1974 if (SrcWithSubRC != SrcRC) { 1975 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI)) 1976 return false; 1977 } 1978 1979 I.getOperand(1).setSubReg(SubRegIdx); 1980 } 1981 1982 I.setDesc(TII.get(TargetOpcode::COPY)); 1983 return true; 1984 } 1985 1986 /// \returns true if a bitmask for \p Size bits will be an inline immediate. 1987 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) { 1988 Mask = maskTrailingOnes<unsigned>(Size); 1989 int SignedMask = static_cast<int>(Mask); 1990 return SignedMask >= -16 && SignedMask <= 64; 1991 } 1992 1993 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1. 1994 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank( 1995 Register Reg, const MachineRegisterInfo &MRI, 1996 const TargetRegisterInfo &TRI) const { 1997 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 1998 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>()) 1999 return RB; 2000 2001 // Ignore the type, since we don't use vcc in artifacts. 2002 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>()) 2003 return &RBI.getRegBankFromRegClass(*RC, LLT()); 2004 return nullptr; 2005 } 2006 2007 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { 2008 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG; 2009 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg; 2010 const DebugLoc &DL = I.getDebugLoc(); 2011 MachineBasicBlock &MBB = *I.getParent(); 2012 const Register DstReg = I.getOperand(0).getReg(); 2013 const Register SrcReg = I.getOperand(1).getReg(); 2014 2015 const LLT DstTy = MRI->getType(DstReg); 2016 const LLT SrcTy = MRI->getType(SrcReg); 2017 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ? 2018 I.getOperand(2).getImm() : SrcTy.getSizeInBits(); 2019 const unsigned DstSize = DstTy.getSizeInBits(); 2020 if (!DstTy.isScalar()) 2021 return false; 2022 2023 // Artifact casts should never use vcc. 2024 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI); 2025 2026 // FIXME: This should probably be illegal and split earlier. 2027 if (I.getOpcode() == AMDGPU::G_ANYEXT) { 2028 if (DstSize <= 32) 2029 return selectCOPY(I); 2030 2031 const TargetRegisterClass *SrcRC = 2032 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank); 2033 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 2034 const TargetRegisterClass *DstRC = 2035 TRI.getRegClassForSizeOnBank(DstSize, *DstBank); 2036 2037 Register UndefReg = MRI->createVirtualRegister(SrcRC); 2038 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 2039 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 2040 .addReg(SrcReg) 2041 .addImm(AMDGPU::sub0) 2042 .addReg(UndefReg) 2043 .addImm(AMDGPU::sub1); 2044 I.eraseFromParent(); 2045 2046 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) && 2047 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI); 2048 } 2049 2050 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) { 2051 // 64-bit should have been split up in RegBankSelect 2052 2053 // Try to use an and with a mask if it will save code size. 2054 unsigned Mask; 2055 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 2056 MachineInstr *ExtI = 2057 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg) 2058 .addImm(Mask) 2059 .addReg(SrcReg); 2060 I.eraseFromParent(); 2061 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 2062 } 2063 2064 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64; 2065 MachineInstr *ExtI = 2066 BuildMI(MBB, I, DL, TII.get(BFE), DstReg) 2067 .addReg(SrcReg) 2068 .addImm(0) // Offset 2069 .addImm(SrcSize); // Width 2070 I.eraseFromParent(); 2071 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 2072 } 2073 2074 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) { 2075 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ? 2076 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass; 2077 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI)) 2078 return false; 2079 2080 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) { 2081 const unsigned SextOpc = SrcSize == 8 ? 2082 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16; 2083 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg) 2084 .addReg(SrcReg); 2085 I.eraseFromParent(); 2086 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 2087 } 2088 2089 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; 2090 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; 2091 2092 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width. 2093 if (DstSize > 32 && (SrcSize <= 32 || InReg)) { 2094 // We need a 64-bit register source, but the high bits don't matter. 2095 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); 2096 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2097 unsigned SubReg = InReg ? AMDGPU::sub0 : 0; 2098 2099 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 2100 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) 2101 .addReg(SrcReg, 0, SubReg) 2102 .addImm(AMDGPU::sub0) 2103 .addReg(UndefReg) 2104 .addImm(AMDGPU::sub1); 2105 2106 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg) 2107 .addReg(ExtReg) 2108 .addImm(SrcSize << 16); 2109 2110 I.eraseFromParent(); 2111 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI); 2112 } 2113 2114 unsigned Mask; 2115 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 2116 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg) 2117 .addReg(SrcReg) 2118 .addImm(Mask); 2119 } else { 2120 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg) 2121 .addReg(SrcReg) 2122 .addImm(SrcSize << 16); 2123 } 2124 2125 I.eraseFromParent(); 2126 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 2127 } 2128 2129 return false; 2130 } 2131 2132 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { 2133 MachineBasicBlock *BB = I.getParent(); 2134 MachineOperand &ImmOp = I.getOperand(1); 2135 Register DstReg = I.getOperand(0).getReg(); 2136 unsigned Size = MRI->getType(DstReg).getSizeInBits(); 2137 2138 // The AMDGPU backend only supports Imm operands and not CImm or FPImm. 2139 if (ImmOp.isFPImm()) { 2140 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt(); 2141 ImmOp.ChangeToImmediate(Imm.getZExtValue()); 2142 } else if (ImmOp.isCImm()) { 2143 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue()); 2144 } else { 2145 llvm_unreachable("Not supported by g_constants"); 2146 } 2147 2148 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2149 const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID; 2150 2151 unsigned Opcode; 2152 if (DstRB->getID() == AMDGPU::VCCRegBankID) { 2153 Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 2154 } else { 2155 Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 2156 2157 // We should never produce s1 values on banks other than VCC. If the user of 2158 // this already constrained the register, we may incorrectly think it's VCC 2159 // if it wasn't originally. 2160 if (Size == 1) 2161 return false; 2162 } 2163 2164 if (Size != 64) { 2165 I.setDesc(TII.get(Opcode)); 2166 I.addImplicitDefUseOperands(*MF); 2167 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2168 } 2169 2170 const DebugLoc &DL = I.getDebugLoc(); 2171 2172 APInt Imm(Size, I.getOperand(1).getImm()); 2173 2174 MachineInstr *ResInst; 2175 if (IsSgpr && TII.isInlineConstant(Imm)) { 2176 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) 2177 .addImm(I.getOperand(1).getImm()); 2178 } else { 2179 const TargetRegisterClass *RC = IsSgpr ? 2180 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass; 2181 Register LoReg = MRI->createVirtualRegister(RC); 2182 Register HiReg = MRI->createVirtualRegister(RC); 2183 2184 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg) 2185 .addImm(Imm.trunc(32).getZExtValue()); 2186 2187 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg) 2188 .addImm(Imm.ashr(32).getZExtValue()); 2189 2190 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 2191 .addReg(LoReg) 2192 .addImm(AMDGPU::sub0) 2193 .addReg(HiReg) 2194 .addImm(AMDGPU::sub1); 2195 } 2196 2197 // We can't call constrainSelectedInstRegOperands here, because it doesn't 2198 // work for target independent opcodes 2199 I.eraseFromParent(); 2200 const TargetRegisterClass *DstRC = 2201 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI); 2202 if (!DstRC) 2203 return true; 2204 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI); 2205 } 2206 2207 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const { 2208 // Only manually handle the f64 SGPR case. 2209 // 2210 // FIXME: This is a workaround for 2.5 different tablegen problems. Because 2211 // the bit ops theoretically have a second result due to the implicit def of 2212 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing 2213 // that is easy by disabling the check. The result works, but uses a 2214 // nonsensical sreg32orlds_and_sreg_1 regclass. 2215 // 2216 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to 2217 // the variadic REG_SEQUENCE operands. 2218 2219 Register Dst = MI.getOperand(0).getReg(); 2220 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 2221 if (DstRB->getID() != AMDGPU::SGPRRegBankID || 2222 MRI->getType(Dst) != LLT::scalar(64)) 2223 return false; 2224 2225 Register Src = MI.getOperand(1).getReg(); 2226 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI); 2227 if (Fabs) 2228 Src = Fabs->getOperand(1).getReg(); 2229 2230 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || 2231 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) 2232 return false; 2233 2234 MachineBasicBlock *BB = MI.getParent(); 2235 const DebugLoc &DL = MI.getDebugLoc(); 2236 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2237 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2238 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2239 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2240 2241 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) 2242 .addReg(Src, 0, AMDGPU::sub0); 2243 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) 2244 .addReg(Src, 0, AMDGPU::sub1); 2245 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) 2246 .addImm(0x80000000); 2247 2248 // Set or toggle sign bit. 2249 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32; 2250 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg) 2251 .addReg(HiReg) 2252 .addReg(ConstReg); 2253 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) 2254 .addReg(LoReg) 2255 .addImm(AMDGPU::sub0) 2256 .addReg(OpReg) 2257 .addImm(AMDGPU::sub1); 2258 MI.eraseFromParent(); 2259 return true; 2260 } 2261 2262 // FIXME: This is a workaround for the same tablegen problems as G_FNEG 2263 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const { 2264 Register Dst = MI.getOperand(0).getReg(); 2265 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 2266 if (DstRB->getID() != AMDGPU::SGPRRegBankID || 2267 MRI->getType(Dst) != LLT::scalar(64)) 2268 return false; 2269 2270 Register Src = MI.getOperand(1).getReg(); 2271 MachineBasicBlock *BB = MI.getParent(); 2272 const DebugLoc &DL = MI.getDebugLoc(); 2273 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2274 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2275 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2276 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2277 2278 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || 2279 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) 2280 return false; 2281 2282 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) 2283 .addReg(Src, 0, AMDGPU::sub0); 2284 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) 2285 .addReg(Src, 0, AMDGPU::sub1); 2286 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) 2287 .addImm(0x7fffffff); 2288 2289 // Clear sign bit. 2290 // TODO: Should this used S_BITSET0_*? 2291 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg) 2292 .addReg(HiReg) 2293 .addReg(ConstReg); 2294 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) 2295 .addReg(LoReg) 2296 .addImm(AMDGPU::sub0) 2297 .addReg(OpReg) 2298 .addImm(AMDGPU::sub1); 2299 2300 MI.eraseFromParent(); 2301 return true; 2302 } 2303 2304 static bool isConstant(const MachineInstr &MI) { 2305 return MI.getOpcode() == TargetOpcode::G_CONSTANT; 2306 } 2307 2308 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, 2309 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const { 2310 2311 const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg()); 2312 2313 assert(PtrMI); 2314 2315 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD) 2316 return; 2317 2318 GEPInfo GEPInfo(*PtrMI); 2319 2320 for (unsigned i = 1; i != 3; ++i) { 2321 const MachineOperand &GEPOp = PtrMI->getOperand(i); 2322 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg()); 2323 assert(OpDef); 2324 if (i == 2 && isConstant(*OpDef)) { 2325 // TODO: Could handle constant base + variable offset, but a combine 2326 // probably should have commuted it. 2327 assert(GEPInfo.Imm == 0); 2328 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue(); 2329 continue; 2330 } 2331 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI); 2332 if (OpBank->getID() == AMDGPU::SGPRRegBankID) 2333 GEPInfo.SgprParts.push_back(GEPOp.getReg()); 2334 else 2335 GEPInfo.VgprParts.push_back(GEPOp.getReg()); 2336 } 2337 2338 AddrInfo.push_back(GEPInfo); 2339 getAddrModeInfo(*PtrMI, MRI, AddrInfo); 2340 } 2341 2342 bool AMDGPUInstructionSelector::isSGPR(Register Reg) const { 2343 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID; 2344 } 2345 2346 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const { 2347 if (!MI.hasOneMemOperand()) 2348 return false; 2349 2350 const MachineMemOperand *MMO = *MI.memoperands_begin(); 2351 const Value *Ptr = MMO->getValue(); 2352 2353 // UndefValue means this is a load of a kernel input. These are uniform. 2354 // Sometimes LDS instructions have constant pointers. 2355 // If Ptr is null, then that means this mem operand contains a 2356 // PseudoSourceValue like GOT. 2357 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || 2358 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) 2359 return true; 2360 2361 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 2362 return true; 2363 2364 const Instruction *I = dyn_cast<Instruction>(Ptr); 2365 return I && I->getMetadata("amdgpu.uniform"); 2366 } 2367 2368 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const { 2369 for (const GEPInfo &GEPInfo : AddrInfo) { 2370 if (!GEPInfo.VgprParts.empty()) 2371 return true; 2372 } 2373 return false; 2374 } 2375 2376 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const { 2377 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); 2378 unsigned AS = PtrTy.getAddressSpace(); 2379 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) && 2380 STI.ldsRequiresM0Init()) { 2381 MachineBasicBlock *BB = I.getParent(); 2382 2383 // If DS instructions require M0 initialization, insert it before selecting. 2384 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 2385 .addImm(-1); 2386 } 2387 } 2388 2389 bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW( 2390 MachineInstr &I) const { 2391 if (I.getOpcode() == TargetOpcode::G_ATOMICRMW_FADD) { 2392 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); 2393 unsigned AS = PtrTy.getAddressSpace(); 2394 if (AS == AMDGPUAS::GLOBAL_ADDRESS) 2395 return selectGlobalAtomicFadd(I, I.getOperand(1), I.getOperand(2)); 2396 } 2397 2398 initM0(I); 2399 return selectImpl(I, *CoverageInfo); 2400 } 2401 2402 static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI) { 2403 if (Reg.isPhysical()) 2404 return false; 2405 2406 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg); 2407 const unsigned Opcode = MI.getOpcode(); 2408 2409 if (Opcode == AMDGPU::COPY) 2410 return isVCmpResult(MI.getOperand(1).getReg(), MRI); 2411 2412 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR || 2413 Opcode == AMDGPU::G_XOR) 2414 return isVCmpResult(MI.getOperand(1).getReg(), MRI) && 2415 isVCmpResult(MI.getOperand(2).getReg(), MRI); 2416 2417 if (Opcode == TargetOpcode::G_INTRINSIC) 2418 return MI.getIntrinsicID() == Intrinsic::amdgcn_class; 2419 2420 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP; 2421 } 2422 2423 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { 2424 MachineBasicBlock *BB = I.getParent(); 2425 MachineOperand &CondOp = I.getOperand(0); 2426 Register CondReg = CondOp.getReg(); 2427 const DebugLoc &DL = I.getDebugLoc(); 2428 2429 unsigned BrOpcode; 2430 Register CondPhysReg; 2431 const TargetRegisterClass *ConstrainRC; 2432 2433 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide 2434 // whether the branch is uniform when selecting the instruction. In 2435 // GlobalISel, we should push that decision into RegBankSelect. Assume for now 2436 // RegBankSelect knows what it's doing if the branch condition is scc, even 2437 // though it currently does not. 2438 if (!isVCC(CondReg, *MRI)) { 2439 if (MRI->getType(CondReg) != LLT::scalar(32)) 2440 return false; 2441 2442 CondPhysReg = AMDGPU::SCC; 2443 BrOpcode = AMDGPU::S_CBRANCH_SCC1; 2444 ConstrainRC = &AMDGPU::SReg_32RegClass; 2445 } else { 2446 // FIXME: Should scc->vcc copies and with exec? 2447 2448 // Unless the value of CondReg is a result of a V_CMP* instruction then we 2449 // need to insert an and with exec. 2450 if (!isVCmpResult(CondReg, *MRI)) { 2451 const bool Is64 = STI.isWave64(); 2452 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; 2453 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO; 2454 2455 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC()); 2456 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg) 2457 .addReg(CondReg) 2458 .addReg(Exec); 2459 CondReg = TmpReg; 2460 } 2461 2462 CondPhysReg = TRI.getVCC(); 2463 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ; 2464 ConstrainRC = TRI.getBoolRC(); 2465 } 2466 2467 if (!MRI->getRegClassOrNull(CondReg)) 2468 MRI->setRegClass(CondReg, ConstrainRC); 2469 2470 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg) 2471 .addReg(CondReg); 2472 BuildMI(*BB, &I, DL, TII.get(BrOpcode)) 2473 .addMBB(I.getOperand(1).getMBB()); 2474 2475 I.eraseFromParent(); 2476 return true; 2477 } 2478 2479 bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE( 2480 MachineInstr &I) const { 2481 Register DstReg = I.getOperand(0).getReg(); 2482 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2483 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 2484 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32)); 2485 if (IsVGPR) 2486 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 2487 2488 return RBI.constrainGenericRegister( 2489 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI); 2490 } 2491 2492 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { 2493 Register DstReg = I.getOperand(0).getReg(); 2494 Register SrcReg = I.getOperand(1).getReg(); 2495 Register MaskReg = I.getOperand(2).getReg(); 2496 LLT Ty = MRI->getType(DstReg); 2497 LLT MaskTy = MRI->getType(MaskReg); 2498 MachineBasicBlock *BB = I.getParent(); 2499 const DebugLoc &DL = I.getDebugLoc(); 2500 2501 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2502 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 2503 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI); 2504 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 2505 if (DstRB != SrcRB) // Should only happen for hand written MIR. 2506 return false; 2507 2508 // Try to avoid emitting a bit operation when we only need to touch half of 2509 // the 64-bit pointer. 2510 APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64); 2511 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32); 2512 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32); 2513 2514 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32; 2515 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32; 2516 2517 if (!IsVGPR && Ty.getSizeInBits() == 64 && 2518 !CanCopyLow32 && !CanCopyHi32) { 2519 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg) 2520 .addReg(SrcReg) 2521 .addReg(MaskReg); 2522 I.eraseFromParent(); 2523 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 2524 } 2525 2526 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 2527 const TargetRegisterClass &RegRC 2528 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 2529 2530 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB); 2531 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB); 2532 const TargetRegisterClass *MaskRC = 2533 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB); 2534 2535 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 2536 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 2537 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI)) 2538 return false; 2539 2540 if (Ty.getSizeInBits() == 32) { 2541 assert(MaskTy.getSizeInBits() == 32 && 2542 "ptrmask should have been narrowed during legalize"); 2543 2544 BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg) 2545 .addReg(SrcReg) 2546 .addReg(MaskReg); 2547 I.eraseFromParent(); 2548 return true; 2549 } 2550 2551 Register HiReg = MRI->createVirtualRegister(&RegRC); 2552 Register LoReg = MRI->createVirtualRegister(&RegRC); 2553 2554 // Extract the subregisters from the source pointer. 2555 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg) 2556 .addReg(SrcReg, 0, AMDGPU::sub0); 2557 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg) 2558 .addReg(SrcReg, 0, AMDGPU::sub1); 2559 2560 Register MaskedLo, MaskedHi; 2561 2562 if (CanCopyLow32) { 2563 // If all the bits in the low half are 1, we only need a copy for it. 2564 MaskedLo = LoReg; 2565 } else { 2566 // Extract the mask subregister and apply the and. 2567 Register MaskLo = MRI->createVirtualRegister(&RegRC); 2568 MaskedLo = MRI->createVirtualRegister(&RegRC); 2569 2570 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo) 2571 .addReg(MaskReg, 0, AMDGPU::sub0); 2572 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo) 2573 .addReg(LoReg) 2574 .addReg(MaskLo); 2575 } 2576 2577 if (CanCopyHi32) { 2578 // If all the bits in the high half are 1, we only need a copy for it. 2579 MaskedHi = HiReg; 2580 } else { 2581 Register MaskHi = MRI->createVirtualRegister(&RegRC); 2582 MaskedHi = MRI->createVirtualRegister(&RegRC); 2583 2584 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi) 2585 .addReg(MaskReg, 0, AMDGPU::sub1); 2586 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi) 2587 .addReg(HiReg) 2588 .addReg(MaskHi); 2589 } 2590 2591 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 2592 .addReg(MaskedLo) 2593 .addImm(AMDGPU::sub0) 2594 .addReg(MaskedHi) 2595 .addImm(AMDGPU::sub1); 2596 I.eraseFromParent(); 2597 return true; 2598 } 2599 2600 /// Return the register to use for the index value, and the subregister to use 2601 /// for the indirectly accessed register. 2602 static std::pair<Register, unsigned> 2603 computeIndirectRegIndex(MachineRegisterInfo &MRI, 2604 const SIRegisterInfo &TRI, 2605 const TargetRegisterClass *SuperRC, 2606 Register IdxReg, 2607 unsigned EltSize) { 2608 Register IdxBaseReg; 2609 int Offset; 2610 2611 std::tie(IdxBaseReg, Offset) = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg); 2612 if (IdxBaseReg == AMDGPU::NoRegister) { 2613 // This will happen if the index is a known constant. This should ordinarily 2614 // be legalized out, but handle it as a register just in case. 2615 assert(Offset == 0); 2616 IdxBaseReg = IdxReg; 2617 } 2618 2619 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize); 2620 2621 // Skip out of bounds offsets, or else we would end up using an undefined 2622 // register. 2623 if (static_cast<unsigned>(Offset) >= SubRegs.size()) 2624 return std::make_pair(IdxReg, SubRegs[0]); 2625 return std::make_pair(IdxBaseReg, SubRegs[Offset]); 2626 } 2627 2628 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT( 2629 MachineInstr &MI) const { 2630 Register DstReg = MI.getOperand(0).getReg(); 2631 Register SrcReg = MI.getOperand(1).getReg(); 2632 Register IdxReg = MI.getOperand(2).getReg(); 2633 2634 LLT DstTy = MRI->getType(DstReg); 2635 LLT SrcTy = MRI->getType(SrcReg); 2636 2637 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2638 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 2639 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 2640 2641 // The index must be scalar. If it wasn't RegBankSelect should have moved this 2642 // into a waterfall loop. 2643 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 2644 return false; 2645 2646 const TargetRegisterClass *SrcRC = 2647 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB); 2648 const TargetRegisterClass *DstRC = 2649 TRI.getRegClassForTypeOnBank(DstTy, *DstRB); 2650 if (!SrcRC || !DstRC) 2651 return false; 2652 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 2653 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 2654 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 2655 return false; 2656 2657 MachineBasicBlock *BB = MI.getParent(); 2658 const DebugLoc &DL = MI.getDebugLoc(); 2659 const bool Is64 = DstTy.getSizeInBits() == 64; 2660 2661 unsigned SubReg; 2662 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg, 2663 DstTy.getSizeInBits() / 8); 2664 2665 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) { 2666 if (DstTy.getSizeInBits() != 32 && !Is64) 2667 return false; 2668 2669 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2670 .addReg(IdxReg); 2671 2672 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32; 2673 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg) 2674 .addReg(SrcReg, 0, SubReg) 2675 .addReg(SrcReg, RegState::Implicit); 2676 MI.eraseFromParent(); 2677 return true; 2678 } 2679 2680 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32) 2681 return false; 2682 2683 if (!STI.useVGPRIndexMode()) { 2684 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2685 .addReg(IdxReg); 2686 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg) 2687 .addReg(SrcReg, 0, SubReg) 2688 .addReg(SrcReg, RegState::Implicit); 2689 MI.eraseFromParent(); 2690 return true; 2691 } 2692 2693 const MCInstrDesc &GPRIDXDesc = 2694 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true); 2695 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg) 2696 .addReg(SrcReg) 2697 .addReg(IdxReg) 2698 .addImm(SubReg); 2699 2700 MI.eraseFromParent(); 2701 return true; 2702 } 2703 2704 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd 2705 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT( 2706 MachineInstr &MI) const { 2707 Register DstReg = MI.getOperand(0).getReg(); 2708 Register VecReg = MI.getOperand(1).getReg(); 2709 Register ValReg = MI.getOperand(2).getReg(); 2710 Register IdxReg = MI.getOperand(3).getReg(); 2711 2712 LLT VecTy = MRI->getType(DstReg); 2713 LLT ValTy = MRI->getType(ValReg); 2714 unsigned VecSize = VecTy.getSizeInBits(); 2715 unsigned ValSize = ValTy.getSizeInBits(); 2716 2717 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI); 2718 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI); 2719 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 2720 2721 assert(VecTy.getElementType() == ValTy); 2722 2723 // The index must be scalar. If it wasn't RegBankSelect should have moved this 2724 // into a waterfall loop. 2725 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 2726 return false; 2727 2728 const TargetRegisterClass *VecRC = 2729 TRI.getRegClassForTypeOnBank(VecTy, *VecRB); 2730 const TargetRegisterClass *ValRC = 2731 TRI.getRegClassForTypeOnBank(ValTy, *ValRB); 2732 2733 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) || 2734 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) || 2735 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) || 2736 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 2737 return false; 2738 2739 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32) 2740 return false; 2741 2742 unsigned SubReg; 2743 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, 2744 ValSize / 8); 2745 2746 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID && 2747 STI.useVGPRIndexMode(); 2748 2749 MachineBasicBlock *BB = MI.getParent(); 2750 const DebugLoc &DL = MI.getDebugLoc(); 2751 2752 if (!IndexMode) { 2753 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2754 .addReg(IdxReg); 2755 2756 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo( 2757 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID); 2758 BuildMI(*BB, MI, DL, RegWriteOp, DstReg) 2759 .addReg(VecReg) 2760 .addReg(ValReg) 2761 .addImm(SubReg); 2762 MI.eraseFromParent(); 2763 return true; 2764 } 2765 2766 const MCInstrDesc &GPRIDXDesc = 2767 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false); 2768 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg) 2769 .addReg(VecReg) 2770 .addReg(ValReg) 2771 .addReg(IdxReg) 2772 .addImm(SubReg); 2773 2774 MI.eraseFromParent(); 2775 return true; 2776 } 2777 2778 static bool isZeroOrUndef(int X) { 2779 return X == 0 || X == -1; 2780 } 2781 2782 static bool isOneOrUndef(int X) { 2783 return X == 1 || X == -1; 2784 } 2785 2786 static bool isZeroOrOneOrUndef(int X) { 2787 return X == 0 || X == 1 || X == -1; 2788 } 2789 2790 // Normalize a VOP3P shuffle mask to refer to the low/high half of a single 2791 // 32-bit register. 2792 static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1, 2793 ArrayRef<int> Mask) { 2794 NewMask[0] = Mask[0]; 2795 NewMask[1] = Mask[1]; 2796 if (isZeroOrOneOrUndef(Mask[0]) && isZeroOrOneOrUndef(Mask[1])) 2797 return Src0; 2798 2799 assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1); 2800 assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1); 2801 2802 // Shift the mask inputs to be 0/1; 2803 NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2; 2804 NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2; 2805 return Src1; 2806 } 2807 2808 // This is only legal with VOP3P instructions as an aid to op_sel matching. 2809 bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR( 2810 MachineInstr &MI) const { 2811 Register DstReg = MI.getOperand(0).getReg(); 2812 Register Src0Reg = MI.getOperand(1).getReg(); 2813 Register Src1Reg = MI.getOperand(2).getReg(); 2814 ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask(); 2815 2816 const LLT V2S16 = LLT::fixed_vector(2, 16); 2817 if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16) 2818 return false; 2819 2820 if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask)) 2821 return false; 2822 2823 assert(ShufMask.size() == 2); 2824 assert(STI.hasSDWA() && "no target has VOP3P but not SDWA"); 2825 2826 MachineBasicBlock *MBB = MI.getParent(); 2827 const DebugLoc &DL = MI.getDebugLoc(); 2828 2829 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2830 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; 2831 const TargetRegisterClass &RC = IsVALU ? 2832 AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 2833 2834 // Handle the degenerate case which should have folded out. 2835 if (ShufMask[0] == -1 && ShufMask[1] == -1) { 2836 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg); 2837 2838 MI.eraseFromParent(); 2839 return RBI.constrainGenericRegister(DstReg, RC, *MRI); 2840 } 2841 2842 // A legal VOP3P mask only reads one of the sources. 2843 int Mask[2]; 2844 Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask); 2845 2846 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) || 2847 !RBI.constrainGenericRegister(SrcVec, RC, *MRI)) 2848 return false; 2849 2850 // TODO: This also should have been folded out 2851 if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) { 2852 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg) 2853 .addReg(SrcVec); 2854 2855 MI.eraseFromParent(); 2856 return true; 2857 } 2858 2859 if (Mask[0] == 1 && Mask[1] == -1) { 2860 if (IsVALU) { 2861 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg) 2862 .addImm(16) 2863 .addReg(SrcVec); 2864 } else { 2865 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg) 2866 .addReg(SrcVec) 2867 .addImm(16); 2868 } 2869 } else if (Mask[0] == -1 && Mask[1] == 0) { 2870 if (IsVALU) { 2871 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg) 2872 .addImm(16) 2873 .addReg(SrcVec); 2874 } else { 2875 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg) 2876 .addReg(SrcVec) 2877 .addImm(16); 2878 } 2879 } else if (Mask[0] == 0 && Mask[1] == 0) { 2880 if (IsVALU) { 2881 // Write low half of the register into the high half. 2882 MachineInstr *MovSDWA = 2883 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 2884 .addImm(0) // $src0_modifiers 2885 .addReg(SrcVec) // $src0 2886 .addImm(0) // $clamp 2887 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel 2888 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 2889 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel 2890 .addReg(SrcVec, RegState::Implicit); 2891 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 2892 } else { 2893 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg) 2894 .addReg(SrcVec) 2895 .addReg(SrcVec); 2896 } 2897 } else if (Mask[0] == 1 && Mask[1] == 1) { 2898 if (IsVALU) { 2899 // Write high half of the register into the low half. 2900 MachineInstr *MovSDWA = 2901 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 2902 .addImm(0) // $src0_modifiers 2903 .addReg(SrcVec) // $src0 2904 .addImm(0) // $clamp 2905 .addImm(AMDGPU::SDWA::WORD_0) // $dst_sel 2906 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 2907 .addImm(AMDGPU::SDWA::WORD_1) // $src0_sel 2908 .addReg(SrcVec, RegState::Implicit); 2909 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 2910 } else { 2911 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg) 2912 .addReg(SrcVec) 2913 .addReg(SrcVec); 2914 } 2915 } else if (Mask[0] == 1 && Mask[1] == 0) { 2916 if (IsVALU) { 2917 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32_e64), DstReg) 2918 .addReg(SrcVec) 2919 .addReg(SrcVec) 2920 .addImm(16); 2921 } else { 2922 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2923 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg) 2924 .addReg(SrcVec) 2925 .addImm(16); 2926 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg) 2927 .addReg(TmpReg) 2928 .addReg(SrcVec); 2929 } 2930 } else 2931 llvm_unreachable("all shuffle masks should be handled"); 2932 2933 MI.eraseFromParent(); 2934 return true; 2935 } 2936 2937 bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD( 2938 MachineInstr &MI) const { 2939 if (STI.hasGFX90AInsts()) 2940 return selectImpl(MI, *CoverageInfo); 2941 2942 MachineBasicBlock *MBB = MI.getParent(); 2943 const DebugLoc &DL = MI.getDebugLoc(); 2944 2945 if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) { 2946 Function &F = MBB->getParent()->getFunction(); 2947 DiagnosticInfoUnsupported 2948 NoFpRet(F, "return versions of fp atomics not supported", 2949 MI.getDebugLoc(), DS_Error); 2950 F.getContext().diagnose(NoFpRet); 2951 return false; 2952 } 2953 2954 // FIXME: This is only needed because tablegen requires number of dst operands 2955 // in match and replace pattern to be the same. Otherwise patterns can be 2956 // exported from SDag path. 2957 MachineOperand &VDataIn = MI.getOperand(1); 2958 MachineOperand &VIndex = MI.getOperand(3); 2959 MachineOperand &VOffset = MI.getOperand(4); 2960 MachineOperand &SOffset = MI.getOperand(5); 2961 int16_t Offset = MI.getOperand(6).getImm(); 2962 2963 bool HasVOffset = !isOperandImmEqual(VOffset, 0, *MRI); 2964 bool HasVIndex = !isOperandImmEqual(VIndex, 0, *MRI); 2965 2966 unsigned Opcode; 2967 if (HasVOffset) { 2968 Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN 2969 : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN; 2970 } else { 2971 Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN 2972 : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET; 2973 } 2974 2975 if (MRI->getType(VDataIn.getReg()).isVector()) { 2976 switch (Opcode) { 2977 case AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN: 2978 Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN; 2979 break; 2980 case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN: 2981 Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFEN; 2982 break; 2983 case AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN: 2984 Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_IDXEN; 2985 break; 2986 case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET: 2987 Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFSET; 2988 break; 2989 } 2990 } 2991 2992 auto I = BuildMI(*MBB, MI, DL, TII.get(Opcode)); 2993 I.add(VDataIn); 2994 2995 if (Opcode == AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN || 2996 Opcode == AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN) { 2997 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class()); 2998 BuildMI(*MBB, &*I, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg) 2999 .addReg(VIndex.getReg()) 3000 .addImm(AMDGPU::sub0) 3001 .addReg(VOffset.getReg()) 3002 .addImm(AMDGPU::sub1); 3003 3004 I.addReg(IdxReg); 3005 } else if (HasVIndex) { 3006 I.add(VIndex); 3007 } else if (HasVOffset) { 3008 I.add(VOffset); 3009 } 3010 3011 I.add(MI.getOperand(2)); // rsrc 3012 I.add(SOffset); 3013 I.addImm(Offset); 3014 I.addImm(MI.getOperand(7).getImm()); // cpol 3015 I.cloneMemRefs(MI); 3016 3017 MI.eraseFromParent(); 3018 3019 return true; 3020 } 3021 3022 bool AMDGPUInstructionSelector::selectGlobalAtomicFadd( 3023 MachineInstr &MI, MachineOperand &AddrOp, MachineOperand &DataOp) const { 3024 3025 if (STI.hasGFX90AInsts()) { 3026 // gfx90a adds return versions of the global atomic fadd instructions so no 3027 // special handling is required. 3028 return selectImpl(MI, *CoverageInfo); 3029 } 3030 3031 MachineBasicBlock *MBB = MI.getParent(); 3032 const DebugLoc &DL = MI.getDebugLoc(); 3033 3034 if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) { 3035 Function &F = MBB->getParent()->getFunction(); 3036 DiagnosticInfoUnsupported 3037 NoFpRet(F, "return versions of fp atomics not supported", 3038 MI.getDebugLoc(), DS_Error); 3039 F.getContext().diagnose(NoFpRet); 3040 return false; 3041 } 3042 3043 // FIXME: This is only needed because tablegen requires number of dst operands 3044 // in match and replace pattern to be the same. Otherwise patterns can be 3045 // exported from SDag path. 3046 auto Addr = selectFlatOffsetImpl(AddrOp, SIInstrFlags::FlatGlobal); 3047 3048 Register Data = DataOp.getReg(); 3049 const unsigned Opc = MRI->getType(Data).isVector() ? 3050 AMDGPU::GLOBAL_ATOMIC_PK_ADD_F16 : AMDGPU::GLOBAL_ATOMIC_ADD_F32; 3051 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc)) 3052 .addReg(Addr.first) 3053 .addReg(Data) 3054 .addImm(Addr.second) 3055 .addImm(0) // cpol 3056 .cloneMemRefs(MI); 3057 3058 MI.eraseFromParent(); 3059 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 3060 } 3061 3062 bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const { 3063 unsigned Opc; 3064 unsigned Size = MI.getOperand(3).getImm(); 3065 3066 // The struct intrinsic variants add one additional operand over raw. 3067 const bool HasVIndex = MI.getNumOperands() == 9; 3068 Register VIndex; 3069 int OpOffset = 0; 3070 if (HasVIndex) { 3071 VIndex = MI.getOperand(4).getReg(); 3072 OpOffset = 1; 3073 } 3074 3075 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3076 Optional<ValueAndVReg> MaybeVOffset = 3077 getIConstantVRegValWithLookThrough(VOffset, *MRI); 3078 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue(); 3079 3080 switch (Size) { 3081 default: 3082 return false; 3083 case 1: 3084 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN 3085 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN 3086 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN 3087 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET; 3088 break; 3089 case 2: 3090 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN 3091 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN 3092 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN 3093 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET; 3094 break; 3095 case 4: 3096 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN 3097 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN 3098 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN 3099 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET; 3100 break; 3101 } 3102 3103 MachineBasicBlock *MBB = MI.getParent(); 3104 const DebugLoc &DL = MI.getDebugLoc(); 3105 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 3106 .add(MI.getOperand(2)); 3107 3108 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc)); 3109 3110 if (HasVIndex && HasVOffset) { 3111 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class()); 3112 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg) 3113 .addReg(VIndex) 3114 .addImm(AMDGPU::sub0) 3115 .addReg(VOffset) 3116 .addImm(AMDGPU::sub1); 3117 3118 MIB.addReg(IdxReg); 3119 } else if (HasVIndex) { 3120 MIB.addReg(VIndex); 3121 } else if (HasVOffset) { 3122 MIB.addReg(VOffset); 3123 } 3124 3125 MIB.add(MI.getOperand(1)); // rsrc 3126 MIB.add(MI.getOperand(5 + OpOffset)); // soffset 3127 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset 3128 unsigned Aux = MI.getOperand(7 + OpOffset).getImm(); 3129 MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol 3130 MIB.addImm((Aux >> 3) & 1); // swz 3131 3132 MachineMemOperand *LoadMMO = *MI.memoperands_begin(); 3133 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); 3134 LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm(); 3135 MachinePointerInfo StorePtrI = LoadPtrI; 3136 StorePtrI.V = nullptr; 3137 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; 3138 3139 auto F = LoadMMO->getFlags() & 3140 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); 3141 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, 3142 Size, LoadMMO->getBaseAlign()); 3143 3144 MachineMemOperand *StoreMMO = 3145 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore, 3146 sizeof(int32_t), LoadMMO->getBaseAlign()); 3147 3148 MIB.setMemRefs({LoadMMO, StoreMMO}); 3149 3150 MI.eraseFromParent(); 3151 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 3152 } 3153 3154 /// Match a zero extend from a 32-bit value to 64-bits. 3155 static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) { 3156 Register ZExtSrc; 3157 if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc)))) 3158 return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register(); 3159 3160 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0) 3161 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); 3162 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES) 3163 return false; 3164 3165 if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) { 3166 return Def->getOperand(1).getReg(); 3167 } 3168 3169 return Register(); 3170 } 3171 3172 bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{ 3173 unsigned Opc; 3174 unsigned Size = MI.getOperand(3).getImm(); 3175 3176 switch (Size) { 3177 default: 3178 return false; 3179 case 1: 3180 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE; 3181 break; 3182 case 2: 3183 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT; 3184 break; 3185 case 4: 3186 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD; 3187 break; 3188 } 3189 3190 MachineBasicBlock *MBB = MI.getParent(); 3191 const DebugLoc &DL = MI.getDebugLoc(); 3192 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 3193 .add(MI.getOperand(2)); 3194 3195 Register Addr = MI.getOperand(1).getReg(); 3196 Register VOffset; 3197 // Try to split SAddr and VOffset. Global and LDS pointers share the same 3198 // immediate offset, so we cannot use a regular SelectGlobalSAddr(). 3199 if (!isSGPR(Addr)) { 3200 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI); 3201 if (isSGPR(AddrDef->Reg)) { 3202 Addr = AddrDef->Reg; 3203 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) { 3204 Register SAddr = 3205 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI); 3206 if (SAddr && isSGPR(SAddr)) { 3207 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg(); 3208 if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) { 3209 Addr = SAddr; 3210 VOffset = Off; 3211 } 3212 } 3213 } 3214 } 3215 3216 if (isSGPR(Addr)) { 3217 Opc = AMDGPU::getGlobalSaddrOp(Opc); 3218 if (!VOffset) { 3219 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3220 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset) 3221 .addImm(0); 3222 } 3223 } 3224 3225 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc)) 3226 .addReg(Addr); 3227 3228 if (isSGPR(Addr)) 3229 MIB.addReg(VOffset); 3230 3231 MIB.add(MI.getOperand(4)) // offset 3232 .add(MI.getOperand(5)); // cpol 3233 3234 MachineMemOperand *LoadMMO = *MI.memoperands_begin(); 3235 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); 3236 LoadPtrI.Offset = MI.getOperand(4).getImm(); 3237 MachinePointerInfo StorePtrI = LoadPtrI; 3238 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; 3239 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; 3240 auto F = LoadMMO->getFlags() & 3241 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); 3242 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, 3243 Size, LoadMMO->getBaseAlign()); 3244 MachineMemOperand *StoreMMO = 3245 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore, 3246 sizeof(int32_t), Align(4)); 3247 3248 MIB.setMemRefs({LoadMMO, StoreMMO}); 3249 3250 MI.eraseFromParent(); 3251 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 3252 } 3253 3254 bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{ 3255 MI.setDesc(TII.get(MI.getOperand(1).getImm())); 3256 MI.removeOperand(1); 3257 MI.addImplicitDefUseOperands(*MI.getParent()->getParent()); 3258 return true; 3259 } 3260 3261 bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const { 3262 unsigned Opc; 3263 switch (MI.getIntrinsicID()) { 3264 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16: 3265 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64; 3266 break; 3267 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16: 3268 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64; 3269 break; 3270 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16: 3271 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64; 3272 break; 3273 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16: 3274 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64; 3275 break; 3276 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8: 3277 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64; 3278 break; 3279 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: 3280 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64; 3281 break; 3282 default: 3283 llvm_unreachable("unhandled smfmac intrinsic"); 3284 } 3285 3286 auto VDst_In = MI.getOperand(4); 3287 3288 MI.setDesc(TII.get(Opc)); 3289 MI.removeOperand(4); // VDst_In 3290 MI.removeOperand(1); // Intrinsic ID 3291 MI.addOperand(VDst_In); // Readd VDst_In to the end 3292 MI.addImplicitDefUseOperands(*MI.getParent()->getParent()); 3293 return true; 3294 } 3295 3296 bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const { 3297 Register DstReg = MI.getOperand(0).getReg(); 3298 Register SrcReg = MI.getOperand(1).getReg(); 3299 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 3300 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; 3301 MachineBasicBlock *MBB = MI.getParent(); 3302 const DebugLoc &DL = MI.getDebugLoc(); 3303 3304 if (IsVALU) { 3305 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg) 3306 .addImm(Subtarget->getWavefrontSizeLog2()) 3307 .addReg(SrcReg); 3308 } else { 3309 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg) 3310 .addReg(SrcReg) 3311 .addImm(Subtarget->getWavefrontSizeLog2()); 3312 } 3313 3314 const TargetRegisterClass &RC = 3315 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 3316 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI)) 3317 return false; 3318 3319 MI.eraseFromParent(); 3320 return true; 3321 } 3322 3323 bool AMDGPUInstructionSelector::select(MachineInstr &I) { 3324 if (I.isPHI()) 3325 return selectPHI(I); 3326 3327 if (!I.isPreISelOpcode()) { 3328 if (I.isCopy()) 3329 return selectCOPY(I); 3330 return true; 3331 } 3332 3333 switch (I.getOpcode()) { 3334 case TargetOpcode::G_AND: 3335 case TargetOpcode::G_OR: 3336 case TargetOpcode::G_XOR: 3337 if (selectImpl(I, *CoverageInfo)) 3338 return true; 3339 return selectG_AND_OR_XOR(I); 3340 case TargetOpcode::G_ADD: 3341 case TargetOpcode::G_SUB: 3342 if (selectImpl(I, *CoverageInfo)) 3343 return true; 3344 return selectG_ADD_SUB(I); 3345 case TargetOpcode::G_UADDO: 3346 case TargetOpcode::G_USUBO: 3347 case TargetOpcode::G_UADDE: 3348 case TargetOpcode::G_USUBE: 3349 return selectG_UADDO_USUBO_UADDE_USUBE(I); 3350 case TargetOpcode::G_INTTOPTR: 3351 case TargetOpcode::G_BITCAST: 3352 case TargetOpcode::G_PTRTOINT: 3353 return selectCOPY(I); 3354 case TargetOpcode::G_CONSTANT: 3355 case TargetOpcode::G_FCONSTANT: 3356 return selectG_CONSTANT(I); 3357 case TargetOpcode::G_FNEG: 3358 if (selectImpl(I, *CoverageInfo)) 3359 return true; 3360 return selectG_FNEG(I); 3361 case TargetOpcode::G_FABS: 3362 if (selectImpl(I, *CoverageInfo)) 3363 return true; 3364 return selectG_FABS(I); 3365 case TargetOpcode::G_EXTRACT: 3366 return selectG_EXTRACT(I); 3367 case TargetOpcode::G_MERGE_VALUES: 3368 case TargetOpcode::G_BUILD_VECTOR: 3369 case TargetOpcode::G_CONCAT_VECTORS: 3370 return selectG_MERGE_VALUES(I); 3371 case TargetOpcode::G_UNMERGE_VALUES: 3372 return selectG_UNMERGE_VALUES(I); 3373 case TargetOpcode::G_BUILD_VECTOR_TRUNC: 3374 return selectG_BUILD_VECTOR_TRUNC(I); 3375 case TargetOpcode::G_PTR_ADD: 3376 return selectG_PTR_ADD(I); 3377 case TargetOpcode::G_IMPLICIT_DEF: 3378 return selectG_IMPLICIT_DEF(I); 3379 case TargetOpcode::G_FREEZE: 3380 return selectCOPY(I); 3381 case TargetOpcode::G_INSERT: 3382 return selectG_INSERT(I); 3383 case TargetOpcode::G_INTRINSIC: 3384 return selectG_INTRINSIC(I); 3385 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: 3386 return selectG_INTRINSIC_W_SIDE_EFFECTS(I); 3387 case TargetOpcode::G_ICMP: 3388 if (selectG_ICMP(I)) 3389 return true; 3390 return selectImpl(I, *CoverageInfo); 3391 case TargetOpcode::G_LOAD: 3392 case TargetOpcode::G_STORE: 3393 case TargetOpcode::G_ATOMIC_CMPXCHG: 3394 case TargetOpcode::G_ATOMICRMW_XCHG: 3395 case TargetOpcode::G_ATOMICRMW_ADD: 3396 case TargetOpcode::G_ATOMICRMW_SUB: 3397 case TargetOpcode::G_ATOMICRMW_AND: 3398 case TargetOpcode::G_ATOMICRMW_OR: 3399 case TargetOpcode::G_ATOMICRMW_XOR: 3400 case TargetOpcode::G_ATOMICRMW_MIN: 3401 case TargetOpcode::G_ATOMICRMW_MAX: 3402 case TargetOpcode::G_ATOMICRMW_UMIN: 3403 case TargetOpcode::G_ATOMICRMW_UMAX: 3404 case TargetOpcode::G_ATOMICRMW_FADD: 3405 case AMDGPU::G_AMDGPU_ATOMIC_INC: 3406 case AMDGPU::G_AMDGPU_ATOMIC_DEC: 3407 case AMDGPU::G_AMDGPU_ATOMIC_FMIN: 3408 case AMDGPU::G_AMDGPU_ATOMIC_FMAX: 3409 return selectG_LOAD_STORE_ATOMICRMW(I); 3410 case TargetOpcode::G_SELECT: 3411 return selectG_SELECT(I); 3412 case TargetOpcode::G_TRUNC: 3413 return selectG_TRUNC(I); 3414 case TargetOpcode::G_SEXT: 3415 case TargetOpcode::G_ZEXT: 3416 case TargetOpcode::G_ANYEXT: 3417 case TargetOpcode::G_SEXT_INREG: 3418 if (selectImpl(I, *CoverageInfo)) 3419 return true; 3420 return selectG_SZA_EXT(I); 3421 case TargetOpcode::G_BRCOND: 3422 return selectG_BRCOND(I); 3423 case TargetOpcode::G_GLOBAL_VALUE: 3424 return selectG_GLOBAL_VALUE(I); 3425 case TargetOpcode::G_PTRMASK: 3426 return selectG_PTRMASK(I); 3427 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 3428 return selectG_EXTRACT_VECTOR_ELT(I); 3429 case TargetOpcode::G_INSERT_VECTOR_ELT: 3430 return selectG_INSERT_VECTOR_ELT(I); 3431 case TargetOpcode::G_SHUFFLE_VECTOR: 3432 return selectG_SHUFFLE_VECTOR(I); 3433 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 3434 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: 3435 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: 3436 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { 3437 const AMDGPU::ImageDimIntrinsicInfo *Intr 3438 = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID()); 3439 assert(Intr && "not an image intrinsic with image pseudo"); 3440 return selectImageIntrinsic(I, Intr); 3441 } 3442 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: 3443 return selectBVHIntrinsic(I); 3444 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: 3445 return selectAMDGPU_BUFFER_ATOMIC_FADD(I); 3446 case AMDGPU::G_SBFX: 3447 case AMDGPU::G_UBFX: 3448 return selectG_SBFX_UBFX(I); 3449 case AMDGPU::G_SI_CALL: 3450 I.setDesc(TII.get(AMDGPU::SI_CALL)); 3451 return true; 3452 case AMDGPU::G_AMDGPU_WAVE_ADDRESS: 3453 return selectWaveAddress(I); 3454 default: 3455 return selectImpl(I, *CoverageInfo); 3456 } 3457 return false; 3458 } 3459 3460 InstructionSelector::ComplexRendererFns 3461 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { 3462 return {{ 3463 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 3464 }}; 3465 3466 } 3467 3468 std::pair<Register, unsigned> 3469 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root, 3470 bool AllowAbs) const { 3471 Register Src = Root.getReg(); 3472 Register OrigSrc = Src; 3473 unsigned Mods = 0; 3474 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI); 3475 3476 if (MI && MI->getOpcode() == AMDGPU::G_FNEG) { 3477 Src = MI->getOperand(1).getReg(); 3478 Mods |= SISrcMods::NEG; 3479 MI = getDefIgnoringCopies(Src, *MRI); 3480 } 3481 3482 if (AllowAbs && MI && MI->getOpcode() == AMDGPU::G_FABS) { 3483 Src = MI->getOperand(1).getReg(); 3484 Mods |= SISrcMods::ABS; 3485 } 3486 3487 if (Mods != 0 && 3488 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) { 3489 MachineInstr *UseMI = Root.getParent(); 3490 3491 // If we looked through copies to find source modifiers on an SGPR operand, 3492 // we now have an SGPR register source. To avoid potentially violating the 3493 // constant bus restriction, we need to insert a copy to a VGPR. 3494 Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc); 3495 BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(), 3496 TII.get(AMDGPU::COPY), VGPRSrc) 3497 .addReg(Src); 3498 Src = VGPRSrc; 3499 } 3500 3501 return std::make_pair(Src, Mods); 3502 } 3503 3504 /// 3505 /// This will select either an SGPR or VGPR operand and will save us from 3506 /// having to write an extra tablegen pattern. 3507 InstructionSelector::ComplexRendererFns 3508 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const { 3509 return {{ 3510 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 3511 }}; 3512 } 3513 3514 InstructionSelector::ComplexRendererFns 3515 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { 3516 Register Src; 3517 unsigned Mods; 3518 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 3519 3520 return {{ 3521 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3522 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 3523 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 3524 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 3525 }}; 3526 } 3527 3528 InstructionSelector::ComplexRendererFns 3529 AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const { 3530 Register Src; 3531 unsigned Mods; 3532 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false); 3533 3534 return {{ 3535 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3536 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 3537 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 3538 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 3539 }}; 3540 } 3541 3542 InstructionSelector::ComplexRendererFns 3543 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { 3544 return {{ 3545 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 3546 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 3547 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 3548 }}; 3549 } 3550 3551 InstructionSelector::ComplexRendererFns 3552 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { 3553 Register Src; 3554 unsigned Mods; 3555 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 3556 3557 return {{ 3558 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3559 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3560 }}; 3561 } 3562 3563 InstructionSelector::ComplexRendererFns 3564 AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const { 3565 Register Src; 3566 unsigned Mods; 3567 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false); 3568 3569 return {{ 3570 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3571 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3572 }}; 3573 } 3574 3575 InstructionSelector::ComplexRendererFns 3576 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { 3577 Register Reg = Root.getReg(); 3578 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); 3579 if (Def && (Def->getOpcode() == AMDGPU::G_FNEG || 3580 Def->getOpcode() == AMDGPU::G_FABS)) 3581 return {}; 3582 return {{ 3583 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 3584 }}; 3585 } 3586 3587 std::pair<Register, unsigned> 3588 AMDGPUInstructionSelector::selectVOP3PModsImpl( 3589 Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const { 3590 unsigned Mods = 0; 3591 MachineInstr *MI = MRI.getVRegDef(Src); 3592 3593 if (MI && MI->getOpcode() == AMDGPU::G_FNEG && 3594 // It's possible to see an f32 fneg here, but unlikely. 3595 // TODO: Treat f32 fneg as only high bit. 3596 MRI.getType(Src) == LLT::fixed_vector(2, 16)) { 3597 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); 3598 Src = MI->getOperand(1).getReg(); 3599 MI = MRI.getVRegDef(Src); 3600 } 3601 3602 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. 3603 (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard() 3604 3605 // Packed instructions do not have abs modifiers. 3606 Mods |= SISrcMods::OP_SEL_1; 3607 3608 return std::make_pair(Src, Mods); 3609 } 3610 3611 InstructionSelector::ComplexRendererFns 3612 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const { 3613 MachineRegisterInfo &MRI 3614 = Root.getParent()->getParent()->getParent()->getRegInfo(); 3615 3616 Register Src; 3617 unsigned Mods; 3618 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI); 3619 3620 return {{ 3621 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3622 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3623 }}; 3624 } 3625 3626 InstructionSelector::ComplexRendererFns 3627 AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const { 3628 MachineRegisterInfo &MRI 3629 = Root.getParent()->getParent()->getParent()->getRegInfo(); 3630 3631 Register Src; 3632 unsigned Mods; 3633 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true); 3634 3635 return {{ 3636 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3637 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3638 }}; 3639 } 3640 3641 InstructionSelector::ComplexRendererFns 3642 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const { 3643 Register Src; 3644 unsigned Mods; 3645 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 3646 if (!isKnownNeverNaN(Src, *MRI)) 3647 return None; 3648 3649 return {{ 3650 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3651 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3652 }}; 3653 } 3654 3655 InstructionSelector::ComplexRendererFns 3656 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { 3657 // FIXME: Handle op_sel 3658 return {{ 3659 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 3660 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods 3661 }}; 3662 } 3663 3664 InstructionSelector::ComplexRendererFns 3665 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { 3666 SmallVector<GEPInfo, 4> AddrInfo; 3667 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 3668 3669 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 3670 return None; 3671 3672 const GEPInfo &GEPInfo = AddrInfo[0]; 3673 Optional<int64_t> EncodedImm = 3674 AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false); 3675 if (!EncodedImm) 3676 return None; 3677 3678 unsigned PtrReg = GEPInfo.SgprParts[0]; 3679 return {{ 3680 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 3681 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } 3682 }}; 3683 } 3684 3685 InstructionSelector::ComplexRendererFns 3686 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { 3687 SmallVector<GEPInfo, 4> AddrInfo; 3688 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 3689 3690 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 3691 return None; 3692 3693 const GEPInfo &GEPInfo = AddrInfo[0]; 3694 Register PtrReg = GEPInfo.SgprParts[0]; 3695 Optional<int64_t> EncodedImm = 3696 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm); 3697 if (!EncodedImm) 3698 return None; 3699 3700 return {{ 3701 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 3702 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } 3703 }}; 3704 } 3705 3706 InstructionSelector::ComplexRendererFns 3707 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { 3708 MachineInstr *MI = Root.getParent(); 3709 MachineBasicBlock *MBB = MI->getParent(); 3710 3711 SmallVector<GEPInfo, 4> AddrInfo; 3712 getAddrModeInfo(*MI, *MRI, AddrInfo); 3713 3714 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, 3715 // then we can select all ptr + 32-bit offsets not just immediate offsets. 3716 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 3717 return None; 3718 3719 const GEPInfo &GEPInfo = AddrInfo[0]; 3720 // SGPR offset is unsigned. 3721 if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm)) 3722 return None; 3723 3724 // If we make it this far we have a load with an 32-bit immediate offset. 3725 // It is OK to select this using a sgpr offset, because we have already 3726 // failed trying to select this load into one of the _IMM variants since 3727 // the _IMM Patterns are considered before the _SGPR patterns. 3728 Register PtrReg = GEPInfo.SgprParts[0]; 3729 Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 3730 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) 3731 .addImm(GEPInfo.Imm); 3732 return {{ 3733 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 3734 [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); } 3735 }}; 3736 } 3737 3738 std::pair<Register, int> 3739 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root, 3740 uint64_t FlatVariant) const { 3741 MachineInstr *MI = Root.getParent(); 3742 3743 auto Default = std::make_pair(Root.getReg(), 0); 3744 3745 if (!STI.hasFlatInstOffsets()) 3746 return Default; 3747 3748 Register PtrBase; 3749 int64_t ConstOffset; 3750 std::tie(PtrBase, ConstOffset) = 3751 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 3752 if (ConstOffset == 0) 3753 return Default; 3754 3755 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); 3756 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant)) 3757 return Default; 3758 3759 return std::make_pair(PtrBase, ConstOffset); 3760 } 3761 3762 InstructionSelector::ComplexRendererFns 3763 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const { 3764 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT); 3765 3766 return {{ 3767 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); }, 3768 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); }, 3769 }}; 3770 } 3771 3772 InstructionSelector::ComplexRendererFns 3773 AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const { 3774 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal); 3775 3776 return {{ 3777 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); }, 3778 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); }, 3779 }}; 3780 } 3781 3782 InstructionSelector::ComplexRendererFns 3783 AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const { 3784 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch); 3785 3786 return {{ 3787 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); }, 3788 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); }, 3789 }}; 3790 } 3791 3792 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset) 3793 InstructionSelector::ComplexRendererFns 3794 AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { 3795 Register Addr = Root.getReg(); 3796 Register PtrBase; 3797 int64_t ConstOffset; 3798 int64_t ImmOffset = 0; 3799 3800 // Match the immediate offset first, which canonically is moved as low as 3801 // possible. 3802 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); 3803 3804 if (ConstOffset != 0) { 3805 if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, 3806 SIInstrFlags::FlatGlobal)) { 3807 Addr = PtrBase; 3808 ImmOffset = ConstOffset; 3809 } else { 3810 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI); 3811 if (isSGPR(PtrBaseDef->Reg)) { 3812 if (ConstOffset > 0) { 3813 // Offset is too large. 3814 // 3815 // saddr + large_offset -> saddr + 3816 // (voffset = large_offset & ~MaxOffset) + 3817 // (large_offset & MaxOffset); 3818 int64_t SplitImmOffset, RemainderOffset; 3819 std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset( 3820 ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal); 3821 3822 if (isUInt<32>(RemainderOffset)) { 3823 MachineInstr *MI = Root.getParent(); 3824 MachineBasicBlock *MBB = MI->getParent(); 3825 Register HighBits = 3826 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3827 3828 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), 3829 HighBits) 3830 .addImm(RemainderOffset); 3831 3832 return {{ 3833 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr 3834 [=](MachineInstrBuilder &MIB) { 3835 MIB.addReg(HighBits); 3836 }, // voffset 3837 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); }, 3838 }}; 3839 } 3840 } 3841 3842 // We are adding a 64 bit SGPR and a constant. If constant bus limit 3843 // is 1 we would need to perform 1 or 2 extra moves for each half of 3844 // the constant and it is better to do a scalar add and then issue a 3845 // single VALU instruction to materialize zero. Otherwise it is less 3846 // instructions to perform VALU adds with immediates or inline literals. 3847 unsigned NumLiterals = 3848 !TII.isInlineConstant(APInt(32, ConstOffset & 0xffffffff)) + 3849 !TII.isInlineConstant(APInt(32, ConstOffset >> 32)); 3850 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals) 3851 return None; 3852 } 3853 } 3854 } 3855 3856 // Match the variable offset. 3857 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI); 3858 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) { 3859 // Look through the SGPR->VGPR copy. 3860 Register SAddr = 3861 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI); 3862 3863 if (SAddr && isSGPR(SAddr)) { 3864 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg(); 3865 3866 // It's possible voffset is an SGPR here, but the copy to VGPR will be 3867 // inserted later. 3868 if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) { 3869 return {{[=](MachineInstrBuilder &MIB) { // saddr 3870 MIB.addReg(SAddr); 3871 }, 3872 [=](MachineInstrBuilder &MIB) { // voffset 3873 MIB.addReg(VOffset); 3874 }, 3875 [=](MachineInstrBuilder &MIB) { // offset 3876 MIB.addImm(ImmOffset); 3877 }}}; 3878 } 3879 } 3880 } 3881 3882 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and 3883 // drop this. 3884 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF || 3885 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg)) 3886 return None; 3887 3888 // It's cheaper to materialize a single 32-bit zero for vaddr than the two 3889 // moves required to copy a 64-bit SGPR to VGPR. 3890 MachineInstr *MI = Root.getParent(); 3891 MachineBasicBlock *MBB = MI->getParent(); 3892 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3893 3894 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset) 3895 .addImm(0); 3896 3897 return {{ 3898 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr 3899 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset 3900 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset 3901 }}; 3902 } 3903 3904 InstructionSelector::ComplexRendererFns 3905 AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { 3906 Register Addr = Root.getReg(); 3907 Register PtrBase; 3908 int64_t ConstOffset; 3909 int64_t ImmOffset = 0; 3910 3911 // Match the immediate offset first, which canonically is moved as low as 3912 // possible. 3913 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); 3914 3915 if (ConstOffset != 0 && 3916 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, 3917 SIInstrFlags::FlatScratch)) { 3918 Addr = PtrBase; 3919 ImmOffset = ConstOffset; 3920 } 3921 3922 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI); 3923 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) { 3924 int FI = AddrDef->MI->getOperand(1).getIndex(); 3925 return {{ 3926 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr 3927 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset 3928 }}; 3929 } 3930 3931 Register SAddr = AddrDef->Reg; 3932 3933 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) { 3934 Register LHS = AddrDef->MI->getOperand(1).getReg(); 3935 Register RHS = AddrDef->MI->getOperand(2).getReg(); 3936 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI); 3937 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI); 3938 3939 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX && 3940 isSGPR(RHSDef->Reg)) { 3941 int FI = LHSDef->MI->getOperand(1).getIndex(); 3942 MachineInstr &I = *Root.getParent(); 3943 MachineBasicBlock *BB = I.getParent(); 3944 const DebugLoc &DL = I.getDebugLoc(); 3945 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 3946 3947 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr) 3948 .addFrameIndex(FI) 3949 .addReg(RHSDef->Reg); 3950 } 3951 } 3952 3953 if (!isSGPR(SAddr)) 3954 return None; 3955 3956 return {{ 3957 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr 3958 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset 3959 }}; 3960 } 3961 3962 InstructionSelector::ComplexRendererFns 3963 AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const { 3964 Register Addr = Root.getReg(); 3965 Register PtrBase; 3966 int64_t ConstOffset; 3967 int64_t ImmOffset = 0; 3968 3969 // Match the immediate offset first, which canonically is moved as low as 3970 // possible. 3971 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); 3972 3973 if (ConstOffset != 0 && 3974 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) { 3975 Addr = PtrBase; 3976 ImmOffset = ConstOffset; 3977 } 3978 3979 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI); 3980 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD) 3981 return None; 3982 3983 Register RHS = AddrDef->MI->getOperand(2).getReg(); 3984 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) 3985 return None; 3986 3987 Register LHS = AddrDef->MI->getOperand(1).getReg(); 3988 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI); 3989 3990 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) { 3991 int FI = LHSDef->MI->getOperand(1).getIndex(); 3992 return {{ 3993 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr 3994 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr 3995 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset 3996 }}; 3997 } 3998 3999 if (!isSGPR(LHS)) 4000 return None; 4001 4002 return {{ 4003 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr 4004 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr 4005 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset 4006 }}; 4007 } 4008 4009 InstructionSelector::ComplexRendererFns 4010 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { 4011 MachineInstr *MI = Root.getParent(); 4012 MachineBasicBlock *MBB = MI->getParent(); 4013 MachineFunction *MF = MBB->getParent(); 4014 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 4015 4016 int64_t Offset = 0; 4017 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) && 4018 Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) { 4019 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4020 4021 // TODO: Should this be inside the render function? The iterator seems to 4022 // move. 4023 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), 4024 HighBits) 4025 .addImm(Offset & ~4095); 4026 4027 return {{[=](MachineInstrBuilder &MIB) { // rsrc 4028 MIB.addReg(Info->getScratchRSrcReg()); 4029 }, 4030 [=](MachineInstrBuilder &MIB) { // vaddr 4031 MIB.addReg(HighBits); 4032 }, 4033 [=](MachineInstrBuilder &MIB) { // soffset 4034 // Use constant zero for soffset and rely on eliminateFrameIndex 4035 // to choose the appropriate frame register if need be. 4036 MIB.addImm(0); 4037 }, 4038 [=](MachineInstrBuilder &MIB) { // offset 4039 MIB.addImm(Offset & 4095); 4040 }}}; 4041 } 4042 4043 assert(Offset == 0 || Offset == -1); 4044 4045 // Try to fold a frame index directly into the MUBUF vaddr field, and any 4046 // offsets. 4047 Optional<int> FI; 4048 Register VAddr = Root.getReg(); 4049 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) { 4050 Register PtrBase; 4051 int64_t ConstOffset; 4052 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI); 4053 if (ConstOffset != 0) { 4054 if (SIInstrInfo::isLegalMUBUFImmOffset(ConstOffset) && 4055 (!STI.privateMemoryResourceIsRangeChecked() || 4056 KnownBits->signBitIsZero(PtrBase))) { 4057 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase); 4058 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX) 4059 FI = PtrBaseDef->getOperand(1).getIndex(); 4060 else 4061 VAddr = PtrBase; 4062 Offset = ConstOffset; 4063 } 4064 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) { 4065 FI = RootDef->getOperand(1).getIndex(); 4066 } 4067 } 4068 4069 return {{[=](MachineInstrBuilder &MIB) { // rsrc 4070 MIB.addReg(Info->getScratchRSrcReg()); 4071 }, 4072 [=](MachineInstrBuilder &MIB) { // vaddr 4073 if (FI.hasValue()) 4074 MIB.addFrameIndex(FI.getValue()); 4075 else 4076 MIB.addReg(VAddr); 4077 }, 4078 [=](MachineInstrBuilder &MIB) { // soffset 4079 // Use constant zero for soffset and rely on eliminateFrameIndex 4080 // to choose the appropriate frame register if need be. 4081 MIB.addImm(0); 4082 }, 4083 [=](MachineInstrBuilder &MIB) { // offset 4084 MIB.addImm(Offset); 4085 }}}; 4086 } 4087 4088 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base, 4089 int64_t Offset) const { 4090 if (!isUInt<16>(Offset)) 4091 return false; 4092 4093 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) 4094 return true; 4095 4096 // On Southern Islands instruction with a negative base value and an offset 4097 // don't seem to work. 4098 return KnownBits->signBitIsZero(Base); 4099 } 4100 4101 bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0, 4102 int64_t Offset1, 4103 unsigned Size) const { 4104 if (Offset0 % Size != 0 || Offset1 % Size != 0) 4105 return false; 4106 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size)) 4107 return false; 4108 4109 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) 4110 return true; 4111 4112 // On Southern Islands instruction with a negative base value and an offset 4113 // don't seem to work. 4114 return KnownBits->signBitIsZero(Base); 4115 } 4116 4117 bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI, 4118 unsigned ShAmtBits) const { 4119 assert(MI.getOpcode() == TargetOpcode::G_AND); 4120 4121 Optional<APInt> RHS = getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI); 4122 if (!RHS) 4123 return false; 4124 4125 if (RHS->countTrailingOnes() >= ShAmtBits) 4126 return true; 4127 4128 const APInt &LHSKnownZeros = 4129 KnownBits->getKnownZeroes(MI.getOperand(1).getReg()); 4130 return (LHSKnownZeros | *RHS).countTrailingOnes() >= ShAmtBits; 4131 } 4132 4133 // Return the wave level SGPR base address if this is a wave address. 4134 static Register getWaveAddress(const MachineInstr *Def) { 4135 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS 4136 ? Def->getOperand(1).getReg() 4137 : Register(); 4138 } 4139 4140 InstructionSelector::ComplexRendererFns 4141 AMDGPUInstructionSelector::selectMUBUFScratchOffset( 4142 MachineOperand &Root) const { 4143 Register Reg = Root.getReg(); 4144 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 4145 4146 const MachineInstr *Def = MRI->getVRegDef(Reg); 4147 if (Register WaveBase = getWaveAddress(Def)) { 4148 return {{ 4149 [=](MachineInstrBuilder &MIB) { // rsrc 4150 MIB.addReg(Info->getScratchRSrcReg()); 4151 }, 4152 [=](MachineInstrBuilder &MIB) { // soffset 4153 MIB.addReg(WaveBase); 4154 }, 4155 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset 4156 }}; 4157 } 4158 4159 int64_t Offset = 0; 4160 4161 // FIXME: Copy check is a hack 4162 Register BasePtr; 4163 if (mi_match(Reg, *MRI, m_GPtrAdd(m_Reg(BasePtr), m_Copy(m_ICst(Offset))))) { 4164 if (!SIInstrInfo::isLegalMUBUFImmOffset(Offset)) 4165 return {}; 4166 const MachineInstr *BasePtrDef = MRI->getVRegDef(BasePtr); 4167 Register WaveBase = getWaveAddress(BasePtrDef); 4168 if (!WaveBase) 4169 return {}; 4170 4171 return {{ 4172 [=](MachineInstrBuilder &MIB) { // rsrc 4173 MIB.addReg(Info->getScratchRSrcReg()); 4174 }, 4175 [=](MachineInstrBuilder &MIB) { // soffset 4176 MIB.addReg(WaveBase); 4177 }, 4178 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset 4179 }}; 4180 } 4181 4182 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) || 4183 !SIInstrInfo::isLegalMUBUFImmOffset(Offset)) 4184 return {}; 4185 4186 return {{ 4187 [=](MachineInstrBuilder &MIB) { // rsrc 4188 MIB.addReg(Info->getScratchRSrcReg()); 4189 }, 4190 [=](MachineInstrBuilder &MIB) { // soffset 4191 MIB.addImm(0); 4192 }, 4193 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset 4194 }}; 4195 } 4196 4197 std::pair<Register, unsigned> 4198 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const { 4199 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 4200 if (!RootDef) 4201 return std::make_pair(Root.getReg(), 0); 4202 4203 int64_t ConstAddr = 0; 4204 4205 Register PtrBase; 4206 int64_t Offset; 4207 std::tie(PtrBase, Offset) = 4208 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 4209 4210 if (Offset) { 4211 if (isDSOffsetLegal(PtrBase, Offset)) { 4212 // (add n0, c0) 4213 return std::make_pair(PtrBase, Offset); 4214 } 4215 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 4216 // TODO 4217 4218 4219 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 4220 // TODO 4221 4222 } 4223 4224 return std::make_pair(Root.getReg(), 0); 4225 } 4226 4227 InstructionSelector::ComplexRendererFns 4228 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const { 4229 Register Reg; 4230 unsigned Offset; 4231 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root); 4232 return {{ 4233 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 4234 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } 4235 }}; 4236 } 4237 4238 InstructionSelector::ComplexRendererFns 4239 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const { 4240 return selectDSReadWrite2(Root, 4); 4241 } 4242 4243 InstructionSelector::ComplexRendererFns 4244 AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const { 4245 return selectDSReadWrite2(Root, 8); 4246 } 4247 4248 InstructionSelector::ComplexRendererFns 4249 AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root, 4250 unsigned Size) const { 4251 Register Reg; 4252 unsigned Offset; 4253 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size); 4254 return {{ 4255 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 4256 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, 4257 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); } 4258 }}; 4259 } 4260 4261 std::pair<Register, unsigned> 4262 AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root, 4263 unsigned Size) const { 4264 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 4265 if (!RootDef) 4266 return std::make_pair(Root.getReg(), 0); 4267 4268 int64_t ConstAddr = 0; 4269 4270 Register PtrBase; 4271 int64_t Offset; 4272 std::tie(PtrBase, Offset) = 4273 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 4274 4275 if (Offset) { 4276 int64_t OffsetValue0 = Offset; 4277 int64_t OffsetValue1 = Offset + Size; 4278 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) { 4279 // (add n0, c0) 4280 return std::make_pair(PtrBase, OffsetValue0 / Size); 4281 } 4282 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 4283 // TODO 4284 4285 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 4286 // TODO 4287 4288 } 4289 4290 return std::make_pair(Root.getReg(), 0); 4291 } 4292 4293 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return 4294 /// the base value with the constant offset. There may be intervening copies 4295 /// between \p Root and the identified constant. Returns \p Root, 0 if this does 4296 /// not match the pattern. 4297 std::pair<Register, int64_t> 4298 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset( 4299 Register Root, const MachineRegisterInfo &MRI) const { 4300 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI); 4301 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD) 4302 return {Root, 0}; 4303 4304 MachineOperand &RHS = RootI->getOperand(2); 4305 Optional<ValueAndVReg> MaybeOffset = 4306 getIConstantVRegValWithLookThrough(RHS.getReg(), MRI); 4307 if (!MaybeOffset) 4308 return {Root, 0}; 4309 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()}; 4310 } 4311 4312 static void addZeroImm(MachineInstrBuilder &MIB) { 4313 MIB.addImm(0); 4314 } 4315 4316 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p 4317 /// BasePtr is not valid, a null base pointer will be used. 4318 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, 4319 uint32_t FormatLo, uint32_t FormatHi, 4320 Register BasePtr) { 4321 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 4322 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 4323 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 4324 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 4325 4326 B.buildInstr(AMDGPU::S_MOV_B32) 4327 .addDef(RSrc2) 4328 .addImm(FormatLo); 4329 B.buildInstr(AMDGPU::S_MOV_B32) 4330 .addDef(RSrc3) 4331 .addImm(FormatHi); 4332 4333 // Build the half of the subregister with the constants before building the 4334 // full 128-bit register. If we are building multiple resource descriptors, 4335 // this will allow CSEing of the 2-component register. 4336 B.buildInstr(AMDGPU::REG_SEQUENCE) 4337 .addDef(RSrcHi) 4338 .addReg(RSrc2) 4339 .addImm(AMDGPU::sub0) 4340 .addReg(RSrc3) 4341 .addImm(AMDGPU::sub1); 4342 4343 Register RSrcLo = BasePtr; 4344 if (!BasePtr) { 4345 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 4346 B.buildInstr(AMDGPU::S_MOV_B64) 4347 .addDef(RSrcLo) 4348 .addImm(0); 4349 } 4350 4351 B.buildInstr(AMDGPU::REG_SEQUENCE) 4352 .addDef(RSrc) 4353 .addReg(RSrcLo) 4354 .addImm(AMDGPU::sub0_sub1) 4355 .addReg(RSrcHi) 4356 .addImm(AMDGPU::sub2_sub3); 4357 4358 return RSrc; 4359 } 4360 4361 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 4362 const SIInstrInfo &TII, Register BasePtr) { 4363 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 4364 4365 // FIXME: Why are half the "default" bits ignored based on the addressing 4366 // mode? 4367 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr); 4368 } 4369 4370 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 4371 const SIInstrInfo &TII, Register BasePtr) { 4372 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 4373 4374 // FIXME: Why are half the "default" bits ignored based on the addressing 4375 // mode? 4376 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr); 4377 } 4378 4379 AMDGPUInstructionSelector::MUBUFAddressData 4380 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const { 4381 MUBUFAddressData Data; 4382 Data.N0 = Src; 4383 4384 Register PtrBase; 4385 int64_t Offset; 4386 4387 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI); 4388 if (isUInt<32>(Offset)) { 4389 Data.N0 = PtrBase; 4390 Data.Offset = Offset; 4391 } 4392 4393 if (MachineInstr *InputAdd 4394 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) { 4395 Data.N2 = InputAdd->getOperand(1).getReg(); 4396 Data.N3 = InputAdd->getOperand(2).getReg(); 4397 4398 // FIXME: Need to fix extra SGPR->VGPRcopies inserted 4399 // FIXME: Don't know this was defined by operand 0 4400 // 4401 // TODO: Remove this when we have copy folding optimizations after 4402 // RegBankSelect. 4403 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg(); 4404 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg(); 4405 } 4406 4407 return Data; 4408 } 4409 4410 /// Return if the addr64 mubuf mode should be used for the given address. 4411 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const { 4412 // (ptr_add N2, N3) -> addr64, or 4413 // (ptr_add (ptr_add N2, N3), C1) -> addr64 4414 if (Addr.N2) 4415 return true; 4416 4417 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI); 4418 return N0Bank->getID() == AMDGPU::VGPRRegBankID; 4419 } 4420 4421 /// Split an immediate offset \p ImmOffset depending on whether it fits in the 4422 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable 4423 /// component. 4424 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset( 4425 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const { 4426 if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset)) 4427 return; 4428 4429 // Illegal offset, store it in soffset. 4430 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 4431 B.buildInstr(AMDGPU::S_MOV_B32) 4432 .addDef(SOffset) 4433 .addImm(ImmOffset); 4434 ImmOffset = 0; 4435 } 4436 4437 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl( 4438 MachineOperand &Root, Register &VAddr, Register &RSrcReg, 4439 Register &SOffset, int64_t &Offset) const { 4440 // FIXME: Predicates should stop this from reaching here. 4441 // addr64 bit was removed for volcanic islands. 4442 if (!STI.hasAddr64() || STI.useFlatForGlobal()) 4443 return false; 4444 4445 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 4446 if (!shouldUseAddr64(AddrData)) 4447 return false; 4448 4449 Register N0 = AddrData.N0; 4450 Register N2 = AddrData.N2; 4451 Register N3 = AddrData.N3; 4452 Offset = AddrData.Offset; 4453 4454 // Base pointer for the SRD. 4455 Register SRDPtr; 4456 4457 if (N2) { 4458 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 4459 assert(N3); 4460 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 4461 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the 4462 // addr64, and construct the default resource from a 0 address. 4463 VAddr = N0; 4464 } else { 4465 SRDPtr = N3; 4466 VAddr = N2; 4467 } 4468 } else { 4469 // N2 is not divergent. 4470 SRDPtr = N2; 4471 VAddr = N3; 4472 } 4473 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 4474 // Use the default null pointer in the resource 4475 VAddr = N0; 4476 } else { 4477 // N0 -> offset, or 4478 // (N0 + C1) -> offset 4479 SRDPtr = N0; 4480 } 4481 4482 MachineIRBuilder B(*Root.getParent()); 4483 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr); 4484 splitIllegalMUBUFOffset(B, SOffset, Offset); 4485 return true; 4486 } 4487 4488 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl( 4489 MachineOperand &Root, Register &RSrcReg, Register &SOffset, 4490 int64_t &Offset) const { 4491 4492 // FIXME: Pattern should not reach here. 4493 if (STI.useFlatForGlobal()) 4494 return false; 4495 4496 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 4497 if (shouldUseAddr64(AddrData)) 4498 return false; 4499 4500 // N0 -> offset, or 4501 // (N0 + C1) -> offset 4502 Register SRDPtr = AddrData.N0; 4503 Offset = AddrData.Offset; 4504 4505 // TODO: Look through extensions for 32-bit soffset. 4506 MachineIRBuilder B(*Root.getParent()); 4507 4508 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr); 4509 splitIllegalMUBUFOffset(B, SOffset, Offset); 4510 return true; 4511 } 4512 4513 InstructionSelector::ComplexRendererFns 4514 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const { 4515 Register VAddr; 4516 Register RSrcReg; 4517 Register SOffset; 4518 int64_t Offset = 0; 4519 4520 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 4521 return {}; 4522 4523 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 4524 // pattern. 4525 return {{ 4526 [=](MachineInstrBuilder &MIB) { // rsrc 4527 MIB.addReg(RSrcReg); 4528 }, 4529 [=](MachineInstrBuilder &MIB) { // vaddr 4530 MIB.addReg(VAddr); 4531 }, 4532 [=](MachineInstrBuilder &MIB) { // soffset 4533 if (SOffset) 4534 MIB.addReg(SOffset); 4535 else 4536 MIB.addImm(0); 4537 }, 4538 [=](MachineInstrBuilder &MIB) { // offset 4539 MIB.addImm(Offset); 4540 }, 4541 addZeroImm, // cpol 4542 addZeroImm, // tfe 4543 addZeroImm // swz 4544 }}; 4545 } 4546 4547 InstructionSelector::ComplexRendererFns 4548 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const { 4549 Register RSrcReg; 4550 Register SOffset; 4551 int64_t Offset = 0; 4552 4553 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 4554 return {}; 4555 4556 return {{ 4557 [=](MachineInstrBuilder &MIB) { // rsrc 4558 MIB.addReg(RSrcReg); 4559 }, 4560 [=](MachineInstrBuilder &MIB) { // soffset 4561 if (SOffset) 4562 MIB.addReg(SOffset); 4563 else 4564 MIB.addImm(0); 4565 }, 4566 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 4567 addZeroImm, // cpol 4568 addZeroImm, // tfe 4569 addZeroImm, // swz 4570 }}; 4571 } 4572 4573 InstructionSelector::ComplexRendererFns 4574 AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const { 4575 Register VAddr; 4576 Register RSrcReg; 4577 Register SOffset; 4578 int64_t Offset = 0; 4579 4580 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 4581 return {}; 4582 4583 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 4584 // pattern. 4585 return {{ 4586 [=](MachineInstrBuilder &MIB) { // rsrc 4587 MIB.addReg(RSrcReg); 4588 }, 4589 [=](MachineInstrBuilder &MIB) { // vaddr 4590 MIB.addReg(VAddr); 4591 }, 4592 [=](MachineInstrBuilder &MIB) { // soffset 4593 if (SOffset) 4594 MIB.addReg(SOffset); 4595 else 4596 MIB.addImm(0); 4597 }, 4598 [=](MachineInstrBuilder &MIB) { // offset 4599 MIB.addImm(Offset); 4600 }, 4601 [=](MachineInstrBuilder &MIB) { 4602 MIB.addImm(AMDGPU::CPol::GLC); // cpol 4603 } 4604 }}; 4605 } 4606 4607 InstructionSelector::ComplexRendererFns 4608 AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const { 4609 Register RSrcReg; 4610 Register SOffset; 4611 int64_t Offset = 0; 4612 4613 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 4614 return {}; 4615 4616 return {{ 4617 [=](MachineInstrBuilder &MIB) { // rsrc 4618 MIB.addReg(RSrcReg); 4619 }, 4620 [=](MachineInstrBuilder &MIB) { // soffset 4621 if (SOffset) 4622 MIB.addReg(SOffset); 4623 else 4624 MIB.addImm(0); 4625 }, 4626 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 4627 [=](MachineInstrBuilder &MIB) { MIB.addImm(AMDGPU::CPol::GLC); } // cpol 4628 }}; 4629 } 4630 4631 /// Get an immediate that must be 32-bits, and treated as zero extended. 4632 static Optional<uint64_t> getConstantZext32Val(Register Reg, 4633 const MachineRegisterInfo &MRI) { 4634 // getIConstantVRegVal sexts any values, so see if that matters. 4635 Optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI); 4636 if (!OffsetVal || !isInt<32>(*OffsetVal)) 4637 return None; 4638 return Lo_32(*OffsetVal); 4639 } 4640 4641 InstructionSelector::ComplexRendererFns 4642 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const { 4643 Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI); 4644 if (!OffsetVal) 4645 return {}; 4646 4647 Optional<int64_t> EncodedImm = 4648 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true); 4649 if (!EncodedImm) 4650 return {}; 4651 4652 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; 4653 } 4654 4655 InstructionSelector::ComplexRendererFns 4656 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const { 4657 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); 4658 4659 Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI); 4660 if (!OffsetVal) 4661 return {}; 4662 4663 Optional<int64_t> EncodedImm 4664 = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal); 4665 if (!EncodedImm) 4666 return {}; 4667 4668 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; 4669 } 4670 4671 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, 4672 const MachineInstr &MI, 4673 int OpIdx) const { 4674 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 4675 "Expected G_CONSTANT"); 4676 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue()); 4677 } 4678 4679 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB, 4680 const MachineInstr &MI, 4681 int OpIdx) const { 4682 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 4683 "Expected G_CONSTANT"); 4684 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue()); 4685 } 4686 4687 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB, 4688 const MachineInstr &MI, 4689 int OpIdx) const { 4690 assert(OpIdx == -1); 4691 4692 const MachineOperand &Op = MI.getOperand(1); 4693 if (MI.getOpcode() == TargetOpcode::G_FCONSTANT) 4694 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); 4695 else { 4696 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT"); 4697 MIB.addImm(Op.getCImm()->getSExtValue()); 4698 } 4699 } 4700 4701 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB, 4702 const MachineInstr &MI, 4703 int OpIdx) const { 4704 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 4705 "Expected G_CONSTANT"); 4706 MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation()); 4707 } 4708 4709 /// This only really exists to satisfy DAG type checking machinery, so is a 4710 /// no-op here. 4711 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB, 4712 const MachineInstr &MI, 4713 int OpIdx) const { 4714 MIB.addImm(MI.getOperand(OpIdx).getImm()); 4715 } 4716 4717 void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB, 4718 const MachineInstr &MI, 4719 int OpIdx) const { 4720 assert(OpIdx >= 0 && "expected to match an immediate operand"); 4721 MIB.addImm(MI.getOperand(OpIdx).getImm() & AMDGPU::CPol::ALL); 4722 } 4723 4724 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB, 4725 const MachineInstr &MI, 4726 int OpIdx) const { 4727 assert(OpIdx >= 0 && "expected to match an immediate operand"); 4728 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1); 4729 } 4730 4731 void AMDGPUInstructionSelector::renderSetGLC(MachineInstrBuilder &MIB, 4732 const MachineInstr &MI, 4733 int OpIdx) const { 4734 assert(OpIdx >= 0 && "expected to match an immediate operand"); 4735 MIB.addImm(MI.getOperand(OpIdx).getImm() | AMDGPU::CPol::GLC); 4736 } 4737 4738 void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB, 4739 const MachineInstr &MI, 4740 int OpIdx) const { 4741 MIB.addFrameIndex((MI.getOperand(1).getIndex())); 4742 } 4743 4744 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const { 4745 return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm()); 4746 } 4747 4748 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const { 4749 return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm()); 4750 } 4751 4752 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const { 4753 return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm()); 4754 } 4755 4756 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const { 4757 return TII.isInlineConstant(Imm); 4758 } 4759