1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the InstructionSelector class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUInstructionSelector.h" 15 #include "AMDGPUInstrInfo.h" 16 #include "AMDGPUGlobalISelUtils.h" 17 #include "AMDGPURegisterBankInfo.h" 18 #include "AMDGPUSubtarget.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 23 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" 24 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" 25 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 26 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 27 #include "llvm/CodeGen/GlobalISel/Utils.h" 28 #include "llvm/CodeGen/MachineBasicBlock.h" 29 #include "llvm/CodeGen/MachineFunction.h" 30 #include "llvm/CodeGen/MachineInstr.h" 31 #include "llvm/CodeGen/MachineInstrBuilder.h" 32 #include "llvm/CodeGen/MachineRegisterInfo.h" 33 #include "llvm/IR/Type.h" 34 #include "llvm/Support/Debug.h" 35 #include "llvm/Support/raw_ostream.h" 36 37 #define DEBUG_TYPE "amdgpu-isel" 38 39 using namespace llvm; 40 using namespace MIPatternMatch; 41 42 static cl::opt<bool> AllowRiskySelect( 43 "amdgpu-global-isel-risky-select", 44 cl::desc("Allow GlobalISel to select cases that are likely to not work yet"), 45 cl::init(false), 46 cl::ReallyHidden); 47 48 #define GET_GLOBALISEL_IMPL 49 #define AMDGPUSubtarget GCNSubtarget 50 #include "AMDGPUGenGlobalISel.inc" 51 #undef GET_GLOBALISEL_IMPL 52 #undef AMDGPUSubtarget 53 54 AMDGPUInstructionSelector::AMDGPUInstructionSelector( 55 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, 56 const AMDGPUTargetMachine &TM) 57 : InstructionSelector(), TII(*STI.getInstrInfo()), 58 TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM), 59 STI(STI), 60 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG), 61 #define GET_GLOBALISEL_PREDICATES_INIT 62 #include "AMDGPUGenGlobalISel.inc" 63 #undef GET_GLOBALISEL_PREDICATES_INIT 64 #define GET_GLOBALISEL_TEMPORARIES_INIT 65 #include "AMDGPUGenGlobalISel.inc" 66 #undef GET_GLOBALISEL_TEMPORARIES_INIT 67 { 68 } 69 70 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } 71 72 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB, 73 CodeGenCoverage &CoverageInfo) { 74 MRI = &MF.getRegInfo(); 75 InstructionSelector::setupMF(MF, KB, CoverageInfo); 76 } 77 78 bool AMDGPUInstructionSelector::isVCC(Register Reg, 79 const MachineRegisterInfo &MRI) const { 80 // The verifier is oblivious to s1 being a valid value for wavesize registers. 81 if (Reg.isPhysical()) 82 return false; 83 84 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 85 const TargetRegisterClass *RC = 86 RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); 87 if (RC) { 88 const LLT Ty = MRI.getType(Reg); 89 return RC->hasSuperClassEq(TRI.getBoolRC()) && 90 Ty.isValid() && Ty.getSizeInBits() == 1; 91 } 92 93 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); 94 return RB->getID() == AMDGPU::VCCRegBankID; 95 } 96 97 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI, 98 unsigned NewOpc) const { 99 MI.setDesc(TII.get(NewOpc)); 100 MI.RemoveOperand(1); // Remove intrinsic ID. 101 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 102 103 MachineOperand &Dst = MI.getOperand(0); 104 MachineOperand &Src = MI.getOperand(1); 105 106 // TODO: This should be legalized to s32 if needed 107 if (MRI->getType(Dst.getReg()) == LLT::scalar(1)) 108 return false; 109 110 const TargetRegisterClass *DstRC 111 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 112 const TargetRegisterClass *SrcRC 113 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 114 if (!DstRC || DstRC != SrcRC) 115 return false; 116 117 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) && 118 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI); 119 } 120 121 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { 122 const DebugLoc &DL = I.getDebugLoc(); 123 MachineBasicBlock *BB = I.getParent(); 124 I.setDesc(TII.get(TargetOpcode::COPY)); 125 126 const MachineOperand &Src = I.getOperand(1); 127 MachineOperand &Dst = I.getOperand(0); 128 Register DstReg = Dst.getReg(); 129 Register SrcReg = Src.getReg(); 130 131 if (isVCC(DstReg, *MRI)) { 132 if (SrcReg == AMDGPU::SCC) { 133 const TargetRegisterClass *RC 134 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 135 if (!RC) 136 return true; 137 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 138 } 139 140 if (!isVCC(SrcReg, *MRI)) { 141 // TODO: Should probably leave the copy and let copyPhysReg expand it. 142 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI)) 143 return false; 144 145 const TargetRegisterClass *SrcRC 146 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 147 148 Register MaskedReg = MRI->createVirtualRegister(SrcRC); 149 150 // We can't trust the high bits at this point, so clear them. 151 152 // TODO: Skip masking high bits if def is known boolean. 153 154 unsigned AndOpc = TRI.isSGPRClass(SrcRC) ? 155 AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32; 156 BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg) 157 .addImm(1) 158 .addReg(SrcReg); 159 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) 160 .addImm(0) 161 .addReg(MaskedReg); 162 163 if (!MRI->getRegClassOrNull(SrcReg)) 164 MRI->setRegClass(SrcReg, SrcRC); 165 I.eraseFromParent(); 166 return true; 167 } 168 169 const TargetRegisterClass *RC = 170 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 171 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) 172 return false; 173 174 return true; 175 } 176 177 for (const MachineOperand &MO : I.operands()) { 178 if (Register::isPhysicalRegister(MO.getReg())) 179 continue; 180 181 const TargetRegisterClass *RC = 182 TRI.getConstrainedRegClassForOperand(MO, *MRI); 183 if (!RC) 184 continue; 185 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); 186 } 187 return true; 188 } 189 190 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { 191 const Register DefReg = I.getOperand(0).getReg(); 192 const LLT DefTy = MRI->getType(DefReg); 193 if (DefTy == LLT::scalar(1)) { 194 if (!AllowRiskySelect) { 195 LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n"); 196 return false; 197 } 198 199 LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n"); 200 } 201 202 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy) 203 204 const RegClassOrRegBank &RegClassOrBank = 205 MRI->getRegClassOrRegBank(DefReg); 206 207 const TargetRegisterClass *DefRC 208 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); 209 if (!DefRC) { 210 if (!DefTy.isValid()) { 211 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); 212 return false; 213 } 214 215 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); 216 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI); 217 if (!DefRC) { 218 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); 219 return false; 220 } 221 } 222 223 // TODO: Verify that all registers have the same bank 224 I.setDesc(TII.get(TargetOpcode::PHI)); 225 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI); 226 } 227 228 MachineOperand 229 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, 230 const TargetRegisterClass &SubRC, 231 unsigned SubIdx) const { 232 233 MachineInstr *MI = MO.getParent(); 234 MachineBasicBlock *BB = MO.getParent()->getParent(); 235 Register DstReg = MRI->createVirtualRegister(&SubRC); 236 237 if (MO.isReg()) { 238 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); 239 Register Reg = MO.getReg(); 240 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) 241 .addReg(Reg, 0, ComposedSubIdx); 242 243 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(), 244 MO.isKill(), MO.isDead(), MO.isUndef(), 245 MO.isEarlyClobber(), 0, MO.isDebug(), 246 MO.isInternalRead()); 247 } 248 249 assert(MO.isImm()); 250 251 APInt Imm(64, MO.getImm()); 252 253 switch (SubIdx) { 254 default: 255 llvm_unreachable("do not know to split immediate with this sub index."); 256 case AMDGPU::sub0: 257 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue()); 258 case AMDGPU::sub1: 259 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue()); 260 } 261 } 262 263 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { 264 switch (Opc) { 265 case AMDGPU::G_AND: 266 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; 267 case AMDGPU::G_OR: 268 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32; 269 case AMDGPU::G_XOR: 270 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32; 271 default: 272 llvm_unreachable("not a bit op"); 273 } 274 } 275 276 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { 277 Register DstReg = I.getOperand(0).getReg(); 278 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 279 280 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 281 if (DstRB->getID() != AMDGPU::SGPRRegBankID && 282 DstRB->getID() != AMDGPU::VCCRegBankID) 283 return false; 284 285 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID && 286 STI.isWave64()); 287 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64))); 288 289 // Dead implicit-def of scc 290 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef 291 true, // isImp 292 false, // isKill 293 true)); // isDead 294 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 295 } 296 297 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { 298 MachineBasicBlock *BB = I.getParent(); 299 MachineFunction *MF = BB->getParent(); 300 Register DstReg = I.getOperand(0).getReg(); 301 const DebugLoc &DL = I.getDebugLoc(); 302 LLT Ty = MRI->getType(DstReg); 303 if (Ty.isVector()) 304 return false; 305 306 unsigned Size = Ty.getSizeInBits(); 307 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 308 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID; 309 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB; 310 311 if (Size == 32) { 312 if (IsSALU) { 313 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; 314 MachineInstr *Add = 315 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 316 .add(I.getOperand(1)) 317 .add(I.getOperand(2)); 318 I.eraseFromParent(); 319 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 320 } 321 322 if (STI.hasAddNoCarry()) { 323 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64; 324 I.setDesc(TII.get(Opc)); 325 I.addOperand(*MF, MachineOperand::CreateImm(0)); 326 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 327 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 328 } 329 330 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64; 331 332 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass()); 333 MachineInstr *Add 334 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 335 .addDef(UnusedCarry, RegState::Dead) 336 .add(I.getOperand(1)) 337 .add(I.getOperand(2)) 338 .addImm(0); 339 I.eraseFromParent(); 340 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 341 } 342 343 assert(!Sub && "illegal sub should not reach here"); 344 345 const TargetRegisterClass &RC 346 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass; 347 const TargetRegisterClass &HalfRC 348 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass; 349 350 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0)); 351 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0)); 352 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1)); 353 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1)); 354 355 Register DstLo = MRI->createVirtualRegister(&HalfRC); 356 Register DstHi = MRI->createVirtualRegister(&HalfRC); 357 358 if (IsSALU) { 359 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) 360 .add(Lo1) 361 .add(Lo2); 362 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) 363 .add(Hi1) 364 .add(Hi2); 365 } else { 366 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass(); 367 Register CarryReg = MRI->createVirtualRegister(CarryRC); 368 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo) 369 .addDef(CarryReg) 370 .add(Lo1) 371 .add(Lo2) 372 .addImm(0); 373 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi) 374 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead) 375 .add(Hi1) 376 .add(Hi2) 377 .addReg(CarryReg, RegState::Kill) 378 .addImm(0); 379 380 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI)) 381 return false; 382 } 383 384 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 385 .addReg(DstLo) 386 .addImm(AMDGPU::sub0) 387 .addReg(DstHi) 388 .addImm(AMDGPU::sub1); 389 390 391 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI)) 392 return false; 393 394 I.eraseFromParent(); 395 return true; 396 } 397 398 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE( 399 MachineInstr &I) const { 400 MachineBasicBlock *BB = I.getParent(); 401 MachineFunction *MF = BB->getParent(); 402 const DebugLoc &DL = I.getDebugLoc(); 403 Register Dst0Reg = I.getOperand(0).getReg(); 404 Register Dst1Reg = I.getOperand(1).getReg(); 405 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO || 406 I.getOpcode() == AMDGPU::G_UADDE; 407 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE || 408 I.getOpcode() == AMDGPU::G_USUBE; 409 410 if (isVCC(Dst1Reg, *MRI)) { 411 unsigned NoCarryOpc = 412 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64; 413 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 414 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc)); 415 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 416 I.addOperand(*MF, MachineOperand::CreateImm(0)); 417 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 418 } 419 420 Register Src0Reg = I.getOperand(2).getReg(); 421 Register Src1Reg = I.getOperand(3).getReg(); 422 423 if (HasCarryIn) { 424 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 425 .addReg(I.getOperand(4).getReg()); 426 } 427 428 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; 429 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; 430 431 BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg) 432 .add(I.getOperand(2)) 433 .add(I.getOperand(3)); 434 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg) 435 .addReg(AMDGPU::SCC); 436 437 if (!MRI->getRegClassOrNull(Dst1Reg)) 438 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass); 439 440 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) || 441 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) || 442 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI)) 443 return false; 444 445 if (HasCarryIn && 446 !RBI.constrainGenericRegister(I.getOperand(4).getReg(), 447 AMDGPU::SReg_32RegClass, *MRI)) 448 return false; 449 450 I.eraseFromParent(); 451 return true; 452 } 453 454 // TODO: We should probably legalize these to only using 32-bit results. 455 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { 456 MachineBasicBlock *BB = I.getParent(); 457 Register DstReg = I.getOperand(0).getReg(); 458 Register SrcReg = I.getOperand(1).getReg(); 459 LLT DstTy = MRI->getType(DstReg); 460 LLT SrcTy = MRI->getType(SrcReg); 461 const unsigned SrcSize = SrcTy.getSizeInBits(); 462 unsigned DstSize = DstTy.getSizeInBits(); 463 464 // TODO: Should handle any multiple of 32 offset. 465 unsigned Offset = I.getOperand(2).getImm(); 466 if (Offset % 32 != 0 || DstSize > 128) 467 return false; 468 469 // 16-bit operations really use 32-bit registers. 470 // FIXME: Probably should not allow 16-bit G_EXTRACT results. 471 if (DstSize == 16) 472 DstSize = 32; 473 474 const TargetRegisterClass *DstRC = 475 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI); 476 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 477 return false; 478 479 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 480 const TargetRegisterClass *SrcRC = 481 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 482 if (!SrcRC) 483 return false; 484 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32, 485 DstSize / 32); 486 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg); 487 if (!SrcRC) 488 return false; 489 490 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I, 491 *SrcRC, I.getOperand(1)); 492 const DebugLoc &DL = I.getDebugLoc(); 493 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg) 494 .addReg(SrcReg, 0, SubReg); 495 496 I.eraseFromParent(); 497 return true; 498 } 499 500 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { 501 MachineBasicBlock *BB = MI.getParent(); 502 Register DstReg = MI.getOperand(0).getReg(); 503 LLT DstTy = MRI->getType(DstReg); 504 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg()); 505 506 const unsigned SrcSize = SrcTy.getSizeInBits(); 507 if (SrcSize < 32) 508 return selectImpl(MI, *CoverageInfo); 509 510 const DebugLoc &DL = MI.getDebugLoc(); 511 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 512 const unsigned DstSize = DstTy.getSizeInBits(); 513 const TargetRegisterClass *DstRC = 514 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 515 if (!DstRC) 516 return false; 517 518 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8); 519 MachineInstrBuilder MIB = 520 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg); 521 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) { 522 MachineOperand &Src = MI.getOperand(I + 1); 523 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef())); 524 MIB.addImm(SubRegs[I]); 525 526 const TargetRegisterClass *SrcRC 527 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 528 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI)) 529 return false; 530 } 531 532 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 533 return false; 534 535 MI.eraseFromParent(); 536 return true; 537 } 538 539 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { 540 MachineBasicBlock *BB = MI.getParent(); 541 const int NumDst = MI.getNumOperands() - 1; 542 543 MachineOperand &Src = MI.getOperand(NumDst); 544 545 Register SrcReg = Src.getReg(); 546 Register DstReg0 = MI.getOperand(0).getReg(); 547 LLT DstTy = MRI->getType(DstReg0); 548 LLT SrcTy = MRI->getType(SrcReg); 549 550 const unsigned DstSize = DstTy.getSizeInBits(); 551 const unsigned SrcSize = SrcTy.getSizeInBits(); 552 const DebugLoc &DL = MI.getDebugLoc(); 553 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 554 555 const TargetRegisterClass *SrcRC = 556 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 557 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 558 return false; 559 560 // Note we could have mixed SGPR and VGPR destination banks for an SGPR 561 // source, and this relies on the fact that the same subregister indices are 562 // used for both. 563 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); 564 for (int I = 0, E = NumDst; I != E; ++I) { 565 MachineOperand &Dst = MI.getOperand(I); 566 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg()) 567 .addReg(SrcReg, 0, SubRegs[I]); 568 569 // Make sure the subregister index is valid for the source register. 570 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]); 571 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 572 return false; 573 574 const TargetRegisterClass *DstRC = 575 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 576 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI)) 577 return false; 578 } 579 580 MI.eraseFromParent(); 581 return true; 582 } 583 584 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC( 585 MachineInstr &MI) const { 586 if (selectImpl(MI, *CoverageInfo)) 587 return true; 588 589 const LLT S32 = LLT::scalar(32); 590 const LLT V2S16 = LLT::vector(2, 16); 591 592 Register Dst = MI.getOperand(0).getReg(); 593 if (MRI->getType(Dst) != V2S16) 594 return false; 595 596 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI); 597 if (DstBank->getID() != AMDGPU::SGPRRegBankID) 598 return false; 599 600 Register Src0 = MI.getOperand(1).getReg(); 601 Register Src1 = MI.getOperand(2).getReg(); 602 if (MRI->getType(Src0) != S32) 603 return false; 604 605 const DebugLoc &DL = MI.getDebugLoc(); 606 MachineBasicBlock *BB = MI.getParent(); 607 608 auto ConstSrc1 = getConstantVRegValWithLookThrough(Src1, *MRI, true, true); 609 if (ConstSrc1) { 610 auto ConstSrc0 = getConstantVRegValWithLookThrough(Src0, *MRI, true, true); 611 if (ConstSrc0) { 612 uint32_t Lo16 = static_cast<uint32_t>(ConstSrc0->Value) & 0xffff; 613 uint32_t Hi16 = static_cast<uint32_t>(ConstSrc1->Value) & 0xffff; 614 615 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst) 616 .addImm(Lo16 | (Hi16 << 16)); 617 MI.eraseFromParent(); 618 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI); 619 } 620 } 621 622 // TODO: This should probably be a combine somewhere 623 // (build_vector_trunc $src0, undef -> copy $src0 624 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI); 625 if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) { 626 MI.setDesc(TII.get(AMDGPU::COPY)); 627 MI.RemoveOperand(2); 628 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) && 629 RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI); 630 } 631 632 Register ShiftSrc0; 633 Register ShiftSrc1; 634 int64_t ShiftAmt; 635 636 // With multiple uses of the shift, this will duplicate the shift and 637 // increase register pressure. 638 // 639 // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16) 640 // => (S_PACK_HH_B32_B16 $src0, $src1) 641 // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16)) 642 // => (S_PACK_LH_B32_B16 $src0, $src1) 643 // (build_vector_trunc $src0, $src1) 644 // => (S_PACK_LL_B32_B16 $src0, $src1) 645 646 // FIXME: This is an inconvenient way to check a specific value 647 bool Shift0 = mi_match( 648 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_ICst(ShiftAmt)))) && 649 ShiftAmt == 16; 650 651 bool Shift1 = mi_match( 652 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_ICst(ShiftAmt)))) && 653 ShiftAmt == 16; 654 655 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16; 656 if (Shift0 && Shift1) { 657 Opc = AMDGPU::S_PACK_HH_B32_B16; 658 MI.getOperand(1).setReg(ShiftSrc0); 659 MI.getOperand(2).setReg(ShiftSrc1); 660 } else if (Shift1) { 661 Opc = AMDGPU::S_PACK_LH_B32_B16; 662 MI.getOperand(2).setReg(ShiftSrc1); 663 } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) { 664 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16 665 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst) 666 .addReg(ShiftSrc0) 667 .addImm(16); 668 669 MI.eraseFromParent(); 670 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 671 } 672 673 MI.setDesc(TII.get(Opc)); 674 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); 675 } 676 677 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const { 678 return selectG_ADD_SUB(I); 679 } 680 681 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { 682 const MachineOperand &MO = I.getOperand(0); 683 684 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The 685 // regbank check here is to know why getConstrainedRegClassForOperand failed. 686 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI); 687 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) || 688 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) { 689 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); 690 return true; 691 } 692 693 return false; 694 } 695 696 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { 697 MachineBasicBlock *BB = I.getParent(); 698 699 Register DstReg = I.getOperand(0).getReg(); 700 Register Src0Reg = I.getOperand(1).getReg(); 701 Register Src1Reg = I.getOperand(2).getReg(); 702 LLT Src1Ty = MRI->getType(Src1Reg); 703 704 unsigned DstSize = MRI->getType(DstReg).getSizeInBits(); 705 unsigned InsSize = Src1Ty.getSizeInBits(); 706 707 int64_t Offset = I.getOperand(3).getImm(); 708 709 // FIXME: These cases should have been illegal and unnecessary to check here. 710 if (Offset % 32 != 0 || InsSize % 32 != 0) 711 return false; 712 713 // Currently not handled by getSubRegFromChannel. 714 if (InsSize > 128) 715 return false; 716 717 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32); 718 if (SubReg == AMDGPU::NoSubRegister) 719 return false; 720 721 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 722 const TargetRegisterClass *DstRC = 723 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 724 if (!DstRC) 725 return false; 726 727 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI); 728 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI); 729 const TargetRegisterClass *Src0RC = 730 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI); 731 const TargetRegisterClass *Src1RC = 732 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI); 733 734 // Deal with weird cases where the class only partially supports the subreg 735 // index. 736 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg); 737 if (!Src0RC || !Src1RC) 738 return false; 739 740 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 741 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) || 742 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI)) 743 return false; 744 745 const DebugLoc &DL = I.getDebugLoc(); 746 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg) 747 .addReg(Src0Reg) 748 .addReg(Src1Reg) 749 .addImm(SubReg); 750 751 I.eraseFromParent(); 752 return true; 753 } 754 755 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const { 756 if (STI.getLDSBankCount() != 16) 757 return selectImpl(MI, *CoverageInfo); 758 759 Register Dst = MI.getOperand(0).getReg(); 760 Register Src0 = MI.getOperand(2).getReg(); 761 Register M0Val = MI.getOperand(6).getReg(); 762 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) || 763 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) || 764 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI)) 765 return false; 766 767 // This requires 2 instructions. It is possible to write a pattern to support 768 // this, but the generated isel emitter doesn't correctly deal with multiple 769 // output instructions using the same physical register input. The copy to m0 770 // is incorrectly placed before the second instruction. 771 // 772 // TODO: Match source modifiers. 773 774 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 775 const DebugLoc &DL = MI.getDebugLoc(); 776 MachineBasicBlock *MBB = MI.getParent(); 777 778 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 779 .addReg(M0Val); 780 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov) 781 .addImm(2) 782 .addImm(MI.getOperand(4).getImm()) // $attr 783 .addImm(MI.getOperand(3).getImm()); // $attrchan 784 785 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst) 786 .addImm(0) // $src0_modifiers 787 .addReg(Src0) // $src0 788 .addImm(MI.getOperand(4).getImm()) // $attr 789 .addImm(MI.getOperand(3).getImm()) // $attrchan 790 .addImm(0) // $src2_modifiers 791 .addReg(InterpMov) // $src2 - 2 f16 values selected by high 792 .addImm(MI.getOperand(5).getImm()) // $high 793 .addImm(0) // $clamp 794 .addImm(0); // $omod 795 796 MI.eraseFromParent(); 797 return true; 798 } 799 800 // We need to handle this here because tablegen doesn't support matching 801 // instructions with multiple outputs. 802 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const { 803 Register Dst0 = MI.getOperand(0).getReg(); 804 Register Dst1 = MI.getOperand(1).getReg(); 805 806 LLT Ty = MRI->getType(Dst0); 807 unsigned Opc; 808 if (Ty == LLT::scalar(32)) 809 Opc = AMDGPU::V_DIV_SCALE_F32; 810 else if (Ty == LLT::scalar(64)) 811 Opc = AMDGPU::V_DIV_SCALE_F64; 812 else 813 return false; 814 815 const DebugLoc &DL = MI.getDebugLoc(); 816 MachineBasicBlock *MBB = MI.getParent(); 817 818 Register Numer = MI.getOperand(3).getReg(); 819 Register Denom = MI.getOperand(4).getReg(); 820 unsigned ChooseDenom = MI.getOperand(5).getImm(); 821 822 Register Src0 = ChooseDenom != 0 ? Numer : Denom; 823 824 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0) 825 .addDef(Dst1) 826 .addUse(Src0) 827 .addUse(Denom) 828 .addUse(Numer); 829 830 MI.eraseFromParent(); 831 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 832 } 833 834 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { 835 unsigned IntrinsicID = I.getIntrinsicID(); 836 switch (IntrinsicID) { 837 case Intrinsic::amdgcn_if_break: { 838 MachineBasicBlock *BB = I.getParent(); 839 840 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 841 // SelectionDAG uses for wave32 vs wave64. 842 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK)) 843 .add(I.getOperand(0)) 844 .add(I.getOperand(2)) 845 .add(I.getOperand(3)); 846 847 Register DstReg = I.getOperand(0).getReg(); 848 Register Src0Reg = I.getOperand(2).getReg(); 849 Register Src1Reg = I.getOperand(3).getReg(); 850 851 I.eraseFromParent(); 852 853 for (Register Reg : { DstReg, Src0Reg, Src1Reg }) 854 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 855 856 return true; 857 } 858 case Intrinsic::amdgcn_interp_p1_f16: 859 return selectInterpP1F16(I); 860 case Intrinsic::amdgcn_wqm: 861 return constrainCopyLikeIntrin(I, AMDGPU::WQM); 862 case Intrinsic::amdgcn_softwqm: 863 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM); 864 case Intrinsic::amdgcn_wwm: 865 return constrainCopyLikeIntrin(I, AMDGPU::WWM); 866 case Intrinsic::amdgcn_div_scale: 867 return selectDivScale(I); 868 case Intrinsic::amdgcn_icmp: 869 return selectIntrinsicIcmp(I); 870 case Intrinsic::amdgcn_ballot: 871 return selectBallot(I); 872 case Intrinsic::amdgcn_reloc_constant: 873 return selectRelocConstant(I); 874 case Intrinsic::returnaddress: 875 return selectReturnAddress(I); 876 default: 877 return selectImpl(I, *CoverageInfo); 878 } 879 } 880 881 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) { 882 if (Size != 32 && Size != 64) 883 return -1; 884 switch (P) { 885 default: 886 llvm_unreachable("Unknown condition code!"); 887 case CmpInst::ICMP_NE: 888 return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64; 889 case CmpInst::ICMP_EQ: 890 return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64; 891 case CmpInst::ICMP_SGT: 892 return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64; 893 case CmpInst::ICMP_SGE: 894 return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64; 895 case CmpInst::ICMP_SLT: 896 return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64; 897 case CmpInst::ICMP_SLE: 898 return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64; 899 case CmpInst::ICMP_UGT: 900 return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64; 901 case CmpInst::ICMP_UGE: 902 return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64; 903 case CmpInst::ICMP_ULT: 904 return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64; 905 case CmpInst::ICMP_ULE: 906 return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64; 907 } 908 } 909 910 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P, 911 unsigned Size) const { 912 if (Size == 64) { 913 if (!STI.hasScalarCompareEq64()) 914 return -1; 915 916 switch (P) { 917 case CmpInst::ICMP_NE: 918 return AMDGPU::S_CMP_LG_U64; 919 case CmpInst::ICMP_EQ: 920 return AMDGPU::S_CMP_EQ_U64; 921 default: 922 return -1; 923 } 924 } 925 926 if (Size != 32) 927 return -1; 928 929 switch (P) { 930 case CmpInst::ICMP_NE: 931 return AMDGPU::S_CMP_LG_U32; 932 case CmpInst::ICMP_EQ: 933 return AMDGPU::S_CMP_EQ_U32; 934 case CmpInst::ICMP_SGT: 935 return AMDGPU::S_CMP_GT_I32; 936 case CmpInst::ICMP_SGE: 937 return AMDGPU::S_CMP_GE_I32; 938 case CmpInst::ICMP_SLT: 939 return AMDGPU::S_CMP_LT_I32; 940 case CmpInst::ICMP_SLE: 941 return AMDGPU::S_CMP_LE_I32; 942 case CmpInst::ICMP_UGT: 943 return AMDGPU::S_CMP_GT_U32; 944 case CmpInst::ICMP_UGE: 945 return AMDGPU::S_CMP_GE_U32; 946 case CmpInst::ICMP_ULT: 947 return AMDGPU::S_CMP_LT_U32; 948 case CmpInst::ICMP_ULE: 949 return AMDGPU::S_CMP_LE_U32; 950 default: 951 llvm_unreachable("Unknown condition code!"); 952 } 953 } 954 955 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { 956 MachineBasicBlock *BB = I.getParent(); 957 const DebugLoc &DL = I.getDebugLoc(); 958 959 Register SrcReg = I.getOperand(2).getReg(); 960 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 961 962 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); 963 964 Register CCReg = I.getOperand(0).getReg(); 965 if (!isVCC(CCReg, *MRI)) { 966 int Opcode = getS_CMPOpcode(Pred, Size); 967 if (Opcode == -1) 968 return false; 969 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode)) 970 .add(I.getOperand(2)) 971 .add(I.getOperand(3)); 972 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg) 973 .addReg(AMDGPU::SCC); 974 bool Ret = 975 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) && 976 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI); 977 I.eraseFromParent(); 978 return Ret; 979 } 980 981 int Opcode = getV_CMPOpcode(Pred, Size); 982 if (Opcode == -1) 983 return false; 984 985 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), 986 I.getOperand(0).getReg()) 987 .add(I.getOperand(2)) 988 .add(I.getOperand(3)); 989 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), 990 *TRI.getBoolRC(), *MRI); 991 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 992 I.eraseFromParent(); 993 return Ret; 994 } 995 996 bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const { 997 Register Dst = I.getOperand(0).getReg(); 998 if (isVCC(Dst, *MRI)) 999 return false; 1000 1001 if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize()) 1002 return false; 1003 1004 MachineBasicBlock *BB = I.getParent(); 1005 const DebugLoc &DL = I.getDebugLoc(); 1006 Register SrcReg = I.getOperand(2).getReg(); 1007 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 1008 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm()); 1009 1010 int Opcode = getV_CMPOpcode(Pred, Size); 1011 if (Opcode == -1) 1012 return false; 1013 1014 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst) 1015 .add(I.getOperand(2)) 1016 .add(I.getOperand(3)); 1017 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(), 1018 *MRI); 1019 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 1020 I.eraseFromParent(); 1021 return Ret; 1022 } 1023 1024 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const { 1025 MachineBasicBlock *BB = I.getParent(); 1026 const DebugLoc &DL = I.getDebugLoc(); 1027 Register DstReg = I.getOperand(0).getReg(); 1028 const unsigned Size = MRI->getType(DstReg).getSizeInBits(); 1029 const bool Is64 = Size == 64; 1030 1031 if (Size != STI.getWavefrontSize()) 1032 return false; 1033 1034 Optional<ValueAndVReg> Arg = 1035 getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true); 1036 1037 if (Arg.hasValue()) { 1038 const int64_t Value = Arg.getValue().Value; 1039 if (Value == 0) { 1040 unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; 1041 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0); 1042 } else if (Value == -1) { // all ones 1043 Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO; 1044 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg); 1045 } else 1046 return false; 1047 } else { 1048 Register SrcReg = I.getOperand(2).getReg(); 1049 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg); 1050 } 1051 1052 I.eraseFromParent(); 1053 return true; 1054 } 1055 1056 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const { 1057 Register DstReg = I.getOperand(0).getReg(); 1058 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 1059 const TargetRegisterClass *DstRC = 1060 TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI); 1061 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 1062 return false; 1063 1064 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID; 1065 1066 Module *M = MF->getFunction().getParent(); 1067 const MDNode *Metadata = I.getOperand(2).getMetadata(); 1068 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString(); 1069 auto RelocSymbol = cast<GlobalVariable>( 1070 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext()))); 1071 1072 MachineBasicBlock *BB = I.getParent(); 1073 BuildMI(*BB, &I, I.getDebugLoc(), 1074 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg) 1075 .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO); 1076 1077 I.eraseFromParent(); 1078 return true; 1079 } 1080 1081 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const { 1082 MachineBasicBlock *MBB = I.getParent(); 1083 MachineFunction &MF = *MBB->getParent(); 1084 const DebugLoc &DL = I.getDebugLoc(); 1085 1086 MachineOperand &Dst = I.getOperand(0); 1087 Register DstReg = Dst.getReg(); 1088 unsigned Depth = I.getOperand(2).getImm(); 1089 1090 const TargetRegisterClass *RC 1091 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 1092 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) || 1093 !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) 1094 return false; 1095 1096 // Check for kernel and shader functions 1097 if (Depth != 0 || 1098 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) { 1099 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) 1100 .addImm(0); 1101 I.eraseFromParent(); 1102 return true; 1103 } 1104 1105 MachineFrameInfo &MFI = MF.getFrameInfo(); 1106 // There is a call to @llvm.returnaddress in this function 1107 MFI.setReturnAddressIsTaken(true); 1108 1109 // Get the return address reg and mark it as an implicit live-in 1110 Register ReturnAddrReg = TRI.getReturnAddressReg(MF); 1111 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg, 1112 AMDGPU::SReg_64RegClass); 1113 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg) 1114 .addReg(LiveIn); 1115 I.eraseFromParent(); 1116 return true; 1117 } 1118 1119 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const { 1120 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 1121 // SelectionDAG uses for wave32 vs wave64. 1122 MachineBasicBlock *BB = MI.getParent(); 1123 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF)) 1124 .add(MI.getOperand(1)); 1125 1126 Register Reg = MI.getOperand(1).getReg(); 1127 MI.eraseFromParent(); 1128 1129 if (!MRI->getRegClassOrNull(Reg)) 1130 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 1131 return true; 1132 } 1133 1134 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic( 1135 MachineInstr &MI, Intrinsic::ID IntrID) const { 1136 MachineBasicBlock *MBB = MI.getParent(); 1137 MachineFunction *MF = MBB->getParent(); 1138 const DebugLoc &DL = MI.getDebugLoc(); 1139 1140 unsigned IndexOperand = MI.getOperand(7).getImm(); 1141 bool WaveRelease = MI.getOperand(8).getImm() != 0; 1142 bool WaveDone = MI.getOperand(9).getImm() != 0; 1143 1144 if (WaveDone && !WaveRelease) 1145 report_fatal_error("ds_ordered_count: wave_done requires wave_release"); 1146 1147 unsigned OrderedCountIndex = IndexOperand & 0x3f; 1148 IndexOperand &= ~0x3f; 1149 unsigned CountDw = 0; 1150 1151 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) { 1152 CountDw = (IndexOperand >> 24) & 0xf; 1153 IndexOperand &= ~(0xf << 24); 1154 1155 if (CountDw < 1 || CountDw > 4) { 1156 report_fatal_error( 1157 "ds_ordered_count: dword count must be between 1 and 4"); 1158 } 1159 } 1160 1161 if (IndexOperand) 1162 report_fatal_error("ds_ordered_count: bad index operand"); 1163 1164 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1; 1165 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF); 1166 1167 unsigned Offset0 = OrderedCountIndex << 2; 1168 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) | 1169 (Instruction << 4); 1170 1171 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) 1172 Offset1 |= (CountDw - 1) << 6; 1173 1174 unsigned Offset = Offset0 | (Offset1 << 8); 1175 1176 Register M0Val = MI.getOperand(2).getReg(); 1177 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1178 .addReg(M0Val); 1179 1180 Register DstReg = MI.getOperand(0).getReg(); 1181 Register ValReg = MI.getOperand(3).getReg(); 1182 MachineInstrBuilder DS = 1183 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg) 1184 .addReg(ValReg) 1185 .addImm(Offset) 1186 .cloneMemRefs(MI); 1187 1188 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI)) 1189 return false; 1190 1191 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI); 1192 MI.eraseFromParent(); 1193 return Ret; 1194 } 1195 1196 static unsigned gwsIntrinToOpcode(unsigned IntrID) { 1197 switch (IntrID) { 1198 case Intrinsic::amdgcn_ds_gws_init: 1199 return AMDGPU::DS_GWS_INIT; 1200 case Intrinsic::amdgcn_ds_gws_barrier: 1201 return AMDGPU::DS_GWS_BARRIER; 1202 case Intrinsic::amdgcn_ds_gws_sema_v: 1203 return AMDGPU::DS_GWS_SEMA_V; 1204 case Intrinsic::amdgcn_ds_gws_sema_br: 1205 return AMDGPU::DS_GWS_SEMA_BR; 1206 case Intrinsic::amdgcn_ds_gws_sema_p: 1207 return AMDGPU::DS_GWS_SEMA_P; 1208 case Intrinsic::amdgcn_ds_gws_sema_release_all: 1209 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL; 1210 default: 1211 llvm_unreachable("not a gws intrinsic"); 1212 } 1213 } 1214 1215 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, 1216 Intrinsic::ID IID) const { 1217 if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all && 1218 !STI.hasGWSSemaReleaseAll()) 1219 return false; 1220 1221 // intrinsic ID, vsrc, offset 1222 const bool HasVSrc = MI.getNumOperands() == 3; 1223 assert(HasVSrc || MI.getNumOperands() == 2); 1224 1225 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg(); 1226 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI); 1227 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID) 1228 return false; 1229 1230 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1231 assert(OffsetDef); 1232 1233 unsigned ImmOffset; 1234 1235 MachineBasicBlock *MBB = MI.getParent(); 1236 const DebugLoc &DL = MI.getDebugLoc(); 1237 1238 MachineInstr *Readfirstlane = nullptr; 1239 1240 // If we legalized the VGPR input, strip out the readfirstlane to analyze the 1241 // incoming offset, in case there's an add of a constant. We'll have to put it 1242 // back later. 1243 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) { 1244 Readfirstlane = OffsetDef; 1245 BaseOffset = OffsetDef->getOperand(1).getReg(); 1246 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1247 } 1248 1249 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) { 1250 // If we have a constant offset, try to use the 0 in m0 as the base. 1251 // TODO: Look into changing the default m0 initialization value. If the 1252 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to 1253 // the immediate offset. 1254 1255 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue(); 1256 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1257 .addImm(0); 1258 } else { 1259 std::tie(BaseOffset, ImmOffset, OffsetDef) 1260 = AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset); 1261 1262 if (Readfirstlane) { 1263 // We have the constant offset now, so put the readfirstlane back on the 1264 // variable component. 1265 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI)) 1266 return false; 1267 1268 Readfirstlane->getOperand(1).setReg(BaseOffset); 1269 BaseOffset = Readfirstlane->getOperand(0).getReg(); 1270 } else { 1271 if (!RBI.constrainGenericRegister(BaseOffset, 1272 AMDGPU::SReg_32RegClass, *MRI)) 1273 return false; 1274 } 1275 1276 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1277 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base) 1278 .addReg(BaseOffset) 1279 .addImm(16); 1280 1281 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1282 .addReg(M0Base); 1283 } 1284 1285 // The resource id offset is computed as (<isa opaque base> + M0[21:16] + 1286 // offset field) % 64. Some versions of the programming guide omit the m0 1287 // part, or claim it's from offset 0. 1288 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID))); 1289 1290 if (HasVSrc) { 1291 Register VSrc = MI.getOperand(1).getReg(); 1292 MIB.addReg(VSrc); 1293 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI)) 1294 return false; 1295 } 1296 1297 MIB.addImm(ImmOffset) 1298 .addImm(-1) // $gds 1299 .cloneMemRefs(MI); 1300 1301 MI.eraseFromParent(); 1302 return true; 1303 } 1304 1305 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI, 1306 bool IsAppend) const { 1307 Register PtrBase = MI.getOperand(2).getReg(); 1308 LLT PtrTy = MRI->getType(PtrBase); 1309 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS; 1310 1311 unsigned Offset; 1312 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2)); 1313 1314 // TODO: Should this try to look through readfirstlane like GWS? 1315 if (!isDSOffsetLegal(PtrBase, Offset, 16)) { 1316 PtrBase = MI.getOperand(2).getReg(); 1317 Offset = 0; 1318 } 1319 1320 MachineBasicBlock *MBB = MI.getParent(); 1321 const DebugLoc &DL = MI.getDebugLoc(); 1322 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME; 1323 1324 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1325 .addReg(PtrBase); 1326 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI)) 1327 return false; 1328 1329 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg()) 1330 .addImm(Offset) 1331 .addImm(IsGDS ? -1 : 0) 1332 .cloneMemRefs(MI); 1333 MI.eraseFromParent(); 1334 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1335 } 1336 1337 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, 1338 bool &IsTexFail) { 1339 if (TexFailCtrl) 1340 IsTexFail = true; 1341 1342 TFE = (TexFailCtrl & 0x1) ? 1 : 0; 1343 TexFailCtrl &= ~(uint64_t)0x1; 1344 LWE = (TexFailCtrl & 0x2) ? 1 : 0; 1345 TexFailCtrl &= ~(uint64_t)0x2; 1346 1347 return TexFailCtrl == 0; 1348 } 1349 1350 static bool parseCachePolicy(uint64_t Value, 1351 bool *GLC, bool *SLC, bool *DLC) { 1352 if (GLC) { 1353 *GLC = (Value & 0x1) ? 1 : 0; 1354 Value &= ~(uint64_t)0x1; 1355 } 1356 if (SLC) { 1357 *SLC = (Value & 0x2) ? 1 : 0; 1358 Value &= ~(uint64_t)0x2; 1359 } 1360 if (DLC) { 1361 *DLC = (Value & 0x4) ? 1 : 0; 1362 Value &= ~(uint64_t)0x4; 1363 } 1364 1365 return Value == 0; 1366 } 1367 1368 bool AMDGPUInstructionSelector::selectImageIntrinsic( 1369 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 1370 MachineBasicBlock *MBB = MI.getParent(); 1371 const DebugLoc &DL = MI.getDebugLoc(); 1372 1373 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 1374 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 1375 1376 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); 1377 const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 1378 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode); 1379 const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo = 1380 AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode); 1381 unsigned IntrOpcode = Intr->BaseOpcode; 1382 const bool IsGFX10 = STI.getGeneration() >= AMDGPUSubtarget::GFX10; 1383 1384 const int VAddrIdx = getImageVAddrIdxBegin(BaseOpcode, 1385 MI.getNumExplicitDefs()); 1386 int NumVAddr, NumGradients; 1387 std::tie(NumVAddr, NumGradients) = getImageNumVAddr(Intr, BaseOpcode); 1388 1389 Register VDataIn, VDataOut; 1390 LLT VDataTy; 1391 int NumVDataDwords = -1; 1392 bool IsD16 = false; 1393 1394 // XXX - Can we just get the second to last argument for ctrl? 1395 unsigned CtrlIdx; // Index of texfailctrl argument 1396 bool Unorm; 1397 if (!BaseOpcode->Sampler) { 1398 Unorm = true; 1399 CtrlIdx = VAddrIdx + NumVAddr + 1; 1400 } else { 1401 Unorm = MI.getOperand(VAddrIdx + NumVAddr + 2).getImm() != 0; 1402 CtrlIdx = VAddrIdx + NumVAddr + 3; 1403 } 1404 1405 bool TFE; 1406 bool LWE; 1407 bool IsTexFail = false; 1408 if (!parseTexFail(MI.getOperand(CtrlIdx).getImm(), TFE, LWE, IsTexFail)) 1409 return false; 1410 1411 const int Flags = MI.getOperand(CtrlIdx + 2).getImm(); 1412 const bool IsA16 = (Flags & 1) != 0; 1413 const bool IsG16 = (Flags & 2) != 0; 1414 1415 // A16 implies 16 bit gradients 1416 if (IsA16 && !IsG16) 1417 return false; 1418 1419 unsigned DMask = 0; 1420 unsigned DMaskLanes = 0; 1421 1422 if (BaseOpcode->Atomic) { 1423 VDataOut = MI.getOperand(0).getReg(); 1424 VDataIn = MI.getOperand(2).getReg(); 1425 LLT Ty = MRI->getType(VDataIn); 1426 1427 // Be careful to allow atomic swap on 16-bit element vectors. 1428 const bool Is64Bit = BaseOpcode->AtomicX2 ? 1429 Ty.getSizeInBits() == 128 : 1430 Ty.getSizeInBits() == 64; 1431 1432 if (BaseOpcode->AtomicX2) { 1433 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister); 1434 1435 DMask = Is64Bit ? 0xf : 0x3; 1436 NumVDataDwords = Is64Bit ? 4 : 2; 1437 } else { 1438 DMask = Is64Bit ? 0x3 : 0x1; 1439 NumVDataDwords = Is64Bit ? 2 : 1; 1440 } 1441 } else { 1442 const int DMaskIdx = 2; // Input/output + intrinsic ID. 1443 1444 DMask = MI.getOperand(DMaskIdx).getImm(); 1445 DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask); 1446 1447 if (BaseOpcode->Store) { 1448 VDataIn = MI.getOperand(1).getReg(); 1449 VDataTy = MRI->getType(VDataIn); 1450 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32; 1451 } else { 1452 VDataOut = MI.getOperand(0).getReg(); 1453 VDataTy = MRI->getType(VDataOut); 1454 NumVDataDwords = DMaskLanes; 1455 1456 // One memoperand is mandatory, except for getresinfo. 1457 // FIXME: Check this in verifier. 1458 if (!MI.memoperands_empty()) { 1459 const MachineMemOperand *MMO = *MI.memoperands_begin(); 1460 1461 // Infer d16 from the memory size, as the register type will be mangled by 1462 // unpacked subtargets, or by TFE. 1463 IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32; 1464 1465 if (IsD16 && !STI.hasUnpackedD16VMem()) 1466 NumVDataDwords = (DMaskLanes + 1) / 2; 1467 } 1468 } 1469 } 1470 1471 // Optimize _L to _LZ when _L is zero 1472 if (LZMappingInfo) { 1473 // The legalizer replaced the register with an immediate 0 if we need to 1474 // change the opcode. 1475 const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1); 1476 if (Lod.isImm()) { 1477 assert(Lod.getImm() == 0); 1478 IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l 1479 } 1480 } 1481 1482 // Optimize _mip away, when 'lod' is zero 1483 if (MIPMappingInfo) { 1484 const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1); 1485 if (Lod.isImm()) { 1486 assert(Lod.getImm() == 0); 1487 IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip 1488 } 1489 } 1490 1491 // Set G16 opcode 1492 if (IsG16 && !IsA16) { 1493 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = 1494 AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode); 1495 assert(G16MappingInfo); 1496 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16 1497 } 1498 1499 // TODO: Check this in verifier. 1500 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this"); 1501 1502 bool GLC = false; 1503 bool SLC = false; 1504 bool DLC = false; 1505 if (BaseOpcode->Atomic) { 1506 GLC = true; // TODO no-return optimization 1507 if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), nullptr, &SLC, 1508 IsGFX10 ? &DLC : nullptr)) 1509 return false; 1510 } else { 1511 if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), &GLC, &SLC, 1512 IsGFX10 ? &DLC : nullptr)) 1513 return false; 1514 } 1515 1516 int NumVAddrRegs = 0; 1517 int NumVAddrDwords = 0; 1518 for (int I = 0; I < NumVAddr; ++I) { 1519 // Skip the $noregs and 0s inserted during legalization. 1520 MachineOperand &AddrOp = MI.getOperand(VAddrIdx + I); 1521 if (!AddrOp.isReg()) 1522 continue; // XXX - Break? 1523 1524 Register Addr = AddrOp.getReg(); 1525 if (!Addr) 1526 break; 1527 1528 ++NumVAddrRegs; 1529 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32; 1530 } 1531 1532 // The legalizer preprocessed the intrinsic arguments. If we aren't using 1533 // NSA, these should have beeen packed into a single value in the first 1534 // address register 1535 const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs; 1536 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) { 1537 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n"); 1538 return false; 1539 } 1540 1541 if (IsTexFail) 1542 ++NumVDataDwords; 1543 1544 int Opcode = -1; 1545 if (IsGFX10) { 1546 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, 1547 UseNSA ? AMDGPU::MIMGEncGfx10NSA 1548 : AMDGPU::MIMGEncGfx10Default, 1549 NumVDataDwords, NumVAddrDwords); 1550 } else { 1551 if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 1552 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, 1553 NumVDataDwords, NumVAddrDwords); 1554 if (Opcode == -1) 1555 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6, 1556 NumVDataDwords, NumVAddrDwords); 1557 } 1558 assert(Opcode != -1); 1559 1560 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode)) 1561 .cloneMemRefs(MI); 1562 1563 if (VDataOut) { 1564 if (BaseOpcode->AtomicX2) { 1565 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64; 1566 1567 Register TmpReg = MRI->createVirtualRegister( 1568 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass); 1569 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 1570 1571 MIB.addDef(TmpReg); 1572 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut) 1573 .addReg(TmpReg, RegState::Kill, SubReg); 1574 1575 } else { 1576 MIB.addDef(VDataOut); // vdata output 1577 } 1578 } 1579 1580 if (VDataIn) 1581 MIB.addReg(VDataIn); // vdata input 1582 1583 for (int i = 0; i != NumVAddrRegs; ++i) { 1584 MachineOperand &SrcOp = MI.getOperand(VAddrIdx + i); 1585 if (SrcOp.isReg()) { 1586 assert(SrcOp.getReg() != 0); 1587 MIB.addReg(SrcOp.getReg()); 1588 } 1589 } 1590 1591 MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr).getReg()); // rsrc 1592 if (BaseOpcode->Sampler) 1593 MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr + 1).getReg()); // sampler 1594 1595 MIB.addImm(DMask); // dmask 1596 1597 if (IsGFX10) 1598 MIB.addImm(DimInfo->Encoding); 1599 MIB.addImm(Unorm); 1600 if (IsGFX10) 1601 MIB.addImm(DLC); 1602 1603 MIB.addImm(GLC); 1604 MIB.addImm(SLC); 1605 MIB.addImm(IsA16 && // a16 or r128 1606 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0); 1607 if (IsGFX10) 1608 MIB.addImm(IsA16 ? -1 : 0); 1609 1610 MIB.addImm(TFE); // tfe 1611 MIB.addImm(LWE); // lwe 1612 if (!IsGFX10) 1613 MIB.addImm(DimInfo->DA ? -1 : 0); 1614 if (BaseOpcode->HasD16) 1615 MIB.addImm(IsD16 ? -1 : 0); 1616 1617 MI.eraseFromParent(); 1618 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1619 } 1620 1621 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( 1622 MachineInstr &I) const { 1623 unsigned IntrinsicID = I.getIntrinsicID(); 1624 switch (IntrinsicID) { 1625 case Intrinsic::amdgcn_end_cf: 1626 return selectEndCfIntrinsic(I); 1627 case Intrinsic::amdgcn_ds_ordered_add: 1628 case Intrinsic::amdgcn_ds_ordered_swap: 1629 return selectDSOrderedIntrinsic(I, IntrinsicID); 1630 case Intrinsic::amdgcn_ds_gws_init: 1631 case Intrinsic::amdgcn_ds_gws_barrier: 1632 case Intrinsic::amdgcn_ds_gws_sema_v: 1633 case Intrinsic::amdgcn_ds_gws_sema_br: 1634 case Intrinsic::amdgcn_ds_gws_sema_p: 1635 case Intrinsic::amdgcn_ds_gws_sema_release_all: 1636 return selectDSGWSIntrinsic(I, IntrinsicID); 1637 case Intrinsic::amdgcn_ds_append: 1638 return selectDSAppendConsume(I, true); 1639 case Intrinsic::amdgcn_ds_consume: 1640 return selectDSAppendConsume(I, false); 1641 default: { 1642 return selectImpl(I, *CoverageInfo); 1643 } 1644 } 1645 } 1646 1647 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { 1648 if (selectImpl(I, *CoverageInfo)) 1649 return true; 1650 1651 MachineBasicBlock *BB = I.getParent(); 1652 const DebugLoc &DL = I.getDebugLoc(); 1653 1654 Register DstReg = I.getOperand(0).getReg(); 1655 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 1656 assert(Size <= 32 || Size == 64); 1657 const MachineOperand &CCOp = I.getOperand(1); 1658 Register CCReg = CCOp.getReg(); 1659 if (!isVCC(CCReg, *MRI)) { 1660 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 : 1661 AMDGPU::S_CSELECT_B32; 1662 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 1663 .addReg(CCReg); 1664 1665 // The generic constrainSelectedInstRegOperands doesn't work for the scc register 1666 // bank, because it does not cover the register class that we used to represent 1667 // for it. So we need to manually set the register class here. 1668 if (!MRI->getRegClassOrNull(CCReg)) 1669 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI)); 1670 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) 1671 .add(I.getOperand(2)) 1672 .add(I.getOperand(3)); 1673 1674 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) | 1675 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI); 1676 I.eraseFromParent(); 1677 return Ret; 1678 } 1679 1680 // Wide VGPR select should have been split in RegBankSelect. 1681 if (Size > 32) 1682 return false; 1683 1684 MachineInstr *Select = 1685 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1686 .addImm(0) 1687 .add(I.getOperand(3)) 1688 .addImm(0) 1689 .add(I.getOperand(2)) 1690 .add(I.getOperand(1)); 1691 1692 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); 1693 I.eraseFromParent(); 1694 return Ret; 1695 } 1696 1697 static int sizeToSubRegIndex(unsigned Size) { 1698 switch (Size) { 1699 case 32: 1700 return AMDGPU::sub0; 1701 case 64: 1702 return AMDGPU::sub0_sub1; 1703 case 96: 1704 return AMDGPU::sub0_sub1_sub2; 1705 case 128: 1706 return AMDGPU::sub0_sub1_sub2_sub3; 1707 case 256: 1708 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; 1709 default: 1710 if (Size < 32) 1711 return AMDGPU::sub0; 1712 if (Size > 256) 1713 return -1; 1714 return sizeToSubRegIndex(PowerOf2Ceil(Size)); 1715 } 1716 } 1717 1718 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { 1719 Register DstReg = I.getOperand(0).getReg(); 1720 Register SrcReg = I.getOperand(1).getReg(); 1721 const LLT DstTy = MRI->getType(DstReg); 1722 const LLT SrcTy = MRI->getType(SrcReg); 1723 const LLT S1 = LLT::scalar(1); 1724 1725 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1726 const RegisterBank *DstRB; 1727 if (DstTy == S1) { 1728 // This is a special case. We don't treat s1 for legalization artifacts as 1729 // vcc booleans. 1730 DstRB = SrcRB; 1731 } else { 1732 DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1733 if (SrcRB != DstRB) 1734 return false; 1735 } 1736 1737 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; 1738 1739 unsigned DstSize = DstTy.getSizeInBits(); 1740 unsigned SrcSize = SrcTy.getSizeInBits(); 1741 1742 const TargetRegisterClass *SrcRC 1743 = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI); 1744 const TargetRegisterClass *DstRC 1745 = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI); 1746 if (!SrcRC || !DstRC) 1747 return false; 1748 1749 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 1750 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) { 1751 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); 1752 return false; 1753 } 1754 1755 if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) { 1756 MachineBasicBlock *MBB = I.getParent(); 1757 const DebugLoc &DL = I.getDebugLoc(); 1758 1759 Register LoReg = MRI->createVirtualRegister(DstRC); 1760 Register HiReg = MRI->createVirtualRegister(DstRC); 1761 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg) 1762 .addReg(SrcReg, 0, AMDGPU::sub0); 1763 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg) 1764 .addReg(SrcReg, 0, AMDGPU::sub1); 1765 1766 if (IsVALU && STI.hasSDWA()) { 1767 // Write the low 16-bits of the high element into the high 16-bits of the 1768 // low element. 1769 MachineInstr *MovSDWA = 1770 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 1771 .addImm(0) // $src0_modifiers 1772 .addReg(HiReg) // $src0 1773 .addImm(0) // $clamp 1774 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel 1775 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 1776 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel 1777 .addReg(LoReg, RegState::Implicit); 1778 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 1779 } else { 1780 Register TmpReg0 = MRI->createVirtualRegister(DstRC); 1781 Register TmpReg1 = MRI->createVirtualRegister(DstRC); 1782 Register ImmReg = MRI->createVirtualRegister(DstRC); 1783 if (IsVALU) { 1784 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0) 1785 .addImm(16) 1786 .addReg(HiReg); 1787 } else { 1788 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0) 1789 .addReg(HiReg) 1790 .addImm(16); 1791 } 1792 1793 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 1794 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 1795 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32; 1796 1797 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg) 1798 .addImm(0xffff); 1799 BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1) 1800 .addReg(LoReg) 1801 .addReg(ImmReg); 1802 BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg) 1803 .addReg(TmpReg0) 1804 .addReg(TmpReg1); 1805 } 1806 1807 I.eraseFromParent(); 1808 return true; 1809 } 1810 1811 if (!DstTy.isScalar()) 1812 return false; 1813 1814 if (SrcSize > 32) { 1815 int SubRegIdx = sizeToSubRegIndex(DstSize); 1816 if (SubRegIdx == -1) 1817 return false; 1818 1819 // Deal with weird cases where the class only partially supports the subreg 1820 // index. 1821 const TargetRegisterClass *SrcWithSubRC 1822 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx); 1823 if (!SrcWithSubRC) 1824 return false; 1825 1826 if (SrcWithSubRC != SrcRC) { 1827 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI)) 1828 return false; 1829 } 1830 1831 I.getOperand(1).setSubReg(SubRegIdx); 1832 } 1833 1834 I.setDesc(TII.get(TargetOpcode::COPY)); 1835 return true; 1836 } 1837 1838 /// \returns true if a bitmask for \p Size bits will be an inline immediate. 1839 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) { 1840 Mask = maskTrailingOnes<unsigned>(Size); 1841 int SignedMask = static_cast<int>(Mask); 1842 return SignedMask >= -16 && SignedMask <= 64; 1843 } 1844 1845 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1. 1846 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank( 1847 Register Reg, const MachineRegisterInfo &MRI, 1848 const TargetRegisterInfo &TRI) const { 1849 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 1850 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>()) 1851 return RB; 1852 1853 // Ignore the type, since we don't use vcc in artifacts. 1854 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>()) 1855 return &RBI.getRegBankFromRegClass(*RC, LLT()); 1856 return nullptr; 1857 } 1858 1859 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { 1860 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG; 1861 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg; 1862 const DebugLoc &DL = I.getDebugLoc(); 1863 MachineBasicBlock &MBB = *I.getParent(); 1864 const Register DstReg = I.getOperand(0).getReg(); 1865 const Register SrcReg = I.getOperand(1).getReg(); 1866 1867 const LLT DstTy = MRI->getType(DstReg); 1868 const LLT SrcTy = MRI->getType(SrcReg); 1869 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ? 1870 I.getOperand(2).getImm() : SrcTy.getSizeInBits(); 1871 const unsigned DstSize = DstTy.getSizeInBits(); 1872 if (!DstTy.isScalar()) 1873 return false; 1874 1875 // Artifact casts should never use vcc. 1876 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI); 1877 1878 // FIXME: This should probably be illegal and split earlier. 1879 if (I.getOpcode() == AMDGPU::G_ANYEXT) { 1880 if (DstSize <= 32) 1881 return selectCOPY(I); 1882 1883 const TargetRegisterClass *SrcRC = 1884 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank, *MRI); 1885 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 1886 const TargetRegisterClass *DstRC = 1887 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 1888 1889 Register UndefReg = MRI->createVirtualRegister(SrcRC); 1890 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 1891 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 1892 .addReg(SrcReg) 1893 .addImm(AMDGPU::sub0) 1894 .addReg(UndefReg) 1895 .addImm(AMDGPU::sub1); 1896 I.eraseFromParent(); 1897 1898 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) && 1899 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI); 1900 } 1901 1902 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) { 1903 // 64-bit should have been split up in RegBankSelect 1904 1905 // Try to use an and with a mask if it will save code size. 1906 unsigned Mask; 1907 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1908 MachineInstr *ExtI = 1909 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg) 1910 .addImm(Mask) 1911 .addReg(SrcReg); 1912 I.eraseFromParent(); 1913 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1914 } 1915 1916 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32; 1917 MachineInstr *ExtI = 1918 BuildMI(MBB, I, DL, TII.get(BFE), DstReg) 1919 .addReg(SrcReg) 1920 .addImm(0) // Offset 1921 .addImm(SrcSize); // Width 1922 I.eraseFromParent(); 1923 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1924 } 1925 1926 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) { 1927 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ? 1928 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass; 1929 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI)) 1930 return false; 1931 1932 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) { 1933 const unsigned SextOpc = SrcSize == 8 ? 1934 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16; 1935 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg) 1936 .addReg(SrcReg); 1937 I.eraseFromParent(); 1938 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1939 } 1940 1941 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; 1942 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; 1943 1944 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width. 1945 if (DstSize > 32 && (SrcSize <= 32 || InReg)) { 1946 // We need a 64-bit register source, but the high bits don't matter. 1947 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); 1948 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1949 unsigned SubReg = InReg ? AMDGPU::sub0 : 0; 1950 1951 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 1952 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) 1953 .addReg(SrcReg, 0, SubReg) 1954 .addImm(AMDGPU::sub0) 1955 .addReg(UndefReg) 1956 .addImm(AMDGPU::sub1); 1957 1958 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg) 1959 .addReg(ExtReg) 1960 .addImm(SrcSize << 16); 1961 1962 I.eraseFromParent(); 1963 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI); 1964 } 1965 1966 unsigned Mask; 1967 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1968 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg) 1969 .addReg(SrcReg) 1970 .addImm(Mask); 1971 } else { 1972 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg) 1973 .addReg(SrcReg) 1974 .addImm(SrcSize << 16); 1975 } 1976 1977 I.eraseFromParent(); 1978 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1979 } 1980 1981 return false; 1982 } 1983 1984 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { 1985 MachineBasicBlock *BB = I.getParent(); 1986 MachineOperand &ImmOp = I.getOperand(1); 1987 1988 // The AMDGPU backend only supports Imm operands and not CImm or FPImm. 1989 if (ImmOp.isFPImm()) { 1990 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt(); 1991 ImmOp.ChangeToImmediate(Imm.getZExtValue()); 1992 } else if (ImmOp.isCImm()) { 1993 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue()); 1994 } 1995 1996 Register DstReg = I.getOperand(0).getReg(); 1997 unsigned Size; 1998 bool IsSgpr; 1999 const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg()); 2000 if (RB) { 2001 IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID; 2002 Size = MRI->getType(DstReg).getSizeInBits(); 2003 } else { 2004 const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg); 2005 IsSgpr = TRI.isSGPRClass(RC); 2006 Size = TRI.getRegSizeInBits(*RC); 2007 } 2008 2009 if (Size != 32 && Size != 64) 2010 return false; 2011 2012 unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 2013 if (Size == 32) { 2014 I.setDesc(TII.get(Opcode)); 2015 I.addImplicitDefUseOperands(*MF); 2016 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2017 } 2018 2019 const DebugLoc &DL = I.getDebugLoc(); 2020 2021 APInt Imm(Size, I.getOperand(1).getImm()); 2022 2023 MachineInstr *ResInst; 2024 if (IsSgpr && TII.isInlineConstant(Imm)) { 2025 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) 2026 .addImm(I.getOperand(1).getImm()); 2027 } else { 2028 const TargetRegisterClass *RC = IsSgpr ? 2029 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass; 2030 Register LoReg = MRI->createVirtualRegister(RC); 2031 Register HiReg = MRI->createVirtualRegister(RC); 2032 2033 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg) 2034 .addImm(Imm.trunc(32).getZExtValue()); 2035 2036 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg) 2037 .addImm(Imm.ashr(32).getZExtValue()); 2038 2039 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 2040 .addReg(LoReg) 2041 .addImm(AMDGPU::sub0) 2042 .addReg(HiReg) 2043 .addImm(AMDGPU::sub1); 2044 } 2045 2046 // We can't call constrainSelectedInstRegOperands here, because it doesn't 2047 // work for target independent opcodes 2048 I.eraseFromParent(); 2049 const TargetRegisterClass *DstRC = 2050 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI); 2051 if (!DstRC) 2052 return true; 2053 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI); 2054 } 2055 2056 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const { 2057 // Only manually handle the f64 SGPR case. 2058 // 2059 // FIXME: This is a workaround for 2.5 different tablegen problems. Because 2060 // the bit ops theoretically have a second result due to the implicit def of 2061 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing 2062 // that is easy by disabling the check. The result works, but uses a 2063 // nonsensical sreg32orlds_and_sreg_1 regclass. 2064 // 2065 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to 2066 // the variadic REG_SEQUENCE operands. 2067 2068 Register Dst = MI.getOperand(0).getReg(); 2069 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 2070 if (DstRB->getID() != AMDGPU::SGPRRegBankID || 2071 MRI->getType(Dst) != LLT::scalar(64)) 2072 return false; 2073 2074 Register Src = MI.getOperand(1).getReg(); 2075 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI); 2076 if (Fabs) 2077 Src = Fabs->getOperand(1).getReg(); 2078 2079 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || 2080 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) 2081 return false; 2082 2083 MachineBasicBlock *BB = MI.getParent(); 2084 const DebugLoc &DL = MI.getDebugLoc(); 2085 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2086 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2087 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2088 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2089 2090 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) 2091 .addReg(Src, 0, AMDGPU::sub0); 2092 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) 2093 .addReg(Src, 0, AMDGPU::sub1); 2094 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) 2095 .addImm(0x80000000); 2096 2097 // Set or toggle sign bit. 2098 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32; 2099 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg) 2100 .addReg(HiReg) 2101 .addReg(ConstReg); 2102 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) 2103 .addReg(LoReg) 2104 .addImm(AMDGPU::sub0) 2105 .addReg(OpReg) 2106 .addImm(AMDGPU::sub1); 2107 MI.eraseFromParent(); 2108 return true; 2109 } 2110 2111 // FIXME: This is a workaround for the same tablegen problems as G_FNEG 2112 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const { 2113 Register Dst = MI.getOperand(0).getReg(); 2114 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 2115 if (DstRB->getID() != AMDGPU::SGPRRegBankID || 2116 MRI->getType(Dst) != LLT::scalar(64)) 2117 return false; 2118 2119 Register Src = MI.getOperand(1).getReg(); 2120 MachineBasicBlock *BB = MI.getParent(); 2121 const DebugLoc &DL = MI.getDebugLoc(); 2122 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2123 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2124 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2125 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2126 2127 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || 2128 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) 2129 return false; 2130 2131 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) 2132 .addReg(Src, 0, AMDGPU::sub0); 2133 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) 2134 .addReg(Src, 0, AMDGPU::sub1); 2135 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) 2136 .addImm(0x7fffffff); 2137 2138 // Clear sign bit. 2139 // TODO: Should this used S_BITSET0_*? 2140 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg) 2141 .addReg(HiReg) 2142 .addReg(ConstReg); 2143 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) 2144 .addReg(LoReg) 2145 .addImm(AMDGPU::sub0) 2146 .addReg(OpReg) 2147 .addImm(AMDGPU::sub1); 2148 2149 MI.eraseFromParent(); 2150 return true; 2151 } 2152 2153 static bool isConstant(const MachineInstr &MI) { 2154 return MI.getOpcode() == TargetOpcode::G_CONSTANT; 2155 } 2156 2157 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, 2158 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const { 2159 2160 const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg()); 2161 2162 assert(PtrMI); 2163 2164 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD) 2165 return; 2166 2167 GEPInfo GEPInfo(*PtrMI); 2168 2169 for (unsigned i = 1; i != 3; ++i) { 2170 const MachineOperand &GEPOp = PtrMI->getOperand(i); 2171 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg()); 2172 assert(OpDef); 2173 if (i == 2 && isConstant(*OpDef)) { 2174 // TODO: Could handle constant base + variable offset, but a combine 2175 // probably should have commuted it. 2176 assert(GEPInfo.Imm == 0); 2177 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue(); 2178 continue; 2179 } 2180 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI); 2181 if (OpBank->getID() == AMDGPU::SGPRRegBankID) 2182 GEPInfo.SgprParts.push_back(GEPOp.getReg()); 2183 else 2184 GEPInfo.VgprParts.push_back(GEPOp.getReg()); 2185 } 2186 2187 AddrInfo.push_back(GEPInfo); 2188 getAddrModeInfo(*PtrMI, MRI, AddrInfo); 2189 } 2190 2191 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const { 2192 if (!MI.hasOneMemOperand()) 2193 return false; 2194 2195 const MachineMemOperand *MMO = *MI.memoperands_begin(); 2196 const Value *Ptr = MMO->getValue(); 2197 2198 // UndefValue means this is a load of a kernel input. These are uniform. 2199 // Sometimes LDS instructions have constant pointers. 2200 // If Ptr is null, then that means this mem operand contains a 2201 // PseudoSourceValue like GOT. 2202 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || 2203 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) 2204 return true; 2205 2206 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 2207 return true; 2208 2209 const Instruction *I = dyn_cast<Instruction>(Ptr); 2210 return I && I->getMetadata("amdgpu.uniform"); 2211 } 2212 2213 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const { 2214 for (const GEPInfo &GEPInfo : AddrInfo) { 2215 if (!GEPInfo.VgprParts.empty()) 2216 return true; 2217 } 2218 return false; 2219 } 2220 2221 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const { 2222 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); 2223 unsigned AS = PtrTy.getAddressSpace(); 2224 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) && 2225 STI.ldsRequiresM0Init()) { 2226 MachineBasicBlock *BB = I.getParent(); 2227 2228 // If DS instructions require M0 initializtion, insert it before selecting. 2229 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 2230 .addImm(-1); 2231 } 2232 } 2233 2234 bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW( 2235 MachineInstr &I) const { 2236 initM0(I); 2237 return selectImpl(I, *CoverageInfo); 2238 } 2239 2240 // TODO: No rtn optimization. 2241 bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG( 2242 MachineInstr &MI) const { 2243 Register PtrReg = MI.getOperand(1).getReg(); 2244 const LLT PtrTy = MRI->getType(PtrReg); 2245 if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS || 2246 STI.useFlatForGlobal()) 2247 return selectImpl(MI, *CoverageInfo); 2248 2249 Register DstReg = MI.getOperand(0).getReg(); 2250 const LLT Ty = MRI->getType(DstReg); 2251 const bool Is64 = Ty.getSizeInBits() == 64; 2252 const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 2253 Register TmpReg = MRI->createVirtualRegister( 2254 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass); 2255 2256 const DebugLoc &DL = MI.getDebugLoc(); 2257 MachineBasicBlock *BB = MI.getParent(); 2258 2259 Register VAddr, RSrcReg, SOffset; 2260 int64_t Offset = 0; 2261 2262 unsigned Opcode; 2263 if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) { 2264 Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN : 2265 AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN; 2266 } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr, 2267 RSrcReg, SOffset, Offset)) { 2268 Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN : 2269 AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN; 2270 } else 2271 return selectImpl(MI, *CoverageInfo); 2272 2273 auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg) 2274 .addReg(MI.getOperand(2).getReg()); 2275 2276 if (VAddr) 2277 MIB.addReg(VAddr); 2278 2279 MIB.addReg(RSrcReg); 2280 if (SOffset) 2281 MIB.addReg(SOffset); 2282 else 2283 MIB.addImm(0); 2284 2285 MIB.addImm(Offset); 2286 MIB.addImm(0); // slc 2287 MIB.cloneMemRefs(MI); 2288 2289 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg) 2290 .addReg(TmpReg, RegState::Kill, SubReg); 2291 2292 MI.eraseFromParent(); 2293 2294 MRI->setRegClass( 2295 DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass); 2296 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 2297 } 2298 2299 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { 2300 MachineBasicBlock *BB = I.getParent(); 2301 MachineOperand &CondOp = I.getOperand(0); 2302 Register CondReg = CondOp.getReg(); 2303 const DebugLoc &DL = I.getDebugLoc(); 2304 2305 unsigned BrOpcode; 2306 Register CondPhysReg; 2307 const TargetRegisterClass *ConstrainRC; 2308 2309 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide 2310 // whether the branch is uniform when selecting the instruction. In 2311 // GlobalISel, we should push that decision into RegBankSelect. Assume for now 2312 // RegBankSelect knows what it's doing if the branch condition is scc, even 2313 // though it currently does not. 2314 if (!isVCC(CondReg, *MRI)) { 2315 if (MRI->getType(CondReg) != LLT::scalar(32)) 2316 return false; 2317 2318 CondPhysReg = AMDGPU::SCC; 2319 BrOpcode = AMDGPU::S_CBRANCH_SCC1; 2320 ConstrainRC = &AMDGPU::SReg_32RegClass; 2321 } else { 2322 // FIXME: Do we have to insert an and with exec here, like in SelectionDAG? 2323 // We sort of know that a VCC producer based on the register bank, that ands 2324 // inactive lanes with 0. What if there was a logical operation with vcc 2325 // producers in different blocks/with different exec masks? 2326 // FIXME: Should scc->vcc copies and with exec? 2327 CondPhysReg = TRI.getVCC(); 2328 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ; 2329 ConstrainRC = TRI.getBoolRC(); 2330 } 2331 2332 if (!MRI->getRegClassOrNull(CondReg)) 2333 MRI->setRegClass(CondReg, ConstrainRC); 2334 2335 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg) 2336 .addReg(CondReg); 2337 BuildMI(*BB, &I, DL, TII.get(BrOpcode)) 2338 .addMBB(I.getOperand(1).getMBB()); 2339 2340 I.eraseFromParent(); 2341 return true; 2342 } 2343 2344 bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE( 2345 MachineInstr &I) const { 2346 Register DstReg = I.getOperand(0).getReg(); 2347 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2348 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 2349 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32)); 2350 if (IsVGPR) 2351 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 2352 2353 return RBI.constrainGenericRegister( 2354 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI); 2355 } 2356 2357 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { 2358 Register DstReg = I.getOperand(0).getReg(); 2359 Register SrcReg = I.getOperand(1).getReg(); 2360 Register MaskReg = I.getOperand(2).getReg(); 2361 LLT Ty = MRI->getType(DstReg); 2362 LLT MaskTy = MRI->getType(MaskReg); 2363 2364 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2365 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 2366 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI); 2367 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 2368 if (DstRB != SrcRB) // Should only happen for hand written MIR. 2369 return false; 2370 2371 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 2372 const TargetRegisterClass &RegRC 2373 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 2374 2375 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB, 2376 *MRI); 2377 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB, 2378 *MRI); 2379 const TargetRegisterClass *MaskRC = 2380 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI); 2381 2382 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 2383 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 2384 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI)) 2385 return false; 2386 2387 MachineBasicBlock *BB = I.getParent(); 2388 const DebugLoc &DL = I.getDebugLoc(); 2389 if (Ty.getSizeInBits() == 32) { 2390 assert(MaskTy.getSizeInBits() == 32 && 2391 "ptrmask should have been narrowed during legalize"); 2392 2393 BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg) 2394 .addReg(SrcReg) 2395 .addReg(MaskReg); 2396 I.eraseFromParent(); 2397 return true; 2398 } 2399 2400 Register HiReg = MRI->createVirtualRegister(&RegRC); 2401 Register LoReg = MRI->createVirtualRegister(&RegRC); 2402 2403 // Extract the subregisters from the source pointer. 2404 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg) 2405 .addReg(SrcReg, 0, AMDGPU::sub0); 2406 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg) 2407 .addReg(SrcReg, 0, AMDGPU::sub1); 2408 2409 Register MaskedLo, MaskedHi; 2410 2411 // Try to avoid emitting a bit operation when we only need to touch half of 2412 // the 64-bit pointer. 2413 APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64); 2414 2415 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32); 2416 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32); 2417 if ((MaskOnes & MaskLo32) == MaskLo32) { 2418 // If all the bits in the low half are 1, we only need a copy for it. 2419 MaskedLo = LoReg; 2420 } else { 2421 // Extract the mask subregister and apply the and. 2422 Register MaskLo = MRI->createVirtualRegister(&RegRC); 2423 MaskedLo = MRI->createVirtualRegister(&RegRC); 2424 2425 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo) 2426 .addReg(MaskReg, 0, AMDGPU::sub0); 2427 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo) 2428 .addReg(LoReg) 2429 .addReg(MaskLo); 2430 } 2431 2432 if ((MaskOnes & MaskHi32) == MaskHi32) { 2433 // If all the bits in the high half are 1, we only need a copy for it. 2434 MaskedHi = HiReg; 2435 } else { 2436 Register MaskHi = MRI->createVirtualRegister(&RegRC); 2437 MaskedHi = MRI->createVirtualRegister(&RegRC); 2438 2439 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi) 2440 .addReg(MaskReg, 0, AMDGPU::sub1); 2441 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi) 2442 .addReg(HiReg) 2443 .addReg(MaskHi); 2444 } 2445 2446 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 2447 .addReg(MaskedLo) 2448 .addImm(AMDGPU::sub0) 2449 .addReg(MaskedHi) 2450 .addImm(AMDGPU::sub1); 2451 I.eraseFromParent(); 2452 return true; 2453 } 2454 2455 /// Return the register to use for the index value, and the subregister to use 2456 /// for the indirectly accessed register. 2457 static std::pair<Register, unsigned> 2458 computeIndirectRegIndex(MachineRegisterInfo &MRI, 2459 const SIRegisterInfo &TRI, 2460 const TargetRegisterClass *SuperRC, 2461 Register IdxReg, 2462 unsigned EltSize) { 2463 Register IdxBaseReg; 2464 int Offset; 2465 MachineInstr *Unused; 2466 2467 std::tie(IdxBaseReg, Offset, Unused) 2468 = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg); 2469 if (IdxBaseReg == AMDGPU::NoRegister) { 2470 // This will happen if the index is a known constant. This should ordinarily 2471 // be legalized out, but handle it as a register just in case. 2472 assert(Offset == 0); 2473 IdxBaseReg = IdxReg; 2474 } 2475 2476 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize); 2477 2478 // Skip out of bounds offsets, or else we would end up using an undefined 2479 // register. 2480 if (static_cast<unsigned>(Offset) >= SubRegs.size()) 2481 return std::make_pair(IdxReg, SubRegs[0]); 2482 return std::make_pair(IdxBaseReg, SubRegs[Offset]); 2483 } 2484 2485 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT( 2486 MachineInstr &MI) const { 2487 Register DstReg = MI.getOperand(0).getReg(); 2488 Register SrcReg = MI.getOperand(1).getReg(); 2489 Register IdxReg = MI.getOperand(2).getReg(); 2490 2491 LLT DstTy = MRI->getType(DstReg); 2492 LLT SrcTy = MRI->getType(SrcReg); 2493 2494 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2495 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 2496 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 2497 2498 // The index must be scalar. If it wasn't RegBankSelect should have moved this 2499 // into a waterfall loop. 2500 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 2501 return false; 2502 2503 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB, 2504 *MRI); 2505 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB, 2506 *MRI); 2507 if (!SrcRC || !DstRC) 2508 return false; 2509 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 2510 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 2511 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 2512 return false; 2513 2514 MachineBasicBlock *BB = MI.getParent(); 2515 const DebugLoc &DL = MI.getDebugLoc(); 2516 const bool Is64 = DstTy.getSizeInBits() == 64; 2517 2518 unsigned SubReg; 2519 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg, 2520 DstTy.getSizeInBits() / 8); 2521 2522 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) { 2523 if (DstTy.getSizeInBits() != 32 && !Is64) 2524 return false; 2525 2526 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2527 .addReg(IdxReg); 2528 2529 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32; 2530 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg) 2531 .addReg(SrcReg, 0, SubReg) 2532 .addReg(SrcReg, RegState::Implicit); 2533 MI.eraseFromParent(); 2534 return true; 2535 } 2536 2537 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32) 2538 return false; 2539 2540 if (!STI.useVGPRIndexMode()) { 2541 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2542 .addReg(IdxReg); 2543 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg) 2544 .addReg(SrcReg, 0, SubReg) 2545 .addReg(SrcReg, RegState::Implicit); 2546 MI.eraseFromParent(); 2547 return true; 2548 } 2549 2550 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON)) 2551 .addReg(IdxReg) 2552 .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE); 2553 BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstReg) 2554 .addReg(SrcReg, 0, SubReg) 2555 .addReg(SrcReg, RegState::Implicit) 2556 .addReg(AMDGPU::M0, RegState::Implicit); 2557 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF)); 2558 2559 MI.eraseFromParent(); 2560 return true; 2561 } 2562 2563 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd 2564 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT( 2565 MachineInstr &MI) const { 2566 Register DstReg = MI.getOperand(0).getReg(); 2567 Register VecReg = MI.getOperand(1).getReg(); 2568 Register ValReg = MI.getOperand(2).getReg(); 2569 Register IdxReg = MI.getOperand(3).getReg(); 2570 2571 LLT VecTy = MRI->getType(DstReg); 2572 LLT ValTy = MRI->getType(ValReg); 2573 unsigned VecSize = VecTy.getSizeInBits(); 2574 unsigned ValSize = ValTy.getSizeInBits(); 2575 2576 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI); 2577 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI); 2578 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 2579 2580 assert(VecTy.getElementType() == ValTy); 2581 2582 // The index must be scalar. If it wasn't RegBankSelect should have moved this 2583 // into a waterfall loop. 2584 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 2585 return false; 2586 2587 const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB, 2588 *MRI); 2589 const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB, 2590 *MRI); 2591 2592 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) || 2593 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) || 2594 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) || 2595 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 2596 return false; 2597 2598 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32) 2599 return false; 2600 2601 unsigned SubReg; 2602 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, 2603 ValSize / 8); 2604 2605 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID && 2606 STI.useVGPRIndexMode(); 2607 2608 MachineBasicBlock *BB = MI.getParent(); 2609 const DebugLoc &DL = MI.getDebugLoc(); 2610 2611 if (IndexMode) { 2612 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON)) 2613 .addReg(IdxReg) 2614 .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE); 2615 } else { 2616 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2617 .addReg(IdxReg); 2618 } 2619 2620 const MCInstrDesc &RegWriteOp 2621 = TII.getIndirectRegWritePseudo(VecSize, ValSize, 2622 VecRB->getID() == AMDGPU::SGPRRegBankID); 2623 BuildMI(*BB, MI, DL, RegWriteOp, DstReg) 2624 .addReg(VecReg) 2625 .addReg(ValReg) 2626 .addImm(SubReg); 2627 2628 if (IndexMode) 2629 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF)); 2630 2631 MI.eraseFromParent(); 2632 return true; 2633 } 2634 2635 static bool isZeroOrUndef(int X) { 2636 return X == 0 || X == -1; 2637 } 2638 2639 static bool isOneOrUndef(int X) { 2640 return X == 1 || X == -1; 2641 } 2642 2643 static bool isZeroOrOneOrUndef(int X) { 2644 return X == 0 || X == 1 || X == -1; 2645 } 2646 2647 // Normalize a VOP3P shuffle mask to refer to the low/high half of a single 2648 // 32-bit register. 2649 static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1, 2650 ArrayRef<int> Mask) { 2651 NewMask[0] = Mask[0]; 2652 NewMask[1] = Mask[1]; 2653 if (isZeroOrOneOrUndef(Mask[0]) && isZeroOrOneOrUndef(Mask[1])) 2654 return Src0; 2655 2656 assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1); 2657 assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1); 2658 2659 // Shift the mask inputs to be 0/1; 2660 NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2; 2661 NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2; 2662 return Src1; 2663 } 2664 2665 // This is only legal with VOP3P instructions as an aid to op_sel matching. 2666 bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR( 2667 MachineInstr &MI) const { 2668 Register DstReg = MI.getOperand(0).getReg(); 2669 Register Src0Reg = MI.getOperand(1).getReg(); 2670 Register Src1Reg = MI.getOperand(2).getReg(); 2671 ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask(); 2672 2673 const LLT V2S16 = LLT::vector(2, 16); 2674 if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16) 2675 return false; 2676 2677 if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask)) 2678 return false; 2679 2680 assert(ShufMask.size() == 2); 2681 assert(STI.hasSDWA() && "no target has VOP3P but not SDWA"); 2682 2683 MachineBasicBlock *MBB = MI.getParent(); 2684 const DebugLoc &DL = MI.getDebugLoc(); 2685 2686 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2687 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; 2688 const TargetRegisterClass &RC = IsVALU ? 2689 AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 2690 2691 // Handle the degenerate case which should have folded out. 2692 if (ShufMask[0] == -1 && ShufMask[1] == -1) { 2693 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg); 2694 2695 MI.eraseFromParent(); 2696 return RBI.constrainGenericRegister(DstReg, RC, *MRI); 2697 } 2698 2699 // A legal VOP3P mask only reads one of the sources. 2700 int Mask[2]; 2701 Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask); 2702 2703 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) || 2704 !RBI.constrainGenericRegister(SrcVec, RC, *MRI)) 2705 return false; 2706 2707 // TODO: This also should have been folded out 2708 if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) { 2709 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg) 2710 .addReg(SrcVec); 2711 2712 MI.eraseFromParent(); 2713 return true; 2714 } 2715 2716 if (Mask[0] == 1 && Mask[1] == -1) { 2717 if (IsVALU) { 2718 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg) 2719 .addImm(16) 2720 .addReg(SrcVec); 2721 } else { 2722 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg) 2723 .addReg(SrcVec) 2724 .addImm(16); 2725 } 2726 } else if (Mask[0] == -1 && Mask[1] == 0) { 2727 if (IsVALU) { 2728 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg) 2729 .addImm(16) 2730 .addReg(SrcVec); 2731 } else { 2732 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg) 2733 .addReg(SrcVec) 2734 .addImm(16); 2735 } 2736 } else if (Mask[0] == 0 && Mask[1] == 0) { 2737 if (IsVALU) { 2738 // Write low half of the register into the high half. 2739 MachineInstr *MovSDWA = 2740 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 2741 .addImm(0) // $src0_modifiers 2742 .addReg(SrcVec) // $src0 2743 .addImm(0) // $clamp 2744 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel 2745 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 2746 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel 2747 .addReg(SrcVec, RegState::Implicit); 2748 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 2749 } else { 2750 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg) 2751 .addReg(SrcVec) 2752 .addReg(SrcVec); 2753 } 2754 } else if (Mask[0] == 1 && Mask[1] == 1) { 2755 if (IsVALU) { 2756 // Write high half of the register into the low half. 2757 MachineInstr *MovSDWA = 2758 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 2759 .addImm(0) // $src0_modifiers 2760 .addReg(SrcVec) // $src0 2761 .addImm(0) // $clamp 2762 .addImm(AMDGPU::SDWA::WORD_0) // $dst_sel 2763 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 2764 .addImm(AMDGPU::SDWA::WORD_1) // $src0_sel 2765 .addReg(SrcVec, RegState::Implicit); 2766 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 2767 } else { 2768 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg) 2769 .addReg(SrcVec) 2770 .addReg(SrcVec); 2771 } 2772 } else if (Mask[0] == 1 && Mask[1] == 0) { 2773 if (IsVALU) { 2774 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32), DstReg) 2775 .addReg(SrcVec) 2776 .addReg(SrcVec) 2777 .addImm(16); 2778 } else { 2779 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2780 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg) 2781 .addReg(SrcVec) 2782 .addImm(16); 2783 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg) 2784 .addReg(TmpReg) 2785 .addReg(SrcVec); 2786 } 2787 } else 2788 llvm_unreachable("all shuffle masks should be handled"); 2789 2790 MI.eraseFromParent(); 2791 return true; 2792 } 2793 2794 bool AMDGPUInstructionSelector::select(MachineInstr &I) { 2795 if (I.isPHI()) 2796 return selectPHI(I); 2797 2798 if (!I.isPreISelOpcode()) { 2799 if (I.isCopy()) 2800 return selectCOPY(I); 2801 return true; 2802 } 2803 2804 switch (I.getOpcode()) { 2805 case TargetOpcode::G_AND: 2806 case TargetOpcode::G_OR: 2807 case TargetOpcode::G_XOR: 2808 if (selectImpl(I, *CoverageInfo)) 2809 return true; 2810 return selectG_AND_OR_XOR(I); 2811 case TargetOpcode::G_ADD: 2812 case TargetOpcode::G_SUB: 2813 if (selectImpl(I, *CoverageInfo)) 2814 return true; 2815 return selectG_ADD_SUB(I); 2816 case TargetOpcode::G_UADDO: 2817 case TargetOpcode::G_USUBO: 2818 case TargetOpcode::G_UADDE: 2819 case TargetOpcode::G_USUBE: 2820 return selectG_UADDO_USUBO_UADDE_USUBE(I); 2821 case TargetOpcode::G_INTTOPTR: 2822 case TargetOpcode::G_BITCAST: 2823 case TargetOpcode::G_PTRTOINT: 2824 return selectCOPY(I); 2825 case TargetOpcode::G_CONSTANT: 2826 case TargetOpcode::G_FCONSTANT: 2827 return selectG_CONSTANT(I); 2828 case TargetOpcode::G_FNEG: 2829 if (selectImpl(I, *CoverageInfo)) 2830 return true; 2831 return selectG_FNEG(I); 2832 case TargetOpcode::G_FABS: 2833 if (selectImpl(I, *CoverageInfo)) 2834 return true; 2835 return selectG_FABS(I); 2836 case TargetOpcode::G_EXTRACT: 2837 return selectG_EXTRACT(I); 2838 case TargetOpcode::G_MERGE_VALUES: 2839 case TargetOpcode::G_BUILD_VECTOR: 2840 case TargetOpcode::G_CONCAT_VECTORS: 2841 return selectG_MERGE_VALUES(I); 2842 case TargetOpcode::G_UNMERGE_VALUES: 2843 return selectG_UNMERGE_VALUES(I); 2844 case TargetOpcode::G_BUILD_VECTOR_TRUNC: 2845 return selectG_BUILD_VECTOR_TRUNC(I); 2846 case TargetOpcode::G_PTR_ADD: 2847 return selectG_PTR_ADD(I); 2848 case TargetOpcode::G_IMPLICIT_DEF: 2849 return selectG_IMPLICIT_DEF(I); 2850 case TargetOpcode::G_FREEZE: 2851 return selectCOPY(I); 2852 case TargetOpcode::G_INSERT: 2853 return selectG_INSERT(I); 2854 case TargetOpcode::G_INTRINSIC: 2855 return selectG_INTRINSIC(I); 2856 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: 2857 return selectG_INTRINSIC_W_SIDE_EFFECTS(I); 2858 case TargetOpcode::G_ICMP: 2859 if (selectG_ICMP(I)) 2860 return true; 2861 return selectImpl(I, *CoverageInfo); 2862 case TargetOpcode::G_LOAD: 2863 case TargetOpcode::G_STORE: 2864 case TargetOpcode::G_ATOMIC_CMPXCHG: 2865 case TargetOpcode::G_ATOMICRMW_XCHG: 2866 case TargetOpcode::G_ATOMICRMW_ADD: 2867 case TargetOpcode::G_ATOMICRMW_SUB: 2868 case TargetOpcode::G_ATOMICRMW_AND: 2869 case TargetOpcode::G_ATOMICRMW_OR: 2870 case TargetOpcode::G_ATOMICRMW_XOR: 2871 case TargetOpcode::G_ATOMICRMW_MIN: 2872 case TargetOpcode::G_ATOMICRMW_MAX: 2873 case TargetOpcode::G_ATOMICRMW_UMIN: 2874 case TargetOpcode::G_ATOMICRMW_UMAX: 2875 case TargetOpcode::G_ATOMICRMW_FADD: 2876 case AMDGPU::G_AMDGPU_ATOMIC_INC: 2877 case AMDGPU::G_AMDGPU_ATOMIC_DEC: 2878 case AMDGPU::G_AMDGPU_ATOMIC_FMIN: 2879 case AMDGPU::G_AMDGPU_ATOMIC_FMAX: 2880 return selectG_LOAD_STORE_ATOMICRMW(I); 2881 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: 2882 return selectG_AMDGPU_ATOMIC_CMPXCHG(I); 2883 case TargetOpcode::G_SELECT: 2884 return selectG_SELECT(I); 2885 case TargetOpcode::G_TRUNC: 2886 return selectG_TRUNC(I); 2887 case TargetOpcode::G_SEXT: 2888 case TargetOpcode::G_ZEXT: 2889 case TargetOpcode::G_ANYEXT: 2890 case TargetOpcode::G_SEXT_INREG: 2891 if (selectImpl(I, *CoverageInfo)) 2892 return true; 2893 return selectG_SZA_EXT(I); 2894 case TargetOpcode::G_BRCOND: 2895 return selectG_BRCOND(I); 2896 case TargetOpcode::G_GLOBAL_VALUE: 2897 return selectG_GLOBAL_VALUE(I); 2898 case TargetOpcode::G_PTRMASK: 2899 return selectG_PTRMASK(I); 2900 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 2901 return selectG_EXTRACT_VECTOR_ELT(I); 2902 case TargetOpcode::G_INSERT_VECTOR_ELT: 2903 return selectG_INSERT_VECTOR_ELT(I); 2904 case TargetOpcode::G_SHUFFLE_VECTOR: 2905 return selectG_SHUFFLE_VECTOR(I); 2906 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 2907 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: { 2908 const AMDGPU::ImageDimIntrinsicInfo *Intr 2909 = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID()); 2910 assert(Intr && "not an image intrinsic with image pseudo"); 2911 return selectImageIntrinsic(I, Intr); 2912 } 2913 default: 2914 return selectImpl(I, *CoverageInfo); 2915 } 2916 return false; 2917 } 2918 2919 InstructionSelector::ComplexRendererFns 2920 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { 2921 return {{ 2922 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 2923 }}; 2924 2925 } 2926 2927 std::pair<Register, unsigned> 2928 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root) const { 2929 Register Src = Root.getReg(); 2930 Register OrigSrc = Src; 2931 unsigned Mods = 0; 2932 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI); 2933 2934 if (MI && MI->getOpcode() == AMDGPU::G_FNEG) { 2935 Src = MI->getOperand(1).getReg(); 2936 Mods |= SISrcMods::NEG; 2937 MI = getDefIgnoringCopies(Src, *MRI); 2938 } 2939 2940 if (MI && MI->getOpcode() == AMDGPU::G_FABS) { 2941 Src = MI->getOperand(1).getReg(); 2942 Mods |= SISrcMods::ABS; 2943 } 2944 2945 if (Mods != 0 && 2946 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) { 2947 MachineInstr *UseMI = Root.getParent(); 2948 2949 // If we looked through copies to find source modifiers on an SGPR operand, 2950 // we now have an SGPR register source. To avoid potentially violating the 2951 // constant bus restriction, we need to insert a copy to a VGPR. 2952 Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc); 2953 BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(), 2954 TII.get(AMDGPU::COPY), VGPRSrc) 2955 .addReg(Src); 2956 Src = VGPRSrc; 2957 } 2958 2959 return std::make_pair(Src, Mods); 2960 } 2961 2962 /// 2963 /// This will select either an SGPR or VGPR operand and will save us from 2964 /// having to write an extra tablegen pattern. 2965 InstructionSelector::ComplexRendererFns 2966 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const { 2967 return {{ 2968 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 2969 }}; 2970 } 2971 2972 InstructionSelector::ComplexRendererFns 2973 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { 2974 Register Src; 2975 unsigned Mods; 2976 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 2977 2978 return {{ 2979 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2980 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 2981 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 2982 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 2983 }}; 2984 } 2985 2986 InstructionSelector::ComplexRendererFns 2987 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { 2988 return {{ 2989 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 2990 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 2991 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 2992 }}; 2993 } 2994 2995 InstructionSelector::ComplexRendererFns 2996 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { 2997 Register Src; 2998 unsigned Mods; 2999 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 3000 3001 return {{ 3002 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3003 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3004 }}; 3005 } 3006 3007 InstructionSelector::ComplexRendererFns 3008 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { 3009 Register Reg = Root.getReg(); 3010 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); 3011 if (Def && (Def->getOpcode() == AMDGPU::G_FNEG || 3012 Def->getOpcode() == AMDGPU::G_FABS)) 3013 return {}; 3014 return {{ 3015 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 3016 }}; 3017 } 3018 3019 std::pair<Register, unsigned> 3020 AMDGPUInstructionSelector::selectVOP3PModsImpl( 3021 Register Src, const MachineRegisterInfo &MRI) const { 3022 unsigned Mods = 0; 3023 MachineInstr *MI = MRI.getVRegDef(Src); 3024 3025 if (MI && MI->getOpcode() == AMDGPU::G_FNEG && 3026 // It's possible to see an f32 fneg here, but unlikely. 3027 // TODO: Treat f32 fneg as only high bit. 3028 MRI.getType(Src) == LLT::vector(2, 16)) { 3029 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); 3030 Src = MI->getOperand(1).getReg(); 3031 MI = MRI.getVRegDef(Src); 3032 } 3033 3034 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. 3035 3036 // Packed instructions do not have abs modifiers. 3037 Mods |= SISrcMods::OP_SEL_1; 3038 3039 return std::make_pair(Src, Mods); 3040 } 3041 3042 InstructionSelector::ComplexRendererFns 3043 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const { 3044 MachineRegisterInfo &MRI 3045 = Root.getParent()->getParent()->getParent()->getRegInfo(); 3046 3047 Register Src; 3048 unsigned Mods; 3049 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI); 3050 3051 return {{ 3052 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3053 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3054 }}; 3055 } 3056 3057 InstructionSelector::ComplexRendererFns 3058 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const { 3059 Register Src; 3060 unsigned Mods; 3061 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 3062 if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI)) 3063 return None; 3064 3065 return {{ 3066 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3067 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3068 }}; 3069 } 3070 3071 InstructionSelector::ComplexRendererFns 3072 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { 3073 // FIXME: Handle op_sel 3074 return {{ 3075 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 3076 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods 3077 }}; 3078 } 3079 3080 InstructionSelector::ComplexRendererFns 3081 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { 3082 SmallVector<GEPInfo, 4> AddrInfo; 3083 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 3084 3085 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 3086 return None; 3087 3088 const GEPInfo &GEPInfo = AddrInfo[0]; 3089 Optional<int64_t> EncodedImm = 3090 AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false); 3091 if (!EncodedImm) 3092 return None; 3093 3094 unsigned PtrReg = GEPInfo.SgprParts[0]; 3095 return {{ 3096 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 3097 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } 3098 }}; 3099 } 3100 3101 InstructionSelector::ComplexRendererFns 3102 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { 3103 SmallVector<GEPInfo, 4> AddrInfo; 3104 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 3105 3106 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 3107 return None; 3108 3109 const GEPInfo &GEPInfo = AddrInfo[0]; 3110 Register PtrReg = GEPInfo.SgprParts[0]; 3111 Optional<int64_t> EncodedImm = 3112 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm); 3113 if (!EncodedImm) 3114 return None; 3115 3116 return {{ 3117 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 3118 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } 3119 }}; 3120 } 3121 3122 InstructionSelector::ComplexRendererFns 3123 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { 3124 MachineInstr *MI = Root.getParent(); 3125 MachineBasicBlock *MBB = MI->getParent(); 3126 3127 SmallVector<GEPInfo, 4> AddrInfo; 3128 getAddrModeInfo(*MI, *MRI, AddrInfo); 3129 3130 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, 3131 // then we can select all ptr + 32-bit offsets not just immediate offsets. 3132 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 3133 return None; 3134 3135 const GEPInfo &GEPInfo = AddrInfo[0]; 3136 // SGPR offset is unsigned. 3137 if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm)) 3138 return None; 3139 3140 // If we make it this far we have a load with an 32-bit immediate offset. 3141 // It is OK to select this using a sgpr offset, because we have already 3142 // failed trying to select this load into one of the _IMM variants since 3143 // the _IMM Patterns are considered before the _SGPR patterns. 3144 Register PtrReg = GEPInfo.SgprParts[0]; 3145 Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 3146 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) 3147 .addImm(GEPInfo.Imm); 3148 return {{ 3149 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 3150 [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); } 3151 }}; 3152 } 3153 3154 template <bool Signed> 3155 InstructionSelector::ComplexRendererFns 3156 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const { 3157 MachineInstr *MI = Root.getParent(); 3158 3159 InstructionSelector::ComplexRendererFns Default = {{ 3160 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 3161 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // offset 3162 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 3163 }}; 3164 3165 if (!STI.hasFlatInstOffsets()) 3166 return Default; 3167 3168 const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg()); 3169 if (!OpDef || OpDef->getOpcode() != AMDGPU::G_PTR_ADD) 3170 return Default; 3171 3172 Optional<int64_t> Offset = 3173 getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI); 3174 if (!Offset.hasValue()) 3175 return Default; 3176 3177 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); 3178 if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed)) 3179 return Default; 3180 3181 Register BasePtr = OpDef->getOperand(1).getReg(); 3182 3183 return {{ 3184 [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); }, 3185 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); }, 3186 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 3187 }}; 3188 } 3189 3190 InstructionSelector::ComplexRendererFns 3191 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const { 3192 return selectFlatOffsetImpl<false>(Root); 3193 } 3194 3195 InstructionSelector::ComplexRendererFns 3196 AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const { 3197 return selectFlatOffsetImpl<true>(Root); 3198 } 3199 3200 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { 3201 auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>(); 3202 return PSV && PSV->isStack(); 3203 } 3204 3205 InstructionSelector::ComplexRendererFns 3206 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { 3207 MachineInstr *MI = Root.getParent(); 3208 MachineBasicBlock *MBB = MI->getParent(); 3209 MachineFunction *MF = MBB->getParent(); 3210 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 3211 3212 int64_t Offset = 0; 3213 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) && 3214 Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) { 3215 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3216 3217 // TODO: Should this be inside the render function? The iterator seems to 3218 // move. 3219 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), 3220 HighBits) 3221 .addImm(Offset & ~4095); 3222 3223 return {{[=](MachineInstrBuilder &MIB) { // rsrc 3224 MIB.addReg(Info->getScratchRSrcReg()); 3225 }, 3226 [=](MachineInstrBuilder &MIB) { // vaddr 3227 MIB.addReg(HighBits); 3228 }, 3229 [=](MachineInstrBuilder &MIB) { // soffset 3230 const MachineMemOperand *MMO = *MI->memoperands_begin(); 3231 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 3232 3233 if (isStackPtrRelative(PtrInfo)) 3234 MIB.addReg(Info->getStackPtrOffsetReg()); 3235 else 3236 MIB.addImm(0); 3237 }, 3238 [=](MachineInstrBuilder &MIB) { // offset 3239 MIB.addImm(Offset & 4095); 3240 }}}; 3241 } 3242 3243 assert(Offset == 0 || Offset == -1); 3244 3245 // Try to fold a frame index directly into the MUBUF vaddr field, and any 3246 // offsets. 3247 Optional<int> FI; 3248 Register VAddr = Root.getReg(); 3249 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) { 3250 if (isBaseWithConstantOffset(Root, *MRI)) { 3251 const MachineOperand &LHS = RootDef->getOperand(1); 3252 const MachineOperand &RHS = RootDef->getOperand(2); 3253 const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); 3254 const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); 3255 if (LHSDef && RHSDef) { 3256 int64_t PossibleOffset = 3257 RHSDef->getOperand(1).getCImm()->getSExtValue(); 3258 if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) && 3259 (!STI.privateMemoryResourceIsRangeChecked() || 3260 KnownBits->signBitIsZero(LHS.getReg()))) { 3261 if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX) 3262 FI = LHSDef->getOperand(1).getIndex(); 3263 else 3264 VAddr = LHS.getReg(); 3265 Offset = PossibleOffset; 3266 } 3267 } 3268 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) { 3269 FI = RootDef->getOperand(1).getIndex(); 3270 } 3271 } 3272 3273 return {{[=](MachineInstrBuilder &MIB) { // rsrc 3274 MIB.addReg(Info->getScratchRSrcReg()); 3275 }, 3276 [=](MachineInstrBuilder &MIB) { // vaddr 3277 if (FI.hasValue()) 3278 MIB.addFrameIndex(FI.getValue()); 3279 else 3280 MIB.addReg(VAddr); 3281 }, 3282 [=](MachineInstrBuilder &MIB) { // soffset 3283 // If we don't know this private access is a local stack object, it 3284 // needs to be relative to the entry point's scratch wave offset. 3285 // TODO: Should split large offsets that don't fit like above. 3286 // TODO: Don't use scratch wave offset just because the offset 3287 // didn't fit. 3288 if (!Info->isEntryFunction() && FI.hasValue()) 3289 MIB.addReg(Info->getStackPtrOffsetReg()); 3290 else 3291 MIB.addImm(0); 3292 }, 3293 [=](MachineInstrBuilder &MIB) { // offset 3294 MIB.addImm(Offset); 3295 }}}; 3296 } 3297 3298 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base, 3299 int64_t Offset, 3300 unsigned OffsetBits) const { 3301 if ((OffsetBits == 16 && !isUInt<16>(Offset)) || 3302 (OffsetBits == 8 && !isUInt<8>(Offset))) 3303 return false; 3304 3305 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) 3306 return true; 3307 3308 // On Southern Islands instruction with a negative base value and an offset 3309 // don't seem to work. 3310 return KnownBits->signBitIsZero(Base); 3311 } 3312 3313 InstructionSelector::ComplexRendererFns 3314 AMDGPUInstructionSelector::selectMUBUFScratchOffset( 3315 MachineOperand &Root) const { 3316 MachineInstr *MI = Root.getParent(); 3317 MachineBasicBlock *MBB = MI->getParent(); 3318 3319 int64_t Offset = 0; 3320 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) || 3321 !SIInstrInfo::isLegalMUBUFImmOffset(Offset)) 3322 return {}; 3323 3324 const MachineFunction *MF = MBB->getParent(); 3325 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 3326 const MachineMemOperand *MMO = *MI->memoperands_begin(); 3327 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 3328 3329 return {{ 3330 [=](MachineInstrBuilder &MIB) { // rsrc 3331 MIB.addReg(Info->getScratchRSrcReg()); 3332 }, 3333 [=](MachineInstrBuilder &MIB) { // soffset 3334 if (isStackPtrRelative(PtrInfo)) 3335 MIB.addReg(Info->getStackPtrOffsetReg()); 3336 else 3337 MIB.addImm(0); 3338 }, 3339 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset 3340 }}; 3341 } 3342 3343 std::pair<Register, unsigned> 3344 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const { 3345 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 3346 if (!RootDef) 3347 return std::make_pair(Root.getReg(), 0); 3348 3349 int64_t ConstAddr = 0; 3350 3351 Register PtrBase; 3352 int64_t Offset; 3353 std::tie(PtrBase, Offset) = 3354 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 3355 3356 if (Offset) { 3357 if (isDSOffsetLegal(PtrBase, Offset, 16)) { 3358 // (add n0, c0) 3359 return std::make_pair(PtrBase, Offset); 3360 } 3361 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 3362 // TODO 3363 3364 3365 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 3366 // TODO 3367 3368 } 3369 3370 return std::make_pair(Root.getReg(), 0); 3371 } 3372 3373 InstructionSelector::ComplexRendererFns 3374 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const { 3375 Register Reg; 3376 unsigned Offset; 3377 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root); 3378 return {{ 3379 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 3380 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } 3381 }}; 3382 } 3383 3384 InstructionSelector::ComplexRendererFns 3385 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const { 3386 Register Reg; 3387 unsigned Offset; 3388 std::tie(Reg, Offset) = selectDS64Bit4ByteAlignedImpl(Root); 3389 return {{ 3390 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 3391 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, 3392 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); } 3393 }}; 3394 } 3395 3396 std::pair<Register, unsigned> 3397 AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const { 3398 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 3399 if (!RootDef) 3400 return std::make_pair(Root.getReg(), 0); 3401 3402 int64_t ConstAddr = 0; 3403 3404 Register PtrBase; 3405 int64_t Offset; 3406 std::tie(PtrBase, Offset) = 3407 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 3408 3409 if (Offset) { 3410 int64_t DWordOffset0 = Offset / 4; 3411 int64_t DWordOffset1 = DWordOffset0 + 1; 3412 if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) { 3413 // (add n0, c0) 3414 return std::make_pair(PtrBase, DWordOffset0); 3415 } 3416 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 3417 // TODO 3418 3419 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 3420 // TODO 3421 3422 } 3423 3424 return std::make_pair(Root.getReg(), 0); 3425 } 3426 3427 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return 3428 /// the base value with the constant offset. There may be intervening copies 3429 /// between \p Root and the identified constant. Returns \p Root, 0 if this does 3430 /// not match the pattern. 3431 std::pair<Register, int64_t> 3432 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset( 3433 Register Root, const MachineRegisterInfo &MRI) const { 3434 MachineInstr *RootI = MRI.getVRegDef(Root); 3435 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD) 3436 return {Root, 0}; 3437 3438 MachineOperand &RHS = RootI->getOperand(2); 3439 Optional<ValueAndVReg> MaybeOffset 3440 = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true); 3441 if (!MaybeOffset) 3442 return {Root, 0}; 3443 return {RootI->getOperand(1).getReg(), MaybeOffset->Value}; 3444 } 3445 3446 static void addZeroImm(MachineInstrBuilder &MIB) { 3447 MIB.addImm(0); 3448 } 3449 3450 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p 3451 /// BasePtr is not valid, a null base pointer will be used. 3452 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, 3453 uint32_t FormatLo, uint32_t FormatHi, 3454 Register BasePtr) { 3455 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 3456 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 3457 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 3458 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 3459 3460 B.buildInstr(AMDGPU::S_MOV_B32) 3461 .addDef(RSrc2) 3462 .addImm(FormatLo); 3463 B.buildInstr(AMDGPU::S_MOV_B32) 3464 .addDef(RSrc3) 3465 .addImm(FormatHi); 3466 3467 // Build the half of the subregister with the constants before building the 3468 // full 128-bit register. If we are building multiple resource descriptors, 3469 // this will allow CSEing of the 2-component register. 3470 B.buildInstr(AMDGPU::REG_SEQUENCE) 3471 .addDef(RSrcHi) 3472 .addReg(RSrc2) 3473 .addImm(AMDGPU::sub0) 3474 .addReg(RSrc3) 3475 .addImm(AMDGPU::sub1); 3476 3477 Register RSrcLo = BasePtr; 3478 if (!BasePtr) { 3479 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 3480 B.buildInstr(AMDGPU::S_MOV_B64) 3481 .addDef(RSrcLo) 3482 .addImm(0); 3483 } 3484 3485 B.buildInstr(AMDGPU::REG_SEQUENCE) 3486 .addDef(RSrc) 3487 .addReg(RSrcLo) 3488 .addImm(AMDGPU::sub0_sub1) 3489 .addReg(RSrcHi) 3490 .addImm(AMDGPU::sub2_sub3); 3491 3492 return RSrc; 3493 } 3494 3495 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 3496 const SIInstrInfo &TII, Register BasePtr) { 3497 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 3498 3499 // FIXME: Why are half the "default" bits ignored based on the addressing 3500 // mode? 3501 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr); 3502 } 3503 3504 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 3505 const SIInstrInfo &TII, Register BasePtr) { 3506 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 3507 3508 // FIXME: Why are half the "default" bits ignored based on the addressing 3509 // mode? 3510 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr); 3511 } 3512 3513 AMDGPUInstructionSelector::MUBUFAddressData 3514 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const { 3515 MUBUFAddressData Data; 3516 Data.N0 = Src; 3517 3518 Register PtrBase; 3519 int64_t Offset; 3520 3521 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI); 3522 if (isUInt<32>(Offset)) { 3523 Data.N0 = PtrBase; 3524 Data.Offset = Offset; 3525 } 3526 3527 if (MachineInstr *InputAdd 3528 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) { 3529 Data.N2 = InputAdd->getOperand(1).getReg(); 3530 Data.N3 = InputAdd->getOperand(2).getReg(); 3531 3532 // FIXME: Need to fix extra SGPR->VGPRcopies inserted 3533 // FIXME: Don't know this was defined by operand 0 3534 // 3535 // TODO: Remove this when we have copy folding optimizations after 3536 // RegBankSelect. 3537 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg(); 3538 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg(); 3539 } 3540 3541 return Data; 3542 } 3543 3544 /// Return if the addr64 mubuf mode should be used for the given address. 3545 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const { 3546 // (ptr_add N2, N3) -> addr64, or 3547 // (ptr_add (ptr_add N2, N3), C1) -> addr64 3548 if (Addr.N2) 3549 return true; 3550 3551 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI); 3552 return N0Bank->getID() == AMDGPU::VGPRRegBankID; 3553 } 3554 3555 /// Split an immediate offset \p ImmOffset depending on whether it fits in the 3556 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable 3557 /// component. 3558 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset( 3559 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const { 3560 if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset)) 3561 return; 3562 3563 // Illegal offset, store it in soffset. 3564 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 3565 B.buildInstr(AMDGPU::S_MOV_B32) 3566 .addDef(SOffset) 3567 .addImm(ImmOffset); 3568 ImmOffset = 0; 3569 } 3570 3571 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl( 3572 MachineOperand &Root, Register &VAddr, Register &RSrcReg, 3573 Register &SOffset, int64_t &Offset) const { 3574 // FIXME: Predicates should stop this from reaching here. 3575 // addr64 bit was removed for volcanic islands. 3576 if (!STI.hasAddr64() || STI.useFlatForGlobal()) 3577 return false; 3578 3579 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 3580 if (!shouldUseAddr64(AddrData)) 3581 return false; 3582 3583 Register N0 = AddrData.N0; 3584 Register N2 = AddrData.N2; 3585 Register N3 = AddrData.N3; 3586 Offset = AddrData.Offset; 3587 3588 // Base pointer for the SRD. 3589 Register SRDPtr; 3590 3591 if (N2) { 3592 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 3593 assert(N3); 3594 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 3595 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the 3596 // addr64, and construct the default resource from a 0 address. 3597 VAddr = N0; 3598 } else { 3599 SRDPtr = N3; 3600 VAddr = N2; 3601 } 3602 } else { 3603 // N2 is not divergent. 3604 SRDPtr = N2; 3605 VAddr = N3; 3606 } 3607 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 3608 // Use the default null pointer in the resource 3609 VAddr = N0; 3610 } else { 3611 // N0 -> offset, or 3612 // (N0 + C1) -> offset 3613 SRDPtr = N0; 3614 } 3615 3616 MachineIRBuilder B(*Root.getParent()); 3617 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr); 3618 splitIllegalMUBUFOffset(B, SOffset, Offset); 3619 return true; 3620 } 3621 3622 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl( 3623 MachineOperand &Root, Register &RSrcReg, Register &SOffset, 3624 int64_t &Offset) const { 3625 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 3626 if (shouldUseAddr64(AddrData)) 3627 return false; 3628 3629 // N0 -> offset, or 3630 // (N0 + C1) -> offset 3631 Register SRDPtr = AddrData.N0; 3632 Offset = AddrData.Offset; 3633 3634 // TODO: Look through extensions for 32-bit soffset. 3635 MachineIRBuilder B(*Root.getParent()); 3636 3637 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr); 3638 splitIllegalMUBUFOffset(B, SOffset, Offset); 3639 return true; 3640 } 3641 3642 InstructionSelector::ComplexRendererFns 3643 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const { 3644 Register VAddr; 3645 Register RSrcReg; 3646 Register SOffset; 3647 int64_t Offset = 0; 3648 3649 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 3650 return {}; 3651 3652 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 3653 // pattern. 3654 return {{ 3655 [=](MachineInstrBuilder &MIB) { // rsrc 3656 MIB.addReg(RSrcReg); 3657 }, 3658 [=](MachineInstrBuilder &MIB) { // vaddr 3659 MIB.addReg(VAddr); 3660 }, 3661 [=](MachineInstrBuilder &MIB) { // soffset 3662 if (SOffset) 3663 MIB.addReg(SOffset); 3664 else 3665 MIB.addImm(0); 3666 }, 3667 [=](MachineInstrBuilder &MIB) { // offset 3668 MIB.addImm(Offset); 3669 }, 3670 addZeroImm, // glc 3671 addZeroImm, // slc 3672 addZeroImm, // tfe 3673 addZeroImm, // dlc 3674 addZeroImm // swz 3675 }}; 3676 } 3677 3678 InstructionSelector::ComplexRendererFns 3679 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const { 3680 Register RSrcReg; 3681 Register SOffset; 3682 int64_t Offset = 0; 3683 3684 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 3685 return {}; 3686 3687 return {{ 3688 [=](MachineInstrBuilder &MIB) { // rsrc 3689 MIB.addReg(RSrcReg); 3690 }, 3691 [=](MachineInstrBuilder &MIB) { // soffset 3692 if (SOffset) 3693 MIB.addReg(SOffset); 3694 else 3695 MIB.addImm(0); 3696 }, 3697 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 3698 addZeroImm, // glc 3699 addZeroImm, // slc 3700 addZeroImm, // tfe 3701 addZeroImm, // dlc 3702 addZeroImm // swz 3703 }}; 3704 } 3705 3706 InstructionSelector::ComplexRendererFns 3707 AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const { 3708 Register VAddr; 3709 Register RSrcReg; 3710 Register SOffset; 3711 int64_t Offset = 0; 3712 3713 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 3714 return {}; 3715 3716 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 3717 // pattern. 3718 return {{ 3719 [=](MachineInstrBuilder &MIB) { // rsrc 3720 MIB.addReg(RSrcReg); 3721 }, 3722 [=](MachineInstrBuilder &MIB) { // vaddr 3723 MIB.addReg(VAddr); 3724 }, 3725 [=](MachineInstrBuilder &MIB) { // soffset 3726 if (SOffset) 3727 MIB.addReg(SOffset); 3728 else 3729 MIB.addImm(0); 3730 }, 3731 [=](MachineInstrBuilder &MIB) { // offset 3732 MIB.addImm(Offset); 3733 }, 3734 addZeroImm // slc 3735 }}; 3736 } 3737 3738 InstructionSelector::ComplexRendererFns 3739 AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const { 3740 Register RSrcReg; 3741 Register SOffset; 3742 int64_t Offset = 0; 3743 3744 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 3745 return {}; 3746 3747 return {{ 3748 [=](MachineInstrBuilder &MIB) { // rsrc 3749 MIB.addReg(RSrcReg); 3750 }, 3751 [=](MachineInstrBuilder &MIB) { // soffset 3752 if (SOffset) 3753 MIB.addReg(SOffset); 3754 else 3755 MIB.addImm(0); 3756 }, 3757 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 3758 addZeroImm // slc 3759 }}; 3760 } 3761 3762 /// Get an immediate that must be 32-bits, and treated as zero extended. 3763 static Optional<uint64_t> getConstantZext32Val(Register Reg, 3764 const MachineRegisterInfo &MRI) { 3765 // getConstantVRegVal sexts any values, so see if that matters. 3766 Optional<int64_t> OffsetVal = getConstantVRegVal(Reg, MRI); 3767 if (!OffsetVal || !isInt<32>(*OffsetVal)) 3768 return None; 3769 return Lo_32(*OffsetVal); 3770 } 3771 3772 InstructionSelector::ComplexRendererFns 3773 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const { 3774 Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI); 3775 if (!OffsetVal) 3776 return {}; 3777 3778 Optional<int64_t> EncodedImm = 3779 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true); 3780 if (!EncodedImm) 3781 return {}; 3782 3783 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; 3784 } 3785 3786 InstructionSelector::ComplexRendererFns 3787 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const { 3788 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); 3789 3790 Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI); 3791 if (!OffsetVal) 3792 return {}; 3793 3794 Optional<int64_t> EncodedImm 3795 = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal); 3796 if (!EncodedImm) 3797 return {}; 3798 3799 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; 3800 } 3801 3802 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, 3803 const MachineInstr &MI, 3804 int OpIdx) const { 3805 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 3806 "Expected G_CONSTANT"); 3807 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue()); 3808 } 3809 3810 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB, 3811 const MachineInstr &MI, 3812 int OpIdx) const { 3813 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 3814 "Expected G_CONSTANT"); 3815 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue()); 3816 } 3817 3818 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB, 3819 const MachineInstr &MI, 3820 int OpIdx) const { 3821 assert(OpIdx == -1); 3822 3823 const MachineOperand &Op = MI.getOperand(1); 3824 if (MI.getOpcode() == TargetOpcode::G_FCONSTANT) 3825 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); 3826 else { 3827 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT"); 3828 MIB.addImm(Op.getCImm()->getSExtValue()); 3829 } 3830 } 3831 3832 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB, 3833 const MachineInstr &MI, 3834 int OpIdx) const { 3835 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 3836 "Expected G_CONSTANT"); 3837 MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation()); 3838 } 3839 3840 /// This only really exists to satisfy DAG type checking machinery, so is a 3841 /// no-op here. 3842 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB, 3843 const MachineInstr &MI, 3844 int OpIdx) const { 3845 MIB.addImm(MI.getOperand(OpIdx).getImm()); 3846 } 3847 3848 void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB, 3849 const MachineInstr &MI, 3850 int OpIdx) const { 3851 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3852 MIB.addImm(MI.getOperand(OpIdx).getImm() & 1); 3853 } 3854 3855 void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB, 3856 const MachineInstr &MI, 3857 int OpIdx) const { 3858 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3859 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1); 3860 } 3861 3862 void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB, 3863 const MachineInstr &MI, 3864 int OpIdx) const { 3865 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3866 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1); 3867 } 3868 3869 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB, 3870 const MachineInstr &MI, 3871 int OpIdx) const { 3872 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3873 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1); 3874 } 3875 3876 void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB, 3877 const MachineInstr &MI, 3878 int OpIdx) const { 3879 MIB.addFrameIndex((MI.getOperand(1).getIndex())); 3880 } 3881 3882 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const { 3883 return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm()); 3884 } 3885 3886 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const { 3887 return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm()); 3888 } 3889 3890 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const { 3891 return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm()); 3892 } 3893 3894 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const { 3895 return TII.isInlineConstant(Imm); 3896 } 3897