1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the InstructionSelector class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUInstructionSelector.h" 15 #include "AMDGPUInstrInfo.h" 16 #include "AMDGPUGlobalISelUtils.h" 17 #include "AMDGPURegisterBankInfo.h" 18 #include "AMDGPUSubtarget.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 23 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" 24 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" 25 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 26 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 27 #include "llvm/CodeGen/GlobalISel/Utils.h" 28 #include "llvm/CodeGen/MachineBasicBlock.h" 29 #include "llvm/CodeGen/MachineFunction.h" 30 #include "llvm/CodeGen/MachineInstr.h" 31 #include "llvm/CodeGen/MachineInstrBuilder.h" 32 #include "llvm/CodeGen/MachineRegisterInfo.h" 33 #include "llvm/IR/Type.h" 34 #include "llvm/Support/Debug.h" 35 #include "llvm/Support/raw_ostream.h" 36 37 #define DEBUG_TYPE "amdgpu-isel" 38 39 using namespace llvm; 40 using namespace MIPatternMatch; 41 42 static cl::opt<bool> AllowRiskySelect( 43 "amdgpu-global-isel-risky-select", 44 cl::desc("Allow GlobalISel to select cases that are likely to not work yet"), 45 cl::init(false), 46 cl::ReallyHidden); 47 48 #define GET_GLOBALISEL_IMPL 49 #define AMDGPUSubtarget GCNSubtarget 50 #include "AMDGPUGenGlobalISel.inc" 51 #undef GET_GLOBALISEL_IMPL 52 #undef AMDGPUSubtarget 53 54 AMDGPUInstructionSelector::AMDGPUInstructionSelector( 55 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, 56 const AMDGPUTargetMachine &TM) 57 : InstructionSelector(), TII(*STI.getInstrInfo()), 58 TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM), 59 STI(STI), 60 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG), 61 #define GET_GLOBALISEL_PREDICATES_INIT 62 #include "AMDGPUGenGlobalISel.inc" 63 #undef GET_GLOBALISEL_PREDICATES_INIT 64 #define GET_GLOBALISEL_TEMPORARIES_INIT 65 #include "AMDGPUGenGlobalISel.inc" 66 #undef GET_GLOBALISEL_TEMPORARIES_INIT 67 { 68 } 69 70 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } 71 72 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB, 73 CodeGenCoverage &CoverageInfo) { 74 MRI = &MF.getRegInfo(); 75 InstructionSelector::setupMF(MF, KB, CoverageInfo); 76 } 77 78 bool AMDGPUInstructionSelector::isVCC(Register Reg, 79 const MachineRegisterInfo &MRI) const { 80 if (Register::isPhysicalRegister(Reg)) 81 return Reg == TRI.getVCC(); 82 83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 84 const TargetRegisterClass *RC = 85 RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); 86 if (RC) { 87 const LLT Ty = MRI.getType(Reg); 88 return RC->hasSuperClassEq(TRI.getBoolRC()) && 89 Ty.isValid() && Ty.getSizeInBits() == 1; 90 } 91 92 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); 93 return RB->getID() == AMDGPU::VCCRegBankID; 94 } 95 96 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI, 97 unsigned NewOpc) const { 98 MI.setDesc(TII.get(NewOpc)); 99 MI.RemoveOperand(1); // Remove intrinsic ID. 100 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 101 102 MachineOperand &Dst = MI.getOperand(0); 103 MachineOperand &Src = MI.getOperand(1); 104 105 // TODO: This should be legalized to s32 if needed 106 if (MRI->getType(Dst.getReg()) == LLT::scalar(1)) 107 return false; 108 109 const TargetRegisterClass *DstRC 110 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 111 const TargetRegisterClass *SrcRC 112 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 113 if (!DstRC || DstRC != SrcRC) 114 return false; 115 116 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) && 117 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI); 118 } 119 120 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { 121 const DebugLoc &DL = I.getDebugLoc(); 122 MachineBasicBlock *BB = I.getParent(); 123 I.setDesc(TII.get(TargetOpcode::COPY)); 124 125 const MachineOperand &Src = I.getOperand(1); 126 MachineOperand &Dst = I.getOperand(0); 127 Register DstReg = Dst.getReg(); 128 Register SrcReg = Src.getReg(); 129 130 if (isVCC(DstReg, *MRI)) { 131 if (SrcReg == AMDGPU::SCC) { 132 const TargetRegisterClass *RC 133 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 134 if (!RC) 135 return true; 136 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 137 } 138 139 if (!isVCC(SrcReg, *MRI)) { 140 // TODO: Should probably leave the copy and let copyPhysReg expand it. 141 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI)) 142 return false; 143 144 const TargetRegisterClass *SrcRC 145 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 146 147 Register MaskedReg = MRI->createVirtualRegister(SrcRC); 148 149 // We can't trust the high bits at this point, so clear them. 150 151 // TODO: Skip masking high bits if def is known boolean. 152 153 unsigned AndOpc = TRI.isSGPRClass(SrcRC) ? 154 AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32; 155 BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg) 156 .addImm(1) 157 .addReg(SrcReg); 158 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) 159 .addImm(0) 160 .addReg(MaskedReg); 161 162 if (!MRI->getRegClassOrNull(SrcReg)) 163 MRI->setRegClass(SrcReg, SrcRC); 164 I.eraseFromParent(); 165 return true; 166 } 167 168 const TargetRegisterClass *RC = 169 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 170 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) 171 return false; 172 173 // Don't constrain the source register to a class so the def instruction 174 // handles it (unless it's undef). 175 // 176 // FIXME: This is a hack. When selecting the def, we neeed to know 177 // specifically know that the result is VCCRegBank, and not just an SGPR 178 // with size 1. An SReg_32 with size 1 is ambiguous with wave32. 179 if (Src.isUndef()) { 180 const TargetRegisterClass *SrcRC = 181 TRI.getConstrainedRegClassForOperand(Src, *MRI); 182 if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 183 return false; 184 } 185 186 return true; 187 } 188 189 for (const MachineOperand &MO : I.operands()) { 190 if (Register::isPhysicalRegister(MO.getReg())) 191 continue; 192 193 const TargetRegisterClass *RC = 194 TRI.getConstrainedRegClassForOperand(MO, *MRI); 195 if (!RC) 196 continue; 197 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); 198 } 199 return true; 200 } 201 202 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { 203 const Register DefReg = I.getOperand(0).getReg(); 204 const LLT DefTy = MRI->getType(DefReg); 205 if (DefTy == LLT::scalar(1)) { 206 if (!AllowRiskySelect) { 207 LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n"); 208 return false; 209 } 210 211 LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n"); 212 } 213 214 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy) 215 216 const RegClassOrRegBank &RegClassOrBank = 217 MRI->getRegClassOrRegBank(DefReg); 218 219 const TargetRegisterClass *DefRC 220 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); 221 if (!DefRC) { 222 if (!DefTy.isValid()) { 223 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); 224 return false; 225 } 226 227 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); 228 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI); 229 if (!DefRC) { 230 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); 231 return false; 232 } 233 } 234 235 // TODO: Verify that all registers have the same bank 236 I.setDesc(TII.get(TargetOpcode::PHI)); 237 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI); 238 } 239 240 MachineOperand 241 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, 242 const TargetRegisterClass &SubRC, 243 unsigned SubIdx) const { 244 245 MachineInstr *MI = MO.getParent(); 246 MachineBasicBlock *BB = MO.getParent()->getParent(); 247 Register DstReg = MRI->createVirtualRegister(&SubRC); 248 249 if (MO.isReg()) { 250 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); 251 Register Reg = MO.getReg(); 252 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) 253 .addReg(Reg, 0, ComposedSubIdx); 254 255 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(), 256 MO.isKill(), MO.isDead(), MO.isUndef(), 257 MO.isEarlyClobber(), 0, MO.isDebug(), 258 MO.isInternalRead()); 259 } 260 261 assert(MO.isImm()); 262 263 APInt Imm(64, MO.getImm()); 264 265 switch (SubIdx) { 266 default: 267 llvm_unreachable("do not know to split immediate with this sub index."); 268 case AMDGPU::sub0: 269 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue()); 270 case AMDGPU::sub1: 271 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue()); 272 } 273 } 274 275 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { 276 switch (Opc) { 277 case AMDGPU::G_AND: 278 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; 279 case AMDGPU::G_OR: 280 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32; 281 case AMDGPU::G_XOR: 282 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32; 283 default: 284 llvm_unreachable("not a bit op"); 285 } 286 } 287 288 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { 289 MachineOperand &Dst = I.getOperand(0); 290 MachineOperand &Src0 = I.getOperand(1); 291 MachineOperand &Src1 = I.getOperand(2); 292 Register DstReg = Dst.getReg(); 293 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 294 295 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 296 if (DstRB->getID() == AMDGPU::VCCRegBankID) { 297 const TargetRegisterClass *RC = TRI.getBoolRC(); 298 unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), 299 RC == &AMDGPU::SReg_64RegClass); 300 I.setDesc(TII.get(InstOpc)); 301 // Dead implicit-def of scc 302 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef 303 true, // isImp 304 false, // isKill 305 true)); // isDead 306 307 // FIXME: Hack to avoid turning the register bank into a register class. 308 // The selector for G_ICMP relies on seeing the register bank for the result 309 // is VCC. In wave32 if we constrain the registers to SReg_32 here, it will 310 // be ambiguous whether it's a scalar or vector bool. 311 if (Src0.isUndef() && !MRI->getRegClassOrNull(Src0.getReg())) 312 MRI->setRegClass(Src0.getReg(), RC); 313 if (Src1.isUndef() && !MRI->getRegClassOrNull(Src1.getReg())) 314 MRI->setRegClass(Src1.getReg(), RC); 315 316 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 317 } 318 319 // TODO: Should this allow an SCC bank result, and produce a copy from SCC for 320 // the result? 321 if (DstRB->getID() == AMDGPU::SGPRRegBankID) { 322 unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), Size > 32); 323 I.setDesc(TII.get(InstOpc)); 324 // Dead implicit-def of scc 325 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef 326 true, // isImp 327 false, // isKill 328 true)); // isDead 329 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 330 } 331 332 return false; 333 } 334 335 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { 336 MachineBasicBlock *BB = I.getParent(); 337 MachineFunction *MF = BB->getParent(); 338 Register DstReg = I.getOperand(0).getReg(); 339 const DebugLoc &DL = I.getDebugLoc(); 340 LLT Ty = MRI->getType(DstReg); 341 if (Ty.isVector()) 342 return false; 343 344 unsigned Size = Ty.getSizeInBits(); 345 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 346 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID; 347 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB; 348 349 if (Size == 32) { 350 if (IsSALU) { 351 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; 352 MachineInstr *Add = 353 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 354 .add(I.getOperand(1)) 355 .add(I.getOperand(2)); 356 I.eraseFromParent(); 357 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 358 } 359 360 if (STI.hasAddNoCarry()) { 361 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64; 362 I.setDesc(TII.get(Opc)); 363 I.addOperand(*MF, MachineOperand::CreateImm(0)); 364 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 365 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 366 } 367 368 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64; 369 370 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass()); 371 MachineInstr *Add 372 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 373 .addDef(UnusedCarry, RegState::Dead) 374 .add(I.getOperand(1)) 375 .add(I.getOperand(2)) 376 .addImm(0); 377 I.eraseFromParent(); 378 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 379 } 380 381 assert(!Sub && "illegal sub should not reach here"); 382 383 const TargetRegisterClass &RC 384 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass; 385 const TargetRegisterClass &HalfRC 386 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass; 387 388 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0)); 389 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0)); 390 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1)); 391 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1)); 392 393 Register DstLo = MRI->createVirtualRegister(&HalfRC); 394 Register DstHi = MRI->createVirtualRegister(&HalfRC); 395 396 if (IsSALU) { 397 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) 398 .add(Lo1) 399 .add(Lo2); 400 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) 401 .add(Hi1) 402 .add(Hi2); 403 } else { 404 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass(); 405 Register CarryReg = MRI->createVirtualRegister(CarryRC); 406 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo) 407 .addDef(CarryReg) 408 .add(Lo1) 409 .add(Lo2) 410 .addImm(0); 411 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi) 412 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead) 413 .add(Hi1) 414 .add(Hi2) 415 .addReg(CarryReg, RegState::Kill) 416 .addImm(0); 417 418 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI)) 419 return false; 420 } 421 422 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 423 .addReg(DstLo) 424 .addImm(AMDGPU::sub0) 425 .addReg(DstHi) 426 .addImm(AMDGPU::sub1); 427 428 429 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI)) 430 return false; 431 432 I.eraseFromParent(); 433 return true; 434 } 435 436 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE( 437 MachineInstr &I) const { 438 MachineBasicBlock *BB = I.getParent(); 439 MachineFunction *MF = BB->getParent(); 440 const DebugLoc &DL = I.getDebugLoc(); 441 Register Dst0Reg = I.getOperand(0).getReg(); 442 Register Dst1Reg = I.getOperand(1).getReg(); 443 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO || 444 I.getOpcode() == AMDGPU::G_UADDE; 445 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE || 446 I.getOpcode() == AMDGPU::G_USUBE; 447 448 if (isVCC(Dst1Reg, *MRI)) { 449 unsigned NoCarryOpc = 450 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64; 451 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 452 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc)); 453 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 454 I.addOperand(*MF, MachineOperand::CreateImm(0)); 455 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 456 } 457 458 Register Src0Reg = I.getOperand(2).getReg(); 459 Register Src1Reg = I.getOperand(3).getReg(); 460 461 if (HasCarryIn) { 462 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 463 .addReg(I.getOperand(4).getReg()); 464 } 465 466 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; 467 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; 468 469 BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg) 470 .add(I.getOperand(2)) 471 .add(I.getOperand(3)); 472 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg) 473 .addReg(AMDGPU::SCC); 474 475 if (!MRI->getRegClassOrNull(Dst1Reg)) 476 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass); 477 478 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) || 479 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) || 480 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI)) 481 return false; 482 483 if (HasCarryIn && 484 !RBI.constrainGenericRegister(I.getOperand(4).getReg(), 485 AMDGPU::SReg_32RegClass, *MRI)) 486 return false; 487 488 I.eraseFromParent(); 489 return true; 490 } 491 492 // TODO: We should probably legalize these to only using 32-bit results. 493 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { 494 MachineBasicBlock *BB = I.getParent(); 495 Register DstReg = I.getOperand(0).getReg(); 496 Register SrcReg = I.getOperand(1).getReg(); 497 LLT DstTy = MRI->getType(DstReg); 498 LLT SrcTy = MRI->getType(SrcReg); 499 const unsigned SrcSize = SrcTy.getSizeInBits(); 500 unsigned DstSize = DstTy.getSizeInBits(); 501 502 // TODO: Should handle any multiple of 32 offset. 503 unsigned Offset = I.getOperand(2).getImm(); 504 if (Offset % 32 != 0 || DstSize > 128) 505 return false; 506 507 // 16-bit operations really use 32-bit registers. 508 // FIXME: Probably should not allow 16-bit G_EXTRACT results. 509 if (DstSize == 16) 510 DstSize = 32; 511 512 const TargetRegisterClass *DstRC = 513 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI); 514 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 515 return false; 516 517 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 518 const TargetRegisterClass *SrcRC = 519 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 520 if (!SrcRC) 521 return false; 522 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32, 523 DstSize / 32); 524 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg); 525 if (!SrcRC) 526 return false; 527 528 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I, 529 *SrcRC, I.getOperand(1)); 530 const DebugLoc &DL = I.getDebugLoc(); 531 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg) 532 .addReg(SrcReg, 0, SubReg); 533 534 I.eraseFromParent(); 535 return true; 536 } 537 538 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { 539 MachineBasicBlock *BB = MI.getParent(); 540 Register DstReg = MI.getOperand(0).getReg(); 541 LLT DstTy = MRI->getType(DstReg); 542 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg()); 543 544 const unsigned SrcSize = SrcTy.getSizeInBits(); 545 if (SrcSize < 32) 546 return selectImpl(MI, *CoverageInfo); 547 548 const DebugLoc &DL = MI.getDebugLoc(); 549 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 550 const unsigned DstSize = DstTy.getSizeInBits(); 551 const TargetRegisterClass *DstRC = 552 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 553 if (!DstRC) 554 return false; 555 556 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8); 557 MachineInstrBuilder MIB = 558 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg); 559 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) { 560 MachineOperand &Src = MI.getOperand(I + 1); 561 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef())); 562 MIB.addImm(SubRegs[I]); 563 564 const TargetRegisterClass *SrcRC 565 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 566 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI)) 567 return false; 568 } 569 570 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 571 return false; 572 573 MI.eraseFromParent(); 574 return true; 575 } 576 577 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { 578 MachineBasicBlock *BB = MI.getParent(); 579 const int NumDst = MI.getNumOperands() - 1; 580 581 MachineOperand &Src = MI.getOperand(NumDst); 582 583 Register SrcReg = Src.getReg(); 584 Register DstReg0 = MI.getOperand(0).getReg(); 585 LLT DstTy = MRI->getType(DstReg0); 586 LLT SrcTy = MRI->getType(SrcReg); 587 588 const unsigned DstSize = DstTy.getSizeInBits(); 589 const unsigned SrcSize = SrcTy.getSizeInBits(); 590 const DebugLoc &DL = MI.getDebugLoc(); 591 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 592 593 const TargetRegisterClass *SrcRC = 594 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 595 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 596 return false; 597 598 const unsigned SrcFlags = getUndefRegState(Src.isUndef()); 599 600 // Note we could have mixed SGPR and VGPR destination banks for an SGPR 601 // source, and this relies on the fact that the same subregister indices are 602 // used for both. 603 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); 604 for (int I = 0, E = NumDst; I != E; ++I) { 605 MachineOperand &Dst = MI.getOperand(I); 606 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg()) 607 .addReg(SrcReg, SrcFlags, SubRegs[I]); 608 609 const TargetRegisterClass *DstRC = 610 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 611 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI)) 612 return false; 613 } 614 615 MI.eraseFromParent(); 616 return true; 617 } 618 619 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC( 620 MachineInstr &MI) const { 621 if (selectImpl(MI, *CoverageInfo)) 622 return true; 623 624 const LLT S32 = LLT::scalar(32); 625 const LLT V2S16 = LLT::vector(2, 16); 626 627 Register Dst = MI.getOperand(0).getReg(); 628 if (MRI->getType(Dst) != V2S16) 629 return false; 630 631 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI); 632 if (DstBank->getID() != AMDGPU::SGPRRegBankID) 633 return false; 634 635 Register Src0 = MI.getOperand(1).getReg(); 636 Register Src1 = MI.getOperand(2).getReg(); 637 if (MRI->getType(Src0) != S32) 638 return false; 639 640 const DebugLoc &DL = MI.getDebugLoc(); 641 MachineBasicBlock *BB = MI.getParent(); 642 643 auto ConstSrc1 = getConstantVRegValWithLookThrough(Src1, *MRI, true, true); 644 if (ConstSrc1) { 645 auto ConstSrc0 = getConstantVRegValWithLookThrough(Src0, *MRI, true, true); 646 if (ConstSrc0) { 647 uint32_t Lo16 = static_cast<uint32_t>(ConstSrc0->Value) & 0xffff; 648 uint32_t Hi16 = static_cast<uint32_t>(ConstSrc1->Value) & 0xffff; 649 650 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst) 651 .addImm(Lo16 | (Hi16 << 16)); 652 MI.eraseFromParent(); 653 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI); 654 } 655 } 656 657 // TODO: This should probably be a combine somewhere 658 // (build_vector_trunc $src0, undef -> copy $src0 659 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI); 660 if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) { 661 MI.setDesc(TII.get(AMDGPU::COPY)); 662 MI.RemoveOperand(2); 663 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) && 664 RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI); 665 } 666 667 Register ShiftSrc0; 668 Register ShiftSrc1; 669 int64_t ShiftAmt; 670 671 // With multiple uses of the shift, this will duplicate the shift and 672 // increase register pressure. 673 // 674 // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16) 675 // => (S_PACK_HH_B32_B16 $src0, $src1) 676 // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16)) 677 // => (S_PACK_LH_B32_B16 $src0, $src1) 678 // (build_vector_trunc $src0, $src1) 679 // => (S_PACK_LL_B32_B16 $src0, $src1) 680 681 // FIXME: This is an inconvenient way to check a specific value 682 bool Shift0 = mi_match( 683 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_ICst(ShiftAmt)))) && 684 ShiftAmt == 16; 685 686 bool Shift1 = mi_match( 687 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_ICst(ShiftAmt)))) && 688 ShiftAmt == 16; 689 690 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16; 691 if (Shift0 && Shift1) { 692 Opc = AMDGPU::S_PACK_HH_B32_B16; 693 MI.getOperand(1).setReg(ShiftSrc0); 694 MI.getOperand(2).setReg(ShiftSrc1); 695 } else if (Shift1) { 696 Opc = AMDGPU::S_PACK_LH_B32_B16; 697 MI.getOperand(2).setReg(ShiftSrc1); 698 } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) { 699 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16 700 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst) 701 .addReg(ShiftSrc0) 702 .addImm(16); 703 704 MI.eraseFromParent(); 705 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 706 } 707 708 MI.setDesc(TII.get(Opc)); 709 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); 710 } 711 712 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const { 713 return selectG_ADD_SUB(I); 714 } 715 716 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { 717 const MachineOperand &MO = I.getOperand(0); 718 719 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The 720 // regbank check here is to know why getConstrainedRegClassForOperand failed. 721 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI); 722 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) || 723 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) { 724 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); 725 return true; 726 } 727 728 return false; 729 } 730 731 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { 732 MachineBasicBlock *BB = I.getParent(); 733 734 Register DstReg = I.getOperand(0).getReg(); 735 Register Src0Reg = I.getOperand(1).getReg(); 736 Register Src1Reg = I.getOperand(2).getReg(); 737 LLT Src1Ty = MRI->getType(Src1Reg); 738 739 unsigned DstSize = MRI->getType(DstReg).getSizeInBits(); 740 unsigned InsSize = Src1Ty.getSizeInBits(); 741 742 int64_t Offset = I.getOperand(3).getImm(); 743 744 // FIXME: These cases should have been illegal and unnecessary to check here. 745 if (Offset % 32 != 0 || InsSize % 32 != 0) 746 return false; 747 748 // Currently not handled by getSubRegFromChannel. 749 if (InsSize > 128) 750 return false; 751 752 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32); 753 if (SubReg == AMDGPU::NoSubRegister) 754 return false; 755 756 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 757 const TargetRegisterClass *DstRC = 758 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 759 if (!DstRC) 760 return false; 761 762 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI); 763 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI); 764 const TargetRegisterClass *Src0RC = 765 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI); 766 const TargetRegisterClass *Src1RC = 767 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI); 768 769 // Deal with weird cases where the class only partially supports the subreg 770 // index. 771 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg); 772 if (!Src0RC || !Src1RC) 773 return false; 774 775 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 776 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) || 777 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI)) 778 return false; 779 780 const DebugLoc &DL = I.getDebugLoc(); 781 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg) 782 .addReg(Src0Reg) 783 .addReg(Src1Reg) 784 .addImm(SubReg); 785 786 I.eraseFromParent(); 787 return true; 788 } 789 790 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const { 791 if (STI.getLDSBankCount() != 16) 792 return selectImpl(MI, *CoverageInfo); 793 794 Register Dst = MI.getOperand(0).getReg(); 795 Register Src0 = MI.getOperand(2).getReg(); 796 Register M0Val = MI.getOperand(6).getReg(); 797 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) || 798 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) || 799 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI)) 800 return false; 801 802 // This requires 2 instructions. It is possible to write a pattern to support 803 // this, but the generated isel emitter doesn't correctly deal with multiple 804 // output instructions using the same physical register input. The copy to m0 805 // is incorrectly placed before the second instruction. 806 // 807 // TODO: Match source modifiers. 808 809 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 810 const DebugLoc &DL = MI.getDebugLoc(); 811 MachineBasicBlock *MBB = MI.getParent(); 812 813 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 814 .addReg(M0Val); 815 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov) 816 .addImm(2) 817 .addImm(MI.getOperand(4).getImm()) // $attr 818 .addImm(MI.getOperand(3).getImm()); // $attrchan 819 820 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst) 821 .addImm(0) // $src0_modifiers 822 .addReg(Src0) // $src0 823 .addImm(MI.getOperand(4).getImm()) // $attr 824 .addImm(MI.getOperand(3).getImm()) // $attrchan 825 .addImm(0) // $src2_modifiers 826 .addReg(InterpMov) // $src2 - 2 f16 values selected by high 827 .addImm(MI.getOperand(5).getImm()) // $high 828 .addImm(0) // $clamp 829 .addImm(0); // $omod 830 831 MI.eraseFromParent(); 832 return true; 833 } 834 835 // We need to handle this here because tablegen doesn't support matching 836 // instructions with multiple outputs. 837 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const { 838 Register Dst0 = MI.getOperand(0).getReg(); 839 Register Dst1 = MI.getOperand(1).getReg(); 840 841 LLT Ty = MRI->getType(Dst0); 842 unsigned Opc; 843 if (Ty == LLT::scalar(32)) 844 Opc = AMDGPU::V_DIV_SCALE_F32; 845 else if (Ty == LLT::scalar(64)) 846 Opc = AMDGPU::V_DIV_SCALE_F64; 847 else 848 return false; 849 850 const DebugLoc &DL = MI.getDebugLoc(); 851 MachineBasicBlock *MBB = MI.getParent(); 852 853 Register Numer = MI.getOperand(3).getReg(); 854 Register Denom = MI.getOperand(4).getReg(); 855 unsigned ChooseDenom = MI.getOperand(5).getImm(); 856 857 Register Src0 = ChooseDenom != 0 ? Numer : Denom; 858 859 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0) 860 .addDef(Dst1) 861 .addUse(Src0) 862 .addUse(Denom) 863 .addUse(Numer); 864 865 MI.eraseFromParent(); 866 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 867 } 868 869 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { 870 unsigned IntrinsicID = I.getIntrinsicID(); 871 switch (IntrinsicID) { 872 case Intrinsic::amdgcn_if_break: { 873 MachineBasicBlock *BB = I.getParent(); 874 875 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 876 // SelectionDAG uses for wave32 vs wave64. 877 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK)) 878 .add(I.getOperand(0)) 879 .add(I.getOperand(2)) 880 .add(I.getOperand(3)); 881 882 Register DstReg = I.getOperand(0).getReg(); 883 Register Src0Reg = I.getOperand(2).getReg(); 884 Register Src1Reg = I.getOperand(3).getReg(); 885 886 I.eraseFromParent(); 887 888 for (Register Reg : { DstReg, Src0Reg, Src1Reg }) 889 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 890 891 return true; 892 } 893 case Intrinsic::amdgcn_interp_p1_f16: 894 return selectInterpP1F16(I); 895 case Intrinsic::amdgcn_wqm: 896 return constrainCopyLikeIntrin(I, AMDGPU::WQM); 897 case Intrinsic::amdgcn_softwqm: 898 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM); 899 case Intrinsic::amdgcn_wwm: 900 return constrainCopyLikeIntrin(I, AMDGPU::WWM); 901 case Intrinsic::amdgcn_div_scale: 902 return selectDivScale(I); 903 case Intrinsic::amdgcn_icmp: 904 return selectIntrinsicIcmp(I); 905 case Intrinsic::amdgcn_ballot: 906 return selectBallot(I); 907 case Intrinsic::amdgcn_reloc_constant: 908 return selectRelocConstant(I); 909 default: 910 return selectImpl(I, *CoverageInfo); 911 } 912 } 913 914 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) { 915 if (Size != 32 && Size != 64) 916 return -1; 917 switch (P) { 918 default: 919 llvm_unreachable("Unknown condition code!"); 920 case CmpInst::ICMP_NE: 921 return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64; 922 case CmpInst::ICMP_EQ: 923 return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64; 924 case CmpInst::ICMP_SGT: 925 return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64; 926 case CmpInst::ICMP_SGE: 927 return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64; 928 case CmpInst::ICMP_SLT: 929 return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64; 930 case CmpInst::ICMP_SLE: 931 return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64; 932 case CmpInst::ICMP_UGT: 933 return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64; 934 case CmpInst::ICMP_UGE: 935 return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64; 936 case CmpInst::ICMP_ULT: 937 return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64; 938 case CmpInst::ICMP_ULE: 939 return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64; 940 } 941 } 942 943 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P, 944 unsigned Size) const { 945 if (Size == 64) { 946 if (!STI.hasScalarCompareEq64()) 947 return -1; 948 949 switch (P) { 950 case CmpInst::ICMP_NE: 951 return AMDGPU::S_CMP_LG_U64; 952 case CmpInst::ICMP_EQ: 953 return AMDGPU::S_CMP_EQ_U64; 954 default: 955 return -1; 956 } 957 } 958 959 if (Size != 32) 960 return -1; 961 962 switch (P) { 963 case CmpInst::ICMP_NE: 964 return AMDGPU::S_CMP_LG_U32; 965 case CmpInst::ICMP_EQ: 966 return AMDGPU::S_CMP_EQ_U32; 967 case CmpInst::ICMP_SGT: 968 return AMDGPU::S_CMP_GT_I32; 969 case CmpInst::ICMP_SGE: 970 return AMDGPU::S_CMP_GE_I32; 971 case CmpInst::ICMP_SLT: 972 return AMDGPU::S_CMP_LT_I32; 973 case CmpInst::ICMP_SLE: 974 return AMDGPU::S_CMP_LE_I32; 975 case CmpInst::ICMP_UGT: 976 return AMDGPU::S_CMP_GT_U32; 977 case CmpInst::ICMP_UGE: 978 return AMDGPU::S_CMP_GE_U32; 979 case CmpInst::ICMP_ULT: 980 return AMDGPU::S_CMP_LT_U32; 981 case CmpInst::ICMP_ULE: 982 return AMDGPU::S_CMP_LE_U32; 983 default: 984 llvm_unreachable("Unknown condition code!"); 985 } 986 } 987 988 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { 989 MachineBasicBlock *BB = I.getParent(); 990 const DebugLoc &DL = I.getDebugLoc(); 991 992 Register SrcReg = I.getOperand(2).getReg(); 993 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 994 995 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); 996 997 Register CCReg = I.getOperand(0).getReg(); 998 if (!isVCC(CCReg, *MRI)) { 999 int Opcode = getS_CMPOpcode(Pred, Size); 1000 if (Opcode == -1) 1001 return false; 1002 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode)) 1003 .add(I.getOperand(2)) 1004 .add(I.getOperand(3)); 1005 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg) 1006 .addReg(AMDGPU::SCC); 1007 bool Ret = 1008 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) && 1009 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI); 1010 I.eraseFromParent(); 1011 return Ret; 1012 } 1013 1014 int Opcode = getV_CMPOpcode(Pred, Size); 1015 if (Opcode == -1) 1016 return false; 1017 1018 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), 1019 I.getOperand(0).getReg()) 1020 .add(I.getOperand(2)) 1021 .add(I.getOperand(3)); 1022 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), 1023 *TRI.getBoolRC(), *MRI); 1024 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 1025 I.eraseFromParent(); 1026 return Ret; 1027 } 1028 1029 bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const { 1030 Register Dst = I.getOperand(0).getReg(); 1031 if (isVCC(Dst, *MRI)) 1032 return false; 1033 1034 if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize()) 1035 return false; 1036 1037 MachineBasicBlock *BB = I.getParent(); 1038 const DebugLoc &DL = I.getDebugLoc(); 1039 Register SrcReg = I.getOperand(2).getReg(); 1040 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 1041 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm()); 1042 1043 int Opcode = getV_CMPOpcode(Pred, Size); 1044 if (Opcode == -1) 1045 return false; 1046 1047 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst) 1048 .add(I.getOperand(2)) 1049 .add(I.getOperand(3)); 1050 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(), 1051 *MRI); 1052 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 1053 I.eraseFromParent(); 1054 return Ret; 1055 } 1056 1057 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const { 1058 MachineBasicBlock *BB = I.getParent(); 1059 const DebugLoc &DL = I.getDebugLoc(); 1060 Register DstReg = I.getOperand(0).getReg(); 1061 const unsigned Size = MRI->getType(DstReg).getSizeInBits(); 1062 const bool Is64 = Size == 64; 1063 1064 if (Size != STI.getWavefrontSize()) 1065 return false; 1066 1067 Optional<ValueAndVReg> Arg = 1068 getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true); 1069 1070 if (Arg.hasValue()) { 1071 const int64_t Value = Arg.getValue().Value; 1072 if (Value == 0) { 1073 unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; 1074 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0); 1075 } else if (Value == -1) { // all ones 1076 Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO; 1077 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg); 1078 } else 1079 return false; 1080 } else { 1081 Register SrcReg = I.getOperand(2).getReg(); 1082 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg); 1083 } 1084 1085 I.eraseFromParent(); 1086 return true; 1087 } 1088 1089 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const { 1090 Register DstReg = I.getOperand(0).getReg(); 1091 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 1092 const TargetRegisterClass *DstRC = 1093 TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI); 1094 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 1095 return false; 1096 1097 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID; 1098 1099 Module *M = MF->getFunction().getParent(); 1100 const MDNode *Metadata = I.getOperand(2).getMetadata(); 1101 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString(); 1102 auto RelocSymbol = cast<GlobalVariable>( 1103 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext()))); 1104 1105 MachineBasicBlock *BB = I.getParent(); 1106 BuildMI(*BB, &I, I.getDebugLoc(), 1107 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg) 1108 .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO); 1109 1110 I.eraseFromParent(); 1111 return true; 1112 } 1113 1114 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const { 1115 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 1116 // SelectionDAG uses for wave32 vs wave64. 1117 MachineBasicBlock *BB = MI.getParent(); 1118 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF)) 1119 .add(MI.getOperand(1)); 1120 1121 Register Reg = MI.getOperand(1).getReg(); 1122 MI.eraseFromParent(); 1123 1124 if (!MRI->getRegClassOrNull(Reg)) 1125 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 1126 return true; 1127 } 1128 1129 static unsigned getDSShaderTypeValue(const MachineFunction &MF) { 1130 switch (MF.getFunction().getCallingConv()) { 1131 case CallingConv::AMDGPU_PS: 1132 return 1; 1133 case CallingConv::AMDGPU_VS: 1134 return 2; 1135 case CallingConv::AMDGPU_GS: 1136 return 3; 1137 case CallingConv::AMDGPU_HS: 1138 case CallingConv::AMDGPU_LS: 1139 case CallingConv::AMDGPU_ES: 1140 report_fatal_error("ds_ordered_count unsupported for this calling conv"); 1141 case CallingConv::AMDGPU_CS: 1142 case CallingConv::AMDGPU_KERNEL: 1143 case CallingConv::C: 1144 case CallingConv::Fast: 1145 default: 1146 // Assume other calling conventions are various compute callable functions 1147 return 0; 1148 } 1149 } 1150 1151 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic( 1152 MachineInstr &MI, Intrinsic::ID IntrID) const { 1153 MachineBasicBlock *MBB = MI.getParent(); 1154 MachineFunction *MF = MBB->getParent(); 1155 const DebugLoc &DL = MI.getDebugLoc(); 1156 1157 unsigned IndexOperand = MI.getOperand(7).getImm(); 1158 bool WaveRelease = MI.getOperand(8).getImm() != 0; 1159 bool WaveDone = MI.getOperand(9).getImm() != 0; 1160 1161 if (WaveDone && !WaveRelease) 1162 report_fatal_error("ds_ordered_count: wave_done requires wave_release"); 1163 1164 unsigned OrderedCountIndex = IndexOperand & 0x3f; 1165 IndexOperand &= ~0x3f; 1166 unsigned CountDw = 0; 1167 1168 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) { 1169 CountDw = (IndexOperand >> 24) & 0xf; 1170 IndexOperand &= ~(0xf << 24); 1171 1172 if (CountDw < 1 || CountDw > 4) { 1173 report_fatal_error( 1174 "ds_ordered_count: dword count must be between 1 and 4"); 1175 } 1176 } 1177 1178 if (IndexOperand) 1179 report_fatal_error("ds_ordered_count: bad index operand"); 1180 1181 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1; 1182 unsigned ShaderType = getDSShaderTypeValue(*MF); 1183 1184 unsigned Offset0 = OrderedCountIndex << 2; 1185 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) | 1186 (Instruction << 4); 1187 1188 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) 1189 Offset1 |= (CountDw - 1) << 6; 1190 1191 unsigned Offset = Offset0 | (Offset1 << 8); 1192 1193 Register M0Val = MI.getOperand(2).getReg(); 1194 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1195 .addReg(M0Val); 1196 1197 Register DstReg = MI.getOperand(0).getReg(); 1198 Register ValReg = MI.getOperand(3).getReg(); 1199 MachineInstrBuilder DS = 1200 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg) 1201 .addReg(ValReg) 1202 .addImm(Offset) 1203 .cloneMemRefs(MI); 1204 1205 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI)) 1206 return false; 1207 1208 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI); 1209 MI.eraseFromParent(); 1210 return Ret; 1211 } 1212 1213 static unsigned gwsIntrinToOpcode(unsigned IntrID) { 1214 switch (IntrID) { 1215 case Intrinsic::amdgcn_ds_gws_init: 1216 return AMDGPU::DS_GWS_INIT; 1217 case Intrinsic::amdgcn_ds_gws_barrier: 1218 return AMDGPU::DS_GWS_BARRIER; 1219 case Intrinsic::amdgcn_ds_gws_sema_v: 1220 return AMDGPU::DS_GWS_SEMA_V; 1221 case Intrinsic::amdgcn_ds_gws_sema_br: 1222 return AMDGPU::DS_GWS_SEMA_BR; 1223 case Intrinsic::amdgcn_ds_gws_sema_p: 1224 return AMDGPU::DS_GWS_SEMA_P; 1225 case Intrinsic::amdgcn_ds_gws_sema_release_all: 1226 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL; 1227 default: 1228 llvm_unreachable("not a gws intrinsic"); 1229 } 1230 } 1231 1232 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, 1233 Intrinsic::ID IID) const { 1234 if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all && 1235 !STI.hasGWSSemaReleaseAll()) 1236 return false; 1237 1238 // intrinsic ID, vsrc, offset 1239 const bool HasVSrc = MI.getNumOperands() == 3; 1240 assert(HasVSrc || MI.getNumOperands() == 2); 1241 1242 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg(); 1243 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI); 1244 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID) 1245 return false; 1246 1247 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1248 assert(OffsetDef); 1249 1250 unsigned ImmOffset; 1251 1252 MachineBasicBlock *MBB = MI.getParent(); 1253 const DebugLoc &DL = MI.getDebugLoc(); 1254 1255 MachineInstr *Readfirstlane = nullptr; 1256 1257 // If we legalized the VGPR input, strip out the readfirstlane to analyze the 1258 // incoming offset, in case there's an add of a constant. We'll have to put it 1259 // back later. 1260 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) { 1261 Readfirstlane = OffsetDef; 1262 BaseOffset = OffsetDef->getOperand(1).getReg(); 1263 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1264 } 1265 1266 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) { 1267 // If we have a constant offset, try to use the 0 in m0 as the base. 1268 // TODO: Look into changing the default m0 initialization value. If the 1269 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to 1270 // the immediate offset. 1271 1272 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue(); 1273 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1274 .addImm(0); 1275 } else { 1276 std::tie(BaseOffset, ImmOffset, OffsetDef) 1277 = AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset); 1278 1279 if (Readfirstlane) { 1280 // We have the constant offset now, so put the readfirstlane back on the 1281 // variable component. 1282 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI)) 1283 return false; 1284 1285 Readfirstlane->getOperand(1).setReg(BaseOffset); 1286 BaseOffset = Readfirstlane->getOperand(0).getReg(); 1287 } else { 1288 if (!RBI.constrainGenericRegister(BaseOffset, 1289 AMDGPU::SReg_32RegClass, *MRI)) 1290 return false; 1291 } 1292 1293 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1294 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base) 1295 .addReg(BaseOffset) 1296 .addImm(16); 1297 1298 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1299 .addReg(M0Base); 1300 } 1301 1302 // The resource id offset is computed as (<isa opaque base> + M0[21:16] + 1303 // offset field) % 64. Some versions of the programming guide omit the m0 1304 // part, or claim it's from offset 0. 1305 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID))); 1306 1307 if (HasVSrc) { 1308 Register VSrc = MI.getOperand(1).getReg(); 1309 MIB.addReg(VSrc); 1310 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI)) 1311 return false; 1312 } 1313 1314 MIB.addImm(ImmOffset) 1315 .addImm(-1) // $gds 1316 .cloneMemRefs(MI); 1317 1318 MI.eraseFromParent(); 1319 return true; 1320 } 1321 1322 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI, 1323 bool IsAppend) const { 1324 Register PtrBase = MI.getOperand(2).getReg(); 1325 LLT PtrTy = MRI->getType(PtrBase); 1326 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS; 1327 1328 unsigned Offset; 1329 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2)); 1330 1331 // TODO: Should this try to look through readfirstlane like GWS? 1332 if (!isDSOffsetLegal(PtrBase, Offset, 16)) { 1333 PtrBase = MI.getOperand(2).getReg(); 1334 Offset = 0; 1335 } 1336 1337 MachineBasicBlock *MBB = MI.getParent(); 1338 const DebugLoc &DL = MI.getDebugLoc(); 1339 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME; 1340 1341 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1342 .addReg(PtrBase); 1343 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI)) 1344 return false; 1345 1346 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg()) 1347 .addImm(Offset) 1348 .addImm(IsGDS ? -1 : 0) 1349 .cloneMemRefs(MI); 1350 MI.eraseFromParent(); 1351 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1352 } 1353 1354 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, 1355 bool &IsTexFail) { 1356 if (TexFailCtrl) 1357 IsTexFail = true; 1358 1359 TFE = (TexFailCtrl & 0x1) ? 1 : 0; 1360 TexFailCtrl &= ~(uint64_t)0x1; 1361 LWE = (TexFailCtrl & 0x2) ? 1 : 0; 1362 TexFailCtrl &= ~(uint64_t)0x2; 1363 1364 return TexFailCtrl == 0; 1365 } 1366 1367 static bool parseCachePolicy(uint64_t Value, 1368 bool *GLC, bool *SLC, bool *DLC) { 1369 if (GLC) { 1370 *GLC = (Value & 0x1) ? 1 : 0; 1371 Value &= ~(uint64_t)0x1; 1372 } 1373 if (SLC) { 1374 *SLC = (Value & 0x2) ? 1 : 0; 1375 Value &= ~(uint64_t)0x2; 1376 } 1377 if (DLC) { 1378 *DLC = (Value & 0x4) ? 1 : 0; 1379 Value &= ~(uint64_t)0x4; 1380 } 1381 1382 return Value == 0; 1383 } 1384 1385 bool AMDGPUInstructionSelector::selectImageIntrinsic( 1386 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 1387 MachineBasicBlock *MBB = MI.getParent(); 1388 const DebugLoc &DL = MI.getDebugLoc(); 1389 1390 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 1391 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 1392 1393 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); 1394 const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 1395 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode); 1396 const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo = 1397 AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode); 1398 unsigned IntrOpcode = Intr->BaseOpcode; 1399 const bool IsGFX10 = STI.getGeneration() >= AMDGPUSubtarget::GFX10; 1400 1401 const int VAddrIdx = getImageVAddrIdxBegin(BaseOpcode, 1402 MI.getNumExplicitDefs()); 1403 int NumVAddr, NumGradients; 1404 std::tie(NumVAddr, NumGradients) = getImageNumVAddr(Intr, BaseOpcode); 1405 1406 Register VDataIn, VDataOut; 1407 LLT VDataTy; 1408 int NumVDataDwords = -1; 1409 bool IsD16 = false; 1410 1411 // XXX - Can we just get the second to last argument for ctrl? 1412 unsigned CtrlIdx; // Index of texfailctrl argument 1413 bool Unorm; 1414 if (!BaseOpcode->Sampler) { 1415 Unorm = true; 1416 CtrlIdx = VAddrIdx + NumVAddr + 1; 1417 } else { 1418 Unorm = MI.getOperand(VAddrIdx + NumVAddr + 2).getImm() != 0; 1419 CtrlIdx = VAddrIdx + NumVAddr + 3; 1420 } 1421 1422 bool TFE; 1423 bool LWE; 1424 bool IsTexFail = false; 1425 if (!parseTexFail(MI.getOperand(CtrlIdx).getImm(), TFE, LWE, IsTexFail)) 1426 return false; 1427 1428 const int Flags = MI.getOperand(CtrlIdx + 2).getImm(); 1429 const bool IsA16 = (Flags & 1) != 0; 1430 const bool IsG16 = (Flags & 2) != 0; 1431 1432 // A16 implies 16 bit gradients 1433 if (IsA16 && !IsG16) 1434 return false; 1435 1436 unsigned DMask = 0; 1437 unsigned DMaskLanes = 0; 1438 1439 if (BaseOpcode->Atomic) { 1440 VDataOut = MI.getOperand(0).getReg(); 1441 VDataIn = MI.getOperand(2).getReg(); 1442 LLT Ty = MRI->getType(VDataIn); 1443 1444 // Be careful to allow atomic swap on 16-bit element vectors. 1445 const bool Is64Bit = BaseOpcode->AtomicX2 ? 1446 Ty.getSizeInBits() == 128 : 1447 Ty.getSizeInBits() == 64; 1448 1449 if (BaseOpcode->AtomicX2) { 1450 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister); 1451 1452 DMask = Is64Bit ? 0xf : 0x3; 1453 NumVDataDwords = Is64Bit ? 4 : 2; 1454 } else { 1455 DMask = Is64Bit ? 0x3 : 0x1; 1456 NumVDataDwords = Is64Bit ? 2 : 1; 1457 } 1458 } else { 1459 const int DMaskIdx = 2; // Input/output + intrinsic ID. 1460 1461 DMask = MI.getOperand(DMaskIdx).getImm(); 1462 DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask); 1463 1464 if (BaseOpcode->Store) { 1465 VDataIn = MI.getOperand(1).getReg(); 1466 VDataTy = MRI->getType(VDataIn); 1467 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32; 1468 } else { 1469 VDataOut = MI.getOperand(0).getReg(); 1470 VDataTy = MRI->getType(VDataOut); 1471 NumVDataDwords = DMaskLanes; 1472 1473 // One memoperand is mandatory, except for getresinfo. 1474 // FIXME: Check this in verifier. 1475 if (!MI.memoperands_empty()) { 1476 const MachineMemOperand *MMO = *MI.memoperands_begin(); 1477 1478 // Infer d16 from the memory size, as the register type will be mangled by 1479 // unpacked subtargets, or by TFE. 1480 IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32; 1481 1482 if (IsD16 && !STI.hasUnpackedD16VMem()) 1483 NumVDataDwords = (DMaskLanes + 1) / 2; 1484 } 1485 } 1486 } 1487 1488 // Optimize _L to _LZ when _L is zero 1489 if (LZMappingInfo) { 1490 // The legalizer replaced the register with an immediate 0 if we need to 1491 // change the opcode. 1492 const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1); 1493 if (Lod.isImm()) { 1494 assert(Lod.getImm() == 0); 1495 IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l 1496 } 1497 } 1498 1499 // Optimize _mip away, when 'lod' is zero 1500 if (MIPMappingInfo) { 1501 const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1); 1502 if (Lod.isImm()) { 1503 assert(Lod.getImm() == 0); 1504 IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip 1505 } 1506 } 1507 1508 // Set G16 opcode 1509 if (IsG16 && !IsA16) { 1510 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = 1511 AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode); 1512 assert(G16MappingInfo); 1513 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16 1514 } 1515 1516 // TODO: Check this in verifier. 1517 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this"); 1518 1519 bool GLC = false; 1520 bool SLC = false; 1521 bool DLC = false; 1522 if (BaseOpcode->Atomic) { 1523 GLC = true; // TODO no-return optimization 1524 if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), nullptr, &SLC, 1525 IsGFX10 ? &DLC : nullptr)) 1526 return false; 1527 } else { 1528 if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), &GLC, &SLC, 1529 IsGFX10 ? &DLC : nullptr)) 1530 return false; 1531 } 1532 1533 int NumVAddrRegs = 0; 1534 int NumVAddrDwords = 0; 1535 for (int I = 0; I < NumVAddr; ++I) { 1536 // Skip the $noregs and 0s inserted during legalization. 1537 MachineOperand &AddrOp = MI.getOperand(VAddrIdx + I); 1538 if (!AddrOp.isReg()) 1539 continue; // XXX - Break? 1540 1541 Register Addr = AddrOp.getReg(); 1542 if (!Addr) 1543 break; 1544 1545 ++NumVAddrRegs; 1546 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32; 1547 } 1548 1549 // The legalizer preprocessed the intrinsic arguments. If we aren't using 1550 // NSA, these should have beeen packed into a single value in the first 1551 // address register 1552 const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs; 1553 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) { 1554 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n"); 1555 return false; 1556 } 1557 1558 if (IsTexFail) 1559 ++NumVDataDwords; 1560 1561 int Opcode = -1; 1562 if (IsGFX10) { 1563 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, 1564 UseNSA ? AMDGPU::MIMGEncGfx10NSA 1565 : AMDGPU::MIMGEncGfx10Default, 1566 NumVDataDwords, NumVAddrDwords); 1567 } else { 1568 if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 1569 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, 1570 NumVDataDwords, NumVAddrDwords); 1571 if (Opcode == -1) 1572 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6, 1573 NumVDataDwords, NumVAddrDwords); 1574 } 1575 assert(Opcode != -1); 1576 1577 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode)) 1578 .cloneMemRefs(MI); 1579 1580 if (VDataOut) { 1581 if (BaseOpcode->AtomicX2) { 1582 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64; 1583 1584 Register TmpReg = MRI->createVirtualRegister( 1585 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass); 1586 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 1587 1588 MIB.addDef(TmpReg); 1589 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut) 1590 .addReg(TmpReg, RegState::Kill, SubReg); 1591 1592 } else { 1593 MIB.addDef(VDataOut); // vdata output 1594 } 1595 } 1596 1597 if (VDataIn) 1598 MIB.addReg(VDataIn); // vdata input 1599 1600 for (int i = 0; i != NumVAddrRegs; ++i) { 1601 MachineOperand &SrcOp = MI.getOperand(VAddrIdx + i); 1602 if (SrcOp.isReg()) { 1603 assert(SrcOp.getReg() != 0); 1604 MIB.addReg(SrcOp.getReg()); 1605 } 1606 } 1607 1608 MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr).getReg()); // rsrc 1609 if (BaseOpcode->Sampler) 1610 MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr + 1).getReg()); // sampler 1611 1612 MIB.addImm(DMask); // dmask 1613 1614 if (IsGFX10) 1615 MIB.addImm(DimInfo->Encoding); 1616 MIB.addImm(Unorm); 1617 if (IsGFX10) 1618 MIB.addImm(DLC); 1619 1620 MIB.addImm(GLC); 1621 MIB.addImm(SLC); 1622 MIB.addImm(IsA16 && // a16 or r128 1623 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0); 1624 if (IsGFX10) 1625 MIB.addImm(IsA16 ? -1 : 0); 1626 1627 MIB.addImm(TFE); // tfe 1628 MIB.addImm(LWE); // lwe 1629 if (!IsGFX10) 1630 MIB.addImm(DimInfo->DA ? -1 : 0); 1631 if (BaseOpcode->HasD16) 1632 MIB.addImm(IsD16 ? -1 : 0); 1633 1634 MI.eraseFromParent(); 1635 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1636 } 1637 1638 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( 1639 MachineInstr &I) const { 1640 unsigned IntrinsicID = I.getIntrinsicID(); 1641 switch (IntrinsicID) { 1642 case Intrinsic::amdgcn_end_cf: 1643 return selectEndCfIntrinsic(I); 1644 case Intrinsic::amdgcn_ds_ordered_add: 1645 case Intrinsic::amdgcn_ds_ordered_swap: 1646 return selectDSOrderedIntrinsic(I, IntrinsicID); 1647 case Intrinsic::amdgcn_ds_gws_init: 1648 case Intrinsic::amdgcn_ds_gws_barrier: 1649 case Intrinsic::amdgcn_ds_gws_sema_v: 1650 case Intrinsic::amdgcn_ds_gws_sema_br: 1651 case Intrinsic::amdgcn_ds_gws_sema_p: 1652 case Intrinsic::amdgcn_ds_gws_sema_release_all: 1653 return selectDSGWSIntrinsic(I, IntrinsicID); 1654 case Intrinsic::amdgcn_ds_append: 1655 return selectDSAppendConsume(I, true); 1656 case Intrinsic::amdgcn_ds_consume: 1657 return selectDSAppendConsume(I, false); 1658 default: { 1659 return selectImpl(I, *CoverageInfo); 1660 } 1661 } 1662 } 1663 1664 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { 1665 if (selectImpl(I, *CoverageInfo)) 1666 return true; 1667 1668 MachineBasicBlock *BB = I.getParent(); 1669 const DebugLoc &DL = I.getDebugLoc(); 1670 1671 Register DstReg = I.getOperand(0).getReg(); 1672 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 1673 assert(Size <= 32 || Size == 64); 1674 const MachineOperand &CCOp = I.getOperand(1); 1675 Register CCReg = CCOp.getReg(); 1676 if (!isVCC(CCReg, *MRI)) { 1677 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 : 1678 AMDGPU::S_CSELECT_B32; 1679 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 1680 .addReg(CCReg); 1681 1682 // The generic constrainSelectedInstRegOperands doesn't work for the scc register 1683 // bank, because it does not cover the register class that we used to represent 1684 // for it. So we need to manually set the register class here. 1685 if (!MRI->getRegClassOrNull(CCReg)) 1686 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI)); 1687 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) 1688 .add(I.getOperand(2)) 1689 .add(I.getOperand(3)); 1690 1691 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) | 1692 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI); 1693 I.eraseFromParent(); 1694 return Ret; 1695 } 1696 1697 // Wide VGPR select should have been split in RegBankSelect. 1698 if (Size > 32) 1699 return false; 1700 1701 MachineInstr *Select = 1702 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1703 .addImm(0) 1704 .add(I.getOperand(3)) 1705 .addImm(0) 1706 .add(I.getOperand(2)) 1707 .add(I.getOperand(1)); 1708 1709 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); 1710 I.eraseFromParent(); 1711 return Ret; 1712 } 1713 1714 bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { 1715 initM0(I); 1716 return selectImpl(I, *CoverageInfo); 1717 } 1718 1719 static int sizeToSubRegIndex(unsigned Size) { 1720 switch (Size) { 1721 case 32: 1722 return AMDGPU::sub0; 1723 case 64: 1724 return AMDGPU::sub0_sub1; 1725 case 96: 1726 return AMDGPU::sub0_sub1_sub2; 1727 case 128: 1728 return AMDGPU::sub0_sub1_sub2_sub3; 1729 case 256: 1730 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; 1731 default: 1732 if (Size < 32) 1733 return AMDGPU::sub0; 1734 if (Size > 256) 1735 return -1; 1736 return sizeToSubRegIndex(PowerOf2Ceil(Size)); 1737 } 1738 } 1739 1740 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { 1741 Register DstReg = I.getOperand(0).getReg(); 1742 Register SrcReg = I.getOperand(1).getReg(); 1743 const LLT DstTy = MRI->getType(DstReg); 1744 const LLT SrcTy = MRI->getType(SrcReg); 1745 const LLT S1 = LLT::scalar(1); 1746 1747 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1748 const RegisterBank *DstRB; 1749 if (DstTy == S1) { 1750 // This is a special case. We don't treat s1 for legalization artifacts as 1751 // vcc booleans. 1752 DstRB = SrcRB; 1753 } else { 1754 DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1755 if (SrcRB != DstRB) 1756 return false; 1757 } 1758 1759 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; 1760 1761 unsigned DstSize = DstTy.getSizeInBits(); 1762 unsigned SrcSize = SrcTy.getSizeInBits(); 1763 1764 const TargetRegisterClass *SrcRC 1765 = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI); 1766 const TargetRegisterClass *DstRC 1767 = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI); 1768 if (!SrcRC || !DstRC) 1769 return false; 1770 1771 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 1772 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) { 1773 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); 1774 return false; 1775 } 1776 1777 if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) { 1778 MachineBasicBlock *MBB = I.getParent(); 1779 const DebugLoc &DL = I.getDebugLoc(); 1780 1781 Register LoReg = MRI->createVirtualRegister(DstRC); 1782 Register HiReg = MRI->createVirtualRegister(DstRC); 1783 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg) 1784 .addReg(SrcReg, 0, AMDGPU::sub0); 1785 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg) 1786 .addReg(SrcReg, 0, AMDGPU::sub1); 1787 1788 if (IsVALU && STI.hasSDWA()) { 1789 // Write the low 16-bits of the high element into the high 16-bits of the 1790 // low element. 1791 MachineInstr *MovSDWA = 1792 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 1793 .addImm(0) // $src0_modifiers 1794 .addReg(HiReg) // $src0 1795 .addImm(0) // $clamp 1796 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel 1797 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 1798 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel 1799 .addReg(LoReg, RegState::Implicit); 1800 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 1801 } else { 1802 Register TmpReg0 = MRI->createVirtualRegister(DstRC); 1803 Register TmpReg1 = MRI->createVirtualRegister(DstRC); 1804 Register ImmReg = MRI->createVirtualRegister(DstRC); 1805 if (IsVALU) { 1806 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0) 1807 .addImm(16) 1808 .addReg(HiReg); 1809 } else { 1810 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0) 1811 .addReg(HiReg) 1812 .addImm(16); 1813 } 1814 1815 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 1816 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 1817 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32; 1818 1819 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg) 1820 .addImm(0xffff); 1821 BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1) 1822 .addReg(LoReg) 1823 .addReg(ImmReg); 1824 BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg) 1825 .addReg(TmpReg0) 1826 .addReg(TmpReg1); 1827 } 1828 1829 I.eraseFromParent(); 1830 return true; 1831 } 1832 1833 if (!DstTy.isScalar()) 1834 return false; 1835 1836 if (SrcSize > 32) { 1837 int SubRegIdx = sizeToSubRegIndex(DstSize); 1838 if (SubRegIdx == -1) 1839 return false; 1840 1841 // Deal with weird cases where the class only partially supports the subreg 1842 // index. 1843 const TargetRegisterClass *SrcWithSubRC 1844 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx); 1845 if (!SrcWithSubRC) 1846 return false; 1847 1848 if (SrcWithSubRC != SrcRC) { 1849 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI)) 1850 return false; 1851 } 1852 1853 I.getOperand(1).setSubReg(SubRegIdx); 1854 } 1855 1856 I.setDesc(TII.get(TargetOpcode::COPY)); 1857 return true; 1858 } 1859 1860 /// \returns true if a bitmask for \p Size bits will be an inline immediate. 1861 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) { 1862 Mask = maskTrailingOnes<unsigned>(Size); 1863 int SignedMask = static_cast<int>(Mask); 1864 return SignedMask >= -16 && SignedMask <= 64; 1865 } 1866 1867 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1. 1868 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank( 1869 Register Reg, const MachineRegisterInfo &MRI, 1870 const TargetRegisterInfo &TRI) const { 1871 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 1872 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>()) 1873 return RB; 1874 1875 // Ignore the type, since we don't use vcc in artifacts. 1876 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>()) 1877 return &RBI.getRegBankFromRegClass(*RC, LLT()); 1878 return nullptr; 1879 } 1880 1881 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { 1882 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG; 1883 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg; 1884 const DebugLoc &DL = I.getDebugLoc(); 1885 MachineBasicBlock &MBB = *I.getParent(); 1886 const Register DstReg = I.getOperand(0).getReg(); 1887 const Register SrcReg = I.getOperand(1).getReg(); 1888 1889 const LLT DstTy = MRI->getType(DstReg); 1890 const LLT SrcTy = MRI->getType(SrcReg); 1891 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ? 1892 I.getOperand(2).getImm() : SrcTy.getSizeInBits(); 1893 const unsigned DstSize = DstTy.getSizeInBits(); 1894 if (!DstTy.isScalar()) 1895 return false; 1896 1897 if (I.getOpcode() == AMDGPU::G_ANYEXT) 1898 return selectCOPY(I); 1899 1900 // Artifact casts should never use vcc. 1901 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI); 1902 1903 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) { 1904 // 64-bit should have been split up in RegBankSelect 1905 1906 // Try to use an and with a mask if it will save code size. 1907 unsigned Mask; 1908 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1909 MachineInstr *ExtI = 1910 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg) 1911 .addImm(Mask) 1912 .addReg(SrcReg); 1913 I.eraseFromParent(); 1914 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1915 } 1916 1917 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32; 1918 MachineInstr *ExtI = 1919 BuildMI(MBB, I, DL, TII.get(BFE), DstReg) 1920 .addReg(SrcReg) 1921 .addImm(0) // Offset 1922 .addImm(SrcSize); // Width 1923 I.eraseFromParent(); 1924 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1925 } 1926 1927 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) { 1928 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ? 1929 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass; 1930 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI)) 1931 return false; 1932 1933 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) { 1934 const unsigned SextOpc = SrcSize == 8 ? 1935 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16; 1936 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg) 1937 .addReg(SrcReg); 1938 I.eraseFromParent(); 1939 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1940 } 1941 1942 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; 1943 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; 1944 1945 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width. 1946 if (DstSize > 32 && (SrcSize <= 32 || InReg)) { 1947 // We need a 64-bit register source, but the high bits don't matter. 1948 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); 1949 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1950 unsigned SubReg = InReg ? AMDGPU::sub0 : 0; 1951 1952 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 1953 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) 1954 .addReg(SrcReg, 0, SubReg) 1955 .addImm(AMDGPU::sub0) 1956 .addReg(UndefReg) 1957 .addImm(AMDGPU::sub1); 1958 1959 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg) 1960 .addReg(ExtReg) 1961 .addImm(SrcSize << 16); 1962 1963 I.eraseFromParent(); 1964 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI); 1965 } 1966 1967 unsigned Mask; 1968 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1969 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg) 1970 .addReg(SrcReg) 1971 .addImm(Mask); 1972 } else { 1973 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg) 1974 .addReg(SrcReg) 1975 .addImm(SrcSize << 16); 1976 } 1977 1978 I.eraseFromParent(); 1979 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1980 } 1981 1982 return false; 1983 } 1984 1985 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { 1986 MachineBasicBlock *BB = I.getParent(); 1987 MachineOperand &ImmOp = I.getOperand(1); 1988 1989 // The AMDGPU backend only supports Imm operands and not CImm or FPImm. 1990 if (ImmOp.isFPImm()) { 1991 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt(); 1992 ImmOp.ChangeToImmediate(Imm.getZExtValue()); 1993 } else if (ImmOp.isCImm()) { 1994 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue()); 1995 } 1996 1997 Register DstReg = I.getOperand(0).getReg(); 1998 unsigned Size; 1999 bool IsSgpr; 2000 const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg()); 2001 if (RB) { 2002 IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID; 2003 Size = MRI->getType(DstReg).getSizeInBits(); 2004 } else { 2005 const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg); 2006 IsSgpr = TRI.isSGPRClass(RC); 2007 Size = TRI.getRegSizeInBits(*RC); 2008 } 2009 2010 if (Size != 32 && Size != 64) 2011 return false; 2012 2013 unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 2014 if (Size == 32) { 2015 I.setDesc(TII.get(Opcode)); 2016 I.addImplicitDefUseOperands(*MF); 2017 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2018 } 2019 2020 const DebugLoc &DL = I.getDebugLoc(); 2021 2022 APInt Imm(Size, I.getOperand(1).getImm()); 2023 2024 MachineInstr *ResInst; 2025 if (IsSgpr && TII.isInlineConstant(Imm)) { 2026 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) 2027 .addImm(I.getOperand(1).getImm()); 2028 } else { 2029 const TargetRegisterClass *RC = IsSgpr ? 2030 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass; 2031 Register LoReg = MRI->createVirtualRegister(RC); 2032 Register HiReg = MRI->createVirtualRegister(RC); 2033 2034 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg) 2035 .addImm(Imm.trunc(32).getZExtValue()); 2036 2037 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg) 2038 .addImm(Imm.ashr(32).getZExtValue()); 2039 2040 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 2041 .addReg(LoReg) 2042 .addImm(AMDGPU::sub0) 2043 .addReg(HiReg) 2044 .addImm(AMDGPU::sub1); 2045 } 2046 2047 // We can't call constrainSelectedInstRegOperands here, because it doesn't 2048 // work for target independent opcodes 2049 I.eraseFromParent(); 2050 const TargetRegisterClass *DstRC = 2051 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI); 2052 if (!DstRC) 2053 return true; 2054 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI); 2055 } 2056 2057 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const { 2058 // Only manually handle the f64 SGPR case. 2059 // 2060 // FIXME: This is a workaround for 2.5 different tablegen problems. Because 2061 // the bit ops theoretically have a second result due to the implicit def of 2062 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing 2063 // that is easy by disabling the check. The result works, but uses a 2064 // nonsensical sreg32orlds_and_sreg_1 regclass. 2065 // 2066 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to 2067 // the variadic REG_SEQUENCE operands. 2068 2069 Register Dst = MI.getOperand(0).getReg(); 2070 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 2071 if (DstRB->getID() != AMDGPU::SGPRRegBankID || 2072 MRI->getType(Dst) != LLT::scalar(64)) 2073 return false; 2074 2075 Register Src = MI.getOperand(1).getReg(); 2076 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI); 2077 if (Fabs) 2078 Src = Fabs->getOperand(1).getReg(); 2079 2080 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || 2081 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) 2082 return false; 2083 2084 MachineBasicBlock *BB = MI.getParent(); 2085 const DebugLoc &DL = MI.getDebugLoc(); 2086 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2087 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2088 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2089 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2090 2091 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) 2092 .addReg(Src, 0, AMDGPU::sub0); 2093 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) 2094 .addReg(Src, 0, AMDGPU::sub1); 2095 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) 2096 .addImm(0x80000000); 2097 2098 // Set or toggle sign bit. 2099 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32; 2100 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg) 2101 .addReg(HiReg) 2102 .addReg(ConstReg); 2103 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) 2104 .addReg(LoReg) 2105 .addImm(AMDGPU::sub0) 2106 .addReg(OpReg) 2107 .addImm(AMDGPU::sub1); 2108 MI.eraseFromParent(); 2109 return true; 2110 } 2111 2112 // FIXME: This is a workaround for the same tablegen problems as G_FNEG 2113 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const { 2114 Register Dst = MI.getOperand(0).getReg(); 2115 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 2116 if (DstRB->getID() != AMDGPU::SGPRRegBankID || 2117 MRI->getType(Dst) != LLT::scalar(64)) 2118 return false; 2119 2120 Register Src = MI.getOperand(1).getReg(); 2121 MachineBasicBlock *BB = MI.getParent(); 2122 const DebugLoc &DL = MI.getDebugLoc(); 2123 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2124 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2125 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2126 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2127 2128 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || 2129 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) 2130 return false; 2131 2132 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) 2133 .addReg(Src, 0, AMDGPU::sub0); 2134 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) 2135 .addReg(Src, 0, AMDGPU::sub1); 2136 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) 2137 .addImm(0x7fffffff); 2138 2139 // Clear sign bit. 2140 // TODO: Should this used S_BITSET0_*? 2141 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg) 2142 .addReg(HiReg) 2143 .addReg(ConstReg); 2144 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) 2145 .addReg(LoReg) 2146 .addImm(AMDGPU::sub0) 2147 .addReg(OpReg) 2148 .addImm(AMDGPU::sub1); 2149 2150 MI.eraseFromParent(); 2151 return true; 2152 } 2153 2154 static bool isConstant(const MachineInstr &MI) { 2155 return MI.getOpcode() == TargetOpcode::G_CONSTANT; 2156 } 2157 2158 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, 2159 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const { 2160 2161 const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg()); 2162 2163 assert(PtrMI); 2164 2165 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD) 2166 return; 2167 2168 GEPInfo GEPInfo(*PtrMI); 2169 2170 for (unsigned i = 1; i != 3; ++i) { 2171 const MachineOperand &GEPOp = PtrMI->getOperand(i); 2172 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg()); 2173 assert(OpDef); 2174 if (i == 2 && isConstant(*OpDef)) { 2175 // TODO: Could handle constant base + variable offset, but a combine 2176 // probably should have commuted it. 2177 assert(GEPInfo.Imm == 0); 2178 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue(); 2179 continue; 2180 } 2181 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI); 2182 if (OpBank->getID() == AMDGPU::SGPRRegBankID) 2183 GEPInfo.SgprParts.push_back(GEPOp.getReg()); 2184 else 2185 GEPInfo.VgprParts.push_back(GEPOp.getReg()); 2186 } 2187 2188 AddrInfo.push_back(GEPInfo); 2189 getAddrModeInfo(*PtrMI, MRI, AddrInfo); 2190 } 2191 2192 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const { 2193 if (!MI.hasOneMemOperand()) 2194 return false; 2195 2196 const MachineMemOperand *MMO = *MI.memoperands_begin(); 2197 const Value *Ptr = MMO->getValue(); 2198 2199 // UndefValue means this is a load of a kernel input. These are uniform. 2200 // Sometimes LDS instructions have constant pointers. 2201 // If Ptr is null, then that means this mem operand contains a 2202 // PseudoSourceValue like GOT. 2203 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || 2204 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) 2205 return true; 2206 2207 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 2208 return true; 2209 2210 const Instruction *I = dyn_cast<Instruction>(Ptr); 2211 return I && I->getMetadata("amdgpu.uniform"); 2212 } 2213 2214 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const { 2215 for (const GEPInfo &GEPInfo : AddrInfo) { 2216 if (!GEPInfo.VgprParts.empty()) 2217 return true; 2218 } 2219 return false; 2220 } 2221 2222 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const { 2223 MachineBasicBlock *BB = I.getParent(); 2224 2225 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); 2226 unsigned AS = PtrTy.getAddressSpace(); 2227 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) && 2228 STI.ldsRequiresM0Init()) { 2229 // If DS instructions require M0 initializtion, insert it before selecting. 2230 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 2231 .addImm(-1); 2232 } 2233 } 2234 2235 bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const { 2236 initM0(I); 2237 return selectImpl(I, *CoverageInfo); 2238 } 2239 2240 // TODO: No rtn optimization. 2241 bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG( 2242 MachineInstr &MI) const { 2243 Register PtrReg = MI.getOperand(1).getReg(); 2244 const LLT PtrTy = MRI->getType(PtrReg); 2245 if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS || 2246 STI.useFlatForGlobal()) 2247 return selectImpl(MI, *CoverageInfo); 2248 2249 Register DstReg = MI.getOperand(0).getReg(); 2250 const LLT Ty = MRI->getType(DstReg); 2251 const bool Is64 = Ty.getSizeInBits() == 64; 2252 const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 2253 Register TmpReg = MRI->createVirtualRegister( 2254 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass); 2255 2256 const DebugLoc &DL = MI.getDebugLoc(); 2257 MachineBasicBlock *BB = MI.getParent(); 2258 2259 Register VAddr, RSrcReg, SOffset; 2260 int64_t Offset = 0; 2261 2262 unsigned Opcode; 2263 if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) { 2264 Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN : 2265 AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN; 2266 } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr, 2267 RSrcReg, SOffset, Offset)) { 2268 Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN : 2269 AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN; 2270 } else 2271 return selectImpl(MI, *CoverageInfo); 2272 2273 auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg) 2274 .addReg(MI.getOperand(2).getReg()); 2275 2276 if (VAddr) 2277 MIB.addReg(VAddr); 2278 2279 MIB.addReg(RSrcReg); 2280 if (SOffset) 2281 MIB.addReg(SOffset); 2282 else 2283 MIB.addImm(0); 2284 2285 MIB.addImm(Offset); 2286 MIB.addImm(0); // slc 2287 MIB.cloneMemRefs(MI); 2288 2289 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg) 2290 .addReg(TmpReg, RegState::Kill, SubReg); 2291 2292 MI.eraseFromParent(); 2293 2294 MRI->setRegClass( 2295 DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass); 2296 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 2297 } 2298 2299 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { 2300 MachineBasicBlock *BB = I.getParent(); 2301 MachineOperand &CondOp = I.getOperand(0); 2302 Register CondReg = CondOp.getReg(); 2303 const DebugLoc &DL = I.getDebugLoc(); 2304 2305 unsigned BrOpcode; 2306 Register CondPhysReg; 2307 const TargetRegisterClass *ConstrainRC; 2308 2309 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide 2310 // whether the branch is uniform when selecting the instruction. In 2311 // GlobalISel, we should push that decision into RegBankSelect. Assume for now 2312 // RegBankSelect knows what it's doing if the branch condition is scc, even 2313 // though it currently does not. 2314 if (!isVCC(CondReg, *MRI)) { 2315 if (MRI->getType(CondReg) != LLT::scalar(32)) 2316 return false; 2317 2318 CondPhysReg = AMDGPU::SCC; 2319 BrOpcode = AMDGPU::S_CBRANCH_SCC1; 2320 // FIXME: Hack for isSCC tests 2321 ConstrainRC = &AMDGPU::SGPR_32RegClass; 2322 } else { 2323 // FIXME: Do we have to insert an and with exec here, like in SelectionDAG? 2324 // We sort of know that a VCC producer based on the register bank, that ands 2325 // inactive lanes with 0. What if there was a logical operation with vcc 2326 // producers in different blocks/with different exec masks? 2327 // FIXME: Should scc->vcc copies and with exec? 2328 CondPhysReg = TRI.getVCC(); 2329 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ; 2330 ConstrainRC = TRI.getBoolRC(); 2331 } 2332 2333 if (!MRI->getRegClassOrNull(CondReg)) 2334 MRI->setRegClass(CondReg, ConstrainRC); 2335 2336 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg) 2337 .addReg(CondReg); 2338 BuildMI(*BB, &I, DL, TII.get(BrOpcode)) 2339 .addMBB(I.getOperand(1).getMBB()); 2340 2341 I.eraseFromParent(); 2342 return true; 2343 } 2344 2345 bool AMDGPUInstructionSelector::selectG_FRAME_INDEX_GLOBAL_VALUE( 2346 MachineInstr &I) const { 2347 Register DstReg = I.getOperand(0).getReg(); 2348 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2349 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 2350 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32)); 2351 if (IsVGPR) 2352 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 2353 2354 return RBI.constrainGenericRegister( 2355 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI); 2356 } 2357 2358 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { 2359 Register DstReg = I.getOperand(0).getReg(); 2360 Register SrcReg = I.getOperand(1).getReg(); 2361 Register MaskReg = I.getOperand(2).getReg(); 2362 LLT Ty = MRI->getType(DstReg); 2363 LLT MaskTy = MRI->getType(MaskReg); 2364 2365 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2366 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 2367 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI); 2368 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 2369 if (DstRB != SrcRB) // Should only happen for hand written MIR. 2370 return false; 2371 2372 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 2373 const TargetRegisterClass &RegRC 2374 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 2375 2376 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB, 2377 *MRI); 2378 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB, 2379 *MRI); 2380 const TargetRegisterClass *MaskRC = 2381 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI); 2382 2383 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 2384 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 2385 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI)) 2386 return false; 2387 2388 MachineBasicBlock *BB = I.getParent(); 2389 const DebugLoc &DL = I.getDebugLoc(); 2390 if (Ty.getSizeInBits() == 32) { 2391 assert(MaskTy.getSizeInBits() == 32 && 2392 "ptrmask should have been narrowed during legalize"); 2393 2394 BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg) 2395 .addReg(SrcReg) 2396 .addReg(MaskReg); 2397 I.eraseFromParent(); 2398 return true; 2399 } 2400 2401 Register HiReg = MRI->createVirtualRegister(&RegRC); 2402 Register LoReg = MRI->createVirtualRegister(&RegRC); 2403 2404 // Extract the subregisters from the source pointer. 2405 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg) 2406 .addReg(SrcReg, 0, AMDGPU::sub0); 2407 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg) 2408 .addReg(SrcReg, 0, AMDGPU::sub1); 2409 2410 Register MaskedLo, MaskedHi; 2411 2412 // Try to avoid emitting a bit operation when we only need to touch half of 2413 // the 64-bit pointer. 2414 APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64); 2415 2416 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32); 2417 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32); 2418 if ((MaskOnes & MaskLo32) == MaskLo32) { 2419 // If all the bits in the low half are 1, we only need a copy for it. 2420 MaskedLo = LoReg; 2421 } else { 2422 // Extract the mask subregister and apply the and. 2423 Register MaskLo = MRI->createVirtualRegister(&RegRC); 2424 MaskedLo = MRI->createVirtualRegister(&RegRC); 2425 2426 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo) 2427 .addReg(MaskReg, 0, AMDGPU::sub0); 2428 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo) 2429 .addReg(LoReg) 2430 .addReg(MaskLo); 2431 } 2432 2433 if ((MaskOnes & MaskHi32) == MaskHi32) { 2434 // If all the bits in the high half are 1, we only need a copy for it. 2435 MaskedHi = HiReg; 2436 } else { 2437 Register MaskHi = MRI->createVirtualRegister(&RegRC); 2438 MaskedHi = MRI->createVirtualRegister(&RegRC); 2439 2440 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi) 2441 .addReg(MaskReg, 0, AMDGPU::sub1); 2442 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi) 2443 .addReg(HiReg) 2444 .addReg(MaskHi); 2445 } 2446 2447 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 2448 .addReg(MaskedLo) 2449 .addImm(AMDGPU::sub0) 2450 .addReg(MaskedHi) 2451 .addImm(AMDGPU::sub1); 2452 I.eraseFromParent(); 2453 return true; 2454 } 2455 2456 /// Return the register to use for the index value, and the subregister to use 2457 /// for the indirectly accessed register. 2458 static std::pair<Register, unsigned> 2459 computeIndirectRegIndex(MachineRegisterInfo &MRI, 2460 const SIRegisterInfo &TRI, 2461 const TargetRegisterClass *SuperRC, 2462 Register IdxReg, 2463 unsigned EltSize) { 2464 Register IdxBaseReg; 2465 int Offset; 2466 MachineInstr *Unused; 2467 2468 std::tie(IdxBaseReg, Offset, Unused) 2469 = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg); 2470 if (IdxBaseReg == AMDGPU::NoRegister) { 2471 // This will happen if the index is a known constant. This should ordinarily 2472 // be legalized out, but handle it as a register just in case. 2473 assert(Offset == 0); 2474 IdxBaseReg = IdxReg; 2475 } 2476 2477 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize); 2478 2479 // Skip out of bounds offsets, or else we would end up using an undefined 2480 // register. 2481 if (static_cast<unsigned>(Offset) >= SubRegs.size()) 2482 return std::make_pair(IdxReg, SubRegs[0]); 2483 return std::make_pair(IdxBaseReg, SubRegs[Offset]); 2484 } 2485 2486 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT( 2487 MachineInstr &MI) const { 2488 Register DstReg = MI.getOperand(0).getReg(); 2489 Register SrcReg = MI.getOperand(1).getReg(); 2490 Register IdxReg = MI.getOperand(2).getReg(); 2491 2492 LLT DstTy = MRI->getType(DstReg); 2493 LLT SrcTy = MRI->getType(SrcReg); 2494 2495 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2496 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 2497 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 2498 2499 // The index must be scalar. If it wasn't RegBankSelect should have moved this 2500 // into a waterfall loop. 2501 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 2502 return false; 2503 2504 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB, 2505 *MRI); 2506 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB, 2507 *MRI); 2508 if (!SrcRC || !DstRC) 2509 return false; 2510 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 2511 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 2512 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 2513 return false; 2514 2515 MachineBasicBlock *BB = MI.getParent(); 2516 const DebugLoc &DL = MI.getDebugLoc(); 2517 const bool Is64 = DstTy.getSizeInBits() == 64; 2518 2519 unsigned SubReg; 2520 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg, 2521 DstTy.getSizeInBits() / 8); 2522 2523 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) { 2524 if (DstTy.getSizeInBits() != 32 && !Is64) 2525 return false; 2526 2527 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2528 .addReg(IdxReg); 2529 2530 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32; 2531 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg) 2532 .addReg(SrcReg, 0, SubReg) 2533 .addReg(SrcReg, RegState::Implicit); 2534 MI.eraseFromParent(); 2535 return true; 2536 } 2537 2538 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32) 2539 return false; 2540 2541 if (!STI.useVGPRIndexMode()) { 2542 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2543 .addReg(IdxReg); 2544 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg) 2545 .addReg(SrcReg, 0, SubReg) 2546 .addReg(SrcReg, RegState::Implicit); 2547 MI.eraseFromParent(); 2548 return true; 2549 } 2550 2551 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON)) 2552 .addReg(IdxReg) 2553 .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE); 2554 BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstReg) 2555 .addReg(SrcReg, 0, SubReg) 2556 .addReg(SrcReg, RegState::Implicit) 2557 .addReg(AMDGPU::M0, RegState::Implicit); 2558 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF)); 2559 2560 MI.eraseFromParent(); 2561 return true; 2562 } 2563 2564 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd 2565 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT( 2566 MachineInstr &MI) const { 2567 Register DstReg = MI.getOperand(0).getReg(); 2568 Register VecReg = MI.getOperand(1).getReg(); 2569 Register ValReg = MI.getOperand(2).getReg(); 2570 Register IdxReg = MI.getOperand(3).getReg(); 2571 2572 LLT VecTy = MRI->getType(DstReg); 2573 LLT ValTy = MRI->getType(ValReg); 2574 unsigned VecSize = VecTy.getSizeInBits(); 2575 unsigned ValSize = ValTy.getSizeInBits(); 2576 2577 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI); 2578 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI); 2579 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 2580 2581 assert(VecTy.getElementType() == ValTy); 2582 2583 // The index must be scalar. If it wasn't RegBankSelect should have moved this 2584 // into a waterfall loop. 2585 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 2586 return false; 2587 2588 const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB, 2589 *MRI); 2590 const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB, 2591 *MRI); 2592 2593 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) || 2594 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) || 2595 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) || 2596 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 2597 return false; 2598 2599 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32) 2600 return false; 2601 2602 unsigned SubReg; 2603 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, 2604 ValSize / 8); 2605 2606 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID && 2607 STI.useVGPRIndexMode(); 2608 2609 MachineBasicBlock *BB = MI.getParent(); 2610 const DebugLoc &DL = MI.getDebugLoc(); 2611 2612 if (IndexMode) { 2613 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON)) 2614 .addReg(IdxReg) 2615 .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE); 2616 } else { 2617 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2618 .addReg(IdxReg); 2619 } 2620 2621 const MCInstrDesc &RegWriteOp 2622 = TII.getIndirectRegWritePseudo(VecSize, ValSize, 2623 VecRB->getID() == AMDGPU::SGPRRegBankID); 2624 BuildMI(*BB, MI, DL, RegWriteOp, DstReg) 2625 .addReg(VecReg) 2626 .addReg(ValReg) 2627 .addImm(SubReg); 2628 2629 if (IndexMode) 2630 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF)); 2631 2632 MI.eraseFromParent(); 2633 return true; 2634 } 2635 2636 static bool isZeroOrUndef(int X) { 2637 return X == 0 || X == -1; 2638 } 2639 2640 static bool isOneOrUndef(int X) { 2641 return X == 1 || X == -1; 2642 } 2643 2644 static bool isZeroOrOneOrUndef(int X) { 2645 return X == 0 || X == 1 || X == -1; 2646 } 2647 2648 // Normalize a VOP3P shuffle mask to refer to the low/high half of a single 2649 // 32-bit register. 2650 static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1, 2651 ArrayRef<int> Mask) { 2652 NewMask[0] = Mask[0]; 2653 NewMask[1] = Mask[1]; 2654 if (isZeroOrOneOrUndef(Mask[0]) && isZeroOrOneOrUndef(Mask[1])) 2655 return Src0; 2656 2657 assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1); 2658 assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1); 2659 2660 // Shift the mask inputs to be 0/1; 2661 NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2; 2662 NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2; 2663 return Src1; 2664 } 2665 2666 // This is only legal with VOP3P instructions as an aid to op_sel matching. 2667 bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR( 2668 MachineInstr &MI) const { 2669 Register DstReg = MI.getOperand(0).getReg(); 2670 Register Src0Reg = MI.getOperand(1).getReg(); 2671 Register Src1Reg = MI.getOperand(2).getReg(); 2672 ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask(); 2673 2674 const LLT V2S16 = LLT::vector(2, 16); 2675 if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16) 2676 return false; 2677 2678 if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask)) 2679 return false; 2680 2681 assert(ShufMask.size() == 2); 2682 assert(STI.hasSDWA() && "no target has VOP3P but not SDWA"); 2683 2684 MachineBasicBlock *MBB = MI.getParent(); 2685 const DebugLoc &DL = MI.getDebugLoc(); 2686 2687 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2688 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; 2689 const TargetRegisterClass &RC = IsVALU ? 2690 AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 2691 2692 // Handle the degenerate case which should have folded out. 2693 if (ShufMask[0] == -1 && ShufMask[1] == -1) { 2694 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg); 2695 2696 MI.eraseFromParent(); 2697 return RBI.constrainGenericRegister(DstReg, RC, *MRI); 2698 } 2699 2700 // A legal VOP3P mask only reads one of the sources. 2701 int Mask[2]; 2702 Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask); 2703 2704 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) || 2705 !RBI.constrainGenericRegister(SrcVec, RC, *MRI)) 2706 return false; 2707 2708 // TODO: This also should have been folded out 2709 if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) { 2710 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg) 2711 .addReg(SrcVec); 2712 2713 MI.eraseFromParent(); 2714 return true; 2715 } 2716 2717 if (Mask[0] == 1 && Mask[1] == -1) { 2718 if (IsVALU) { 2719 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg) 2720 .addImm(16) 2721 .addReg(SrcVec); 2722 } else { 2723 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg) 2724 .addReg(SrcVec) 2725 .addImm(16); 2726 } 2727 } else if (Mask[0] == -1 && Mask[1] == 0) { 2728 if (IsVALU) { 2729 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg) 2730 .addImm(16) 2731 .addReg(SrcVec); 2732 } else { 2733 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg) 2734 .addReg(SrcVec) 2735 .addImm(16); 2736 } 2737 } else if (Mask[0] == 0 && Mask[1] == 0) { 2738 if (IsVALU) { 2739 // Write low half of the register into the high half. 2740 MachineInstr *MovSDWA = 2741 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 2742 .addImm(0) // $src0_modifiers 2743 .addReg(SrcVec) // $src0 2744 .addImm(0) // $clamp 2745 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel 2746 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 2747 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel 2748 .addReg(SrcVec, RegState::Implicit); 2749 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 2750 } else { 2751 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg) 2752 .addReg(SrcVec) 2753 .addReg(SrcVec); 2754 } 2755 } else if (Mask[0] == 1 && Mask[1] == 1) { 2756 if (IsVALU) { 2757 // Write high half of the register into the low half. 2758 MachineInstr *MovSDWA = 2759 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 2760 .addImm(0) // $src0_modifiers 2761 .addReg(SrcVec) // $src0 2762 .addImm(0) // $clamp 2763 .addImm(AMDGPU::SDWA::WORD_0) // $dst_sel 2764 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 2765 .addImm(AMDGPU::SDWA::WORD_1) // $src0_sel 2766 .addReg(SrcVec, RegState::Implicit); 2767 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 2768 } else { 2769 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg) 2770 .addReg(SrcVec) 2771 .addReg(SrcVec); 2772 } 2773 } else if (Mask[0] == 1 && Mask[1] == 0) { 2774 if (IsVALU) { 2775 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32), DstReg) 2776 .addReg(SrcVec) 2777 .addReg(SrcVec) 2778 .addImm(16); 2779 } else { 2780 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2781 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg) 2782 .addReg(SrcVec) 2783 .addImm(16); 2784 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg) 2785 .addReg(TmpReg) 2786 .addReg(SrcVec); 2787 } 2788 } else 2789 llvm_unreachable("all shuffle masks should be handled"); 2790 2791 MI.eraseFromParent(); 2792 return true; 2793 } 2794 2795 bool AMDGPUInstructionSelector::select(MachineInstr &I) { 2796 if (I.isPHI()) 2797 return selectPHI(I); 2798 2799 if (!I.isPreISelOpcode()) { 2800 if (I.isCopy()) 2801 return selectCOPY(I); 2802 return true; 2803 } 2804 2805 switch (I.getOpcode()) { 2806 case TargetOpcode::G_AND: 2807 case TargetOpcode::G_OR: 2808 case TargetOpcode::G_XOR: 2809 if (selectImpl(I, *CoverageInfo)) 2810 return true; 2811 return selectG_AND_OR_XOR(I); 2812 case TargetOpcode::G_ADD: 2813 case TargetOpcode::G_SUB: 2814 if (selectImpl(I, *CoverageInfo)) 2815 return true; 2816 return selectG_ADD_SUB(I); 2817 case TargetOpcode::G_UADDO: 2818 case TargetOpcode::G_USUBO: 2819 case TargetOpcode::G_UADDE: 2820 case TargetOpcode::G_USUBE: 2821 return selectG_UADDO_USUBO_UADDE_USUBE(I); 2822 case TargetOpcode::G_INTTOPTR: 2823 case TargetOpcode::G_BITCAST: 2824 case TargetOpcode::G_PTRTOINT: 2825 return selectCOPY(I); 2826 case TargetOpcode::G_CONSTANT: 2827 case TargetOpcode::G_FCONSTANT: 2828 return selectG_CONSTANT(I); 2829 case TargetOpcode::G_FNEG: 2830 if (selectImpl(I, *CoverageInfo)) 2831 return true; 2832 return selectG_FNEG(I); 2833 case TargetOpcode::G_FABS: 2834 if (selectImpl(I, *CoverageInfo)) 2835 return true; 2836 return selectG_FABS(I); 2837 case TargetOpcode::G_EXTRACT: 2838 return selectG_EXTRACT(I); 2839 case TargetOpcode::G_MERGE_VALUES: 2840 case TargetOpcode::G_BUILD_VECTOR: 2841 case TargetOpcode::G_CONCAT_VECTORS: 2842 return selectG_MERGE_VALUES(I); 2843 case TargetOpcode::G_UNMERGE_VALUES: 2844 return selectG_UNMERGE_VALUES(I); 2845 case TargetOpcode::G_BUILD_VECTOR_TRUNC: 2846 return selectG_BUILD_VECTOR_TRUNC(I); 2847 case TargetOpcode::G_PTR_ADD: 2848 return selectG_PTR_ADD(I); 2849 case TargetOpcode::G_IMPLICIT_DEF: 2850 return selectG_IMPLICIT_DEF(I); 2851 case TargetOpcode::G_FREEZE: 2852 return selectCOPY(I); 2853 case TargetOpcode::G_INSERT: 2854 return selectG_INSERT(I); 2855 case TargetOpcode::G_INTRINSIC: 2856 return selectG_INTRINSIC(I); 2857 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: 2858 return selectG_INTRINSIC_W_SIDE_EFFECTS(I); 2859 case TargetOpcode::G_ICMP: 2860 if (selectG_ICMP(I)) 2861 return true; 2862 return selectImpl(I, *CoverageInfo); 2863 case TargetOpcode::G_LOAD: 2864 case TargetOpcode::G_ATOMIC_CMPXCHG: 2865 case TargetOpcode::G_ATOMICRMW_XCHG: 2866 case TargetOpcode::G_ATOMICRMW_ADD: 2867 case TargetOpcode::G_ATOMICRMW_SUB: 2868 case TargetOpcode::G_ATOMICRMW_AND: 2869 case TargetOpcode::G_ATOMICRMW_OR: 2870 case TargetOpcode::G_ATOMICRMW_XOR: 2871 case TargetOpcode::G_ATOMICRMW_MIN: 2872 case TargetOpcode::G_ATOMICRMW_MAX: 2873 case TargetOpcode::G_ATOMICRMW_UMIN: 2874 case TargetOpcode::G_ATOMICRMW_UMAX: 2875 case TargetOpcode::G_ATOMICRMW_FADD: 2876 case AMDGPU::G_AMDGPU_ATOMIC_INC: 2877 case AMDGPU::G_AMDGPU_ATOMIC_DEC: 2878 return selectG_LOAD_ATOMICRMW(I); 2879 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: 2880 return selectG_AMDGPU_ATOMIC_CMPXCHG(I); 2881 case TargetOpcode::G_SELECT: 2882 return selectG_SELECT(I); 2883 case TargetOpcode::G_STORE: 2884 return selectG_STORE(I); 2885 case TargetOpcode::G_TRUNC: 2886 return selectG_TRUNC(I); 2887 case TargetOpcode::G_SEXT: 2888 case TargetOpcode::G_ZEXT: 2889 case TargetOpcode::G_ANYEXT: 2890 case TargetOpcode::G_SEXT_INREG: 2891 if (selectImpl(I, *CoverageInfo)) 2892 return true; 2893 return selectG_SZA_EXT(I); 2894 case TargetOpcode::G_BRCOND: 2895 return selectG_BRCOND(I); 2896 case TargetOpcode::G_FRAME_INDEX: 2897 case TargetOpcode::G_GLOBAL_VALUE: 2898 return selectG_FRAME_INDEX_GLOBAL_VALUE(I); 2899 case TargetOpcode::G_PTRMASK: 2900 return selectG_PTRMASK(I); 2901 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 2902 return selectG_EXTRACT_VECTOR_ELT(I); 2903 case TargetOpcode::G_INSERT_VECTOR_ELT: 2904 return selectG_INSERT_VECTOR_ELT(I); 2905 case TargetOpcode::G_SHUFFLE_VECTOR: 2906 return selectG_SHUFFLE_VECTOR(I); 2907 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 2908 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: { 2909 const AMDGPU::ImageDimIntrinsicInfo *Intr 2910 = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID()); 2911 assert(Intr && "not an image intrinsic with image pseudo"); 2912 return selectImageIntrinsic(I, Intr); 2913 } 2914 default: 2915 return selectImpl(I, *CoverageInfo); 2916 } 2917 return false; 2918 } 2919 2920 InstructionSelector::ComplexRendererFns 2921 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { 2922 return {{ 2923 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 2924 }}; 2925 2926 } 2927 2928 std::pair<Register, unsigned> 2929 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root) const { 2930 Register Src = Root.getReg(); 2931 Register OrigSrc = Src; 2932 unsigned Mods = 0; 2933 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI); 2934 2935 if (MI && MI->getOpcode() == AMDGPU::G_FNEG) { 2936 Src = MI->getOperand(1).getReg(); 2937 Mods |= SISrcMods::NEG; 2938 MI = getDefIgnoringCopies(Src, *MRI); 2939 } 2940 2941 if (MI && MI->getOpcode() == AMDGPU::G_FABS) { 2942 Src = MI->getOperand(1).getReg(); 2943 Mods |= SISrcMods::ABS; 2944 } 2945 2946 if (Mods != 0 && 2947 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) { 2948 MachineInstr *UseMI = Root.getParent(); 2949 2950 // If we looked through copies to find source modifiers on an SGPR operand, 2951 // we now have an SGPR register source. To avoid potentially violating the 2952 // constant bus restriction, we need to insert a copy to a VGPR. 2953 Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc); 2954 BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(), 2955 TII.get(AMDGPU::COPY), VGPRSrc) 2956 .addReg(Src); 2957 Src = VGPRSrc; 2958 } 2959 2960 return std::make_pair(Src, Mods); 2961 } 2962 2963 /// 2964 /// This will select either an SGPR or VGPR operand and will save us from 2965 /// having to write an extra tablegen pattern. 2966 InstructionSelector::ComplexRendererFns 2967 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const { 2968 return {{ 2969 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 2970 }}; 2971 } 2972 2973 InstructionSelector::ComplexRendererFns 2974 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { 2975 Register Src; 2976 unsigned Mods; 2977 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 2978 2979 return {{ 2980 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2981 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 2982 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 2983 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 2984 }}; 2985 } 2986 2987 InstructionSelector::ComplexRendererFns 2988 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { 2989 return {{ 2990 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 2991 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 2992 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 2993 }}; 2994 } 2995 2996 InstructionSelector::ComplexRendererFns 2997 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { 2998 Register Src; 2999 unsigned Mods; 3000 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 3001 3002 return {{ 3003 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3004 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3005 }}; 3006 } 3007 3008 InstructionSelector::ComplexRendererFns 3009 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { 3010 Register Reg = Root.getReg(); 3011 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); 3012 if (Def && (Def->getOpcode() == AMDGPU::G_FNEG || 3013 Def->getOpcode() == AMDGPU::G_FABS)) 3014 return {}; 3015 return {{ 3016 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 3017 }}; 3018 } 3019 3020 std::pair<Register, unsigned> 3021 AMDGPUInstructionSelector::selectVOP3PModsImpl( 3022 Register Src, const MachineRegisterInfo &MRI) const { 3023 unsigned Mods = 0; 3024 MachineInstr *MI = MRI.getVRegDef(Src); 3025 3026 if (MI && MI->getOpcode() == AMDGPU::G_FNEG && 3027 // It's possible to see an f32 fneg here, but unlikely. 3028 // TODO: Treat f32 fneg as only high bit. 3029 MRI.getType(Src) == LLT::vector(2, 16)) { 3030 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); 3031 Src = MI->getOperand(1).getReg(); 3032 MI = MRI.getVRegDef(Src); 3033 } 3034 3035 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. 3036 3037 // Packed instructions do not have abs modifiers. 3038 Mods |= SISrcMods::OP_SEL_1; 3039 3040 return std::make_pair(Src, Mods); 3041 } 3042 3043 InstructionSelector::ComplexRendererFns 3044 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const { 3045 MachineRegisterInfo &MRI 3046 = Root.getParent()->getParent()->getParent()->getRegInfo(); 3047 3048 Register Src; 3049 unsigned Mods; 3050 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI); 3051 3052 return {{ 3053 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3054 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3055 }}; 3056 } 3057 3058 InstructionSelector::ComplexRendererFns 3059 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const { 3060 Register Src; 3061 unsigned Mods; 3062 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 3063 if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI)) 3064 return None; 3065 3066 return {{ 3067 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3068 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3069 }}; 3070 } 3071 3072 InstructionSelector::ComplexRendererFns 3073 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { 3074 // FIXME: Handle op_sel 3075 return {{ 3076 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 3077 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods 3078 }}; 3079 } 3080 3081 InstructionSelector::ComplexRendererFns 3082 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { 3083 SmallVector<GEPInfo, 4> AddrInfo; 3084 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 3085 3086 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 3087 return None; 3088 3089 const GEPInfo &GEPInfo = AddrInfo[0]; 3090 Optional<int64_t> EncodedImm = 3091 AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false); 3092 if (!EncodedImm) 3093 return None; 3094 3095 unsigned PtrReg = GEPInfo.SgprParts[0]; 3096 return {{ 3097 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 3098 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } 3099 }}; 3100 } 3101 3102 InstructionSelector::ComplexRendererFns 3103 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { 3104 SmallVector<GEPInfo, 4> AddrInfo; 3105 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 3106 3107 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 3108 return None; 3109 3110 const GEPInfo &GEPInfo = AddrInfo[0]; 3111 Register PtrReg = GEPInfo.SgprParts[0]; 3112 Optional<int64_t> EncodedImm = 3113 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm); 3114 if (!EncodedImm) 3115 return None; 3116 3117 return {{ 3118 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 3119 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } 3120 }}; 3121 } 3122 3123 InstructionSelector::ComplexRendererFns 3124 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { 3125 MachineInstr *MI = Root.getParent(); 3126 MachineBasicBlock *MBB = MI->getParent(); 3127 3128 SmallVector<GEPInfo, 4> AddrInfo; 3129 getAddrModeInfo(*MI, *MRI, AddrInfo); 3130 3131 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, 3132 // then we can select all ptr + 32-bit offsets not just immediate offsets. 3133 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 3134 return None; 3135 3136 const GEPInfo &GEPInfo = AddrInfo[0]; 3137 // SGPR offset is unsigned. 3138 if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm)) 3139 return None; 3140 3141 // If we make it this far we have a load with an 32-bit immediate offset. 3142 // It is OK to select this using a sgpr offset, because we have already 3143 // failed trying to select this load into one of the _IMM variants since 3144 // the _IMM Patterns are considered before the _SGPR patterns. 3145 Register PtrReg = GEPInfo.SgprParts[0]; 3146 Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 3147 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) 3148 .addImm(GEPInfo.Imm); 3149 return {{ 3150 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 3151 [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); } 3152 }}; 3153 } 3154 3155 template <bool Signed> 3156 InstructionSelector::ComplexRendererFns 3157 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const { 3158 MachineInstr *MI = Root.getParent(); 3159 3160 InstructionSelector::ComplexRendererFns Default = {{ 3161 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 3162 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // offset 3163 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 3164 }}; 3165 3166 if (!STI.hasFlatInstOffsets()) 3167 return Default; 3168 3169 const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg()); 3170 if (!OpDef || OpDef->getOpcode() != AMDGPU::G_PTR_ADD) 3171 return Default; 3172 3173 Optional<int64_t> Offset = 3174 getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI); 3175 if (!Offset.hasValue()) 3176 return Default; 3177 3178 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); 3179 if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed)) 3180 return Default; 3181 3182 Register BasePtr = OpDef->getOperand(1).getReg(); 3183 3184 return {{ 3185 [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); }, 3186 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); }, 3187 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 3188 }}; 3189 } 3190 3191 InstructionSelector::ComplexRendererFns 3192 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const { 3193 return selectFlatOffsetImpl<false>(Root); 3194 } 3195 3196 InstructionSelector::ComplexRendererFns 3197 AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const { 3198 return selectFlatOffsetImpl<true>(Root); 3199 } 3200 3201 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { 3202 auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>(); 3203 return PSV && PSV->isStack(); 3204 } 3205 3206 InstructionSelector::ComplexRendererFns 3207 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { 3208 MachineInstr *MI = Root.getParent(); 3209 MachineBasicBlock *MBB = MI->getParent(); 3210 MachineFunction *MF = MBB->getParent(); 3211 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 3212 3213 int64_t Offset = 0; 3214 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) && 3215 Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) { 3216 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3217 3218 // TODO: Should this be inside the render function? The iterator seems to 3219 // move. 3220 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), 3221 HighBits) 3222 .addImm(Offset & ~4095); 3223 3224 return {{[=](MachineInstrBuilder &MIB) { // rsrc 3225 MIB.addReg(Info->getScratchRSrcReg()); 3226 }, 3227 [=](MachineInstrBuilder &MIB) { // vaddr 3228 MIB.addReg(HighBits); 3229 }, 3230 [=](MachineInstrBuilder &MIB) { // soffset 3231 const MachineMemOperand *MMO = *MI->memoperands_begin(); 3232 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 3233 3234 if (isStackPtrRelative(PtrInfo)) 3235 MIB.addReg(Info->getStackPtrOffsetReg()); 3236 else 3237 MIB.addImm(0); 3238 }, 3239 [=](MachineInstrBuilder &MIB) { // offset 3240 MIB.addImm(Offset & 4095); 3241 }}}; 3242 } 3243 3244 assert(Offset == 0 || Offset == -1); 3245 3246 // Try to fold a frame index directly into the MUBUF vaddr field, and any 3247 // offsets. 3248 Optional<int> FI; 3249 Register VAddr = Root.getReg(); 3250 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) { 3251 if (isBaseWithConstantOffset(Root, *MRI)) { 3252 const MachineOperand &LHS = RootDef->getOperand(1); 3253 const MachineOperand &RHS = RootDef->getOperand(2); 3254 const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); 3255 const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); 3256 if (LHSDef && RHSDef) { 3257 int64_t PossibleOffset = 3258 RHSDef->getOperand(1).getCImm()->getSExtValue(); 3259 if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) && 3260 (!STI.privateMemoryResourceIsRangeChecked() || 3261 KnownBits->signBitIsZero(LHS.getReg()))) { 3262 if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX) 3263 FI = LHSDef->getOperand(1).getIndex(); 3264 else 3265 VAddr = LHS.getReg(); 3266 Offset = PossibleOffset; 3267 } 3268 } 3269 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) { 3270 FI = RootDef->getOperand(1).getIndex(); 3271 } 3272 } 3273 3274 return {{[=](MachineInstrBuilder &MIB) { // rsrc 3275 MIB.addReg(Info->getScratchRSrcReg()); 3276 }, 3277 [=](MachineInstrBuilder &MIB) { // vaddr 3278 if (FI.hasValue()) 3279 MIB.addFrameIndex(FI.getValue()); 3280 else 3281 MIB.addReg(VAddr); 3282 }, 3283 [=](MachineInstrBuilder &MIB) { // soffset 3284 // If we don't know this private access is a local stack object, it 3285 // needs to be relative to the entry point's scratch wave offset. 3286 // TODO: Should split large offsets that don't fit like above. 3287 // TODO: Don't use scratch wave offset just because the offset 3288 // didn't fit. 3289 if (!Info->isEntryFunction() && FI.hasValue()) 3290 MIB.addReg(Info->getStackPtrOffsetReg()); 3291 else 3292 MIB.addImm(0); 3293 }, 3294 [=](MachineInstrBuilder &MIB) { // offset 3295 MIB.addImm(Offset); 3296 }}}; 3297 } 3298 3299 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base, 3300 int64_t Offset, 3301 unsigned OffsetBits) const { 3302 if ((OffsetBits == 16 && !isUInt<16>(Offset)) || 3303 (OffsetBits == 8 && !isUInt<8>(Offset))) 3304 return false; 3305 3306 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) 3307 return true; 3308 3309 // On Southern Islands instruction with a negative base value and an offset 3310 // don't seem to work. 3311 return KnownBits->signBitIsZero(Base); 3312 } 3313 3314 InstructionSelector::ComplexRendererFns 3315 AMDGPUInstructionSelector::selectMUBUFScratchOffset( 3316 MachineOperand &Root) const { 3317 MachineInstr *MI = Root.getParent(); 3318 MachineBasicBlock *MBB = MI->getParent(); 3319 3320 int64_t Offset = 0; 3321 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) || 3322 !SIInstrInfo::isLegalMUBUFImmOffset(Offset)) 3323 return {}; 3324 3325 const MachineFunction *MF = MBB->getParent(); 3326 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 3327 const MachineMemOperand *MMO = *MI->memoperands_begin(); 3328 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 3329 3330 return {{ 3331 [=](MachineInstrBuilder &MIB) { // rsrc 3332 MIB.addReg(Info->getScratchRSrcReg()); 3333 }, 3334 [=](MachineInstrBuilder &MIB) { // soffset 3335 if (isStackPtrRelative(PtrInfo)) 3336 MIB.addReg(Info->getStackPtrOffsetReg()); 3337 else 3338 MIB.addImm(0); 3339 }, 3340 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset 3341 }}; 3342 } 3343 3344 std::pair<Register, unsigned> 3345 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const { 3346 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 3347 if (!RootDef) 3348 return std::make_pair(Root.getReg(), 0); 3349 3350 int64_t ConstAddr = 0; 3351 3352 Register PtrBase; 3353 int64_t Offset; 3354 std::tie(PtrBase, Offset) = 3355 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 3356 3357 if (Offset) { 3358 if (isDSOffsetLegal(PtrBase, Offset, 16)) { 3359 // (add n0, c0) 3360 return std::make_pair(PtrBase, Offset); 3361 } 3362 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 3363 // TODO 3364 3365 3366 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 3367 // TODO 3368 3369 } 3370 3371 return std::make_pair(Root.getReg(), 0); 3372 } 3373 3374 InstructionSelector::ComplexRendererFns 3375 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const { 3376 Register Reg; 3377 unsigned Offset; 3378 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root); 3379 return {{ 3380 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 3381 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } 3382 }}; 3383 } 3384 3385 InstructionSelector::ComplexRendererFns 3386 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const { 3387 Register Reg; 3388 unsigned Offset; 3389 std::tie(Reg, Offset) = selectDS64Bit4ByteAlignedImpl(Root); 3390 return {{ 3391 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 3392 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, 3393 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); } 3394 }}; 3395 } 3396 3397 std::pair<Register, unsigned> 3398 AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const { 3399 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 3400 if (!RootDef) 3401 return std::make_pair(Root.getReg(), 0); 3402 3403 int64_t ConstAddr = 0; 3404 3405 Register PtrBase; 3406 int64_t Offset; 3407 std::tie(PtrBase, Offset) = 3408 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 3409 3410 if (Offset) { 3411 int64_t DWordOffset0 = Offset / 4; 3412 int64_t DWordOffset1 = DWordOffset0 + 1; 3413 if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) { 3414 // (add n0, c0) 3415 return std::make_pair(PtrBase, DWordOffset0); 3416 } 3417 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 3418 // TODO 3419 3420 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 3421 // TODO 3422 3423 } 3424 3425 return std::make_pair(Root.getReg(), 0); 3426 } 3427 3428 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return 3429 /// the base value with the constant offset. There may be intervening copies 3430 /// between \p Root and the identified constant. Returns \p Root, 0 if this does 3431 /// not match the pattern. 3432 std::pair<Register, int64_t> 3433 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset( 3434 Register Root, const MachineRegisterInfo &MRI) const { 3435 MachineInstr *RootI = MRI.getVRegDef(Root); 3436 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD) 3437 return {Root, 0}; 3438 3439 MachineOperand &RHS = RootI->getOperand(2); 3440 Optional<ValueAndVReg> MaybeOffset 3441 = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true); 3442 if (!MaybeOffset) 3443 return {Root, 0}; 3444 return {RootI->getOperand(1).getReg(), MaybeOffset->Value}; 3445 } 3446 3447 static void addZeroImm(MachineInstrBuilder &MIB) { 3448 MIB.addImm(0); 3449 } 3450 3451 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p 3452 /// BasePtr is not valid, a null base pointer will be used. 3453 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, 3454 uint32_t FormatLo, uint32_t FormatHi, 3455 Register BasePtr) { 3456 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 3457 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 3458 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 3459 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 3460 3461 B.buildInstr(AMDGPU::S_MOV_B32) 3462 .addDef(RSrc2) 3463 .addImm(FormatLo); 3464 B.buildInstr(AMDGPU::S_MOV_B32) 3465 .addDef(RSrc3) 3466 .addImm(FormatHi); 3467 3468 // Build the half of the subregister with the constants before building the 3469 // full 128-bit register. If we are building multiple resource descriptors, 3470 // this will allow CSEing of the 2-component register. 3471 B.buildInstr(AMDGPU::REG_SEQUENCE) 3472 .addDef(RSrcHi) 3473 .addReg(RSrc2) 3474 .addImm(AMDGPU::sub0) 3475 .addReg(RSrc3) 3476 .addImm(AMDGPU::sub1); 3477 3478 Register RSrcLo = BasePtr; 3479 if (!BasePtr) { 3480 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 3481 B.buildInstr(AMDGPU::S_MOV_B64) 3482 .addDef(RSrcLo) 3483 .addImm(0); 3484 } 3485 3486 B.buildInstr(AMDGPU::REG_SEQUENCE) 3487 .addDef(RSrc) 3488 .addReg(RSrcLo) 3489 .addImm(AMDGPU::sub0_sub1) 3490 .addReg(RSrcHi) 3491 .addImm(AMDGPU::sub2_sub3); 3492 3493 return RSrc; 3494 } 3495 3496 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 3497 const SIInstrInfo &TII, Register BasePtr) { 3498 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 3499 3500 // FIXME: Why are half the "default" bits ignored based on the addressing 3501 // mode? 3502 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr); 3503 } 3504 3505 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 3506 const SIInstrInfo &TII, Register BasePtr) { 3507 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 3508 3509 // FIXME: Why are half the "default" bits ignored based on the addressing 3510 // mode? 3511 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr); 3512 } 3513 3514 AMDGPUInstructionSelector::MUBUFAddressData 3515 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const { 3516 MUBUFAddressData Data; 3517 Data.N0 = Src; 3518 3519 Register PtrBase; 3520 int64_t Offset; 3521 3522 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI); 3523 if (isUInt<32>(Offset)) { 3524 Data.N0 = PtrBase; 3525 Data.Offset = Offset; 3526 } 3527 3528 if (MachineInstr *InputAdd 3529 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) { 3530 Data.N2 = InputAdd->getOperand(1).getReg(); 3531 Data.N3 = InputAdd->getOperand(2).getReg(); 3532 3533 // FIXME: Need to fix extra SGPR->VGPRcopies inserted 3534 // FIXME: Don't know this was defined by operand 0 3535 // 3536 // TODO: Remove this when we have copy folding optimizations after 3537 // RegBankSelect. 3538 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg(); 3539 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg(); 3540 } 3541 3542 return Data; 3543 } 3544 3545 /// Return if the addr64 mubuf mode should be used for the given address. 3546 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const { 3547 // (ptr_add N2, N3) -> addr64, or 3548 // (ptr_add (ptr_add N2, N3), C1) -> addr64 3549 if (Addr.N2) 3550 return true; 3551 3552 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI); 3553 return N0Bank->getID() == AMDGPU::VGPRRegBankID; 3554 } 3555 3556 /// Split an immediate offset \p ImmOffset depending on whether it fits in the 3557 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable 3558 /// component. 3559 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset( 3560 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const { 3561 if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset)) 3562 return; 3563 3564 // Illegal offset, store it in soffset. 3565 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 3566 B.buildInstr(AMDGPU::S_MOV_B32) 3567 .addDef(SOffset) 3568 .addImm(ImmOffset); 3569 ImmOffset = 0; 3570 } 3571 3572 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl( 3573 MachineOperand &Root, Register &VAddr, Register &RSrcReg, 3574 Register &SOffset, int64_t &Offset) const { 3575 // FIXME: Predicates should stop this from reaching here. 3576 // addr64 bit was removed for volcanic islands. 3577 if (!STI.hasAddr64() || STI.useFlatForGlobal()) 3578 return false; 3579 3580 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 3581 if (!shouldUseAddr64(AddrData)) 3582 return false; 3583 3584 Register N0 = AddrData.N0; 3585 Register N2 = AddrData.N2; 3586 Register N3 = AddrData.N3; 3587 Offset = AddrData.Offset; 3588 3589 // Base pointer for the SRD. 3590 Register SRDPtr; 3591 3592 if (N2) { 3593 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 3594 assert(N3); 3595 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 3596 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the 3597 // addr64, and construct the default resource from a 0 address. 3598 VAddr = N0; 3599 } else { 3600 SRDPtr = N3; 3601 VAddr = N2; 3602 } 3603 } else { 3604 // N2 is not divergent. 3605 SRDPtr = N2; 3606 VAddr = N3; 3607 } 3608 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 3609 // Use the default null pointer in the resource 3610 VAddr = N0; 3611 } else { 3612 // N0 -> offset, or 3613 // (N0 + C1) -> offset 3614 SRDPtr = N0; 3615 } 3616 3617 MachineIRBuilder B(*Root.getParent()); 3618 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr); 3619 splitIllegalMUBUFOffset(B, SOffset, Offset); 3620 return true; 3621 } 3622 3623 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl( 3624 MachineOperand &Root, Register &RSrcReg, Register &SOffset, 3625 int64_t &Offset) const { 3626 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 3627 if (shouldUseAddr64(AddrData)) 3628 return false; 3629 3630 // N0 -> offset, or 3631 // (N0 + C1) -> offset 3632 Register SRDPtr = AddrData.N0; 3633 Offset = AddrData.Offset; 3634 3635 // TODO: Look through extensions for 32-bit soffset. 3636 MachineIRBuilder B(*Root.getParent()); 3637 3638 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr); 3639 splitIllegalMUBUFOffset(B, SOffset, Offset); 3640 return true; 3641 } 3642 3643 InstructionSelector::ComplexRendererFns 3644 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const { 3645 Register VAddr; 3646 Register RSrcReg; 3647 Register SOffset; 3648 int64_t Offset = 0; 3649 3650 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 3651 return {}; 3652 3653 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 3654 // pattern. 3655 return {{ 3656 [=](MachineInstrBuilder &MIB) { // rsrc 3657 MIB.addReg(RSrcReg); 3658 }, 3659 [=](MachineInstrBuilder &MIB) { // vaddr 3660 MIB.addReg(VAddr); 3661 }, 3662 [=](MachineInstrBuilder &MIB) { // soffset 3663 if (SOffset) 3664 MIB.addReg(SOffset); 3665 else 3666 MIB.addImm(0); 3667 }, 3668 [=](MachineInstrBuilder &MIB) { // offset 3669 MIB.addImm(Offset); 3670 }, 3671 addZeroImm, // glc 3672 addZeroImm, // slc 3673 addZeroImm, // tfe 3674 addZeroImm, // dlc 3675 addZeroImm // swz 3676 }}; 3677 } 3678 3679 InstructionSelector::ComplexRendererFns 3680 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const { 3681 Register RSrcReg; 3682 Register SOffset; 3683 int64_t Offset = 0; 3684 3685 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 3686 return {}; 3687 3688 return {{ 3689 [=](MachineInstrBuilder &MIB) { // rsrc 3690 MIB.addReg(RSrcReg); 3691 }, 3692 [=](MachineInstrBuilder &MIB) { // soffset 3693 if (SOffset) 3694 MIB.addReg(SOffset); 3695 else 3696 MIB.addImm(0); 3697 }, 3698 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 3699 addZeroImm, // glc 3700 addZeroImm, // slc 3701 addZeroImm, // tfe 3702 addZeroImm, // dlc 3703 addZeroImm // swz 3704 }}; 3705 } 3706 3707 InstructionSelector::ComplexRendererFns 3708 AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const { 3709 Register VAddr; 3710 Register RSrcReg; 3711 Register SOffset; 3712 int64_t Offset = 0; 3713 3714 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 3715 return {}; 3716 3717 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 3718 // pattern. 3719 return {{ 3720 [=](MachineInstrBuilder &MIB) { // rsrc 3721 MIB.addReg(RSrcReg); 3722 }, 3723 [=](MachineInstrBuilder &MIB) { // vaddr 3724 MIB.addReg(VAddr); 3725 }, 3726 [=](MachineInstrBuilder &MIB) { // soffset 3727 if (SOffset) 3728 MIB.addReg(SOffset); 3729 else 3730 MIB.addImm(0); 3731 }, 3732 [=](MachineInstrBuilder &MIB) { // offset 3733 MIB.addImm(Offset); 3734 }, 3735 addZeroImm // slc 3736 }}; 3737 } 3738 3739 InstructionSelector::ComplexRendererFns 3740 AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const { 3741 Register RSrcReg; 3742 Register SOffset; 3743 int64_t Offset = 0; 3744 3745 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 3746 return {}; 3747 3748 return {{ 3749 [=](MachineInstrBuilder &MIB) { // rsrc 3750 MIB.addReg(RSrcReg); 3751 }, 3752 [=](MachineInstrBuilder &MIB) { // soffset 3753 if (SOffset) 3754 MIB.addReg(SOffset); 3755 else 3756 MIB.addImm(0); 3757 }, 3758 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 3759 addZeroImm // slc 3760 }}; 3761 } 3762 3763 /// Get an immediate that must be 32-bits, and treated as zero extended. 3764 static Optional<uint64_t> getConstantZext32Val(Register Reg, 3765 const MachineRegisterInfo &MRI) { 3766 // getConstantVRegVal sexts any values, so see if that matters. 3767 Optional<int64_t> OffsetVal = getConstantVRegVal(Reg, MRI); 3768 if (!OffsetVal || !isInt<32>(*OffsetVal)) 3769 return None; 3770 return Lo_32(*OffsetVal); 3771 } 3772 3773 InstructionSelector::ComplexRendererFns 3774 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const { 3775 Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI); 3776 if (!OffsetVal) 3777 return {}; 3778 3779 Optional<int64_t> EncodedImm = 3780 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true); 3781 if (!EncodedImm) 3782 return {}; 3783 3784 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; 3785 } 3786 3787 InstructionSelector::ComplexRendererFns 3788 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const { 3789 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); 3790 3791 Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI); 3792 if (!OffsetVal) 3793 return {}; 3794 3795 Optional<int64_t> EncodedImm 3796 = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal); 3797 if (!EncodedImm) 3798 return {}; 3799 3800 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; 3801 } 3802 3803 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, 3804 const MachineInstr &MI, 3805 int OpIdx) const { 3806 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 3807 "Expected G_CONSTANT"); 3808 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue()); 3809 } 3810 3811 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB, 3812 const MachineInstr &MI, 3813 int OpIdx) const { 3814 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 3815 "Expected G_CONSTANT"); 3816 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue()); 3817 } 3818 3819 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB, 3820 const MachineInstr &MI, 3821 int OpIdx) const { 3822 assert(OpIdx == -1); 3823 3824 const MachineOperand &Op = MI.getOperand(1); 3825 if (MI.getOpcode() == TargetOpcode::G_FCONSTANT) 3826 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); 3827 else { 3828 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT"); 3829 MIB.addImm(Op.getCImm()->getSExtValue()); 3830 } 3831 } 3832 3833 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB, 3834 const MachineInstr &MI, 3835 int OpIdx) const { 3836 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 3837 "Expected G_CONSTANT"); 3838 MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation()); 3839 } 3840 3841 /// This only really exists to satisfy DAG type checking machinery, so is a 3842 /// no-op here. 3843 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB, 3844 const MachineInstr &MI, 3845 int OpIdx) const { 3846 MIB.addImm(MI.getOperand(OpIdx).getImm()); 3847 } 3848 3849 void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB, 3850 const MachineInstr &MI, 3851 int OpIdx) const { 3852 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3853 MIB.addImm(MI.getOperand(OpIdx).getImm() & 1); 3854 } 3855 3856 void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB, 3857 const MachineInstr &MI, 3858 int OpIdx) const { 3859 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3860 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1); 3861 } 3862 3863 void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB, 3864 const MachineInstr &MI, 3865 int OpIdx) const { 3866 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3867 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1); 3868 } 3869 3870 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB, 3871 const MachineInstr &MI, 3872 int OpIdx) const { 3873 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3874 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1); 3875 } 3876 3877 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const { 3878 return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm()); 3879 } 3880 3881 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const { 3882 return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm()); 3883 } 3884 3885 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const { 3886 return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm()); 3887 } 3888 3889 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const { 3890 return TII.isInlineConstant(Imm); 3891 } 3892