1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the InstructionSelector class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUInstructionSelector.h" 15 #include "AMDGPUInstrInfo.h" 16 #include "AMDGPUGlobalISelUtils.h" 17 #include "AMDGPURegisterBankInfo.h" 18 #include "AMDGPUSubtarget.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 23 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" 24 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" 25 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 26 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 27 #include "llvm/CodeGen/GlobalISel/Utils.h" 28 #include "llvm/CodeGen/MachineBasicBlock.h" 29 #include "llvm/CodeGen/MachineFunction.h" 30 #include "llvm/CodeGen/MachineInstr.h" 31 #include "llvm/CodeGen/MachineInstrBuilder.h" 32 #include "llvm/CodeGen/MachineRegisterInfo.h" 33 #include "llvm/IR/Type.h" 34 #include "llvm/Support/Debug.h" 35 #include "llvm/Support/raw_ostream.h" 36 37 #define DEBUG_TYPE "amdgpu-isel" 38 39 using namespace llvm; 40 using namespace MIPatternMatch; 41 42 static cl::opt<bool> AllowRiskySelect( 43 "amdgpu-global-isel-risky-select", 44 cl::desc("Allow GlobalISel to select cases that are likely to not work yet"), 45 cl::init(false), 46 cl::ReallyHidden); 47 48 #define GET_GLOBALISEL_IMPL 49 #define AMDGPUSubtarget GCNSubtarget 50 #include "AMDGPUGenGlobalISel.inc" 51 #undef GET_GLOBALISEL_IMPL 52 #undef AMDGPUSubtarget 53 54 AMDGPUInstructionSelector::AMDGPUInstructionSelector( 55 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, 56 const AMDGPUTargetMachine &TM) 57 : InstructionSelector(), TII(*STI.getInstrInfo()), 58 TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM), 59 STI(STI), 60 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG), 61 #define GET_GLOBALISEL_PREDICATES_INIT 62 #include "AMDGPUGenGlobalISel.inc" 63 #undef GET_GLOBALISEL_PREDICATES_INIT 64 #define GET_GLOBALISEL_TEMPORARIES_INIT 65 #include "AMDGPUGenGlobalISel.inc" 66 #undef GET_GLOBALISEL_TEMPORARIES_INIT 67 { 68 } 69 70 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } 71 72 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB, 73 CodeGenCoverage &CoverageInfo) { 74 MRI = &MF.getRegInfo(); 75 InstructionSelector::setupMF(MF, KB, CoverageInfo); 76 } 77 78 bool AMDGPUInstructionSelector::isVCC(Register Reg, 79 const MachineRegisterInfo &MRI) const { 80 if (Register::isPhysicalRegister(Reg)) 81 return Reg == TRI.getVCC(); 82 83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 84 const TargetRegisterClass *RC = 85 RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); 86 if (RC) { 87 const LLT Ty = MRI.getType(Reg); 88 return RC->hasSuperClassEq(TRI.getBoolRC()) && 89 Ty.isValid() && Ty.getSizeInBits() == 1; 90 } 91 92 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); 93 return RB->getID() == AMDGPU::VCCRegBankID; 94 } 95 96 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI, 97 unsigned NewOpc) const { 98 MI.setDesc(TII.get(NewOpc)); 99 MI.RemoveOperand(1); // Remove intrinsic ID. 100 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 101 102 MachineOperand &Dst = MI.getOperand(0); 103 MachineOperand &Src = MI.getOperand(1); 104 105 // TODO: This should be legalized to s32 if needed 106 if (MRI->getType(Dst.getReg()) == LLT::scalar(1)) 107 return false; 108 109 const TargetRegisterClass *DstRC 110 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 111 const TargetRegisterClass *SrcRC 112 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 113 if (!DstRC || DstRC != SrcRC) 114 return false; 115 116 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) && 117 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI); 118 } 119 120 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { 121 const DebugLoc &DL = I.getDebugLoc(); 122 MachineBasicBlock *BB = I.getParent(); 123 I.setDesc(TII.get(TargetOpcode::COPY)); 124 125 const MachineOperand &Src = I.getOperand(1); 126 MachineOperand &Dst = I.getOperand(0); 127 Register DstReg = Dst.getReg(); 128 Register SrcReg = Src.getReg(); 129 130 if (isVCC(DstReg, *MRI)) { 131 if (SrcReg == AMDGPU::SCC) { 132 const TargetRegisterClass *RC 133 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 134 if (!RC) 135 return true; 136 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 137 } 138 139 if (!isVCC(SrcReg, *MRI)) { 140 // TODO: Should probably leave the copy and let copyPhysReg expand it. 141 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI)) 142 return false; 143 144 const TargetRegisterClass *SrcRC 145 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 146 147 Register MaskedReg = MRI->createVirtualRegister(SrcRC); 148 149 // We can't trust the high bits at this point, so clear them. 150 151 // TODO: Skip masking high bits if def is known boolean. 152 153 unsigned AndOpc = TRI.isSGPRClass(SrcRC) ? 154 AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32; 155 BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg) 156 .addImm(1) 157 .addReg(SrcReg); 158 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) 159 .addImm(0) 160 .addReg(MaskedReg); 161 162 if (!MRI->getRegClassOrNull(SrcReg)) 163 MRI->setRegClass(SrcReg, SrcRC); 164 I.eraseFromParent(); 165 return true; 166 } 167 168 const TargetRegisterClass *RC = 169 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 170 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) 171 return false; 172 173 // Don't constrain the source register to a class so the def instruction 174 // handles it (unless it's undef). 175 // 176 // FIXME: This is a hack. When selecting the def, we neeed to know 177 // specifically know that the result is VCCRegBank, and not just an SGPR 178 // with size 1. An SReg_32 with size 1 is ambiguous with wave32. 179 if (Src.isUndef()) { 180 const TargetRegisterClass *SrcRC = 181 TRI.getConstrainedRegClassForOperand(Src, *MRI); 182 if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 183 return false; 184 } 185 186 return true; 187 } 188 189 for (const MachineOperand &MO : I.operands()) { 190 if (Register::isPhysicalRegister(MO.getReg())) 191 continue; 192 193 const TargetRegisterClass *RC = 194 TRI.getConstrainedRegClassForOperand(MO, *MRI); 195 if (!RC) 196 continue; 197 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); 198 } 199 return true; 200 } 201 202 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { 203 const Register DefReg = I.getOperand(0).getReg(); 204 const LLT DefTy = MRI->getType(DefReg); 205 if (DefTy == LLT::scalar(1)) { 206 if (!AllowRiskySelect) { 207 LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n"); 208 return false; 209 } 210 211 LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n"); 212 } 213 214 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy) 215 216 const RegClassOrRegBank &RegClassOrBank = 217 MRI->getRegClassOrRegBank(DefReg); 218 219 const TargetRegisterClass *DefRC 220 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); 221 if (!DefRC) { 222 if (!DefTy.isValid()) { 223 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); 224 return false; 225 } 226 227 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); 228 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI); 229 if (!DefRC) { 230 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); 231 return false; 232 } 233 } 234 235 // TODO: Verify that all registers have the same bank 236 I.setDesc(TII.get(TargetOpcode::PHI)); 237 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI); 238 } 239 240 MachineOperand 241 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, 242 const TargetRegisterClass &SubRC, 243 unsigned SubIdx) const { 244 245 MachineInstr *MI = MO.getParent(); 246 MachineBasicBlock *BB = MO.getParent()->getParent(); 247 Register DstReg = MRI->createVirtualRegister(&SubRC); 248 249 if (MO.isReg()) { 250 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); 251 Register Reg = MO.getReg(); 252 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) 253 .addReg(Reg, 0, ComposedSubIdx); 254 255 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(), 256 MO.isKill(), MO.isDead(), MO.isUndef(), 257 MO.isEarlyClobber(), 0, MO.isDebug(), 258 MO.isInternalRead()); 259 } 260 261 assert(MO.isImm()); 262 263 APInt Imm(64, MO.getImm()); 264 265 switch (SubIdx) { 266 default: 267 llvm_unreachable("do not know to split immediate with this sub index."); 268 case AMDGPU::sub0: 269 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue()); 270 case AMDGPU::sub1: 271 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue()); 272 } 273 } 274 275 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { 276 switch (Opc) { 277 case AMDGPU::G_AND: 278 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; 279 case AMDGPU::G_OR: 280 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32; 281 case AMDGPU::G_XOR: 282 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32; 283 default: 284 llvm_unreachable("not a bit op"); 285 } 286 } 287 288 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { 289 MachineOperand &Dst = I.getOperand(0); 290 MachineOperand &Src0 = I.getOperand(1); 291 MachineOperand &Src1 = I.getOperand(2); 292 Register DstReg = Dst.getReg(); 293 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 294 295 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 296 if (DstRB->getID() == AMDGPU::VCCRegBankID) { 297 const TargetRegisterClass *RC = TRI.getBoolRC(); 298 unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), 299 RC == &AMDGPU::SReg_64RegClass); 300 I.setDesc(TII.get(InstOpc)); 301 // Dead implicit-def of scc 302 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef 303 true, // isImp 304 false, // isKill 305 true)); // isDead 306 307 // FIXME: Hack to avoid turning the register bank into a register class. 308 // The selector for G_ICMP relies on seeing the register bank for the result 309 // is VCC. In wave32 if we constrain the registers to SReg_32 here, it will 310 // be ambiguous whether it's a scalar or vector bool. 311 if (Src0.isUndef() && !MRI->getRegClassOrNull(Src0.getReg())) 312 MRI->setRegClass(Src0.getReg(), RC); 313 if (Src1.isUndef() && !MRI->getRegClassOrNull(Src1.getReg())) 314 MRI->setRegClass(Src1.getReg(), RC); 315 316 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 317 } 318 319 // TODO: Should this allow an SCC bank result, and produce a copy from SCC for 320 // the result? 321 if (DstRB->getID() == AMDGPU::SGPRRegBankID) { 322 unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), Size > 32); 323 I.setDesc(TII.get(InstOpc)); 324 // Dead implicit-def of scc 325 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef 326 true, // isImp 327 false, // isKill 328 true)); // isDead 329 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 330 } 331 332 return false; 333 } 334 335 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { 336 MachineBasicBlock *BB = I.getParent(); 337 MachineFunction *MF = BB->getParent(); 338 Register DstReg = I.getOperand(0).getReg(); 339 const DebugLoc &DL = I.getDebugLoc(); 340 LLT Ty = MRI->getType(DstReg); 341 if (Ty.isVector()) 342 return false; 343 344 unsigned Size = Ty.getSizeInBits(); 345 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 346 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID; 347 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB; 348 349 if (Size == 32) { 350 if (IsSALU) { 351 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; 352 MachineInstr *Add = 353 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 354 .add(I.getOperand(1)) 355 .add(I.getOperand(2)); 356 I.eraseFromParent(); 357 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 358 } 359 360 if (STI.hasAddNoCarry()) { 361 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64; 362 I.setDesc(TII.get(Opc)); 363 I.addOperand(*MF, MachineOperand::CreateImm(0)); 364 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 365 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 366 } 367 368 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64; 369 370 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass()); 371 MachineInstr *Add 372 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 373 .addDef(UnusedCarry, RegState::Dead) 374 .add(I.getOperand(1)) 375 .add(I.getOperand(2)) 376 .addImm(0); 377 I.eraseFromParent(); 378 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 379 } 380 381 assert(!Sub && "illegal sub should not reach here"); 382 383 const TargetRegisterClass &RC 384 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass; 385 const TargetRegisterClass &HalfRC 386 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass; 387 388 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0)); 389 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0)); 390 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1)); 391 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1)); 392 393 Register DstLo = MRI->createVirtualRegister(&HalfRC); 394 Register DstHi = MRI->createVirtualRegister(&HalfRC); 395 396 if (IsSALU) { 397 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) 398 .add(Lo1) 399 .add(Lo2); 400 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) 401 .add(Hi1) 402 .add(Hi2); 403 } else { 404 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass(); 405 Register CarryReg = MRI->createVirtualRegister(CarryRC); 406 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo) 407 .addDef(CarryReg) 408 .add(Lo1) 409 .add(Lo2) 410 .addImm(0); 411 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi) 412 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead) 413 .add(Hi1) 414 .add(Hi2) 415 .addReg(CarryReg, RegState::Kill) 416 .addImm(0); 417 418 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI)) 419 return false; 420 } 421 422 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 423 .addReg(DstLo) 424 .addImm(AMDGPU::sub0) 425 .addReg(DstHi) 426 .addImm(AMDGPU::sub1); 427 428 429 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI)) 430 return false; 431 432 I.eraseFromParent(); 433 return true; 434 } 435 436 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE( 437 MachineInstr &I) const { 438 MachineBasicBlock *BB = I.getParent(); 439 MachineFunction *MF = BB->getParent(); 440 const DebugLoc &DL = I.getDebugLoc(); 441 Register Dst0Reg = I.getOperand(0).getReg(); 442 Register Dst1Reg = I.getOperand(1).getReg(); 443 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO || 444 I.getOpcode() == AMDGPU::G_UADDE; 445 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE || 446 I.getOpcode() == AMDGPU::G_USUBE; 447 448 if (isVCC(Dst1Reg, *MRI)) { 449 unsigned NoCarryOpc = 450 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64; 451 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 452 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc)); 453 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 454 I.addOperand(*MF, MachineOperand::CreateImm(0)); 455 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 456 } 457 458 Register Src0Reg = I.getOperand(2).getReg(); 459 Register Src1Reg = I.getOperand(3).getReg(); 460 461 if (HasCarryIn) { 462 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 463 .addReg(I.getOperand(4).getReg()); 464 } 465 466 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; 467 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; 468 469 BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg) 470 .add(I.getOperand(2)) 471 .add(I.getOperand(3)); 472 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg) 473 .addReg(AMDGPU::SCC); 474 475 if (!MRI->getRegClassOrNull(Dst1Reg)) 476 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass); 477 478 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) || 479 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) || 480 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI)) 481 return false; 482 483 if (HasCarryIn && 484 !RBI.constrainGenericRegister(I.getOperand(4).getReg(), 485 AMDGPU::SReg_32RegClass, *MRI)) 486 return false; 487 488 I.eraseFromParent(); 489 return true; 490 } 491 492 // TODO: We should probably legalize these to only using 32-bit results. 493 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { 494 MachineBasicBlock *BB = I.getParent(); 495 Register DstReg = I.getOperand(0).getReg(); 496 Register SrcReg = I.getOperand(1).getReg(); 497 LLT DstTy = MRI->getType(DstReg); 498 LLT SrcTy = MRI->getType(SrcReg); 499 const unsigned SrcSize = SrcTy.getSizeInBits(); 500 unsigned DstSize = DstTy.getSizeInBits(); 501 502 // TODO: Should handle any multiple of 32 offset. 503 unsigned Offset = I.getOperand(2).getImm(); 504 if (Offset % 32 != 0 || DstSize > 128) 505 return false; 506 507 // 16-bit operations really use 32-bit registers. 508 // FIXME: Probably should not allow 16-bit G_EXTRACT results. 509 if (DstSize == 16) 510 DstSize = 32; 511 512 const TargetRegisterClass *DstRC = 513 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI); 514 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 515 return false; 516 517 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 518 const TargetRegisterClass *SrcRC = 519 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 520 if (!SrcRC) 521 return false; 522 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32, 523 DstSize / 32); 524 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg); 525 if (!SrcRC) 526 return false; 527 528 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I, 529 *SrcRC, I.getOperand(1)); 530 const DebugLoc &DL = I.getDebugLoc(); 531 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg) 532 .addReg(SrcReg, 0, SubReg); 533 534 I.eraseFromParent(); 535 return true; 536 } 537 538 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { 539 MachineBasicBlock *BB = MI.getParent(); 540 Register DstReg = MI.getOperand(0).getReg(); 541 LLT DstTy = MRI->getType(DstReg); 542 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg()); 543 544 const unsigned SrcSize = SrcTy.getSizeInBits(); 545 if (SrcSize < 32) 546 return selectImpl(MI, *CoverageInfo); 547 548 const DebugLoc &DL = MI.getDebugLoc(); 549 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 550 const unsigned DstSize = DstTy.getSizeInBits(); 551 const TargetRegisterClass *DstRC = 552 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 553 if (!DstRC) 554 return false; 555 556 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8); 557 MachineInstrBuilder MIB = 558 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg); 559 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) { 560 MachineOperand &Src = MI.getOperand(I + 1); 561 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef())); 562 MIB.addImm(SubRegs[I]); 563 564 const TargetRegisterClass *SrcRC 565 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 566 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI)) 567 return false; 568 } 569 570 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 571 return false; 572 573 MI.eraseFromParent(); 574 return true; 575 } 576 577 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { 578 MachineBasicBlock *BB = MI.getParent(); 579 const int NumDst = MI.getNumOperands() - 1; 580 581 MachineOperand &Src = MI.getOperand(NumDst); 582 583 Register SrcReg = Src.getReg(); 584 Register DstReg0 = MI.getOperand(0).getReg(); 585 LLT DstTy = MRI->getType(DstReg0); 586 LLT SrcTy = MRI->getType(SrcReg); 587 588 const unsigned DstSize = DstTy.getSizeInBits(); 589 const unsigned SrcSize = SrcTy.getSizeInBits(); 590 const DebugLoc &DL = MI.getDebugLoc(); 591 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 592 593 const TargetRegisterClass *SrcRC = 594 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 595 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 596 return false; 597 598 const unsigned SrcFlags = getUndefRegState(Src.isUndef()); 599 600 // Note we could have mixed SGPR and VGPR destination banks for an SGPR 601 // source, and this relies on the fact that the same subregister indices are 602 // used for both. 603 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); 604 for (int I = 0, E = NumDst; I != E; ++I) { 605 MachineOperand &Dst = MI.getOperand(I); 606 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg()) 607 .addReg(SrcReg, SrcFlags, SubRegs[I]); 608 609 const TargetRegisterClass *DstRC = 610 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 611 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI)) 612 return false; 613 } 614 615 MI.eraseFromParent(); 616 return true; 617 } 618 619 static bool isZero(Register Reg, const MachineRegisterInfo &MRI) { 620 int64_t Val; 621 return mi_match(Reg, MRI, m_ICst(Val)) && Val == 0; 622 } 623 624 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC( 625 MachineInstr &MI) const { 626 if (selectImpl(MI, *CoverageInfo)) 627 return true; 628 629 const LLT S32 = LLT::scalar(32); 630 const LLT V2S16 = LLT::vector(2, 16); 631 632 Register Dst = MI.getOperand(0).getReg(); 633 if (MRI->getType(Dst) != V2S16) 634 return false; 635 636 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI); 637 if (DstBank->getID() != AMDGPU::SGPRRegBankID) 638 return false; 639 640 Register Src0 = MI.getOperand(1).getReg(); 641 Register Src1 = MI.getOperand(2).getReg(); 642 if (MRI->getType(Src0) != S32) 643 return false; 644 645 const DebugLoc &DL = MI.getDebugLoc(); 646 MachineBasicBlock *BB = MI.getParent(); 647 648 // TODO: This should probably be a combine somewhere 649 // (build_vector_trunc $src0, undef -> copy $src0 650 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI); 651 if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) { 652 MI.setDesc(TII.get(AMDGPU::COPY)); 653 MI.RemoveOperand(2); 654 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) && 655 RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI); 656 } 657 658 Register ShiftSrc0; 659 Register ShiftSrc1; 660 int64_t ShiftAmt; 661 662 // With multiple uses of the shift, this will duplicate the shift and 663 // increase register pressure. 664 // 665 // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16) 666 // => (S_PACK_HH_B32_B16 $src0, $src1) 667 // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16)) 668 // => (S_PACK_LH_B32_B16 $src0, $src1) 669 // (build_vector_trunc $src0, $src1) 670 // => (S_PACK_LL_B32_B16 $src0, $src1) 671 672 // FIXME: This is an inconvenient way to check a specific value 673 bool Shift0 = mi_match( 674 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_ICst(ShiftAmt)))) && 675 ShiftAmt == 16; 676 677 bool Shift1 = mi_match( 678 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_ICst(ShiftAmt)))) && 679 ShiftAmt == 16; 680 681 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16; 682 if (Shift0 && Shift1) { 683 Opc = AMDGPU::S_PACK_HH_B32_B16; 684 MI.getOperand(1).setReg(ShiftSrc0); 685 MI.getOperand(2).setReg(ShiftSrc1); 686 } else if (Shift1) { 687 Opc = AMDGPU::S_PACK_LH_B32_B16; 688 MI.getOperand(2).setReg(ShiftSrc1); 689 } else if (Shift0 && isZero(Src1, *MRI)) { 690 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16 691 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst) 692 .addReg(ShiftSrc0) 693 .addImm(16); 694 695 MI.eraseFromParent(); 696 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 697 } 698 699 MI.setDesc(TII.get(Opc)); 700 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); 701 } 702 703 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const { 704 return selectG_ADD_SUB(I); 705 } 706 707 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { 708 const MachineOperand &MO = I.getOperand(0); 709 710 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The 711 // regbank check here is to know why getConstrainedRegClassForOperand failed. 712 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI); 713 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) || 714 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) { 715 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); 716 return true; 717 } 718 719 return false; 720 } 721 722 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { 723 MachineBasicBlock *BB = I.getParent(); 724 725 Register DstReg = I.getOperand(0).getReg(); 726 Register Src0Reg = I.getOperand(1).getReg(); 727 Register Src1Reg = I.getOperand(2).getReg(); 728 LLT Src1Ty = MRI->getType(Src1Reg); 729 730 unsigned DstSize = MRI->getType(DstReg).getSizeInBits(); 731 unsigned InsSize = Src1Ty.getSizeInBits(); 732 733 int64_t Offset = I.getOperand(3).getImm(); 734 735 // FIXME: These cases should have been illegal and unnecessary to check here. 736 if (Offset % 32 != 0 || InsSize % 32 != 0) 737 return false; 738 739 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32); 740 if (SubReg == AMDGPU::NoSubRegister) 741 return false; 742 743 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 744 const TargetRegisterClass *DstRC = 745 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 746 if (!DstRC) 747 return false; 748 749 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI); 750 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI); 751 const TargetRegisterClass *Src0RC = 752 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI); 753 const TargetRegisterClass *Src1RC = 754 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI); 755 756 // Deal with weird cases where the class only partially supports the subreg 757 // index. 758 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg); 759 if (!Src0RC || !Src1RC) 760 return false; 761 762 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 763 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) || 764 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI)) 765 return false; 766 767 const DebugLoc &DL = I.getDebugLoc(); 768 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg) 769 .addReg(Src0Reg) 770 .addReg(Src1Reg) 771 .addImm(SubReg); 772 773 I.eraseFromParent(); 774 return true; 775 } 776 777 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const { 778 if (STI.getLDSBankCount() != 16) 779 return selectImpl(MI, *CoverageInfo); 780 781 Register Dst = MI.getOperand(0).getReg(); 782 Register Src0 = MI.getOperand(2).getReg(); 783 Register M0Val = MI.getOperand(6).getReg(); 784 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) || 785 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) || 786 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI)) 787 return false; 788 789 // This requires 2 instructions. It is possible to write a pattern to support 790 // this, but the generated isel emitter doesn't correctly deal with multiple 791 // output instructions using the same physical register input. The copy to m0 792 // is incorrectly placed before the second instruction. 793 // 794 // TODO: Match source modifiers. 795 796 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 797 const DebugLoc &DL = MI.getDebugLoc(); 798 MachineBasicBlock *MBB = MI.getParent(); 799 800 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 801 .addReg(M0Val); 802 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov) 803 .addImm(2) 804 .addImm(MI.getOperand(4).getImm()) // $attr 805 .addImm(MI.getOperand(3).getImm()); // $attrchan 806 807 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst) 808 .addImm(0) // $src0_modifiers 809 .addReg(Src0) // $src0 810 .addImm(MI.getOperand(4).getImm()) // $attr 811 .addImm(MI.getOperand(3).getImm()) // $attrchan 812 .addImm(0) // $src2_modifiers 813 .addReg(InterpMov) // $src2 - 2 f16 values selected by high 814 .addImm(MI.getOperand(5).getImm()) // $high 815 .addImm(0) // $clamp 816 .addImm(0); // $omod 817 818 MI.eraseFromParent(); 819 return true; 820 } 821 822 // We need to handle this here because tablegen doesn't support matching 823 // instructions with multiple outputs. 824 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const { 825 Register Dst0 = MI.getOperand(0).getReg(); 826 Register Dst1 = MI.getOperand(1).getReg(); 827 828 LLT Ty = MRI->getType(Dst0); 829 unsigned Opc; 830 if (Ty == LLT::scalar(32)) 831 Opc = AMDGPU::V_DIV_SCALE_F32; 832 else if (Ty == LLT::scalar(64)) 833 Opc = AMDGPU::V_DIV_SCALE_F64; 834 else 835 return false; 836 837 const DebugLoc &DL = MI.getDebugLoc(); 838 MachineBasicBlock *MBB = MI.getParent(); 839 840 Register Numer = MI.getOperand(3).getReg(); 841 Register Denom = MI.getOperand(4).getReg(); 842 unsigned ChooseDenom = MI.getOperand(5).getImm(); 843 844 Register Src0 = ChooseDenom != 0 ? Numer : Denom; 845 846 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0) 847 .addDef(Dst1) 848 .addUse(Src0) 849 .addUse(Denom) 850 .addUse(Numer); 851 852 MI.eraseFromParent(); 853 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 854 } 855 856 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { 857 unsigned IntrinsicID = I.getIntrinsicID(); 858 switch (IntrinsicID) { 859 case Intrinsic::amdgcn_if_break: { 860 MachineBasicBlock *BB = I.getParent(); 861 862 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 863 // SelectionDAG uses for wave32 vs wave64. 864 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK)) 865 .add(I.getOperand(0)) 866 .add(I.getOperand(2)) 867 .add(I.getOperand(3)); 868 869 Register DstReg = I.getOperand(0).getReg(); 870 Register Src0Reg = I.getOperand(2).getReg(); 871 Register Src1Reg = I.getOperand(3).getReg(); 872 873 I.eraseFromParent(); 874 875 for (Register Reg : { DstReg, Src0Reg, Src1Reg }) 876 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 877 878 return true; 879 } 880 case Intrinsic::amdgcn_interp_p1_f16: 881 return selectInterpP1F16(I); 882 case Intrinsic::amdgcn_wqm: 883 return constrainCopyLikeIntrin(I, AMDGPU::WQM); 884 case Intrinsic::amdgcn_softwqm: 885 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM); 886 case Intrinsic::amdgcn_wwm: 887 return constrainCopyLikeIntrin(I, AMDGPU::WWM); 888 case Intrinsic::amdgcn_div_scale: 889 return selectDivScale(I); 890 case Intrinsic::amdgcn_icmp: 891 return selectIntrinsicIcmp(I); 892 case Intrinsic::amdgcn_ballot: 893 return selectBallot(I); 894 default: 895 return selectImpl(I, *CoverageInfo); 896 } 897 } 898 899 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) { 900 if (Size != 32 && Size != 64) 901 return -1; 902 switch (P) { 903 default: 904 llvm_unreachable("Unknown condition code!"); 905 case CmpInst::ICMP_NE: 906 return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64; 907 case CmpInst::ICMP_EQ: 908 return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64; 909 case CmpInst::ICMP_SGT: 910 return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64; 911 case CmpInst::ICMP_SGE: 912 return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64; 913 case CmpInst::ICMP_SLT: 914 return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64; 915 case CmpInst::ICMP_SLE: 916 return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64; 917 case CmpInst::ICMP_UGT: 918 return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64; 919 case CmpInst::ICMP_UGE: 920 return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64; 921 case CmpInst::ICMP_ULT: 922 return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64; 923 case CmpInst::ICMP_ULE: 924 return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64; 925 } 926 } 927 928 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P, 929 unsigned Size) const { 930 if (Size == 64) { 931 if (!STI.hasScalarCompareEq64()) 932 return -1; 933 934 switch (P) { 935 case CmpInst::ICMP_NE: 936 return AMDGPU::S_CMP_LG_U64; 937 case CmpInst::ICMP_EQ: 938 return AMDGPU::S_CMP_EQ_U64; 939 default: 940 return -1; 941 } 942 } 943 944 if (Size != 32) 945 return -1; 946 947 switch (P) { 948 case CmpInst::ICMP_NE: 949 return AMDGPU::S_CMP_LG_U32; 950 case CmpInst::ICMP_EQ: 951 return AMDGPU::S_CMP_EQ_U32; 952 case CmpInst::ICMP_SGT: 953 return AMDGPU::S_CMP_GT_I32; 954 case CmpInst::ICMP_SGE: 955 return AMDGPU::S_CMP_GE_I32; 956 case CmpInst::ICMP_SLT: 957 return AMDGPU::S_CMP_LT_I32; 958 case CmpInst::ICMP_SLE: 959 return AMDGPU::S_CMP_LE_I32; 960 case CmpInst::ICMP_UGT: 961 return AMDGPU::S_CMP_GT_U32; 962 case CmpInst::ICMP_UGE: 963 return AMDGPU::S_CMP_GE_U32; 964 case CmpInst::ICMP_ULT: 965 return AMDGPU::S_CMP_LT_U32; 966 case CmpInst::ICMP_ULE: 967 return AMDGPU::S_CMP_LE_U32; 968 default: 969 llvm_unreachable("Unknown condition code!"); 970 } 971 } 972 973 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { 974 MachineBasicBlock *BB = I.getParent(); 975 const DebugLoc &DL = I.getDebugLoc(); 976 977 Register SrcReg = I.getOperand(2).getReg(); 978 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 979 980 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); 981 982 Register CCReg = I.getOperand(0).getReg(); 983 if (!isVCC(CCReg, *MRI)) { 984 int Opcode = getS_CMPOpcode(Pred, Size); 985 if (Opcode == -1) 986 return false; 987 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode)) 988 .add(I.getOperand(2)) 989 .add(I.getOperand(3)); 990 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg) 991 .addReg(AMDGPU::SCC); 992 bool Ret = 993 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) && 994 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI); 995 I.eraseFromParent(); 996 return Ret; 997 } 998 999 int Opcode = getV_CMPOpcode(Pred, Size); 1000 if (Opcode == -1) 1001 return false; 1002 1003 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), 1004 I.getOperand(0).getReg()) 1005 .add(I.getOperand(2)) 1006 .add(I.getOperand(3)); 1007 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), 1008 *TRI.getBoolRC(), *MRI); 1009 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 1010 I.eraseFromParent(); 1011 return Ret; 1012 } 1013 1014 bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const { 1015 Register Dst = I.getOperand(0).getReg(); 1016 if (isVCC(Dst, *MRI)) 1017 return false; 1018 1019 if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize()) 1020 return false; 1021 1022 MachineBasicBlock *BB = I.getParent(); 1023 const DebugLoc &DL = I.getDebugLoc(); 1024 Register SrcReg = I.getOperand(2).getReg(); 1025 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 1026 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm()); 1027 1028 int Opcode = getV_CMPOpcode(Pred, Size); 1029 if (Opcode == -1) 1030 return false; 1031 1032 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst) 1033 .add(I.getOperand(2)) 1034 .add(I.getOperand(3)); 1035 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(), 1036 *MRI); 1037 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 1038 I.eraseFromParent(); 1039 return Ret; 1040 } 1041 1042 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const { 1043 MachineBasicBlock *BB = I.getParent(); 1044 const DebugLoc &DL = I.getDebugLoc(); 1045 Register DstReg = I.getOperand(0).getReg(); 1046 const unsigned Size = MRI->getType(DstReg).getSizeInBits(); 1047 const bool Is64 = Size == 64; 1048 1049 if (Size != STI.getWavefrontSize()) 1050 return false; 1051 1052 Optional<ValueAndVReg> Arg = 1053 getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true); 1054 1055 if (Arg.hasValue()) { 1056 const int64_t Value = Arg.getValue().Value; 1057 if (Value == 0) { 1058 unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; 1059 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0); 1060 } else if (Value == -1) { // all ones 1061 Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO; 1062 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg); 1063 } else 1064 return false; 1065 } else { 1066 Register SrcReg = I.getOperand(2).getReg(); 1067 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg); 1068 } 1069 1070 I.eraseFromParent(); 1071 return true; 1072 } 1073 1074 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const { 1075 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 1076 // SelectionDAG uses for wave32 vs wave64. 1077 MachineBasicBlock *BB = MI.getParent(); 1078 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF)) 1079 .add(MI.getOperand(1)); 1080 1081 Register Reg = MI.getOperand(1).getReg(); 1082 MI.eraseFromParent(); 1083 1084 if (!MRI->getRegClassOrNull(Reg)) 1085 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 1086 return true; 1087 } 1088 1089 static unsigned getDSShaderTypeValue(const MachineFunction &MF) { 1090 switch (MF.getFunction().getCallingConv()) { 1091 case CallingConv::AMDGPU_PS: 1092 return 1; 1093 case CallingConv::AMDGPU_VS: 1094 return 2; 1095 case CallingConv::AMDGPU_GS: 1096 return 3; 1097 case CallingConv::AMDGPU_HS: 1098 case CallingConv::AMDGPU_LS: 1099 case CallingConv::AMDGPU_ES: 1100 report_fatal_error("ds_ordered_count unsupported for this calling conv"); 1101 case CallingConv::AMDGPU_CS: 1102 case CallingConv::AMDGPU_KERNEL: 1103 case CallingConv::C: 1104 case CallingConv::Fast: 1105 default: 1106 // Assume other calling conventions are various compute callable functions 1107 return 0; 1108 } 1109 } 1110 1111 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic( 1112 MachineInstr &MI, Intrinsic::ID IntrID) const { 1113 MachineBasicBlock *MBB = MI.getParent(); 1114 MachineFunction *MF = MBB->getParent(); 1115 const DebugLoc &DL = MI.getDebugLoc(); 1116 1117 unsigned IndexOperand = MI.getOperand(7).getImm(); 1118 bool WaveRelease = MI.getOperand(8).getImm() != 0; 1119 bool WaveDone = MI.getOperand(9).getImm() != 0; 1120 1121 if (WaveDone && !WaveRelease) 1122 report_fatal_error("ds_ordered_count: wave_done requires wave_release"); 1123 1124 unsigned OrderedCountIndex = IndexOperand & 0x3f; 1125 IndexOperand &= ~0x3f; 1126 unsigned CountDw = 0; 1127 1128 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) { 1129 CountDw = (IndexOperand >> 24) & 0xf; 1130 IndexOperand &= ~(0xf << 24); 1131 1132 if (CountDw < 1 || CountDw > 4) { 1133 report_fatal_error( 1134 "ds_ordered_count: dword count must be between 1 and 4"); 1135 } 1136 } 1137 1138 if (IndexOperand) 1139 report_fatal_error("ds_ordered_count: bad index operand"); 1140 1141 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1; 1142 unsigned ShaderType = getDSShaderTypeValue(*MF); 1143 1144 unsigned Offset0 = OrderedCountIndex << 2; 1145 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) | 1146 (Instruction << 4); 1147 1148 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) 1149 Offset1 |= (CountDw - 1) << 6; 1150 1151 unsigned Offset = Offset0 | (Offset1 << 8); 1152 1153 Register M0Val = MI.getOperand(2).getReg(); 1154 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1155 .addReg(M0Val); 1156 1157 Register DstReg = MI.getOperand(0).getReg(); 1158 Register ValReg = MI.getOperand(3).getReg(); 1159 MachineInstrBuilder DS = 1160 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg) 1161 .addReg(ValReg) 1162 .addImm(Offset) 1163 .cloneMemRefs(MI); 1164 1165 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI)) 1166 return false; 1167 1168 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI); 1169 MI.eraseFromParent(); 1170 return Ret; 1171 } 1172 1173 static unsigned gwsIntrinToOpcode(unsigned IntrID) { 1174 switch (IntrID) { 1175 case Intrinsic::amdgcn_ds_gws_init: 1176 return AMDGPU::DS_GWS_INIT; 1177 case Intrinsic::amdgcn_ds_gws_barrier: 1178 return AMDGPU::DS_GWS_BARRIER; 1179 case Intrinsic::amdgcn_ds_gws_sema_v: 1180 return AMDGPU::DS_GWS_SEMA_V; 1181 case Intrinsic::amdgcn_ds_gws_sema_br: 1182 return AMDGPU::DS_GWS_SEMA_BR; 1183 case Intrinsic::amdgcn_ds_gws_sema_p: 1184 return AMDGPU::DS_GWS_SEMA_P; 1185 case Intrinsic::amdgcn_ds_gws_sema_release_all: 1186 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL; 1187 default: 1188 llvm_unreachable("not a gws intrinsic"); 1189 } 1190 } 1191 1192 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, 1193 Intrinsic::ID IID) const { 1194 if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all && 1195 !STI.hasGWSSemaReleaseAll()) 1196 return false; 1197 1198 // intrinsic ID, vsrc, offset 1199 const bool HasVSrc = MI.getNumOperands() == 3; 1200 assert(HasVSrc || MI.getNumOperands() == 2); 1201 1202 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg(); 1203 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI); 1204 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID) 1205 return false; 1206 1207 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1208 assert(OffsetDef); 1209 1210 unsigned ImmOffset; 1211 1212 MachineBasicBlock *MBB = MI.getParent(); 1213 const DebugLoc &DL = MI.getDebugLoc(); 1214 1215 MachineInstr *Readfirstlane = nullptr; 1216 1217 // If we legalized the VGPR input, strip out the readfirstlane to analyze the 1218 // incoming offset, in case there's an add of a constant. We'll have to put it 1219 // back later. 1220 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) { 1221 Readfirstlane = OffsetDef; 1222 BaseOffset = OffsetDef->getOperand(1).getReg(); 1223 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1224 } 1225 1226 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) { 1227 // If we have a constant offset, try to use the 0 in m0 as the base. 1228 // TODO: Look into changing the default m0 initialization value. If the 1229 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to 1230 // the immediate offset. 1231 1232 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue(); 1233 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1234 .addImm(0); 1235 } else { 1236 std::tie(BaseOffset, ImmOffset, OffsetDef) 1237 = AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset); 1238 1239 if (Readfirstlane) { 1240 // We have the constant offset now, so put the readfirstlane back on the 1241 // variable component. 1242 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI)) 1243 return false; 1244 1245 Readfirstlane->getOperand(1).setReg(BaseOffset); 1246 BaseOffset = Readfirstlane->getOperand(0).getReg(); 1247 } else { 1248 if (!RBI.constrainGenericRegister(BaseOffset, 1249 AMDGPU::SReg_32RegClass, *MRI)) 1250 return false; 1251 } 1252 1253 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1254 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base) 1255 .addReg(BaseOffset) 1256 .addImm(16); 1257 1258 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1259 .addReg(M0Base); 1260 } 1261 1262 // The resource id offset is computed as (<isa opaque base> + M0[21:16] + 1263 // offset field) % 64. Some versions of the programming guide omit the m0 1264 // part, or claim it's from offset 0. 1265 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID))); 1266 1267 if (HasVSrc) { 1268 Register VSrc = MI.getOperand(1).getReg(); 1269 MIB.addReg(VSrc); 1270 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI)) 1271 return false; 1272 } 1273 1274 MIB.addImm(ImmOffset) 1275 .addImm(-1) // $gds 1276 .cloneMemRefs(MI); 1277 1278 MI.eraseFromParent(); 1279 return true; 1280 } 1281 1282 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI, 1283 bool IsAppend) const { 1284 Register PtrBase = MI.getOperand(2).getReg(); 1285 LLT PtrTy = MRI->getType(PtrBase); 1286 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS; 1287 1288 unsigned Offset; 1289 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2)); 1290 1291 // TODO: Should this try to look through readfirstlane like GWS? 1292 if (!isDSOffsetLegal(PtrBase, Offset, 16)) { 1293 PtrBase = MI.getOperand(2).getReg(); 1294 Offset = 0; 1295 } 1296 1297 MachineBasicBlock *MBB = MI.getParent(); 1298 const DebugLoc &DL = MI.getDebugLoc(); 1299 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME; 1300 1301 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1302 .addReg(PtrBase); 1303 BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg()) 1304 .addImm(Offset) 1305 .addImm(IsGDS ? -1 : 0) 1306 .cloneMemRefs(MI); 1307 MI.eraseFromParent(); 1308 return true; 1309 } 1310 1311 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, 1312 bool &IsTexFail) { 1313 if (TexFailCtrl) 1314 IsTexFail = true; 1315 1316 TFE = (TexFailCtrl & 0x1) ? 1 : 0; 1317 TexFailCtrl &= ~(uint64_t)0x1; 1318 LWE = (TexFailCtrl & 0x2) ? 1 : 0; 1319 TexFailCtrl &= ~(uint64_t)0x2; 1320 1321 return TexFailCtrl == 0; 1322 } 1323 1324 static bool parseCachePolicy(uint64_t Value, 1325 bool *GLC, bool *SLC, bool *DLC) { 1326 if (GLC) { 1327 *GLC = (Value & 0x1) ? 1 : 0; 1328 Value &= ~(uint64_t)0x1; 1329 } 1330 if (SLC) { 1331 *SLC = (Value & 0x2) ? 1 : 0; 1332 Value &= ~(uint64_t)0x2; 1333 } 1334 if (DLC) { 1335 *DLC = (Value & 0x4) ? 1 : 0; 1336 Value &= ~(uint64_t)0x4; 1337 } 1338 1339 return Value == 0; 1340 } 1341 1342 bool AMDGPUInstructionSelector::selectImageIntrinsic( 1343 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 1344 MachineBasicBlock *MBB = MI.getParent(); 1345 const DebugLoc &DL = MI.getDebugLoc(); 1346 1347 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 1348 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 1349 1350 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); 1351 const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 1352 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode); 1353 const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo = 1354 AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode); 1355 unsigned IntrOpcode = Intr->BaseOpcode; 1356 const bool IsGFX10 = STI.getGeneration() >= AMDGPUSubtarget::GFX10; 1357 1358 const int VAddrIdx = getImageVAddrIdxBegin(BaseOpcode, 1359 MI.getNumExplicitDefs()); 1360 int NumVAddr, NumGradients; 1361 std::tie(NumVAddr, NumGradients) = getImageNumVAddr(Intr, BaseOpcode); 1362 1363 Register VDataIn, VDataOut; 1364 LLT VDataTy; 1365 int NumVDataDwords = -1; 1366 bool IsD16 = false; 1367 1368 // XXX - Can we just get the second to last argument for ctrl? 1369 unsigned CtrlIdx; // Index of texfailctrl argument 1370 bool Unorm; 1371 if (!BaseOpcode->Sampler) { 1372 Unorm = true; 1373 CtrlIdx = VAddrIdx + NumVAddr + 1; 1374 } else { 1375 Unorm = MI.getOperand(VAddrIdx + NumVAddr + 2).getImm() != 0; 1376 CtrlIdx = VAddrIdx + NumVAddr + 3; 1377 } 1378 1379 bool TFE; 1380 bool LWE; 1381 bool IsTexFail = false; 1382 if (!parseTexFail(MI.getOperand(CtrlIdx).getImm(), TFE, LWE, IsTexFail)) 1383 return false; 1384 1385 const int Flags = MI.getOperand(CtrlIdx + 2).getImm(); 1386 const bool IsA16 = (Flags & 1) != 0; 1387 const bool IsG16 = (Flags & 2) != 0; 1388 1389 // A16 implies 16 bit gradients 1390 if (IsA16 && !IsG16) 1391 return false; 1392 1393 unsigned DMask = 0; 1394 unsigned DMaskLanes = 0; 1395 1396 if (BaseOpcode->Atomic) { 1397 VDataOut = MI.getOperand(0).getReg(); 1398 VDataIn = MI.getOperand(2).getReg(); 1399 LLT Ty = MRI->getType(VDataIn); 1400 1401 // Be careful to allow atomic swap on 16-bit element vectors. 1402 const bool Is64Bit = BaseOpcode->AtomicX2 ? 1403 Ty.getSizeInBits() == 128 : 1404 Ty.getSizeInBits() == 64; 1405 1406 if (BaseOpcode->AtomicX2) { 1407 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister); 1408 1409 DMask = Is64Bit ? 0xf : 0x3; 1410 NumVDataDwords = Is64Bit ? 4 : 2; 1411 } else { 1412 DMask = Is64Bit ? 0x3 : 0x1; 1413 NumVDataDwords = Is64Bit ? 2 : 1; 1414 } 1415 } else { 1416 const int DMaskIdx = 2; // Input/output + intrinsic ID. 1417 1418 DMask = MI.getOperand(DMaskIdx).getImm(); 1419 DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask); 1420 1421 if (BaseOpcode->Store) { 1422 VDataIn = MI.getOperand(1).getReg(); 1423 VDataTy = MRI->getType(VDataIn); 1424 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32; 1425 } else { 1426 VDataOut = MI.getOperand(0).getReg(); 1427 VDataTy = MRI->getType(VDataOut); 1428 NumVDataDwords = DMaskLanes; 1429 1430 // One memoperand is mandatory, except for getresinfo. 1431 // FIXME: Check this in verifier. 1432 if (!MI.memoperands_empty()) { 1433 const MachineMemOperand *MMO = *MI.memoperands_begin(); 1434 1435 // Infer d16 from the memory size, as the register type will be mangled by 1436 // unpacked subtargets, or by TFE. 1437 IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32; 1438 1439 if (IsD16 && !STI.hasUnpackedD16VMem()) 1440 NumVDataDwords = (DMaskLanes + 1) / 2; 1441 } 1442 } 1443 } 1444 1445 // Optimize _L to _LZ when _L is zero 1446 if (LZMappingInfo) { 1447 // The legalizer replaced the register with an immediate 0 if we need to 1448 // change the opcode. 1449 const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1); 1450 if (Lod.isImm()) { 1451 assert(Lod.getImm() == 0); 1452 IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l 1453 } 1454 } 1455 1456 // Optimize _mip away, when 'lod' is zero 1457 if (MIPMappingInfo) { 1458 const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1); 1459 if (Lod.isImm()) { 1460 assert(Lod.getImm() == 0); 1461 IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip 1462 } 1463 } 1464 1465 // Set G16 opcode 1466 if (IsG16 && !IsA16) { 1467 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = 1468 AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode); 1469 assert(G16MappingInfo); 1470 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16 1471 } 1472 1473 // TODO: Check this in verifier. 1474 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this"); 1475 1476 bool GLC = false; 1477 bool SLC = false; 1478 bool DLC = false; 1479 if (BaseOpcode->Atomic) { 1480 GLC = true; // TODO no-return optimization 1481 if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), nullptr, &SLC, 1482 IsGFX10 ? &DLC : nullptr)) 1483 return false; 1484 } else { 1485 if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), &GLC, &SLC, 1486 IsGFX10 ? &DLC : nullptr)) 1487 return false; 1488 } 1489 1490 int NumVAddrRegs = 0; 1491 int NumVAddrDwords = 0; 1492 for (int I = 0; I < NumVAddr; ++I) { 1493 // Skip the $noregs and 0s inserted during legalization. 1494 MachineOperand &AddrOp = MI.getOperand(VAddrIdx + I); 1495 if (!AddrOp.isReg()) 1496 continue; // XXX - Break? 1497 1498 Register Addr = AddrOp.getReg(); 1499 if (!Addr) 1500 break; 1501 1502 ++NumVAddrRegs; 1503 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32; 1504 } 1505 1506 // The legalizer preprocessed the intrinsic arguments. If we aren't using 1507 // NSA, these should have beeen packed into a single value in the first 1508 // address register 1509 const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs; 1510 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) { 1511 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n"); 1512 return false; 1513 } 1514 1515 if (IsTexFail) 1516 ++NumVDataDwords; 1517 1518 int Opcode = -1; 1519 if (IsGFX10) { 1520 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, 1521 UseNSA ? AMDGPU::MIMGEncGfx10NSA 1522 : AMDGPU::MIMGEncGfx10Default, 1523 NumVDataDwords, NumVAddrDwords); 1524 } else { 1525 if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 1526 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, 1527 NumVDataDwords, NumVAddrDwords); 1528 if (Opcode == -1) 1529 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6, 1530 NumVDataDwords, NumVAddrDwords); 1531 } 1532 assert(Opcode != -1); 1533 1534 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode)) 1535 .cloneMemRefs(MI); 1536 1537 if (VDataOut) { 1538 if (BaseOpcode->AtomicX2) { 1539 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64; 1540 1541 Register TmpReg = MRI->createVirtualRegister( 1542 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass); 1543 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 1544 1545 MIB.addDef(TmpReg); 1546 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut) 1547 .addReg(TmpReg, RegState::Kill, SubReg); 1548 1549 } else { 1550 MIB.addDef(VDataOut); // vdata output 1551 } 1552 } 1553 1554 if (VDataIn) 1555 MIB.addReg(VDataIn); // vdata input 1556 1557 for (int i = 0; i != NumVAddrRegs; ++i) { 1558 MachineOperand &SrcOp = MI.getOperand(VAddrIdx + i); 1559 if (SrcOp.isReg()) { 1560 assert(SrcOp.getReg() != 0); 1561 MIB.addReg(SrcOp.getReg()); 1562 } 1563 } 1564 1565 MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr).getReg()); // rsrc 1566 if (BaseOpcode->Sampler) 1567 MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr + 1).getReg()); // sampler 1568 1569 MIB.addImm(DMask); // dmask 1570 1571 if (IsGFX10) 1572 MIB.addImm(DimInfo->Encoding); 1573 MIB.addImm(Unorm); 1574 if (IsGFX10) 1575 MIB.addImm(DLC); 1576 1577 MIB.addImm(GLC); 1578 MIB.addImm(SLC); 1579 MIB.addImm(IsA16 && // a16 or r128 1580 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0); 1581 if (IsGFX10) 1582 MIB.addImm(IsA16 ? -1 : 0); 1583 1584 MIB.addImm(TFE); // tfe 1585 MIB.addImm(LWE); // lwe 1586 if (!IsGFX10) 1587 MIB.addImm(DimInfo->DA ? -1 : 0); 1588 if (BaseOpcode->HasD16) 1589 MIB.addImm(IsD16 ? -1 : 0); 1590 1591 MI.eraseFromParent(); 1592 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1593 } 1594 1595 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( 1596 MachineInstr &I) const { 1597 unsigned IntrinsicID = I.getIntrinsicID(); 1598 switch (IntrinsicID) { 1599 case Intrinsic::amdgcn_end_cf: 1600 return selectEndCfIntrinsic(I); 1601 case Intrinsic::amdgcn_ds_ordered_add: 1602 case Intrinsic::amdgcn_ds_ordered_swap: 1603 return selectDSOrderedIntrinsic(I, IntrinsicID); 1604 case Intrinsic::amdgcn_ds_gws_init: 1605 case Intrinsic::amdgcn_ds_gws_barrier: 1606 case Intrinsic::amdgcn_ds_gws_sema_v: 1607 case Intrinsic::amdgcn_ds_gws_sema_br: 1608 case Intrinsic::amdgcn_ds_gws_sema_p: 1609 case Intrinsic::amdgcn_ds_gws_sema_release_all: 1610 return selectDSGWSIntrinsic(I, IntrinsicID); 1611 case Intrinsic::amdgcn_ds_append: 1612 return selectDSAppendConsume(I, true); 1613 case Intrinsic::amdgcn_ds_consume: 1614 return selectDSAppendConsume(I, false); 1615 default: { 1616 return selectImpl(I, *CoverageInfo); 1617 } 1618 } 1619 } 1620 1621 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { 1622 if (selectImpl(I, *CoverageInfo)) 1623 return true; 1624 1625 MachineBasicBlock *BB = I.getParent(); 1626 const DebugLoc &DL = I.getDebugLoc(); 1627 1628 Register DstReg = I.getOperand(0).getReg(); 1629 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 1630 assert(Size <= 32 || Size == 64); 1631 const MachineOperand &CCOp = I.getOperand(1); 1632 Register CCReg = CCOp.getReg(); 1633 if (!isVCC(CCReg, *MRI)) { 1634 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 : 1635 AMDGPU::S_CSELECT_B32; 1636 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 1637 .addReg(CCReg); 1638 1639 // The generic constrainSelectedInstRegOperands doesn't work for the scc register 1640 // bank, because it does not cover the register class that we used to represent 1641 // for it. So we need to manually set the register class here. 1642 if (!MRI->getRegClassOrNull(CCReg)) 1643 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI)); 1644 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) 1645 .add(I.getOperand(2)) 1646 .add(I.getOperand(3)); 1647 1648 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) | 1649 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI); 1650 I.eraseFromParent(); 1651 return Ret; 1652 } 1653 1654 // Wide VGPR select should have been split in RegBankSelect. 1655 if (Size > 32) 1656 return false; 1657 1658 MachineInstr *Select = 1659 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1660 .addImm(0) 1661 .add(I.getOperand(3)) 1662 .addImm(0) 1663 .add(I.getOperand(2)) 1664 .add(I.getOperand(1)); 1665 1666 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); 1667 I.eraseFromParent(); 1668 return Ret; 1669 } 1670 1671 bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { 1672 initM0(I); 1673 return selectImpl(I, *CoverageInfo); 1674 } 1675 1676 static int sizeToSubRegIndex(unsigned Size) { 1677 switch (Size) { 1678 case 32: 1679 return AMDGPU::sub0; 1680 case 64: 1681 return AMDGPU::sub0_sub1; 1682 case 96: 1683 return AMDGPU::sub0_sub1_sub2; 1684 case 128: 1685 return AMDGPU::sub0_sub1_sub2_sub3; 1686 case 256: 1687 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; 1688 default: 1689 if (Size < 32) 1690 return AMDGPU::sub0; 1691 if (Size > 256) 1692 return -1; 1693 return sizeToSubRegIndex(PowerOf2Ceil(Size)); 1694 } 1695 } 1696 1697 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { 1698 Register DstReg = I.getOperand(0).getReg(); 1699 Register SrcReg = I.getOperand(1).getReg(); 1700 const LLT DstTy = MRI->getType(DstReg); 1701 const LLT SrcTy = MRI->getType(SrcReg); 1702 const LLT S1 = LLT::scalar(1); 1703 1704 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1705 const RegisterBank *DstRB; 1706 if (DstTy == S1) { 1707 // This is a special case. We don't treat s1 for legalization artifacts as 1708 // vcc booleans. 1709 DstRB = SrcRB; 1710 } else { 1711 DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1712 if (SrcRB != DstRB) 1713 return false; 1714 } 1715 1716 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; 1717 1718 unsigned DstSize = DstTy.getSizeInBits(); 1719 unsigned SrcSize = SrcTy.getSizeInBits(); 1720 1721 const TargetRegisterClass *SrcRC 1722 = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI); 1723 const TargetRegisterClass *DstRC 1724 = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI); 1725 if (!SrcRC || !DstRC) 1726 return false; 1727 1728 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 1729 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) { 1730 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); 1731 return false; 1732 } 1733 1734 if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) { 1735 MachineBasicBlock *MBB = I.getParent(); 1736 const DebugLoc &DL = I.getDebugLoc(); 1737 1738 Register LoReg = MRI->createVirtualRegister(DstRC); 1739 Register HiReg = MRI->createVirtualRegister(DstRC); 1740 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg) 1741 .addReg(SrcReg, 0, AMDGPU::sub0); 1742 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg) 1743 .addReg(SrcReg, 0, AMDGPU::sub1); 1744 1745 if (IsVALU && STI.hasSDWA()) { 1746 // Write the low 16-bits of the high element into the high 16-bits of the 1747 // low element. 1748 MachineInstr *MovSDWA = 1749 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 1750 .addImm(0) // $src0_modifiers 1751 .addReg(HiReg) // $src0 1752 .addImm(0) // $clamp 1753 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel 1754 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 1755 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel 1756 .addReg(LoReg, RegState::Implicit); 1757 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 1758 } else { 1759 Register TmpReg0 = MRI->createVirtualRegister(DstRC); 1760 Register TmpReg1 = MRI->createVirtualRegister(DstRC); 1761 Register ImmReg = MRI->createVirtualRegister(DstRC); 1762 if (IsVALU) { 1763 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0) 1764 .addImm(16) 1765 .addReg(HiReg); 1766 } else { 1767 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0) 1768 .addReg(HiReg) 1769 .addImm(16); 1770 } 1771 1772 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 1773 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 1774 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32; 1775 1776 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg) 1777 .addImm(0xffff); 1778 BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1) 1779 .addReg(LoReg) 1780 .addReg(ImmReg); 1781 BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg) 1782 .addReg(TmpReg0) 1783 .addReg(TmpReg1); 1784 } 1785 1786 I.eraseFromParent(); 1787 return true; 1788 } 1789 1790 if (!DstTy.isScalar()) 1791 return false; 1792 1793 if (SrcSize > 32) { 1794 int SubRegIdx = sizeToSubRegIndex(DstSize); 1795 if (SubRegIdx == -1) 1796 return false; 1797 1798 // Deal with weird cases where the class only partially supports the subreg 1799 // index. 1800 const TargetRegisterClass *SrcWithSubRC 1801 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx); 1802 if (!SrcWithSubRC) 1803 return false; 1804 1805 if (SrcWithSubRC != SrcRC) { 1806 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI)) 1807 return false; 1808 } 1809 1810 I.getOperand(1).setSubReg(SubRegIdx); 1811 } 1812 1813 I.setDesc(TII.get(TargetOpcode::COPY)); 1814 return true; 1815 } 1816 1817 /// \returns true if a bitmask for \p Size bits will be an inline immediate. 1818 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) { 1819 Mask = maskTrailingOnes<unsigned>(Size); 1820 int SignedMask = static_cast<int>(Mask); 1821 return SignedMask >= -16 && SignedMask <= 64; 1822 } 1823 1824 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1. 1825 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank( 1826 Register Reg, const MachineRegisterInfo &MRI, 1827 const TargetRegisterInfo &TRI) const { 1828 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 1829 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>()) 1830 return RB; 1831 1832 // Ignore the type, since we don't use vcc in artifacts. 1833 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>()) 1834 return &RBI.getRegBankFromRegClass(*RC, LLT()); 1835 return nullptr; 1836 } 1837 1838 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { 1839 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG; 1840 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg; 1841 const DebugLoc &DL = I.getDebugLoc(); 1842 MachineBasicBlock &MBB = *I.getParent(); 1843 const Register DstReg = I.getOperand(0).getReg(); 1844 const Register SrcReg = I.getOperand(1).getReg(); 1845 1846 const LLT DstTy = MRI->getType(DstReg); 1847 const LLT SrcTy = MRI->getType(SrcReg); 1848 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ? 1849 I.getOperand(2).getImm() : SrcTy.getSizeInBits(); 1850 const unsigned DstSize = DstTy.getSizeInBits(); 1851 if (!DstTy.isScalar()) 1852 return false; 1853 1854 if (I.getOpcode() == AMDGPU::G_ANYEXT) 1855 return selectCOPY(I); 1856 1857 // Artifact casts should never use vcc. 1858 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI); 1859 1860 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) { 1861 // 64-bit should have been split up in RegBankSelect 1862 1863 // Try to use an and with a mask if it will save code size. 1864 unsigned Mask; 1865 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1866 MachineInstr *ExtI = 1867 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg) 1868 .addImm(Mask) 1869 .addReg(SrcReg); 1870 I.eraseFromParent(); 1871 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1872 } 1873 1874 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32; 1875 MachineInstr *ExtI = 1876 BuildMI(MBB, I, DL, TII.get(BFE), DstReg) 1877 .addReg(SrcReg) 1878 .addImm(0) // Offset 1879 .addImm(SrcSize); // Width 1880 I.eraseFromParent(); 1881 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1882 } 1883 1884 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) { 1885 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ? 1886 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass; 1887 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI)) 1888 return false; 1889 1890 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) { 1891 const unsigned SextOpc = SrcSize == 8 ? 1892 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16; 1893 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg) 1894 .addReg(SrcReg); 1895 I.eraseFromParent(); 1896 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1897 } 1898 1899 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; 1900 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; 1901 1902 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width. 1903 if (DstSize > 32 && (SrcSize <= 32 || InReg)) { 1904 // We need a 64-bit register source, but the high bits don't matter. 1905 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); 1906 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1907 unsigned SubReg = InReg ? AMDGPU::sub0 : 0; 1908 1909 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 1910 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) 1911 .addReg(SrcReg, 0, SubReg) 1912 .addImm(AMDGPU::sub0) 1913 .addReg(UndefReg) 1914 .addImm(AMDGPU::sub1); 1915 1916 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg) 1917 .addReg(ExtReg) 1918 .addImm(SrcSize << 16); 1919 1920 I.eraseFromParent(); 1921 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI); 1922 } 1923 1924 unsigned Mask; 1925 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1926 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg) 1927 .addReg(SrcReg) 1928 .addImm(Mask); 1929 } else { 1930 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg) 1931 .addReg(SrcReg) 1932 .addImm(SrcSize << 16); 1933 } 1934 1935 I.eraseFromParent(); 1936 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1937 } 1938 1939 return false; 1940 } 1941 1942 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { 1943 MachineBasicBlock *BB = I.getParent(); 1944 MachineOperand &ImmOp = I.getOperand(1); 1945 1946 // The AMDGPU backend only supports Imm operands and not CImm or FPImm. 1947 if (ImmOp.isFPImm()) { 1948 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt(); 1949 ImmOp.ChangeToImmediate(Imm.getZExtValue()); 1950 } else if (ImmOp.isCImm()) { 1951 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getZExtValue()); 1952 } 1953 1954 Register DstReg = I.getOperand(0).getReg(); 1955 unsigned Size; 1956 bool IsSgpr; 1957 const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg()); 1958 if (RB) { 1959 IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID; 1960 Size = MRI->getType(DstReg).getSizeInBits(); 1961 } else { 1962 const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg); 1963 IsSgpr = TRI.isSGPRClass(RC); 1964 Size = TRI.getRegSizeInBits(*RC); 1965 } 1966 1967 if (Size != 32 && Size != 64) 1968 return false; 1969 1970 unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1971 if (Size == 32) { 1972 I.setDesc(TII.get(Opcode)); 1973 I.addImplicitDefUseOperands(*MF); 1974 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 1975 } 1976 1977 const DebugLoc &DL = I.getDebugLoc(); 1978 1979 APInt Imm(Size, I.getOperand(1).getImm()); 1980 1981 MachineInstr *ResInst; 1982 if (IsSgpr && TII.isInlineConstant(Imm)) { 1983 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) 1984 .addImm(I.getOperand(1).getImm()); 1985 } else { 1986 const TargetRegisterClass *RC = IsSgpr ? 1987 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass; 1988 Register LoReg = MRI->createVirtualRegister(RC); 1989 Register HiReg = MRI->createVirtualRegister(RC); 1990 1991 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg) 1992 .addImm(Imm.trunc(32).getZExtValue()); 1993 1994 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg) 1995 .addImm(Imm.ashr(32).getZExtValue()); 1996 1997 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 1998 .addReg(LoReg) 1999 .addImm(AMDGPU::sub0) 2000 .addReg(HiReg) 2001 .addImm(AMDGPU::sub1); 2002 } 2003 2004 // We can't call constrainSelectedInstRegOperands here, because it doesn't 2005 // work for target independent opcodes 2006 I.eraseFromParent(); 2007 const TargetRegisterClass *DstRC = 2008 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI); 2009 if (!DstRC) 2010 return true; 2011 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI); 2012 } 2013 2014 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const { 2015 // Only manually handle the f64 SGPR case. 2016 // 2017 // FIXME: This is a workaround for 2.5 different tablegen problems. Because 2018 // the bit ops theoretically have a second result due to the implicit def of 2019 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing 2020 // that is easy by disabling the check. The result works, but uses a 2021 // nonsensical sreg32orlds_and_sreg_1 regclass. 2022 // 2023 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to 2024 // the variadic REG_SEQUENCE operands. 2025 2026 Register Dst = MI.getOperand(0).getReg(); 2027 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 2028 if (DstRB->getID() != AMDGPU::SGPRRegBankID || 2029 MRI->getType(Dst) != LLT::scalar(64)) 2030 return false; 2031 2032 Register Src = MI.getOperand(1).getReg(); 2033 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI); 2034 if (Fabs) 2035 Src = Fabs->getOperand(1).getReg(); 2036 2037 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || 2038 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) 2039 return false; 2040 2041 MachineBasicBlock *BB = MI.getParent(); 2042 const DebugLoc &DL = MI.getDebugLoc(); 2043 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2044 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2045 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2046 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2047 2048 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) 2049 .addReg(Src, 0, AMDGPU::sub0); 2050 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) 2051 .addReg(Src, 0, AMDGPU::sub1); 2052 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) 2053 .addImm(0x80000000); 2054 2055 // Set or toggle sign bit. 2056 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32; 2057 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg) 2058 .addReg(HiReg) 2059 .addReg(ConstReg); 2060 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) 2061 .addReg(LoReg) 2062 .addImm(AMDGPU::sub0) 2063 .addReg(OpReg) 2064 .addImm(AMDGPU::sub1); 2065 MI.eraseFromParent(); 2066 return true; 2067 } 2068 2069 // FIXME: This is a workaround for the same tablegen problems as G_FNEG 2070 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const { 2071 Register Dst = MI.getOperand(0).getReg(); 2072 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 2073 if (DstRB->getID() != AMDGPU::SGPRRegBankID || 2074 MRI->getType(Dst) != LLT::scalar(64)) 2075 return false; 2076 2077 Register Src = MI.getOperand(1).getReg(); 2078 MachineBasicBlock *BB = MI.getParent(); 2079 const DebugLoc &DL = MI.getDebugLoc(); 2080 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2081 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2082 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2083 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2084 2085 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || 2086 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) 2087 return false; 2088 2089 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) 2090 .addReg(Src, 0, AMDGPU::sub0); 2091 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) 2092 .addReg(Src, 0, AMDGPU::sub1); 2093 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) 2094 .addImm(0x7fffffff); 2095 2096 // Clear sign bit. 2097 // TODO: Should this used S_BITSET0_*? 2098 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg) 2099 .addReg(HiReg) 2100 .addReg(ConstReg); 2101 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) 2102 .addReg(LoReg) 2103 .addImm(AMDGPU::sub0) 2104 .addReg(OpReg) 2105 .addImm(AMDGPU::sub1); 2106 2107 MI.eraseFromParent(); 2108 return true; 2109 } 2110 2111 static bool isConstant(const MachineInstr &MI) { 2112 return MI.getOpcode() == TargetOpcode::G_CONSTANT; 2113 } 2114 2115 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, 2116 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const { 2117 2118 const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg()); 2119 2120 assert(PtrMI); 2121 2122 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD) 2123 return; 2124 2125 GEPInfo GEPInfo(*PtrMI); 2126 2127 for (unsigned i = 1; i != 3; ++i) { 2128 const MachineOperand &GEPOp = PtrMI->getOperand(i); 2129 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg()); 2130 assert(OpDef); 2131 if (i == 2 && isConstant(*OpDef)) { 2132 // TODO: Could handle constant base + variable offset, but a combine 2133 // probably should have commuted it. 2134 assert(GEPInfo.Imm == 0); 2135 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue(); 2136 continue; 2137 } 2138 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI); 2139 if (OpBank->getID() == AMDGPU::SGPRRegBankID) 2140 GEPInfo.SgprParts.push_back(GEPOp.getReg()); 2141 else 2142 GEPInfo.VgprParts.push_back(GEPOp.getReg()); 2143 } 2144 2145 AddrInfo.push_back(GEPInfo); 2146 getAddrModeInfo(*PtrMI, MRI, AddrInfo); 2147 } 2148 2149 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const { 2150 if (!MI.hasOneMemOperand()) 2151 return false; 2152 2153 const MachineMemOperand *MMO = *MI.memoperands_begin(); 2154 const Value *Ptr = MMO->getValue(); 2155 2156 // UndefValue means this is a load of a kernel input. These are uniform. 2157 // Sometimes LDS instructions have constant pointers. 2158 // If Ptr is null, then that means this mem operand contains a 2159 // PseudoSourceValue like GOT. 2160 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || 2161 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) 2162 return true; 2163 2164 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 2165 return true; 2166 2167 const Instruction *I = dyn_cast<Instruction>(Ptr); 2168 return I && I->getMetadata("amdgpu.uniform"); 2169 } 2170 2171 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const { 2172 for (const GEPInfo &GEPInfo : AddrInfo) { 2173 if (!GEPInfo.VgprParts.empty()) 2174 return true; 2175 } 2176 return false; 2177 } 2178 2179 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const { 2180 MachineBasicBlock *BB = I.getParent(); 2181 2182 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); 2183 unsigned AS = PtrTy.getAddressSpace(); 2184 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) && 2185 STI.ldsRequiresM0Init()) { 2186 // If DS instructions require M0 initializtion, insert it before selecting. 2187 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 2188 .addImm(-1); 2189 } 2190 } 2191 2192 bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const { 2193 initM0(I); 2194 return selectImpl(I, *CoverageInfo); 2195 } 2196 2197 // TODO: No rtn optimization. 2198 bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG( 2199 MachineInstr &MI) const { 2200 Register PtrReg = MI.getOperand(1).getReg(); 2201 const LLT PtrTy = MRI->getType(PtrReg); 2202 if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS || 2203 STI.useFlatForGlobal()) 2204 return selectImpl(MI, *CoverageInfo); 2205 2206 Register DstReg = MI.getOperand(0).getReg(); 2207 const LLT Ty = MRI->getType(DstReg); 2208 const bool Is64 = Ty.getSizeInBits() == 64; 2209 const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 2210 Register TmpReg = MRI->createVirtualRegister( 2211 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass); 2212 2213 const DebugLoc &DL = MI.getDebugLoc(); 2214 MachineBasicBlock *BB = MI.getParent(); 2215 2216 Register VAddr, RSrcReg, SOffset; 2217 int64_t Offset = 0; 2218 2219 unsigned Opcode; 2220 if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) { 2221 Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN : 2222 AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN; 2223 } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr, 2224 RSrcReg, SOffset, Offset)) { 2225 Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN : 2226 AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN; 2227 } else 2228 return selectImpl(MI, *CoverageInfo); 2229 2230 auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg) 2231 .addReg(MI.getOperand(2).getReg()); 2232 2233 if (VAddr) 2234 MIB.addReg(VAddr); 2235 2236 MIB.addReg(RSrcReg); 2237 if (SOffset) 2238 MIB.addReg(SOffset); 2239 else 2240 MIB.addImm(0); 2241 2242 MIB.addImm(Offset); 2243 MIB.addImm(0); // slc 2244 MIB.cloneMemRefs(MI); 2245 2246 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg) 2247 .addReg(TmpReg, RegState::Kill, SubReg); 2248 2249 MI.eraseFromParent(); 2250 2251 MRI->setRegClass( 2252 DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass); 2253 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 2254 } 2255 2256 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { 2257 MachineBasicBlock *BB = I.getParent(); 2258 MachineOperand &CondOp = I.getOperand(0); 2259 Register CondReg = CondOp.getReg(); 2260 const DebugLoc &DL = I.getDebugLoc(); 2261 2262 unsigned BrOpcode; 2263 Register CondPhysReg; 2264 const TargetRegisterClass *ConstrainRC; 2265 2266 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide 2267 // whether the branch is uniform when selecting the instruction. In 2268 // GlobalISel, we should push that decision into RegBankSelect. Assume for now 2269 // RegBankSelect knows what it's doing if the branch condition is scc, even 2270 // though it currently does not. 2271 if (!isVCC(CondReg, *MRI)) { 2272 if (MRI->getType(CondReg) != LLT::scalar(32)) 2273 return false; 2274 2275 CondPhysReg = AMDGPU::SCC; 2276 BrOpcode = AMDGPU::S_CBRANCH_SCC1; 2277 // FIXME: Hack for isSCC tests 2278 ConstrainRC = &AMDGPU::SGPR_32RegClass; 2279 } else { 2280 // FIXME: Do we have to insert an and with exec here, like in SelectionDAG? 2281 // We sort of know that a VCC producer based on the register bank, that ands 2282 // inactive lanes with 0. What if there was a logical operation with vcc 2283 // producers in different blocks/with different exec masks? 2284 // FIXME: Should scc->vcc copies and with exec? 2285 CondPhysReg = TRI.getVCC(); 2286 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ; 2287 ConstrainRC = TRI.getBoolRC(); 2288 } 2289 2290 if (!MRI->getRegClassOrNull(CondReg)) 2291 MRI->setRegClass(CondReg, ConstrainRC); 2292 2293 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg) 2294 .addReg(CondReg); 2295 BuildMI(*BB, &I, DL, TII.get(BrOpcode)) 2296 .addMBB(I.getOperand(1).getMBB()); 2297 2298 I.eraseFromParent(); 2299 return true; 2300 } 2301 2302 bool AMDGPUInstructionSelector::selectG_FRAME_INDEX_GLOBAL_VALUE( 2303 MachineInstr &I) const { 2304 Register DstReg = I.getOperand(0).getReg(); 2305 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2306 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 2307 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32)); 2308 if (IsVGPR) 2309 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 2310 2311 return RBI.constrainGenericRegister( 2312 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI); 2313 } 2314 2315 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { 2316 Register DstReg = I.getOperand(0).getReg(); 2317 Register SrcReg = I.getOperand(1).getReg(); 2318 Register MaskReg = I.getOperand(2).getReg(); 2319 LLT Ty = MRI->getType(DstReg); 2320 LLT MaskTy = MRI->getType(MaskReg); 2321 2322 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2323 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 2324 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI); 2325 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 2326 if (DstRB != SrcRB) // Should only happen for hand written MIR. 2327 return false; 2328 2329 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 2330 const TargetRegisterClass &RegRC 2331 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 2332 2333 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB, 2334 *MRI); 2335 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB, 2336 *MRI); 2337 const TargetRegisterClass *MaskRC = 2338 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI); 2339 2340 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 2341 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 2342 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI)) 2343 return false; 2344 2345 MachineBasicBlock *BB = I.getParent(); 2346 const DebugLoc &DL = I.getDebugLoc(); 2347 if (Ty.getSizeInBits() == 32) { 2348 assert(MaskTy.getSizeInBits() == 32 && 2349 "ptrmask should have been narrowed during legalize"); 2350 2351 BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg) 2352 .addReg(SrcReg) 2353 .addReg(MaskReg); 2354 I.eraseFromParent(); 2355 return true; 2356 } 2357 2358 Register HiReg = MRI->createVirtualRegister(&RegRC); 2359 Register LoReg = MRI->createVirtualRegister(&RegRC); 2360 2361 // Extract the subregisters from the source pointer. 2362 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg) 2363 .addReg(SrcReg, 0, AMDGPU::sub0); 2364 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg) 2365 .addReg(SrcReg, 0, AMDGPU::sub1); 2366 2367 Register MaskedLo, MaskedHi; 2368 2369 // Try to avoid emitting a bit operation when we only need to touch half of 2370 // the 64-bit pointer. 2371 APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64); 2372 2373 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32); 2374 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32); 2375 if ((MaskOnes & MaskLo32) == MaskLo32) { 2376 // If all the bits in the low half are 1, we only need a copy for it. 2377 MaskedLo = LoReg; 2378 } else { 2379 // Extract the mask subregister and apply the and. 2380 Register MaskLo = MRI->createVirtualRegister(&RegRC); 2381 MaskedLo = MRI->createVirtualRegister(&RegRC); 2382 2383 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo) 2384 .addReg(MaskReg, 0, AMDGPU::sub0); 2385 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo) 2386 .addReg(LoReg) 2387 .addReg(MaskLo); 2388 } 2389 2390 if ((MaskOnes & MaskHi32) == MaskHi32) { 2391 // If all the bits in the high half are 1, we only need a copy for it. 2392 MaskedHi = HiReg; 2393 } else { 2394 Register MaskHi = MRI->createVirtualRegister(&RegRC); 2395 MaskedHi = MRI->createVirtualRegister(&RegRC); 2396 2397 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi) 2398 .addReg(MaskReg, 0, AMDGPU::sub1); 2399 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi) 2400 .addReg(HiReg) 2401 .addReg(MaskHi); 2402 } 2403 2404 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 2405 .addReg(MaskedLo) 2406 .addImm(AMDGPU::sub0) 2407 .addReg(MaskedHi) 2408 .addImm(AMDGPU::sub1); 2409 I.eraseFromParent(); 2410 return true; 2411 } 2412 2413 /// Return the register to use for the index value, and the subregister to use 2414 /// for the indirectly accessed register. 2415 static std::pair<Register, unsigned> 2416 computeIndirectRegIndex(MachineRegisterInfo &MRI, 2417 const SIRegisterInfo &TRI, 2418 const TargetRegisterClass *SuperRC, 2419 Register IdxReg, 2420 unsigned EltSize) { 2421 Register IdxBaseReg; 2422 int Offset; 2423 MachineInstr *Unused; 2424 2425 std::tie(IdxBaseReg, Offset, Unused) 2426 = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg); 2427 if (IdxBaseReg == AMDGPU::NoRegister) { 2428 // This will happen if the index is a known constant. This should ordinarily 2429 // be legalized out, but handle it as a register just in case. 2430 assert(Offset == 0); 2431 IdxBaseReg = IdxReg; 2432 } 2433 2434 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize); 2435 2436 // Skip out of bounds offsets, or else we would end up using an undefined 2437 // register. 2438 if (static_cast<unsigned>(Offset) >= SubRegs.size()) 2439 return std::make_pair(IdxReg, SubRegs[0]); 2440 return std::make_pair(IdxBaseReg, SubRegs[Offset]); 2441 } 2442 2443 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT( 2444 MachineInstr &MI) const { 2445 Register DstReg = MI.getOperand(0).getReg(); 2446 Register SrcReg = MI.getOperand(1).getReg(); 2447 Register IdxReg = MI.getOperand(2).getReg(); 2448 2449 LLT DstTy = MRI->getType(DstReg); 2450 LLT SrcTy = MRI->getType(SrcReg); 2451 2452 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2453 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 2454 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 2455 2456 // The index must be scalar. If it wasn't RegBankSelect should have moved this 2457 // into a waterfall loop. 2458 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 2459 return false; 2460 2461 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB, 2462 *MRI); 2463 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB, 2464 *MRI); 2465 if (!SrcRC || !DstRC) 2466 return false; 2467 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 2468 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 2469 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 2470 return false; 2471 2472 MachineBasicBlock *BB = MI.getParent(); 2473 const DebugLoc &DL = MI.getDebugLoc(); 2474 const bool Is64 = DstTy.getSizeInBits() == 64; 2475 2476 unsigned SubReg; 2477 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg, 2478 DstTy.getSizeInBits() / 8); 2479 2480 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) { 2481 if (DstTy.getSizeInBits() != 32 && !Is64) 2482 return false; 2483 2484 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2485 .addReg(IdxReg); 2486 2487 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32; 2488 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg) 2489 .addReg(SrcReg, 0, SubReg) 2490 .addReg(SrcReg, RegState::Implicit); 2491 MI.eraseFromParent(); 2492 return true; 2493 } 2494 2495 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32) 2496 return false; 2497 2498 if (!STI.useVGPRIndexMode()) { 2499 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2500 .addReg(IdxReg); 2501 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg) 2502 .addReg(SrcReg, RegState::Undef, SubReg) 2503 .addReg(SrcReg, RegState::Implicit); 2504 MI.eraseFromParent(); 2505 return true; 2506 } 2507 2508 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON)) 2509 .addReg(IdxReg) 2510 .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE); 2511 BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstReg) 2512 .addReg(SrcReg, RegState::Undef, SubReg) 2513 .addReg(SrcReg, RegState::Implicit) 2514 .addReg(AMDGPU::M0, RegState::Implicit); 2515 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF)); 2516 2517 MI.eraseFromParent(); 2518 return true; 2519 } 2520 2521 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd 2522 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT( 2523 MachineInstr &MI) const { 2524 Register DstReg = MI.getOperand(0).getReg(); 2525 Register VecReg = MI.getOperand(1).getReg(); 2526 Register ValReg = MI.getOperand(2).getReg(); 2527 Register IdxReg = MI.getOperand(3).getReg(); 2528 2529 LLT VecTy = MRI->getType(DstReg); 2530 LLT ValTy = MRI->getType(ValReg); 2531 unsigned VecSize = VecTy.getSizeInBits(); 2532 unsigned ValSize = ValTy.getSizeInBits(); 2533 2534 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI); 2535 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI); 2536 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 2537 2538 assert(VecTy.getElementType() == ValTy); 2539 2540 // The index must be scalar. If it wasn't RegBankSelect should have moved this 2541 // into a waterfall loop. 2542 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 2543 return false; 2544 2545 const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB, 2546 *MRI); 2547 const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB, 2548 *MRI); 2549 2550 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) || 2551 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) || 2552 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) || 2553 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 2554 return false; 2555 2556 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32) 2557 return false; 2558 2559 unsigned SubReg; 2560 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, 2561 ValSize / 8); 2562 2563 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID && 2564 STI.useVGPRIndexMode(); 2565 2566 MachineBasicBlock *BB = MI.getParent(); 2567 const DebugLoc &DL = MI.getDebugLoc(); 2568 2569 if (IndexMode) { 2570 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON)) 2571 .addReg(IdxReg) 2572 .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE); 2573 } else { 2574 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2575 .addReg(IdxReg); 2576 } 2577 2578 const MCInstrDesc &RegWriteOp 2579 = TII.getIndirectRegWritePseudo(VecSize, ValSize, 2580 VecRB->getID() == AMDGPU::SGPRRegBankID); 2581 BuildMI(*BB, MI, DL, RegWriteOp, DstReg) 2582 .addReg(VecReg) 2583 .addReg(ValReg) 2584 .addImm(SubReg); 2585 2586 if (IndexMode) 2587 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF)); 2588 2589 MI.eraseFromParent(); 2590 return true; 2591 } 2592 2593 static bool isZeroOrUndef(int X) { 2594 return X == 0 || X == -1; 2595 } 2596 2597 static bool isOneOrUndef(int X) { 2598 return X == 1 || X == -1; 2599 } 2600 2601 static bool isZeroOrOneOrUndef(int X) { 2602 return X == 0 || X == 1 || X == -1; 2603 } 2604 2605 // Normalize a VOP3P shuffle mask to refer to the low/high half of a single 2606 // 32-bit register. 2607 static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1, 2608 ArrayRef<int> Mask) { 2609 NewMask[0] = Mask[0]; 2610 NewMask[1] = Mask[1]; 2611 if (isZeroOrOneOrUndef(Mask[0]) && isZeroOrOneOrUndef(Mask[1])) 2612 return Src0; 2613 2614 assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1); 2615 assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1); 2616 2617 // Shift the mask inputs to be 0/1; 2618 NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2; 2619 NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2; 2620 return Src1; 2621 } 2622 2623 // This is only legal with VOP3P instructions as an aid to op_sel matching. 2624 bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR( 2625 MachineInstr &MI) const { 2626 Register DstReg = MI.getOperand(0).getReg(); 2627 Register Src0Reg = MI.getOperand(1).getReg(); 2628 Register Src1Reg = MI.getOperand(2).getReg(); 2629 ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask(); 2630 2631 const LLT V2S16 = LLT::vector(2, 16); 2632 if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16) 2633 return false; 2634 2635 if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask)) 2636 return false; 2637 2638 assert(ShufMask.size() == 2); 2639 assert(STI.hasSDWA() && "no target has VOP3P but not SDWA"); 2640 2641 MachineBasicBlock *MBB = MI.getParent(); 2642 const DebugLoc &DL = MI.getDebugLoc(); 2643 2644 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2645 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; 2646 const TargetRegisterClass &RC = IsVALU ? 2647 AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 2648 2649 // Handle the degenerate case which should have folded out. 2650 if (ShufMask[0] == -1 && ShufMask[1] == -1) { 2651 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg); 2652 2653 MI.eraseFromParent(); 2654 return RBI.constrainGenericRegister(DstReg, RC, *MRI); 2655 } 2656 2657 // A legal VOP3P mask only reads one of the sources. 2658 int Mask[2]; 2659 Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask); 2660 2661 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) || 2662 !RBI.constrainGenericRegister(SrcVec, RC, *MRI)) 2663 return false; 2664 2665 // TODO: This also should have been folded out 2666 if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) { 2667 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg) 2668 .addReg(SrcVec); 2669 2670 MI.eraseFromParent(); 2671 return true; 2672 } 2673 2674 if (Mask[0] == 1 && Mask[1] == -1) { 2675 if (IsVALU) { 2676 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg) 2677 .addImm(16) 2678 .addReg(SrcVec); 2679 } else { 2680 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg) 2681 .addReg(SrcVec) 2682 .addImm(16); 2683 } 2684 } else if (Mask[0] == -1 && Mask[1] == 0) { 2685 if (IsVALU) { 2686 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg) 2687 .addImm(16) 2688 .addReg(SrcVec); 2689 } else { 2690 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg) 2691 .addReg(SrcVec) 2692 .addImm(16); 2693 } 2694 } else if (Mask[0] == 0 && Mask[1] == 0) { 2695 if (IsVALU) { 2696 // Write low half of the register into the high half. 2697 MachineInstr *MovSDWA = 2698 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 2699 .addImm(0) // $src0_modifiers 2700 .addReg(SrcVec) // $src0 2701 .addImm(0) // $clamp 2702 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel 2703 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 2704 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel 2705 .addReg(SrcVec, RegState::Implicit); 2706 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 2707 } else { 2708 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg) 2709 .addReg(SrcVec) 2710 .addReg(SrcVec); 2711 } 2712 } else if (Mask[0] == 1 && Mask[1] == 1) { 2713 if (IsVALU) { 2714 // Write high half of the register into the low half. 2715 MachineInstr *MovSDWA = 2716 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 2717 .addImm(0) // $src0_modifiers 2718 .addReg(SrcVec) // $src0 2719 .addImm(0) // $clamp 2720 .addImm(AMDGPU::SDWA::WORD_0) // $dst_sel 2721 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 2722 .addImm(AMDGPU::SDWA::WORD_1) // $src0_sel 2723 .addReg(SrcVec, RegState::Implicit); 2724 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 2725 } else { 2726 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg) 2727 .addReg(SrcVec) 2728 .addReg(SrcVec); 2729 } 2730 } else if (Mask[0] == 1 && Mask[1] == 0) { 2731 if (IsVALU) { 2732 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32), DstReg) 2733 .addReg(SrcVec) 2734 .addReg(SrcVec) 2735 .addImm(16); 2736 } else { 2737 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2738 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg) 2739 .addReg(SrcVec) 2740 .addImm(16); 2741 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg) 2742 .addReg(TmpReg) 2743 .addReg(SrcVec); 2744 } 2745 } else 2746 llvm_unreachable("all shuffle masks should be handled"); 2747 2748 MI.eraseFromParent(); 2749 return true; 2750 } 2751 2752 bool AMDGPUInstructionSelector::select(MachineInstr &I) { 2753 if (I.isPHI()) 2754 return selectPHI(I); 2755 2756 if (!I.isPreISelOpcode()) { 2757 if (I.isCopy()) 2758 return selectCOPY(I); 2759 return true; 2760 } 2761 2762 switch (I.getOpcode()) { 2763 case TargetOpcode::G_AND: 2764 case TargetOpcode::G_OR: 2765 case TargetOpcode::G_XOR: 2766 if (selectImpl(I, *CoverageInfo)) 2767 return true; 2768 return selectG_AND_OR_XOR(I); 2769 case TargetOpcode::G_ADD: 2770 case TargetOpcode::G_SUB: 2771 if (selectImpl(I, *CoverageInfo)) 2772 return true; 2773 return selectG_ADD_SUB(I); 2774 case TargetOpcode::G_UADDO: 2775 case TargetOpcode::G_USUBO: 2776 case TargetOpcode::G_UADDE: 2777 case TargetOpcode::G_USUBE: 2778 return selectG_UADDO_USUBO_UADDE_USUBE(I); 2779 case TargetOpcode::G_INTTOPTR: 2780 case TargetOpcode::G_BITCAST: 2781 case TargetOpcode::G_PTRTOINT: 2782 return selectCOPY(I); 2783 case TargetOpcode::G_CONSTANT: 2784 case TargetOpcode::G_FCONSTANT: 2785 return selectG_CONSTANT(I); 2786 case TargetOpcode::G_FNEG: 2787 if (selectImpl(I, *CoverageInfo)) 2788 return true; 2789 return selectG_FNEG(I); 2790 case TargetOpcode::G_FABS: 2791 if (selectImpl(I, *CoverageInfo)) 2792 return true; 2793 return selectG_FABS(I); 2794 case TargetOpcode::G_EXTRACT: 2795 return selectG_EXTRACT(I); 2796 case TargetOpcode::G_MERGE_VALUES: 2797 case TargetOpcode::G_BUILD_VECTOR: 2798 case TargetOpcode::G_CONCAT_VECTORS: 2799 return selectG_MERGE_VALUES(I); 2800 case TargetOpcode::G_UNMERGE_VALUES: 2801 return selectG_UNMERGE_VALUES(I); 2802 case TargetOpcode::G_BUILD_VECTOR_TRUNC: 2803 return selectG_BUILD_VECTOR_TRUNC(I); 2804 case TargetOpcode::G_PTR_ADD: 2805 return selectG_PTR_ADD(I); 2806 case TargetOpcode::G_IMPLICIT_DEF: 2807 return selectG_IMPLICIT_DEF(I); 2808 case TargetOpcode::G_FREEZE: 2809 return selectCOPY(I); 2810 case TargetOpcode::G_INSERT: 2811 return selectG_INSERT(I); 2812 case TargetOpcode::G_INTRINSIC: 2813 return selectG_INTRINSIC(I); 2814 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: 2815 return selectG_INTRINSIC_W_SIDE_EFFECTS(I); 2816 case TargetOpcode::G_ICMP: 2817 if (selectG_ICMP(I)) 2818 return true; 2819 return selectImpl(I, *CoverageInfo); 2820 case TargetOpcode::G_LOAD: 2821 case TargetOpcode::G_ATOMIC_CMPXCHG: 2822 case TargetOpcode::G_ATOMICRMW_XCHG: 2823 case TargetOpcode::G_ATOMICRMW_ADD: 2824 case TargetOpcode::G_ATOMICRMW_SUB: 2825 case TargetOpcode::G_ATOMICRMW_AND: 2826 case TargetOpcode::G_ATOMICRMW_OR: 2827 case TargetOpcode::G_ATOMICRMW_XOR: 2828 case TargetOpcode::G_ATOMICRMW_MIN: 2829 case TargetOpcode::G_ATOMICRMW_MAX: 2830 case TargetOpcode::G_ATOMICRMW_UMIN: 2831 case TargetOpcode::G_ATOMICRMW_UMAX: 2832 case TargetOpcode::G_ATOMICRMW_FADD: 2833 return selectG_LOAD_ATOMICRMW(I); 2834 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: 2835 return selectG_AMDGPU_ATOMIC_CMPXCHG(I); 2836 case TargetOpcode::G_SELECT: 2837 return selectG_SELECT(I); 2838 case TargetOpcode::G_STORE: 2839 return selectG_STORE(I); 2840 case TargetOpcode::G_TRUNC: 2841 return selectG_TRUNC(I); 2842 case TargetOpcode::G_SEXT: 2843 case TargetOpcode::G_ZEXT: 2844 case TargetOpcode::G_ANYEXT: 2845 case TargetOpcode::G_SEXT_INREG: 2846 if (selectImpl(I, *CoverageInfo)) 2847 return true; 2848 return selectG_SZA_EXT(I); 2849 case TargetOpcode::G_BRCOND: 2850 return selectG_BRCOND(I); 2851 case TargetOpcode::G_FRAME_INDEX: 2852 case TargetOpcode::G_GLOBAL_VALUE: 2853 return selectG_FRAME_INDEX_GLOBAL_VALUE(I); 2854 case TargetOpcode::G_PTRMASK: 2855 return selectG_PTRMASK(I); 2856 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 2857 return selectG_EXTRACT_VECTOR_ELT(I); 2858 case TargetOpcode::G_INSERT_VECTOR_ELT: 2859 return selectG_INSERT_VECTOR_ELT(I); 2860 case TargetOpcode::G_SHUFFLE_VECTOR: 2861 return selectG_SHUFFLE_VECTOR(I); 2862 case AMDGPU::G_AMDGPU_ATOMIC_INC: 2863 case AMDGPU::G_AMDGPU_ATOMIC_DEC: 2864 initM0(I); 2865 return selectImpl(I, *CoverageInfo); 2866 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 2867 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: { 2868 const AMDGPU::ImageDimIntrinsicInfo *Intr 2869 = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID()); 2870 assert(Intr && "not an image intrinsic with image pseudo"); 2871 return selectImageIntrinsic(I, Intr); 2872 } 2873 default: 2874 return selectImpl(I, *CoverageInfo); 2875 } 2876 return false; 2877 } 2878 2879 InstructionSelector::ComplexRendererFns 2880 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { 2881 return {{ 2882 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 2883 }}; 2884 2885 } 2886 2887 std::pair<Register, unsigned> 2888 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root) const { 2889 Register Src = Root.getReg(); 2890 Register OrigSrc = Src; 2891 unsigned Mods = 0; 2892 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI); 2893 2894 if (MI && MI->getOpcode() == AMDGPU::G_FNEG) { 2895 Src = MI->getOperand(1).getReg(); 2896 Mods |= SISrcMods::NEG; 2897 MI = getDefIgnoringCopies(Src, *MRI); 2898 } 2899 2900 if (MI && MI->getOpcode() == AMDGPU::G_FABS) { 2901 Src = MI->getOperand(1).getReg(); 2902 Mods |= SISrcMods::ABS; 2903 } 2904 2905 if (Mods != 0 && 2906 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) { 2907 MachineInstr *UseMI = Root.getParent(); 2908 2909 // If we looked through copies to find source modifiers on an SGPR operand, 2910 // we now have an SGPR register source. To avoid potentially violating the 2911 // constant bus restriction, we need to insert a copy to a VGPR. 2912 Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc); 2913 BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(), 2914 TII.get(AMDGPU::COPY), VGPRSrc) 2915 .addReg(Src); 2916 Src = VGPRSrc; 2917 } 2918 2919 return std::make_pair(Src, Mods); 2920 } 2921 2922 /// 2923 /// This will select either an SGPR or VGPR operand and will save us from 2924 /// having to write an extra tablegen pattern. 2925 InstructionSelector::ComplexRendererFns 2926 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const { 2927 return {{ 2928 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 2929 }}; 2930 } 2931 2932 InstructionSelector::ComplexRendererFns 2933 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { 2934 Register Src; 2935 unsigned Mods; 2936 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 2937 2938 return {{ 2939 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2940 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 2941 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 2942 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 2943 }}; 2944 } 2945 2946 InstructionSelector::ComplexRendererFns 2947 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { 2948 return {{ 2949 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 2950 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 2951 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 2952 }}; 2953 } 2954 2955 InstructionSelector::ComplexRendererFns 2956 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { 2957 Register Src; 2958 unsigned Mods; 2959 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 2960 2961 return {{ 2962 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2963 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 2964 }}; 2965 } 2966 2967 InstructionSelector::ComplexRendererFns 2968 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { 2969 Register Reg = Root.getReg(); 2970 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); 2971 if (Def && (Def->getOpcode() == AMDGPU::G_FNEG || 2972 Def->getOpcode() == AMDGPU::G_FABS)) 2973 return {}; 2974 return {{ 2975 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 2976 }}; 2977 } 2978 2979 std::pair<Register, unsigned> 2980 AMDGPUInstructionSelector::selectVOP3PModsImpl( 2981 Register Src, const MachineRegisterInfo &MRI) const { 2982 unsigned Mods = 0; 2983 MachineInstr *MI = MRI.getVRegDef(Src); 2984 2985 if (MI && MI->getOpcode() == AMDGPU::G_FNEG && 2986 // It's possible to see an f32 fneg here, but unlikely. 2987 // TODO: Treat f32 fneg as only high bit. 2988 MRI.getType(Src) == LLT::vector(2, 16)) { 2989 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); 2990 Src = MI->getOperand(1).getReg(); 2991 MI = MRI.getVRegDef(Src); 2992 } 2993 2994 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. 2995 2996 // Packed instructions do not have abs modifiers. 2997 Mods |= SISrcMods::OP_SEL_1; 2998 2999 return std::make_pair(Src, Mods); 3000 } 3001 3002 InstructionSelector::ComplexRendererFns 3003 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const { 3004 MachineRegisterInfo &MRI 3005 = Root.getParent()->getParent()->getParent()->getRegInfo(); 3006 3007 Register Src; 3008 unsigned Mods; 3009 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI); 3010 3011 return {{ 3012 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3013 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3014 }}; 3015 } 3016 3017 InstructionSelector::ComplexRendererFns 3018 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const { 3019 Register Src; 3020 unsigned Mods; 3021 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 3022 if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI)) 3023 return None; 3024 3025 return {{ 3026 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3027 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3028 }}; 3029 } 3030 3031 InstructionSelector::ComplexRendererFns 3032 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { 3033 // FIXME: Handle op_sel 3034 return {{ 3035 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 3036 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods 3037 }}; 3038 } 3039 3040 InstructionSelector::ComplexRendererFns 3041 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { 3042 SmallVector<GEPInfo, 4> AddrInfo; 3043 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 3044 3045 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 3046 return None; 3047 3048 const GEPInfo &GEPInfo = AddrInfo[0]; 3049 Optional<int64_t> EncodedImm = 3050 AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false); 3051 if (!EncodedImm) 3052 return None; 3053 3054 unsigned PtrReg = GEPInfo.SgprParts[0]; 3055 return {{ 3056 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 3057 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } 3058 }}; 3059 } 3060 3061 InstructionSelector::ComplexRendererFns 3062 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { 3063 SmallVector<GEPInfo, 4> AddrInfo; 3064 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 3065 3066 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 3067 return None; 3068 3069 const GEPInfo &GEPInfo = AddrInfo[0]; 3070 Register PtrReg = GEPInfo.SgprParts[0]; 3071 Optional<int64_t> EncodedImm = 3072 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm); 3073 if (!EncodedImm) 3074 return None; 3075 3076 return {{ 3077 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 3078 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } 3079 }}; 3080 } 3081 3082 InstructionSelector::ComplexRendererFns 3083 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { 3084 MachineInstr *MI = Root.getParent(); 3085 MachineBasicBlock *MBB = MI->getParent(); 3086 3087 SmallVector<GEPInfo, 4> AddrInfo; 3088 getAddrModeInfo(*MI, *MRI, AddrInfo); 3089 3090 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, 3091 // then we can select all ptr + 32-bit offsets not just immediate offsets. 3092 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 3093 return None; 3094 3095 const GEPInfo &GEPInfo = AddrInfo[0]; 3096 // SGPR offset is unsigned. 3097 if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm)) 3098 return None; 3099 3100 // If we make it this far we have a load with an 32-bit immediate offset. 3101 // It is OK to select this using a sgpr offset, because we have already 3102 // failed trying to select this load into one of the _IMM variants since 3103 // the _IMM Patterns are considered before the _SGPR patterns. 3104 Register PtrReg = GEPInfo.SgprParts[0]; 3105 Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 3106 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) 3107 .addImm(GEPInfo.Imm); 3108 return {{ 3109 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 3110 [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); } 3111 }}; 3112 } 3113 3114 template <bool Signed> 3115 InstructionSelector::ComplexRendererFns 3116 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const { 3117 MachineInstr *MI = Root.getParent(); 3118 3119 InstructionSelector::ComplexRendererFns Default = {{ 3120 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 3121 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // offset 3122 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 3123 }}; 3124 3125 if (!STI.hasFlatInstOffsets()) 3126 return Default; 3127 3128 const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg()); 3129 if (!OpDef || OpDef->getOpcode() != AMDGPU::G_PTR_ADD) 3130 return Default; 3131 3132 Optional<int64_t> Offset = 3133 getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI); 3134 if (!Offset.hasValue()) 3135 return Default; 3136 3137 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); 3138 if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed)) 3139 return Default; 3140 3141 Register BasePtr = OpDef->getOperand(1).getReg(); 3142 3143 return {{ 3144 [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); }, 3145 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); }, 3146 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 3147 }}; 3148 } 3149 3150 InstructionSelector::ComplexRendererFns 3151 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const { 3152 return selectFlatOffsetImpl<false>(Root); 3153 } 3154 3155 InstructionSelector::ComplexRendererFns 3156 AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const { 3157 return selectFlatOffsetImpl<true>(Root); 3158 } 3159 3160 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { 3161 auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>(); 3162 return PSV && PSV->isStack(); 3163 } 3164 3165 InstructionSelector::ComplexRendererFns 3166 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { 3167 MachineInstr *MI = Root.getParent(); 3168 MachineBasicBlock *MBB = MI->getParent(); 3169 MachineFunction *MF = MBB->getParent(); 3170 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 3171 3172 int64_t Offset = 0; 3173 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) && 3174 Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) { 3175 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3176 3177 // TODO: Should this be inside the render function? The iterator seems to 3178 // move. 3179 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), 3180 HighBits) 3181 .addImm(Offset & ~4095); 3182 3183 return {{[=](MachineInstrBuilder &MIB) { // rsrc 3184 MIB.addReg(Info->getScratchRSrcReg()); 3185 }, 3186 [=](MachineInstrBuilder &MIB) { // vaddr 3187 MIB.addReg(HighBits); 3188 }, 3189 [=](MachineInstrBuilder &MIB) { // soffset 3190 const MachineMemOperand *MMO = *MI->memoperands_begin(); 3191 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 3192 3193 if (isStackPtrRelative(PtrInfo)) 3194 MIB.addReg(Info->getStackPtrOffsetReg()); 3195 else 3196 MIB.addImm(0); 3197 }, 3198 [=](MachineInstrBuilder &MIB) { // offset 3199 MIB.addImm(Offset & 4095); 3200 }}}; 3201 } 3202 3203 assert(Offset == 0 || Offset == -1); 3204 3205 // Try to fold a frame index directly into the MUBUF vaddr field, and any 3206 // offsets. 3207 Optional<int> FI; 3208 Register VAddr = Root.getReg(); 3209 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) { 3210 if (isBaseWithConstantOffset(Root, *MRI)) { 3211 const MachineOperand &LHS = RootDef->getOperand(1); 3212 const MachineOperand &RHS = RootDef->getOperand(2); 3213 const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); 3214 const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); 3215 if (LHSDef && RHSDef) { 3216 int64_t PossibleOffset = 3217 RHSDef->getOperand(1).getCImm()->getSExtValue(); 3218 if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) && 3219 (!STI.privateMemoryResourceIsRangeChecked() || 3220 KnownBits->signBitIsZero(LHS.getReg()))) { 3221 if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX) 3222 FI = LHSDef->getOperand(1).getIndex(); 3223 else 3224 VAddr = LHS.getReg(); 3225 Offset = PossibleOffset; 3226 } 3227 } 3228 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) { 3229 FI = RootDef->getOperand(1).getIndex(); 3230 } 3231 } 3232 3233 return {{[=](MachineInstrBuilder &MIB) { // rsrc 3234 MIB.addReg(Info->getScratchRSrcReg()); 3235 }, 3236 [=](MachineInstrBuilder &MIB) { // vaddr 3237 if (FI.hasValue()) 3238 MIB.addFrameIndex(FI.getValue()); 3239 else 3240 MIB.addReg(VAddr); 3241 }, 3242 [=](MachineInstrBuilder &MIB) { // soffset 3243 // If we don't know this private access is a local stack object, it 3244 // needs to be relative to the entry point's scratch wave offset. 3245 // TODO: Should split large offsets that don't fit like above. 3246 // TODO: Don't use scratch wave offset just because the offset 3247 // didn't fit. 3248 if (!Info->isEntryFunction() && FI.hasValue()) 3249 MIB.addReg(Info->getStackPtrOffsetReg()); 3250 else 3251 MIB.addImm(0); 3252 }, 3253 [=](MachineInstrBuilder &MIB) { // offset 3254 MIB.addImm(Offset); 3255 }}}; 3256 } 3257 3258 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base, 3259 int64_t Offset, 3260 unsigned OffsetBits) const { 3261 if ((OffsetBits == 16 && !isUInt<16>(Offset)) || 3262 (OffsetBits == 8 && !isUInt<8>(Offset))) 3263 return false; 3264 3265 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) 3266 return true; 3267 3268 // On Southern Islands instruction with a negative base value and an offset 3269 // don't seem to work. 3270 return KnownBits->signBitIsZero(Base); 3271 } 3272 3273 InstructionSelector::ComplexRendererFns 3274 AMDGPUInstructionSelector::selectMUBUFScratchOffset( 3275 MachineOperand &Root) const { 3276 MachineInstr *MI = Root.getParent(); 3277 MachineBasicBlock *MBB = MI->getParent(); 3278 3279 int64_t Offset = 0; 3280 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) || 3281 !SIInstrInfo::isLegalMUBUFImmOffset(Offset)) 3282 return {}; 3283 3284 const MachineFunction *MF = MBB->getParent(); 3285 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 3286 const MachineMemOperand *MMO = *MI->memoperands_begin(); 3287 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 3288 3289 return {{ 3290 [=](MachineInstrBuilder &MIB) { // rsrc 3291 MIB.addReg(Info->getScratchRSrcReg()); 3292 }, 3293 [=](MachineInstrBuilder &MIB) { // soffset 3294 if (isStackPtrRelative(PtrInfo)) 3295 MIB.addReg(Info->getStackPtrOffsetReg()); 3296 else 3297 MIB.addImm(0); 3298 }, 3299 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset 3300 }}; 3301 } 3302 3303 std::pair<Register, unsigned> 3304 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const { 3305 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 3306 if (!RootDef) 3307 return std::make_pair(Root.getReg(), 0); 3308 3309 int64_t ConstAddr = 0; 3310 3311 Register PtrBase; 3312 int64_t Offset; 3313 std::tie(PtrBase, Offset) = 3314 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 3315 3316 if (Offset) { 3317 if (isDSOffsetLegal(PtrBase, Offset, 16)) { 3318 // (add n0, c0) 3319 return std::make_pair(PtrBase, Offset); 3320 } 3321 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 3322 // TODO 3323 3324 3325 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 3326 // TODO 3327 3328 } 3329 3330 return std::make_pair(Root.getReg(), 0); 3331 } 3332 3333 InstructionSelector::ComplexRendererFns 3334 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const { 3335 Register Reg; 3336 unsigned Offset; 3337 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root); 3338 return {{ 3339 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 3340 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } 3341 }}; 3342 } 3343 3344 InstructionSelector::ComplexRendererFns 3345 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const { 3346 Register Reg; 3347 unsigned Offset; 3348 std::tie(Reg, Offset) = selectDS64Bit4ByteAlignedImpl(Root); 3349 return {{ 3350 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 3351 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, 3352 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); } 3353 }}; 3354 } 3355 3356 std::pair<Register, unsigned> 3357 AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const { 3358 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 3359 if (!RootDef) 3360 return std::make_pair(Root.getReg(), 0); 3361 3362 int64_t ConstAddr = 0; 3363 3364 Register PtrBase; 3365 int64_t Offset; 3366 std::tie(PtrBase, Offset) = 3367 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 3368 3369 if (Offset) { 3370 int64_t DWordOffset0 = Offset / 4; 3371 int64_t DWordOffset1 = DWordOffset0 + 1; 3372 if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) { 3373 // (add n0, c0) 3374 return std::make_pair(PtrBase, DWordOffset0); 3375 } 3376 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 3377 // TODO 3378 3379 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 3380 // TODO 3381 3382 } 3383 3384 return std::make_pair(Root.getReg(), 0); 3385 } 3386 3387 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return 3388 /// the base value with the constant offset. There may be intervening copies 3389 /// between \p Root and the identified constant. Returns \p Root, 0 if this does 3390 /// not match the pattern. 3391 std::pair<Register, int64_t> 3392 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset( 3393 Register Root, const MachineRegisterInfo &MRI) const { 3394 MachineInstr *RootI = MRI.getVRegDef(Root); 3395 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD) 3396 return {Root, 0}; 3397 3398 MachineOperand &RHS = RootI->getOperand(2); 3399 Optional<ValueAndVReg> MaybeOffset 3400 = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true); 3401 if (!MaybeOffset) 3402 return {Root, 0}; 3403 return {RootI->getOperand(1).getReg(), MaybeOffset->Value}; 3404 } 3405 3406 static void addZeroImm(MachineInstrBuilder &MIB) { 3407 MIB.addImm(0); 3408 } 3409 3410 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p 3411 /// BasePtr is not valid, a null base pointer will be used. 3412 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, 3413 uint32_t FormatLo, uint32_t FormatHi, 3414 Register BasePtr) { 3415 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 3416 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 3417 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 3418 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 3419 3420 B.buildInstr(AMDGPU::S_MOV_B32) 3421 .addDef(RSrc2) 3422 .addImm(FormatLo); 3423 B.buildInstr(AMDGPU::S_MOV_B32) 3424 .addDef(RSrc3) 3425 .addImm(FormatHi); 3426 3427 // Build the half of the subregister with the constants before building the 3428 // full 128-bit register. If we are building multiple resource descriptors, 3429 // this will allow CSEing of the 2-component register. 3430 B.buildInstr(AMDGPU::REG_SEQUENCE) 3431 .addDef(RSrcHi) 3432 .addReg(RSrc2) 3433 .addImm(AMDGPU::sub0) 3434 .addReg(RSrc3) 3435 .addImm(AMDGPU::sub1); 3436 3437 Register RSrcLo = BasePtr; 3438 if (!BasePtr) { 3439 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 3440 B.buildInstr(AMDGPU::S_MOV_B64) 3441 .addDef(RSrcLo) 3442 .addImm(0); 3443 } 3444 3445 B.buildInstr(AMDGPU::REG_SEQUENCE) 3446 .addDef(RSrc) 3447 .addReg(RSrcLo) 3448 .addImm(AMDGPU::sub0_sub1) 3449 .addReg(RSrcHi) 3450 .addImm(AMDGPU::sub2_sub3); 3451 3452 return RSrc; 3453 } 3454 3455 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 3456 const SIInstrInfo &TII, Register BasePtr) { 3457 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 3458 3459 // FIXME: Why are half the "default" bits ignored based on the addressing 3460 // mode? 3461 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr); 3462 } 3463 3464 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 3465 const SIInstrInfo &TII, Register BasePtr) { 3466 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 3467 3468 // FIXME: Why are half the "default" bits ignored based on the addressing 3469 // mode? 3470 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr); 3471 } 3472 3473 AMDGPUInstructionSelector::MUBUFAddressData 3474 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const { 3475 MUBUFAddressData Data; 3476 Data.N0 = Src; 3477 3478 Register PtrBase; 3479 int64_t Offset; 3480 3481 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI); 3482 if (isUInt<32>(Offset)) { 3483 Data.N0 = PtrBase; 3484 Data.Offset = Offset; 3485 } 3486 3487 if (MachineInstr *InputAdd 3488 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) { 3489 Data.N2 = InputAdd->getOperand(1).getReg(); 3490 Data.N3 = InputAdd->getOperand(2).getReg(); 3491 3492 // FIXME: Need to fix extra SGPR->VGPRcopies inserted 3493 // FIXME: Don't know this was defined by operand 0 3494 // 3495 // TODO: Remove this when we have copy folding optimizations after 3496 // RegBankSelect. 3497 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg(); 3498 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg(); 3499 } 3500 3501 return Data; 3502 } 3503 3504 /// Return if the addr64 mubuf mode should be used for the given address. 3505 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const { 3506 // (ptr_add N2, N3) -> addr64, or 3507 // (ptr_add (ptr_add N2, N3), C1) -> addr64 3508 if (Addr.N2) 3509 return true; 3510 3511 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI); 3512 return N0Bank->getID() == AMDGPU::VGPRRegBankID; 3513 } 3514 3515 /// Split an immediate offset \p ImmOffset depending on whether it fits in the 3516 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable 3517 /// component. 3518 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset( 3519 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const { 3520 if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset)) 3521 return; 3522 3523 // Illegal offset, store it in soffset. 3524 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 3525 B.buildInstr(AMDGPU::S_MOV_B32) 3526 .addDef(SOffset) 3527 .addImm(ImmOffset); 3528 ImmOffset = 0; 3529 } 3530 3531 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl( 3532 MachineOperand &Root, Register &VAddr, Register &RSrcReg, 3533 Register &SOffset, int64_t &Offset) const { 3534 // FIXME: Predicates should stop this from reaching here. 3535 // addr64 bit was removed for volcanic islands. 3536 if (!STI.hasAddr64() || STI.useFlatForGlobal()) 3537 return false; 3538 3539 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 3540 if (!shouldUseAddr64(AddrData)) 3541 return false; 3542 3543 Register N0 = AddrData.N0; 3544 Register N2 = AddrData.N2; 3545 Register N3 = AddrData.N3; 3546 Offset = AddrData.Offset; 3547 3548 // Base pointer for the SRD. 3549 Register SRDPtr; 3550 3551 if (N2) { 3552 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 3553 assert(N3); 3554 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 3555 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the 3556 // addr64, and construct the default resource from a 0 address. 3557 VAddr = N0; 3558 } else { 3559 SRDPtr = N3; 3560 VAddr = N2; 3561 } 3562 } else { 3563 // N2 is not divergent. 3564 SRDPtr = N2; 3565 VAddr = N3; 3566 } 3567 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 3568 // Use the default null pointer in the resource 3569 VAddr = N0; 3570 } else { 3571 // N0 -> offset, or 3572 // (N0 + C1) -> offset 3573 SRDPtr = N0; 3574 } 3575 3576 MachineIRBuilder B(*Root.getParent()); 3577 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr); 3578 splitIllegalMUBUFOffset(B, SOffset, Offset); 3579 return true; 3580 } 3581 3582 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl( 3583 MachineOperand &Root, Register &RSrcReg, Register &SOffset, 3584 int64_t &Offset) const { 3585 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 3586 if (shouldUseAddr64(AddrData)) 3587 return false; 3588 3589 // N0 -> offset, or 3590 // (N0 + C1) -> offset 3591 Register SRDPtr = AddrData.N0; 3592 Offset = AddrData.Offset; 3593 3594 // TODO: Look through extensions for 32-bit soffset. 3595 MachineIRBuilder B(*Root.getParent()); 3596 3597 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr); 3598 splitIllegalMUBUFOffset(B, SOffset, Offset); 3599 return true; 3600 } 3601 3602 InstructionSelector::ComplexRendererFns 3603 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const { 3604 Register VAddr; 3605 Register RSrcReg; 3606 Register SOffset; 3607 int64_t Offset = 0; 3608 3609 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 3610 return {}; 3611 3612 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 3613 // pattern. 3614 return {{ 3615 [=](MachineInstrBuilder &MIB) { // rsrc 3616 MIB.addReg(RSrcReg); 3617 }, 3618 [=](MachineInstrBuilder &MIB) { // vaddr 3619 MIB.addReg(VAddr); 3620 }, 3621 [=](MachineInstrBuilder &MIB) { // soffset 3622 if (SOffset) 3623 MIB.addReg(SOffset); 3624 else 3625 MIB.addImm(0); 3626 }, 3627 [=](MachineInstrBuilder &MIB) { // offset 3628 MIB.addImm(Offset); 3629 }, 3630 addZeroImm, // glc 3631 addZeroImm, // slc 3632 addZeroImm, // tfe 3633 addZeroImm, // dlc 3634 addZeroImm // swz 3635 }}; 3636 } 3637 3638 InstructionSelector::ComplexRendererFns 3639 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const { 3640 Register RSrcReg; 3641 Register SOffset; 3642 int64_t Offset = 0; 3643 3644 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 3645 return {}; 3646 3647 return {{ 3648 [=](MachineInstrBuilder &MIB) { // rsrc 3649 MIB.addReg(RSrcReg); 3650 }, 3651 [=](MachineInstrBuilder &MIB) { // soffset 3652 if (SOffset) 3653 MIB.addReg(SOffset); 3654 else 3655 MIB.addImm(0); 3656 }, 3657 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 3658 addZeroImm, // glc 3659 addZeroImm, // slc 3660 addZeroImm, // tfe 3661 addZeroImm, // dlc 3662 addZeroImm // swz 3663 }}; 3664 } 3665 3666 InstructionSelector::ComplexRendererFns 3667 AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const { 3668 Register VAddr; 3669 Register RSrcReg; 3670 Register SOffset; 3671 int64_t Offset = 0; 3672 3673 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 3674 return {}; 3675 3676 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 3677 // pattern. 3678 return {{ 3679 [=](MachineInstrBuilder &MIB) { // rsrc 3680 MIB.addReg(RSrcReg); 3681 }, 3682 [=](MachineInstrBuilder &MIB) { // vaddr 3683 MIB.addReg(VAddr); 3684 }, 3685 [=](MachineInstrBuilder &MIB) { // soffset 3686 if (SOffset) 3687 MIB.addReg(SOffset); 3688 else 3689 MIB.addImm(0); 3690 }, 3691 [=](MachineInstrBuilder &MIB) { // offset 3692 MIB.addImm(Offset); 3693 }, 3694 addZeroImm // slc 3695 }}; 3696 } 3697 3698 InstructionSelector::ComplexRendererFns 3699 AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const { 3700 Register RSrcReg; 3701 Register SOffset; 3702 int64_t Offset = 0; 3703 3704 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 3705 return {}; 3706 3707 return {{ 3708 [=](MachineInstrBuilder &MIB) { // rsrc 3709 MIB.addReg(RSrcReg); 3710 }, 3711 [=](MachineInstrBuilder &MIB) { // soffset 3712 if (SOffset) 3713 MIB.addReg(SOffset); 3714 else 3715 MIB.addImm(0); 3716 }, 3717 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 3718 addZeroImm // slc 3719 }}; 3720 } 3721 3722 /// Get an immediate that must be 32-bits, and treated as zero extended. 3723 static Optional<uint64_t> getConstantZext32Val(Register Reg, 3724 const MachineRegisterInfo &MRI) { 3725 // getConstantVRegVal sexts any values, so see if that matters. 3726 Optional<int64_t> OffsetVal = getConstantVRegVal(Reg, MRI); 3727 if (!OffsetVal || !isInt<32>(*OffsetVal)) 3728 return None; 3729 return Lo_32(*OffsetVal); 3730 } 3731 3732 InstructionSelector::ComplexRendererFns 3733 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const { 3734 Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI); 3735 if (!OffsetVal) 3736 return {}; 3737 3738 Optional<int64_t> EncodedImm = 3739 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true); 3740 if (!EncodedImm) 3741 return {}; 3742 3743 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; 3744 } 3745 3746 InstructionSelector::ComplexRendererFns 3747 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const { 3748 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); 3749 3750 Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI); 3751 if (!OffsetVal) 3752 return {}; 3753 3754 Optional<int64_t> EncodedImm 3755 = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal); 3756 if (!EncodedImm) 3757 return {}; 3758 3759 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; 3760 } 3761 3762 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, 3763 const MachineInstr &MI, 3764 int OpIdx) const { 3765 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 3766 "Expected G_CONSTANT"); 3767 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue()); 3768 } 3769 3770 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB, 3771 const MachineInstr &MI, 3772 int OpIdx) const { 3773 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 3774 "Expected G_CONSTANT"); 3775 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue()); 3776 } 3777 3778 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB, 3779 const MachineInstr &MI, 3780 int OpIdx) const { 3781 assert(OpIdx == -1); 3782 3783 const MachineOperand &Op = MI.getOperand(1); 3784 if (MI.getOpcode() == TargetOpcode::G_FCONSTANT) 3785 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); 3786 else { 3787 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT"); 3788 MIB.addImm(Op.getCImm()->getSExtValue()); 3789 } 3790 } 3791 3792 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB, 3793 const MachineInstr &MI, 3794 int OpIdx) const { 3795 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 3796 "Expected G_CONSTANT"); 3797 MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation()); 3798 } 3799 3800 /// This only really exists to satisfy DAG type checking machinery, so is a 3801 /// no-op here. 3802 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB, 3803 const MachineInstr &MI, 3804 int OpIdx) const { 3805 MIB.addImm(MI.getOperand(OpIdx).getImm()); 3806 } 3807 3808 void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB, 3809 const MachineInstr &MI, 3810 int OpIdx) const { 3811 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3812 MIB.addImm(MI.getOperand(OpIdx).getImm() & 1); 3813 } 3814 3815 void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB, 3816 const MachineInstr &MI, 3817 int OpIdx) const { 3818 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3819 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1); 3820 } 3821 3822 void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB, 3823 const MachineInstr &MI, 3824 int OpIdx) const { 3825 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3826 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1); 3827 } 3828 3829 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB, 3830 const MachineInstr &MI, 3831 int OpIdx) const { 3832 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3833 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1); 3834 } 3835 3836 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const { 3837 return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm()); 3838 } 3839 3840 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const { 3841 return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm()); 3842 } 3843 3844 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const { 3845 return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm()); 3846 } 3847 3848 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const { 3849 return TII.isInlineConstant(Imm); 3850 } 3851