1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the InstructionSelector class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUInstructionSelector.h" 15 #include "AMDGPUInstrInfo.h" 16 #include "AMDGPUGlobalISelUtils.h" 17 #include "AMDGPURegisterBankInfo.h" 18 #include "AMDGPUSubtarget.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 23 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" 24 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" 25 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 26 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 27 #include "llvm/CodeGen/GlobalISel/Utils.h" 28 #include "llvm/CodeGen/MachineBasicBlock.h" 29 #include "llvm/CodeGen/MachineFunction.h" 30 #include "llvm/CodeGen/MachineInstr.h" 31 #include "llvm/CodeGen/MachineInstrBuilder.h" 32 #include "llvm/CodeGen/MachineRegisterInfo.h" 33 #include "llvm/IR/Type.h" 34 #include "llvm/Support/Debug.h" 35 #include "llvm/Support/raw_ostream.h" 36 37 #define DEBUG_TYPE "amdgpu-isel" 38 39 using namespace llvm; 40 using namespace MIPatternMatch; 41 42 static cl::opt<bool> AllowRiskySelect( 43 "amdgpu-global-isel-risky-select", 44 cl::desc("Allow GlobalISel to select cases that are likely to not work yet"), 45 cl::init(false), 46 cl::ReallyHidden); 47 48 #define GET_GLOBALISEL_IMPL 49 #define AMDGPUSubtarget GCNSubtarget 50 #include "AMDGPUGenGlobalISel.inc" 51 #undef GET_GLOBALISEL_IMPL 52 #undef AMDGPUSubtarget 53 54 AMDGPUInstructionSelector::AMDGPUInstructionSelector( 55 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, 56 const AMDGPUTargetMachine &TM) 57 : InstructionSelector(), TII(*STI.getInstrInfo()), 58 TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM), 59 STI(STI), 60 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG), 61 #define GET_GLOBALISEL_PREDICATES_INIT 62 #include "AMDGPUGenGlobalISel.inc" 63 #undef GET_GLOBALISEL_PREDICATES_INIT 64 #define GET_GLOBALISEL_TEMPORARIES_INIT 65 #include "AMDGPUGenGlobalISel.inc" 66 #undef GET_GLOBALISEL_TEMPORARIES_INIT 67 { 68 } 69 70 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } 71 72 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB, 73 CodeGenCoverage &CoverageInfo) { 74 MRI = &MF.getRegInfo(); 75 InstructionSelector::setupMF(MF, KB, CoverageInfo); 76 } 77 78 bool AMDGPUInstructionSelector::isVCC(Register Reg, 79 const MachineRegisterInfo &MRI) const { 80 if (Register::isPhysicalRegister(Reg)) 81 return Reg == TRI.getVCC(); 82 83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 84 const TargetRegisterClass *RC = 85 RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); 86 if (RC) { 87 const LLT Ty = MRI.getType(Reg); 88 return RC->hasSuperClassEq(TRI.getBoolRC()) && 89 Ty.isValid() && Ty.getSizeInBits() == 1; 90 } 91 92 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); 93 return RB->getID() == AMDGPU::VCCRegBankID; 94 } 95 96 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI, 97 unsigned NewOpc) const { 98 MI.setDesc(TII.get(NewOpc)); 99 MI.RemoveOperand(1); // Remove intrinsic ID. 100 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 101 102 MachineOperand &Dst = MI.getOperand(0); 103 MachineOperand &Src = MI.getOperand(1); 104 105 // TODO: This should be legalized to s32 if needed 106 if (MRI->getType(Dst.getReg()) == LLT::scalar(1)) 107 return false; 108 109 const TargetRegisterClass *DstRC 110 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 111 const TargetRegisterClass *SrcRC 112 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 113 if (!DstRC || DstRC != SrcRC) 114 return false; 115 116 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) && 117 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI); 118 } 119 120 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { 121 const DebugLoc &DL = I.getDebugLoc(); 122 MachineBasicBlock *BB = I.getParent(); 123 I.setDesc(TII.get(TargetOpcode::COPY)); 124 125 const MachineOperand &Src = I.getOperand(1); 126 MachineOperand &Dst = I.getOperand(0); 127 Register DstReg = Dst.getReg(); 128 Register SrcReg = Src.getReg(); 129 130 if (isVCC(DstReg, *MRI)) { 131 if (SrcReg == AMDGPU::SCC) { 132 const TargetRegisterClass *RC 133 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 134 if (!RC) 135 return true; 136 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 137 } 138 139 if (!isVCC(SrcReg, *MRI)) { 140 // TODO: Should probably leave the copy and let copyPhysReg expand it. 141 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI)) 142 return false; 143 144 const TargetRegisterClass *SrcRC 145 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 146 147 Register MaskedReg = MRI->createVirtualRegister(SrcRC); 148 149 // We can't trust the high bits at this point, so clear them. 150 151 // TODO: Skip masking high bits if def is known boolean. 152 153 unsigned AndOpc = TRI.isSGPRClass(SrcRC) ? 154 AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32; 155 BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg) 156 .addImm(1) 157 .addReg(SrcReg); 158 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) 159 .addImm(0) 160 .addReg(MaskedReg); 161 162 if (!MRI->getRegClassOrNull(SrcReg)) 163 MRI->setRegClass(SrcReg, SrcRC); 164 I.eraseFromParent(); 165 return true; 166 } 167 168 const TargetRegisterClass *RC = 169 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 170 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) 171 return false; 172 173 // Don't constrain the source register to a class so the def instruction 174 // handles it (unless it's undef). 175 // 176 // FIXME: This is a hack. When selecting the def, we neeed to know 177 // specifically know that the result is VCCRegBank, and not just an SGPR 178 // with size 1. An SReg_32 with size 1 is ambiguous with wave32. 179 if (Src.isUndef()) { 180 const TargetRegisterClass *SrcRC = 181 TRI.getConstrainedRegClassForOperand(Src, *MRI); 182 if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 183 return false; 184 } 185 186 return true; 187 } 188 189 for (const MachineOperand &MO : I.operands()) { 190 if (Register::isPhysicalRegister(MO.getReg())) 191 continue; 192 193 const TargetRegisterClass *RC = 194 TRI.getConstrainedRegClassForOperand(MO, *MRI); 195 if (!RC) 196 continue; 197 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); 198 } 199 return true; 200 } 201 202 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { 203 const Register DefReg = I.getOperand(0).getReg(); 204 const LLT DefTy = MRI->getType(DefReg); 205 if (DefTy == LLT::scalar(1)) { 206 if (!AllowRiskySelect) { 207 LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n"); 208 return false; 209 } 210 211 LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n"); 212 } 213 214 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy) 215 216 const RegClassOrRegBank &RegClassOrBank = 217 MRI->getRegClassOrRegBank(DefReg); 218 219 const TargetRegisterClass *DefRC 220 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); 221 if (!DefRC) { 222 if (!DefTy.isValid()) { 223 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); 224 return false; 225 } 226 227 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); 228 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI); 229 if (!DefRC) { 230 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); 231 return false; 232 } 233 } 234 235 // TODO: Verify that all registers have the same bank 236 I.setDesc(TII.get(TargetOpcode::PHI)); 237 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI); 238 } 239 240 MachineOperand 241 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, 242 const TargetRegisterClass &SubRC, 243 unsigned SubIdx) const { 244 245 MachineInstr *MI = MO.getParent(); 246 MachineBasicBlock *BB = MO.getParent()->getParent(); 247 Register DstReg = MRI->createVirtualRegister(&SubRC); 248 249 if (MO.isReg()) { 250 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); 251 Register Reg = MO.getReg(); 252 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) 253 .addReg(Reg, 0, ComposedSubIdx); 254 255 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(), 256 MO.isKill(), MO.isDead(), MO.isUndef(), 257 MO.isEarlyClobber(), 0, MO.isDebug(), 258 MO.isInternalRead()); 259 } 260 261 assert(MO.isImm()); 262 263 APInt Imm(64, MO.getImm()); 264 265 switch (SubIdx) { 266 default: 267 llvm_unreachable("do not know to split immediate with this sub index."); 268 case AMDGPU::sub0: 269 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue()); 270 case AMDGPU::sub1: 271 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue()); 272 } 273 } 274 275 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { 276 switch (Opc) { 277 case AMDGPU::G_AND: 278 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; 279 case AMDGPU::G_OR: 280 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32; 281 case AMDGPU::G_XOR: 282 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32; 283 default: 284 llvm_unreachable("not a bit op"); 285 } 286 } 287 288 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { 289 MachineOperand &Dst = I.getOperand(0); 290 MachineOperand &Src0 = I.getOperand(1); 291 MachineOperand &Src1 = I.getOperand(2); 292 Register DstReg = Dst.getReg(); 293 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 294 295 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 296 if (DstRB->getID() == AMDGPU::VCCRegBankID) { 297 const TargetRegisterClass *RC = TRI.getBoolRC(); 298 unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), 299 RC == &AMDGPU::SReg_64RegClass); 300 I.setDesc(TII.get(InstOpc)); 301 // Dead implicit-def of scc 302 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef 303 true, // isImp 304 false, // isKill 305 true)); // isDead 306 307 // FIXME: Hack to avoid turning the register bank into a register class. 308 // The selector for G_ICMP relies on seeing the register bank for the result 309 // is VCC. In wave32 if we constrain the registers to SReg_32 here, it will 310 // be ambiguous whether it's a scalar or vector bool. 311 if (Src0.isUndef() && !MRI->getRegClassOrNull(Src0.getReg())) 312 MRI->setRegClass(Src0.getReg(), RC); 313 if (Src1.isUndef() && !MRI->getRegClassOrNull(Src1.getReg())) 314 MRI->setRegClass(Src1.getReg(), RC); 315 316 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 317 } 318 319 // TODO: Should this allow an SCC bank result, and produce a copy from SCC for 320 // the result? 321 if (DstRB->getID() == AMDGPU::SGPRRegBankID) { 322 unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), Size > 32); 323 I.setDesc(TII.get(InstOpc)); 324 // Dead implicit-def of scc 325 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef 326 true, // isImp 327 false, // isKill 328 true)); // isDead 329 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 330 } 331 332 return false; 333 } 334 335 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { 336 MachineBasicBlock *BB = I.getParent(); 337 MachineFunction *MF = BB->getParent(); 338 Register DstReg = I.getOperand(0).getReg(); 339 const DebugLoc &DL = I.getDebugLoc(); 340 LLT Ty = MRI->getType(DstReg); 341 if (Ty.isVector()) 342 return false; 343 344 unsigned Size = Ty.getSizeInBits(); 345 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 346 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID; 347 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB; 348 349 if (Size == 32) { 350 if (IsSALU) { 351 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; 352 MachineInstr *Add = 353 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 354 .add(I.getOperand(1)) 355 .add(I.getOperand(2)); 356 I.eraseFromParent(); 357 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 358 } 359 360 if (STI.hasAddNoCarry()) { 361 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64; 362 I.setDesc(TII.get(Opc)); 363 I.addOperand(*MF, MachineOperand::CreateImm(0)); 364 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 365 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 366 } 367 368 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64; 369 370 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass()); 371 MachineInstr *Add 372 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 373 .addDef(UnusedCarry, RegState::Dead) 374 .add(I.getOperand(1)) 375 .add(I.getOperand(2)) 376 .addImm(0); 377 I.eraseFromParent(); 378 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 379 } 380 381 assert(!Sub && "illegal sub should not reach here"); 382 383 const TargetRegisterClass &RC 384 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass; 385 const TargetRegisterClass &HalfRC 386 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass; 387 388 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0)); 389 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0)); 390 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1)); 391 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1)); 392 393 Register DstLo = MRI->createVirtualRegister(&HalfRC); 394 Register DstHi = MRI->createVirtualRegister(&HalfRC); 395 396 if (IsSALU) { 397 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) 398 .add(Lo1) 399 .add(Lo2); 400 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) 401 .add(Hi1) 402 .add(Hi2); 403 } else { 404 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass(); 405 Register CarryReg = MRI->createVirtualRegister(CarryRC); 406 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo) 407 .addDef(CarryReg) 408 .add(Lo1) 409 .add(Lo2) 410 .addImm(0); 411 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi) 412 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead) 413 .add(Hi1) 414 .add(Hi2) 415 .addReg(CarryReg, RegState::Kill) 416 .addImm(0); 417 418 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI)) 419 return false; 420 } 421 422 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 423 .addReg(DstLo) 424 .addImm(AMDGPU::sub0) 425 .addReg(DstHi) 426 .addImm(AMDGPU::sub1); 427 428 429 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI)) 430 return false; 431 432 I.eraseFromParent(); 433 return true; 434 } 435 436 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE( 437 MachineInstr &I) const { 438 MachineBasicBlock *BB = I.getParent(); 439 MachineFunction *MF = BB->getParent(); 440 const DebugLoc &DL = I.getDebugLoc(); 441 Register Dst0Reg = I.getOperand(0).getReg(); 442 Register Dst1Reg = I.getOperand(1).getReg(); 443 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO || 444 I.getOpcode() == AMDGPU::G_UADDE; 445 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE || 446 I.getOpcode() == AMDGPU::G_USUBE; 447 448 if (isVCC(Dst1Reg, *MRI)) { 449 unsigned NoCarryOpc = 450 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64; 451 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 452 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc)); 453 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 454 I.addOperand(*MF, MachineOperand::CreateImm(0)); 455 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 456 } 457 458 Register Src0Reg = I.getOperand(2).getReg(); 459 Register Src1Reg = I.getOperand(3).getReg(); 460 461 if (HasCarryIn) { 462 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 463 .addReg(I.getOperand(4).getReg()); 464 } 465 466 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; 467 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; 468 469 BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg) 470 .add(I.getOperand(2)) 471 .add(I.getOperand(3)); 472 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg) 473 .addReg(AMDGPU::SCC); 474 475 if (!MRI->getRegClassOrNull(Dst1Reg)) 476 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass); 477 478 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) || 479 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) || 480 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI)) 481 return false; 482 483 if (HasCarryIn && 484 !RBI.constrainGenericRegister(I.getOperand(4).getReg(), 485 AMDGPU::SReg_32RegClass, *MRI)) 486 return false; 487 488 I.eraseFromParent(); 489 return true; 490 } 491 492 // TODO: We should probably legalize these to only using 32-bit results. 493 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { 494 MachineBasicBlock *BB = I.getParent(); 495 Register DstReg = I.getOperand(0).getReg(); 496 Register SrcReg = I.getOperand(1).getReg(); 497 LLT DstTy = MRI->getType(DstReg); 498 LLT SrcTy = MRI->getType(SrcReg); 499 const unsigned SrcSize = SrcTy.getSizeInBits(); 500 unsigned DstSize = DstTy.getSizeInBits(); 501 502 // TODO: Should handle any multiple of 32 offset. 503 unsigned Offset = I.getOperand(2).getImm(); 504 if (Offset % 32 != 0 || DstSize > 128) 505 return false; 506 507 // 16-bit operations really use 32-bit registers. 508 // FIXME: Probably should not allow 16-bit G_EXTRACT results. 509 if (DstSize == 16) 510 DstSize = 32; 511 512 const TargetRegisterClass *DstRC = 513 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI); 514 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 515 return false; 516 517 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 518 const TargetRegisterClass *SrcRC = 519 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 520 if (!SrcRC) 521 return false; 522 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32, 523 DstSize / 32); 524 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg); 525 if (!SrcRC) 526 return false; 527 528 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I, 529 *SrcRC, I.getOperand(1)); 530 const DebugLoc &DL = I.getDebugLoc(); 531 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg) 532 .addReg(SrcReg, 0, SubReg); 533 534 I.eraseFromParent(); 535 return true; 536 } 537 538 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { 539 MachineBasicBlock *BB = MI.getParent(); 540 Register DstReg = MI.getOperand(0).getReg(); 541 LLT DstTy = MRI->getType(DstReg); 542 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg()); 543 544 const unsigned SrcSize = SrcTy.getSizeInBits(); 545 if (SrcSize < 32) 546 return selectImpl(MI, *CoverageInfo); 547 548 const DebugLoc &DL = MI.getDebugLoc(); 549 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 550 const unsigned DstSize = DstTy.getSizeInBits(); 551 const TargetRegisterClass *DstRC = 552 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 553 if (!DstRC) 554 return false; 555 556 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8); 557 MachineInstrBuilder MIB = 558 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg); 559 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) { 560 MachineOperand &Src = MI.getOperand(I + 1); 561 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef())); 562 MIB.addImm(SubRegs[I]); 563 564 const TargetRegisterClass *SrcRC 565 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 566 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI)) 567 return false; 568 } 569 570 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 571 return false; 572 573 MI.eraseFromParent(); 574 return true; 575 } 576 577 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { 578 MachineBasicBlock *BB = MI.getParent(); 579 const int NumDst = MI.getNumOperands() - 1; 580 581 MachineOperand &Src = MI.getOperand(NumDst); 582 583 Register SrcReg = Src.getReg(); 584 Register DstReg0 = MI.getOperand(0).getReg(); 585 LLT DstTy = MRI->getType(DstReg0); 586 LLT SrcTy = MRI->getType(SrcReg); 587 588 const unsigned DstSize = DstTy.getSizeInBits(); 589 const unsigned SrcSize = SrcTy.getSizeInBits(); 590 const DebugLoc &DL = MI.getDebugLoc(); 591 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 592 593 const TargetRegisterClass *SrcRC = 594 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 595 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 596 return false; 597 598 const unsigned SrcFlags = getUndefRegState(Src.isUndef()); 599 600 // Note we could have mixed SGPR and VGPR destination banks for an SGPR 601 // source, and this relies on the fact that the same subregister indices are 602 // used for both. 603 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); 604 for (int I = 0, E = NumDst; I != E; ++I) { 605 MachineOperand &Dst = MI.getOperand(I); 606 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg()) 607 .addReg(SrcReg, SrcFlags, SubRegs[I]); 608 609 const TargetRegisterClass *DstRC = 610 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 611 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI)) 612 return false; 613 } 614 615 MI.eraseFromParent(); 616 return true; 617 } 618 619 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC( 620 MachineInstr &MI) const { 621 if (selectImpl(MI, *CoverageInfo)) 622 return true; 623 624 const LLT S32 = LLT::scalar(32); 625 const LLT V2S16 = LLT::vector(2, 16); 626 627 Register Dst = MI.getOperand(0).getReg(); 628 if (MRI->getType(Dst) != V2S16) 629 return false; 630 631 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI); 632 if (DstBank->getID() != AMDGPU::SGPRRegBankID) 633 return false; 634 635 Register Src0 = MI.getOperand(1).getReg(); 636 Register Src1 = MI.getOperand(2).getReg(); 637 if (MRI->getType(Src0) != S32) 638 return false; 639 640 const DebugLoc &DL = MI.getDebugLoc(); 641 MachineBasicBlock *BB = MI.getParent(); 642 643 auto ConstSrc1 = getConstantVRegValWithLookThrough(Src1, *MRI, true, true); 644 if (ConstSrc1) { 645 auto ConstSrc0 = getConstantVRegValWithLookThrough(Src0, *MRI, true, true); 646 if (ConstSrc0) { 647 uint32_t Lo16 = static_cast<uint32_t>(ConstSrc0->Value) & 0xffff; 648 uint32_t Hi16 = static_cast<uint32_t>(ConstSrc1->Value) & 0xffff; 649 650 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst) 651 .addImm(Lo16 | (Hi16 << 16)); 652 MI.eraseFromParent(); 653 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI); 654 } 655 } 656 657 // TODO: This should probably be a combine somewhere 658 // (build_vector_trunc $src0, undef -> copy $src0 659 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI); 660 if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) { 661 MI.setDesc(TII.get(AMDGPU::COPY)); 662 MI.RemoveOperand(2); 663 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) && 664 RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI); 665 } 666 667 Register ShiftSrc0; 668 Register ShiftSrc1; 669 int64_t ShiftAmt; 670 671 // With multiple uses of the shift, this will duplicate the shift and 672 // increase register pressure. 673 // 674 // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16) 675 // => (S_PACK_HH_B32_B16 $src0, $src1) 676 // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16)) 677 // => (S_PACK_LH_B32_B16 $src0, $src1) 678 // (build_vector_trunc $src0, $src1) 679 // => (S_PACK_LL_B32_B16 $src0, $src1) 680 681 // FIXME: This is an inconvenient way to check a specific value 682 bool Shift0 = mi_match( 683 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_ICst(ShiftAmt)))) && 684 ShiftAmt == 16; 685 686 bool Shift1 = mi_match( 687 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_ICst(ShiftAmt)))) && 688 ShiftAmt == 16; 689 690 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16; 691 if (Shift0 && Shift1) { 692 Opc = AMDGPU::S_PACK_HH_B32_B16; 693 MI.getOperand(1).setReg(ShiftSrc0); 694 MI.getOperand(2).setReg(ShiftSrc1); 695 } else if (Shift1) { 696 Opc = AMDGPU::S_PACK_LH_B32_B16; 697 MI.getOperand(2).setReg(ShiftSrc1); 698 } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) { 699 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16 700 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst) 701 .addReg(ShiftSrc0) 702 .addImm(16); 703 704 MI.eraseFromParent(); 705 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 706 } 707 708 MI.setDesc(TII.get(Opc)); 709 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); 710 } 711 712 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const { 713 return selectG_ADD_SUB(I); 714 } 715 716 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { 717 const MachineOperand &MO = I.getOperand(0); 718 719 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The 720 // regbank check here is to know why getConstrainedRegClassForOperand failed. 721 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI); 722 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) || 723 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) { 724 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); 725 return true; 726 } 727 728 return false; 729 } 730 731 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { 732 MachineBasicBlock *BB = I.getParent(); 733 734 Register DstReg = I.getOperand(0).getReg(); 735 Register Src0Reg = I.getOperand(1).getReg(); 736 Register Src1Reg = I.getOperand(2).getReg(); 737 LLT Src1Ty = MRI->getType(Src1Reg); 738 739 unsigned DstSize = MRI->getType(DstReg).getSizeInBits(); 740 unsigned InsSize = Src1Ty.getSizeInBits(); 741 742 int64_t Offset = I.getOperand(3).getImm(); 743 744 // FIXME: These cases should have been illegal and unnecessary to check here. 745 if (Offset % 32 != 0 || InsSize % 32 != 0) 746 return false; 747 748 // Currently not handled by getSubRegFromChannel. 749 if (InsSize > 128) 750 return false; 751 752 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32); 753 if (SubReg == AMDGPU::NoSubRegister) 754 return false; 755 756 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 757 const TargetRegisterClass *DstRC = 758 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 759 if (!DstRC) 760 return false; 761 762 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI); 763 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI); 764 const TargetRegisterClass *Src0RC = 765 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI); 766 const TargetRegisterClass *Src1RC = 767 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI); 768 769 // Deal with weird cases where the class only partially supports the subreg 770 // index. 771 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg); 772 if (!Src0RC || !Src1RC) 773 return false; 774 775 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 776 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) || 777 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI)) 778 return false; 779 780 const DebugLoc &DL = I.getDebugLoc(); 781 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg) 782 .addReg(Src0Reg) 783 .addReg(Src1Reg) 784 .addImm(SubReg); 785 786 I.eraseFromParent(); 787 return true; 788 } 789 790 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const { 791 if (STI.getLDSBankCount() != 16) 792 return selectImpl(MI, *CoverageInfo); 793 794 Register Dst = MI.getOperand(0).getReg(); 795 Register Src0 = MI.getOperand(2).getReg(); 796 Register M0Val = MI.getOperand(6).getReg(); 797 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) || 798 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) || 799 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI)) 800 return false; 801 802 // This requires 2 instructions. It is possible to write a pattern to support 803 // this, but the generated isel emitter doesn't correctly deal with multiple 804 // output instructions using the same physical register input. The copy to m0 805 // is incorrectly placed before the second instruction. 806 // 807 // TODO: Match source modifiers. 808 809 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 810 const DebugLoc &DL = MI.getDebugLoc(); 811 MachineBasicBlock *MBB = MI.getParent(); 812 813 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 814 .addReg(M0Val); 815 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov) 816 .addImm(2) 817 .addImm(MI.getOperand(4).getImm()) // $attr 818 .addImm(MI.getOperand(3).getImm()); // $attrchan 819 820 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst) 821 .addImm(0) // $src0_modifiers 822 .addReg(Src0) // $src0 823 .addImm(MI.getOperand(4).getImm()) // $attr 824 .addImm(MI.getOperand(3).getImm()) // $attrchan 825 .addImm(0) // $src2_modifiers 826 .addReg(InterpMov) // $src2 - 2 f16 values selected by high 827 .addImm(MI.getOperand(5).getImm()) // $high 828 .addImm(0) // $clamp 829 .addImm(0); // $omod 830 831 MI.eraseFromParent(); 832 return true; 833 } 834 835 // We need to handle this here because tablegen doesn't support matching 836 // instructions with multiple outputs. 837 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const { 838 Register Dst0 = MI.getOperand(0).getReg(); 839 Register Dst1 = MI.getOperand(1).getReg(); 840 841 LLT Ty = MRI->getType(Dst0); 842 unsigned Opc; 843 if (Ty == LLT::scalar(32)) 844 Opc = AMDGPU::V_DIV_SCALE_F32; 845 else if (Ty == LLT::scalar(64)) 846 Opc = AMDGPU::V_DIV_SCALE_F64; 847 else 848 return false; 849 850 const DebugLoc &DL = MI.getDebugLoc(); 851 MachineBasicBlock *MBB = MI.getParent(); 852 853 Register Numer = MI.getOperand(3).getReg(); 854 Register Denom = MI.getOperand(4).getReg(); 855 unsigned ChooseDenom = MI.getOperand(5).getImm(); 856 857 Register Src0 = ChooseDenom != 0 ? Numer : Denom; 858 859 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0) 860 .addDef(Dst1) 861 .addUse(Src0) 862 .addUse(Denom) 863 .addUse(Numer); 864 865 MI.eraseFromParent(); 866 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 867 } 868 869 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { 870 unsigned IntrinsicID = I.getIntrinsicID(); 871 switch (IntrinsicID) { 872 case Intrinsic::amdgcn_if_break: { 873 MachineBasicBlock *BB = I.getParent(); 874 875 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 876 // SelectionDAG uses for wave32 vs wave64. 877 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK)) 878 .add(I.getOperand(0)) 879 .add(I.getOperand(2)) 880 .add(I.getOperand(3)); 881 882 Register DstReg = I.getOperand(0).getReg(); 883 Register Src0Reg = I.getOperand(2).getReg(); 884 Register Src1Reg = I.getOperand(3).getReg(); 885 886 I.eraseFromParent(); 887 888 for (Register Reg : { DstReg, Src0Reg, Src1Reg }) 889 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 890 891 return true; 892 } 893 case Intrinsic::amdgcn_interp_p1_f16: 894 return selectInterpP1F16(I); 895 case Intrinsic::amdgcn_wqm: 896 return constrainCopyLikeIntrin(I, AMDGPU::WQM); 897 case Intrinsic::amdgcn_softwqm: 898 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM); 899 case Intrinsic::amdgcn_wwm: 900 return constrainCopyLikeIntrin(I, AMDGPU::WWM); 901 case Intrinsic::amdgcn_div_scale: 902 return selectDivScale(I); 903 case Intrinsic::amdgcn_icmp: 904 return selectIntrinsicIcmp(I); 905 case Intrinsic::amdgcn_ballot: 906 return selectBallot(I); 907 default: 908 return selectImpl(I, *CoverageInfo); 909 } 910 } 911 912 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) { 913 if (Size != 32 && Size != 64) 914 return -1; 915 switch (P) { 916 default: 917 llvm_unreachable("Unknown condition code!"); 918 case CmpInst::ICMP_NE: 919 return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64; 920 case CmpInst::ICMP_EQ: 921 return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64; 922 case CmpInst::ICMP_SGT: 923 return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64; 924 case CmpInst::ICMP_SGE: 925 return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64; 926 case CmpInst::ICMP_SLT: 927 return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64; 928 case CmpInst::ICMP_SLE: 929 return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64; 930 case CmpInst::ICMP_UGT: 931 return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64; 932 case CmpInst::ICMP_UGE: 933 return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64; 934 case CmpInst::ICMP_ULT: 935 return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64; 936 case CmpInst::ICMP_ULE: 937 return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64; 938 } 939 } 940 941 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P, 942 unsigned Size) const { 943 if (Size == 64) { 944 if (!STI.hasScalarCompareEq64()) 945 return -1; 946 947 switch (P) { 948 case CmpInst::ICMP_NE: 949 return AMDGPU::S_CMP_LG_U64; 950 case CmpInst::ICMP_EQ: 951 return AMDGPU::S_CMP_EQ_U64; 952 default: 953 return -1; 954 } 955 } 956 957 if (Size != 32) 958 return -1; 959 960 switch (P) { 961 case CmpInst::ICMP_NE: 962 return AMDGPU::S_CMP_LG_U32; 963 case CmpInst::ICMP_EQ: 964 return AMDGPU::S_CMP_EQ_U32; 965 case CmpInst::ICMP_SGT: 966 return AMDGPU::S_CMP_GT_I32; 967 case CmpInst::ICMP_SGE: 968 return AMDGPU::S_CMP_GE_I32; 969 case CmpInst::ICMP_SLT: 970 return AMDGPU::S_CMP_LT_I32; 971 case CmpInst::ICMP_SLE: 972 return AMDGPU::S_CMP_LE_I32; 973 case CmpInst::ICMP_UGT: 974 return AMDGPU::S_CMP_GT_U32; 975 case CmpInst::ICMP_UGE: 976 return AMDGPU::S_CMP_GE_U32; 977 case CmpInst::ICMP_ULT: 978 return AMDGPU::S_CMP_LT_U32; 979 case CmpInst::ICMP_ULE: 980 return AMDGPU::S_CMP_LE_U32; 981 default: 982 llvm_unreachable("Unknown condition code!"); 983 } 984 } 985 986 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { 987 MachineBasicBlock *BB = I.getParent(); 988 const DebugLoc &DL = I.getDebugLoc(); 989 990 Register SrcReg = I.getOperand(2).getReg(); 991 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 992 993 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); 994 995 Register CCReg = I.getOperand(0).getReg(); 996 if (!isVCC(CCReg, *MRI)) { 997 int Opcode = getS_CMPOpcode(Pred, Size); 998 if (Opcode == -1) 999 return false; 1000 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode)) 1001 .add(I.getOperand(2)) 1002 .add(I.getOperand(3)); 1003 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg) 1004 .addReg(AMDGPU::SCC); 1005 bool Ret = 1006 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) && 1007 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI); 1008 I.eraseFromParent(); 1009 return Ret; 1010 } 1011 1012 int Opcode = getV_CMPOpcode(Pred, Size); 1013 if (Opcode == -1) 1014 return false; 1015 1016 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), 1017 I.getOperand(0).getReg()) 1018 .add(I.getOperand(2)) 1019 .add(I.getOperand(3)); 1020 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), 1021 *TRI.getBoolRC(), *MRI); 1022 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 1023 I.eraseFromParent(); 1024 return Ret; 1025 } 1026 1027 bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const { 1028 Register Dst = I.getOperand(0).getReg(); 1029 if (isVCC(Dst, *MRI)) 1030 return false; 1031 1032 if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize()) 1033 return false; 1034 1035 MachineBasicBlock *BB = I.getParent(); 1036 const DebugLoc &DL = I.getDebugLoc(); 1037 Register SrcReg = I.getOperand(2).getReg(); 1038 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 1039 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm()); 1040 1041 int Opcode = getV_CMPOpcode(Pred, Size); 1042 if (Opcode == -1) 1043 return false; 1044 1045 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst) 1046 .add(I.getOperand(2)) 1047 .add(I.getOperand(3)); 1048 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(), 1049 *MRI); 1050 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 1051 I.eraseFromParent(); 1052 return Ret; 1053 } 1054 1055 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const { 1056 MachineBasicBlock *BB = I.getParent(); 1057 const DebugLoc &DL = I.getDebugLoc(); 1058 Register DstReg = I.getOperand(0).getReg(); 1059 const unsigned Size = MRI->getType(DstReg).getSizeInBits(); 1060 const bool Is64 = Size == 64; 1061 1062 if (Size != STI.getWavefrontSize()) 1063 return false; 1064 1065 Optional<ValueAndVReg> Arg = 1066 getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true); 1067 1068 if (Arg.hasValue()) { 1069 const int64_t Value = Arg.getValue().Value; 1070 if (Value == 0) { 1071 unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; 1072 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0); 1073 } else if (Value == -1) { // all ones 1074 Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO; 1075 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg); 1076 } else 1077 return false; 1078 } else { 1079 Register SrcReg = I.getOperand(2).getReg(); 1080 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg); 1081 } 1082 1083 I.eraseFromParent(); 1084 return true; 1085 } 1086 1087 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const { 1088 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 1089 // SelectionDAG uses for wave32 vs wave64. 1090 MachineBasicBlock *BB = MI.getParent(); 1091 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF)) 1092 .add(MI.getOperand(1)); 1093 1094 Register Reg = MI.getOperand(1).getReg(); 1095 MI.eraseFromParent(); 1096 1097 if (!MRI->getRegClassOrNull(Reg)) 1098 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 1099 return true; 1100 } 1101 1102 static unsigned getDSShaderTypeValue(const MachineFunction &MF) { 1103 switch (MF.getFunction().getCallingConv()) { 1104 case CallingConv::AMDGPU_PS: 1105 return 1; 1106 case CallingConv::AMDGPU_VS: 1107 return 2; 1108 case CallingConv::AMDGPU_GS: 1109 return 3; 1110 case CallingConv::AMDGPU_HS: 1111 case CallingConv::AMDGPU_LS: 1112 case CallingConv::AMDGPU_ES: 1113 report_fatal_error("ds_ordered_count unsupported for this calling conv"); 1114 case CallingConv::AMDGPU_CS: 1115 case CallingConv::AMDGPU_KERNEL: 1116 case CallingConv::C: 1117 case CallingConv::Fast: 1118 default: 1119 // Assume other calling conventions are various compute callable functions 1120 return 0; 1121 } 1122 } 1123 1124 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic( 1125 MachineInstr &MI, Intrinsic::ID IntrID) const { 1126 MachineBasicBlock *MBB = MI.getParent(); 1127 MachineFunction *MF = MBB->getParent(); 1128 const DebugLoc &DL = MI.getDebugLoc(); 1129 1130 unsigned IndexOperand = MI.getOperand(7).getImm(); 1131 bool WaveRelease = MI.getOperand(8).getImm() != 0; 1132 bool WaveDone = MI.getOperand(9).getImm() != 0; 1133 1134 if (WaveDone && !WaveRelease) 1135 report_fatal_error("ds_ordered_count: wave_done requires wave_release"); 1136 1137 unsigned OrderedCountIndex = IndexOperand & 0x3f; 1138 IndexOperand &= ~0x3f; 1139 unsigned CountDw = 0; 1140 1141 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) { 1142 CountDw = (IndexOperand >> 24) & 0xf; 1143 IndexOperand &= ~(0xf << 24); 1144 1145 if (CountDw < 1 || CountDw > 4) { 1146 report_fatal_error( 1147 "ds_ordered_count: dword count must be between 1 and 4"); 1148 } 1149 } 1150 1151 if (IndexOperand) 1152 report_fatal_error("ds_ordered_count: bad index operand"); 1153 1154 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1; 1155 unsigned ShaderType = getDSShaderTypeValue(*MF); 1156 1157 unsigned Offset0 = OrderedCountIndex << 2; 1158 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) | 1159 (Instruction << 4); 1160 1161 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) 1162 Offset1 |= (CountDw - 1) << 6; 1163 1164 unsigned Offset = Offset0 | (Offset1 << 8); 1165 1166 Register M0Val = MI.getOperand(2).getReg(); 1167 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1168 .addReg(M0Val); 1169 1170 Register DstReg = MI.getOperand(0).getReg(); 1171 Register ValReg = MI.getOperand(3).getReg(); 1172 MachineInstrBuilder DS = 1173 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg) 1174 .addReg(ValReg) 1175 .addImm(Offset) 1176 .cloneMemRefs(MI); 1177 1178 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI)) 1179 return false; 1180 1181 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI); 1182 MI.eraseFromParent(); 1183 return Ret; 1184 } 1185 1186 static unsigned gwsIntrinToOpcode(unsigned IntrID) { 1187 switch (IntrID) { 1188 case Intrinsic::amdgcn_ds_gws_init: 1189 return AMDGPU::DS_GWS_INIT; 1190 case Intrinsic::amdgcn_ds_gws_barrier: 1191 return AMDGPU::DS_GWS_BARRIER; 1192 case Intrinsic::amdgcn_ds_gws_sema_v: 1193 return AMDGPU::DS_GWS_SEMA_V; 1194 case Intrinsic::amdgcn_ds_gws_sema_br: 1195 return AMDGPU::DS_GWS_SEMA_BR; 1196 case Intrinsic::amdgcn_ds_gws_sema_p: 1197 return AMDGPU::DS_GWS_SEMA_P; 1198 case Intrinsic::amdgcn_ds_gws_sema_release_all: 1199 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL; 1200 default: 1201 llvm_unreachable("not a gws intrinsic"); 1202 } 1203 } 1204 1205 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, 1206 Intrinsic::ID IID) const { 1207 if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all && 1208 !STI.hasGWSSemaReleaseAll()) 1209 return false; 1210 1211 // intrinsic ID, vsrc, offset 1212 const bool HasVSrc = MI.getNumOperands() == 3; 1213 assert(HasVSrc || MI.getNumOperands() == 2); 1214 1215 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg(); 1216 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI); 1217 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID) 1218 return false; 1219 1220 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1221 assert(OffsetDef); 1222 1223 unsigned ImmOffset; 1224 1225 MachineBasicBlock *MBB = MI.getParent(); 1226 const DebugLoc &DL = MI.getDebugLoc(); 1227 1228 MachineInstr *Readfirstlane = nullptr; 1229 1230 // If we legalized the VGPR input, strip out the readfirstlane to analyze the 1231 // incoming offset, in case there's an add of a constant. We'll have to put it 1232 // back later. 1233 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) { 1234 Readfirstlane = OffsetDef; 1235 BaseOffset = OffsetDef->getOperand(1).getReg(); 1236 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1237 } 1238 1239 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) { 1240 // If we have a constant offset, try to use the 0 in m0 as the base. 1241 // TODO: Look into changing the default m0 initialization value. If the 1242 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to 1243 // the immediate offset. 1244 1245 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue(); 1246 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1247 .addImm(0); 1248 } else { 1249 std::tie(BaseOffset, ImmOffset, OffsetDef) 1250 = AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset); 1251 1252 if (Readfirstlane) { 1253 // We have the constant offset now, so put the readfirstlane back on the 1254 // variable component. 1255 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI)) 1256 return false; 1257 1258 Readfirstlane->getOperand(1).setReg(BaseOffset); 1259 BaseOffset = Readfirstlane->getOperand(0).getReg(); 1260 } else { 1261 if (!RBI.constrainGenericRegister(BaseOffset, 1262 AMDGPU::SReg_32RegClass, *MRI)) 1263 return false; 1264 } 1265 1266 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1267 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base) 1268 .addReg(BaseOffset) 1269 .addImm(16); 1270 1271 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1272 .addReg(M0Base); 1273 } 1274 1275 // The resource id offset is computed as (<isa opaque base> + M0[21:16] + 1276 // offset field) % 64. Some versions of the programming guide omit the m0 1277 // part, or claim it's from offset 0. 1278 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID))); 1279 1280 if (HasVSrc) { 1281 Register VSrc = MI.getOperand(1).getReg(); 1282 MIB.addReg(VSrc); 1283 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI)) 1284 return false; 1285 } 1286 1287 MIB.addImm(ImmOffset) 1288 .addImm(-1) // $gds 1289 .cloneMemRefs(MI); 1290 1291 MI.eraseFromParent(); 1292 return true; 1293 } 1294 1295 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI, 1296 bool IsAppend) const { 1297 Register PtrBase = MI.getOperand(2).getReg(); 1298 LLT PtrTy = MRI->getType(PtrBase); 1299 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS; 1300 1301 unsigned Offset; 1302 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2)); 1303 1304 // TODO: Should this try to look through readfirstlane like GWS? 1305 if (!isDSOffsetLegal(PtrBase, Offset, 16)) { 1306 PtrBase = MI.getOperand(2).getReg(); 1307 Offset = 0; 1308 } 1309 1310 MachineBasicBlock *MBB = MI.getParent(); 1311 const DebugLoc &DL = MI.getDebugLoc(); 1312 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME; 1313 1314 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1315 .addReg(PtrBase); 1316 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI)) 1317 return false; 1318 1319 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg()) 1320 .addImm(Offset) 1321 .addImm(IsGDS ? -1 : 0) 1322 .cloneMemRefs(MI); 1323 MI.eraseFromParent(); 1324 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1325 } 1326 1327 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, 1328 bool &IsTexFail) { 1329 if (TexFailCtrl) 1330 IsTexFail = true; 1331 1332 TFE = (TexFailCtrl & 0x1) ? 1 : 0; 1333 TexFailCtrl &= ~(uint64_t)0x1; 1334 LWE = (TexFailCtrl & 0x2) ? 1 : 0; 1335 TexFailCtrl &= ~(uint64_t)0x2; 1336 1337 return TexFailCtrl == 0; 1338 } 1339 1340 static bool parseCachePolicy(uint64_t Value, 1341 bool *GLC, bool *SLC, bool *DLC) { 1342 if (GLC) { 1343 *GLC = (Value & 0x1) ? 1 : 0; 1344 Value &= ~(uint64_t)0x1; 1345 } 1346 if (SLC) { 1347 *SLC = (Value & 0x2) ? 1 : 0; 1348 Value &= ~(uint64_t)0x2; 1349 } 1350 if (DLC) { 1351 *DLC = (Value & 0x4) ? 1 : 0; 1352 Value &= ~(uint64_t)0x4; 1353 } 1354 1355 return Value == 0; 1356 } 1357 1358 bool AMDGPUInstructionSelector::selectImageIntrinsic( 1359 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 1360 MachineBasicBlock *MBB = MI.getParent(); 1361 const DebugLoc &DL = MI.getDebugLoc(); 1362 1363 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 1364 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 1365 1366 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); 1367 const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 1368 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode); 1369 const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo = 1370 AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode); 1371 unsigned IntrOpcode = Intr->BaseOpcode; 1372 const bool IsGFX10 = STI.getGeneration() >= AMDGPUSubtarget::GFX10; 1373 1374 const int VAddrIdx = getImageVAddrIdxBegin(BaseOpcode, 1375 MI.getNumExplicitDefs()); 1376 int NumVAddr, NumGradients; 1377 std::tie(NumVAddr, NumGradients) = getImageNumVAddr(Intr, BaseOpcode); 1378 1379 Register VDataIn, VDataOut; 1380 LLT VDataTy; 1381 int NumVDataDwords = -1; 1382 bool IsD16 = false; 1383 1384 // XXX - Can we just get the second to last argument for ctrl? 1385 unsigned CtrlIdx; // Index of texfailctrl argument 1386 bool Unorm; 1387 if (!BaseOpcode->Sampler) { 1388 Unorm = true; 1389 CtrlIdx = VAddrIdx + NumVAddr + 1; 1390 } else { 1391 Unorm = MI.getOperand(VAddrIdx + NumVAddr + 2).getImm() != 0; 1392 CtrlIdx = VAddrIdx + NumVAddr + 3; 1393 } 1394 1395 bool TFE; 1396 bool LWE; 1397 bool IsTexFail = false; 1398 if (!parseTexFail(MI.getOperand(CtrlIdx).getImm(), TFE, LWE, IsTexFail)) 1399 return false; 1400 1401 const int Flags = MI.getOperand(CtrlIdx + 2).getImm(); 1402 const bool IsA16 = (Flags & 1) != 0; 1403 const bool IsG16 = (Flags & 2) != 0; 1404 1405 // A16 implies 16 bit gradients 1406 if (IsA16 && !IsG16) 1407 return false; 1408 1409 unsigned DMask = 0; 1410 unsigned DMaskLanes = 0; 1411 1412 if (BaseOpcode->Atomic) { 1413 VDataOut = MI.getOperand(0).getReg(); 1414 VDataIn = MI.getOperand(2).getReg(); 1415 LLT Ty = MRI->getType(VDataIn); 1416 1417 // Be careful to allow atomic swap on 16-bit element vectors. 1418 const bool Is64Bit = BaseOpcode->AtomicX2 ? 1419 Ty.getSizeInBits() == 128 : 1420 Ty.getSizeInBits() == 64; 1421 1422 if (BaseOpcode->AtomicX2) { 1423 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister); 1424 1425 DMask = Is64Bit ? 0xf : 0x3; 1426 NumVDataDwords = Is64Bit ? 4 : 2; 1427 } else { 1428 DMask = Is64Bit ? 0x3 : 0x1; 1429 NumVDataDwords = Is64Bit ? 2 : 1; 1430 } 1431 } else { 1432 const int DMaskIdx = 2; // Input/output + intrinsic ID. 1433 1434 DMask = MI.getOperand(DMaskIdx).getImm(); 1435 DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask); 1436 1437 if (BaseOpcode->Store) { 1438 VDataIn = MI.getOperand(1).getReg(); 1439 VDataTy = MRI->getType(VDataIn); 1440 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32; 1441 } else { 1442 VDataOut = MI.getOperand(0).getReg(); 1443 VDataTy = MRI->getType(VDataOut); 1444 NumVDataDwords = DMaskLanes; 1445 1446 // One memoperand is mandatory, except for getresinfo. 1447 // FIXME: Check this in verifier. 1448 if (!MI.memoperands_empty()) { 1449 const MachineMemOperand *MMO = *MI.memoperands_begin(); 1450 1451 // Infer d16 from the memory size, as the register type will be mangled by 1452 // unpacked subtargets, or by TFE. 1453 IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32; 1454 1455 if (IsD16 && !STI.hasUnpackedD16VMem()) 1456 NumVDataDwords = (DMaskLanes + 1) / 2; 1457 } 1458 } 1459 } 1460 1461 // Optimize _L to _LZ when _L is zero 1462 if (LZMappingInfo) { 1463 // The legalizer replaced the register with an immediate 0 if we need to 1464 // change the opcode. 1465 const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1); 1466 if (Lod.isImm()) { 1467 assert(Lod.getImm() == 0); 1468 IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l 1469 } 1470 } 1471 1472 // Optimize _mip away, when 'lod' is zero 1473 if (MIPMappingInfo) { 1474 const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1); 1475 if (Lod.isImm()) { 1476 assert(Lod.getImm() == 0); 1477 IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip 1478 } 1479 } 1480 1481 // Set G16 opcode 1482 if (IsG16 && !IsA16) { 1483 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = 1484 AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode); 1485 assert(G16MappingInfo); 1486 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16 1487 } 1488 1489 // TODO: Check this in verifier. 1490 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this"); 1491 1492 bool GLC = false; 1493 bool SLC = false; 1494 bool DLC = false; 1495 if (BaseOpcode->Atomic) { 1496 GLC = true; // TODO no-return optimization 1497 if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), nullptr, &SLC, 1498 IsGFX10 ? &DLC : nullptr)) 1499 return false; 1500 } else { 1501 if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), &GLC, &SLC, 1502 IsGFX10 ? &DLC : nullptr)) 1503 return false; 1504 } 1505 1506 int NumVAddrRegs = 0; 1507 int NumVAddrDwords = 0; 1508 for (int I = 0; I < NumVAddr; ++I) { 1509 // Skip the $noregs and 0s inserted during legalization. 1510 MachineOperand &AddrOp = MI.getOperand(VAddrIdx + I); 1511 if (!AddrOp.isReg()) 1512 continue; // XXX - Break? 1513 1514 Register Addr = AddrOp.getReg(); 1515 if (!Addr) 1516 break; 1517 1518 ++NumVAddrRegs; 1519 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32; 1520 } 1521 1522 // The legalizer preprocessed the intrinsic arguments. If we aren't using 1523 // NSA, these should have beeen packed into a single value in the first 1524 // address register 1525 const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs; 1526 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) { 1527 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n"); 1528 return false; 1529 } 1530 1531 if (IsTexFail) 1532 ++NumVDataDwords; 1533 1534 int Opcode = -1; 1535 if (IsGFX10) { 1536 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, 1537 UseNSA ? AMDGPU::MIMGEncGfx10NSA 1538 : AMDGPU::MIMGEncGfx10Default, 1539 NumVDataDwords, NumVAddrDwords); 1540 } else { 1541 if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 1542 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, 1543 NumVDataDwords, NumVAddrDwords); 1544 if (Opcode == -1) 1545 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6, 1546 NumVDataDwords, NumVAddrDwords); 1547 } 1548 assert(Opcode != -1); 1549 1550 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode)) 1551 .cloneMemRefs(MI); 1552 1553 if (VDataOut) { 1554 if (BaseOpcode->AtomicX2) { 1555 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64; 1556 1557 Register TmpReg = MRI->createVirtualRegister( 1558 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass); 1559 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 1560 1561 MIB.addDef(TmpReg); 1562 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut) 1563 .addReg(TmpReg, RegState::Kill, SubReg); 1564 1565 } else { 1566 MIB.addDef(VDataOut); // vdata output 1567 } 1568 } 1569 1570 if (VDataIn) 1571 MIB.addReg(VDataIn); // vdata input 1572 1573 for (int i = 0; i != NumVAddrRegs; ++i) { 1574 MachineOperand &SrcOp = MI.getOperand(VAddrIdx + i); 1575 if (SrcOp.isReg()) { 1576 assert(SrcOp.getReg() != 0); 1577 MIB.addReg(SrcOp.getReg()); 1578 } 1579 } 1580 1581 MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr).getReg()); // rsrc 1582 if (BaseOpcode->Sampler) 1583 MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr + 1).getReg()); // sampler 1584 1585 MIB.addImm(DMask); // dmask 1586 1587 if (IsGFX10) 1588 MIB.addImm(DimInfo->Encoding); 1589 MIB.addImm(Unorm); 1590 if (IsGFX10) 1591 MIB.addImm(DLC); 1592 1593 MIB.addImm(GLC); 1594 MIB.addImm(SLC); 1595 MIB.addImm(IsA16 && // a16 or r128 1596 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0); 1597 if (IsGFX10) 1598 MIB.addImm(IsA16 ? -1 : 0); 1599 1600 MIB.addImm(TFE); // tfe 1601 MIB.addImm(LWE); // lwe 1602 if (!IsGFX10) 1603 MIB.addImm(DimInfo->DA ? -1 : 0); 1604 if (BaseOpcode->HasD16) 1605 MIB.addImm(IsD16 ? -1 : 0); 1606 1607 MI.eraseFromParent(); 1608 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1609 } 1610 1611 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( 1612 MachineInstr &I) const { 1613 unsigned IntrinsicID = I.getIntrinsicID(); 1614 switch (IntrinsicID) { 1615 case Intrinsic::amdgcn_end_cf: 1616 return selectEndCfIntrinsic(I); 1617 case Intrinsic::amdgcn_ds_ordered_add: 1618 case Intrinsic::amdgcn_ds_ordered_swap: 1619 return selectDSOrderedIntrinsic(I, IntrinsicID); 1620 case Intrinsic::amdgcn_ds_gws_init: 1621 case Intrinsic::amdgcn_ds_gws_barrier: 1622 case Intrinsic::amdgcn_ds_gws_sema_v: 1623 case Intrinsic::amdgcn_ds_gws_sema_br: 1624 case Intrinsic::amdgcn_ds_gws_sema_p: 1625 case Intrinsic::amdgcn_ds_gws_sema_release_all: 1626 return selectDSGWSIntrinsic(I, IntrinsicID); 1627 case Intrinsic::amdgcn_ds_append: 1628 return selectDSAppendConsume(I, true); 1629 case Intrinsic::amdgcn_ds_consume: 1630 return selectDSAppendConsume(I, false); 1631 default: { 1632 return selectImpl(I, *CoverageInfo); 1633 } 1634 } 1635 } 1636 1637 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { 1638 if (selectImpl(I, *CoverageInfo)) 1639 return true; 1640 1641 MachineBasicBlock *BB = I.getParent(); 1642 const DebugLoc &DL = I.getDebugLoc(); 1643 1644 Register DstReg = I.getOperand(0).getReg(); 1645 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 1646 assert(Size <= 32 || Size == 64); 1647 const MachineOperand &CCOp = I.getOperand(1); 1648 Register CCReg = CCOp.getReg(); 1649 if (!isVCC(CCReg, *MRI)) { 1650 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 : 1651 AMDGPU::S_CSELECT_B32; 1652 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 1653 .addReg(CCReg); 1654 1655 // The generic constrainSelectedInstRegOperands doesn't work for the scc register 1656 // bank, because it does not cover the register class that we used to represent 1657 // for it. So we need to manually set the register class here. 1658 if (!MRI->getRegClassOrNull(CCReg)) 1659 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI)); 1660 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) 1661 .add(I.getOperand(2)) 1662 .add(I.getOperand(3)); 1663 1664 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) | 1665 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI); 1666 I.eraseFromParent(); 1667 return Ret; 1668 } 1669 1670 // Wide VGPR select should have been split in RegBankSelect. 1671 if (Size > 32) 1672 return false; 1673 1674 MachineInstr *Select = 1675 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1676 .addImm(0) 1677 .add(I.getOperand(3)) 1678 .addImm(0) 1679 .add(I.getOperand(2)) 1680 .add(I.getOperand(1)); 1681 1682 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); 1683 I.eraseFromParent(); 1684 return Ret; 1685 } 1686 1687 bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { 1688 initM0(I); 1689 return selectImpl(I, *CoverageInfo); 1690 } 1691 1692 static int sizeToSubRegIndex(unsigned Size) { 1693 switch (Size) { 1694 case 32: 1695 return AMDGPU::sub0; 1696 case 64: 1697 return AMDGPU::sub0_sub1; 1698 case 96: 1699 return AMDGPU::sub0_sub1_sub2; 1700 case 128: 1701 return AMDGPU::sub0_sub1_sub2_sub3; 1702 case 256: 1703 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; 1704 default: 1705 if (Size < 32) 1706 return AMDGPU::sub0; 1707 if (Size > 256) 1708 return -1; 1709 return sizeToSubRegIndex(PowerOf2Ceil(Size)); 1710 } 1711 } 1712 1713 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { 1714 Register DstReg = I.getOperand(0).getReg(); 1715 Register SrcReg = I.getOperand(1).getReg(); 1716 const LLT DstTy = MRI->getType(DstReg); 1717 const LLT SrcTy = MRI->getType(SrcReg); 1718 const LLT S1 = LLT::scalar(1); 1719 1720 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1721 const RegisterBank *DstRB; 1722 if (DstTy == S1) { 1723 // This is a special case. We don't treat s1 for legalization artifacts as 1724 // vcc booleans. 1725 DstRB = SrcRB; 1726 } else { 1727 DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1728 if (SrcRB != DstRB) 1729 return false; 1730 } 1731 1732 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; 1733 1734 unsigned DstSize = DstTy.getSizeInBits(); 1735 unsigned SrcSize = SrcTy.getSizeInBits(); 1736 1737 const TargetRegisterClass *SrcRC 1738 = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI); 1739 const TargetRegisterClass *DstRC 1740 = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI); 1741 if (!SrcRC || !DstRC) 1742 return false; 1743 1744 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 1745 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) { 1746 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); 1747 return false; 1748 } 1749 1750 if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) { 1751 MachineBasicBlock *MBB = I.getParent(); 1752 const DebugLoc &DL = I.getDebugLoc(); 1753 1754 Register LoReg = MRI->createVirtualRegister(DstRC); 1755 Register HiReg = MRI->createVirtualRegister(DstRC); 1756 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg) 1757 .addReg(SrcReg, 0, AMDGPU::sub0); 1758 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg) 1759 .addReg(SrcReg, 0, AMDGPU::sub1); 1760 1761 if (IsVALU && STI.hasSDWA()) { 1762 // Write the low 16-bits of the high element into the high 16-bits of the 1763 // low element. 1764 MachineInstr *MovSDWA = 1765 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 1766 .addImm(0) // $src0_modifiers 1767 .addReg(HiReg) // $src0 1768 .addImm(0) // $clamp 1769 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel 1770 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 1771 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel 1772 .addReg(LoReg, RegState::Implicit); 1773 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 1774 } else { 1775 Register TmpReg0 = MRI->createVirtualRegister(DstRC); 1776 Register TmpReg1 = MRI->createVirtualRegister(DstRC); 1777 Register ImmReg = MRI->createVirtualRegister(DstRC); 1778 if (IsVALU) { 1779 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0) 1780 .addImm(16) 1781 .addReg(HiReg); 1782 } else { 1783 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0) 1784 .addReg(HiReg) 1785 .addImm(16); 1786 } 1787 1788 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 1789 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 1790 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32; 1791 1792 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg) 1793 .addImm(0xffff); 1794 BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1) 1795 .addReg(LoReg) 1796 .addReg(ImmReg); 1797 BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg) 1798 .addReg(TmpReg0) 1799 .addReg(TmpReg1); 1800 } 1801 1802 I.eraseFromParent(); 1803 return true; 1804 } 1805 1806 if (!DstTy.isScalar()) 1807 return false; 1808 1809 if (SrcSize > 32) { 1810 int SubRegIdx = sizeToSubRegIndex(DstSize); 1811 if (SubRegIdx == -1) 1812 return false; 1813 1814 // Deal with weird cases where the class only partially supports the subreg 1815 // index. 1816 const TargetRegisterClass *SrcWithSubRC 1817 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx); 1818 if (!SrcWithSubRC) 1819 return false; 1820 1821 if (SrcWithSubRC != SrcRC) { 1822 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI)) 1823 return false; 1824 } 1825 1826 I.getOperand(1).setSubReg(SubRegIdx); 1827 } 1828 1829 I.setDesc(TII.get(TargetOpcode::COPY)); 1830 return true; 1831 } 1832 1833 /// \returns true if a bitmask for \p Size bits will be an inline immediate. 1834 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) { 1835 Mask = maskTrailingOnes<unsigned>(Size); 1836 int SignedMask = static_cast<int>(Mask); 1837 return SignedMask >= -16 && SignedMask <= 64; 1838 } 1839 1840 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1. 1841 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank( 1842 Register Reg, const MachineRegisterInfo &MRI, 1843 const TargetRegisterInfo &TRI) const { 1844 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 1845 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>()) 1846 return RB; 1847 1848 // Ignore the type, since we don't use vcc in artifacts. 1849 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>()) 1850 return &RBI.getRegBankFromRegClass(*RC, LLT()); 1851 return nullptr; 1852 } 1853 1854 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { 1855 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG; 1856 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg; 1857 const DebugLoc &DL = I.getDebugLoc(); 1858 MachineBasicBlock &MBB = *I.getParent(); 1859 const Register DstReg = I.getOperand(0).getReg(); 1860 const Register SrcReg = I.getOperand(1).getReg(); 1861 1862 const LLT DstTy = MRI->getType(DstReg); 1863 const LLT SrcTy = MRI->getType(SrcReg); 1864 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ? 1865 I.getOperand(2).getImm() : SrcTy.getSizeInBits(); 1866 const unsigned DstSize = DstTy.getSizeInBits(); 1867 if (!DstTy.isScalar()) 1868 return false; 1869 1870 if (I.getOpcode() == AMDGPU::G_ANYEXT) 1871 return selectCOPY(I); 1872 1873 // Artifact casts should never use vcc. 1874 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI); 1875 1876 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) { 1877 // 64-bit should have been split up in RegBankSelect 1878 1879 // Try to use an and with a mask if it will save code size. 1880 unsigned Mask; 1881 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1882 MachineInstr *ExtI = 1883 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg) 1884 .addImm(Mask) 1885 .addReg(SrcReg); 1886 I.eraseFromParent(); 1887 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1888 } 1889 1890 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32; 1891 MachineInstr *ExtI = 1892 BuildMI(MBB, I, DL, TII.get(BFE), DstReg) 1893 .addReg(SrcReg) 1894 .addImm(0) // Offset 1895 .addImm(SrcSize); // Width 1896 I.eraseFromParent(); 1897 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1898 } 1899 1900 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) { 1901 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ? 1902 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass; 1903 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI)) 1904 return false; 1905 1906 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) { 1907 const unsigned SextOpc = SrcSize == 8 ? 1908 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16; 1909 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg) 1910 .addReg(SrcReg); 1911 I.eraseFromParent(); 1912 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1913 } 1914 1915 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; 1916 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; 1917 1918 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width. 1919 if (DstSize > 32 && (SrcSize <= 32 || InReg)) { 1920 // We need a 64-bit register source, but the high bits don't matter. 1921 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); 1922 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1923 unsigned SubReg = InReg ? AMDGPU::sub0 : 0; 1924 1925 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 1926 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) 1927 .addReg(SrcReg, 0, SubReg) 1928 .addImm(AMDGPU::sub0) 1929 .addReg(UndefReg) 1930 .addImm(AMDGPU::sub1); 1931 1932 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg) 1933 .addReg(ExtReg) 1934 .addImm(SrcSize << 16); 1935 1936 I.eraseFromParent(); 1937 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI); 1938 } 1939 1940 unsigned Mask; 1941 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1942 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg) 1943 .addReg(SrcReg) 1944 .addImm(Mask); 1945 } else { 1946 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg) 1947 .addReg(SrcReg) 1948 .addImm(SrcSize << 16); 1949 } 1950 1951 I.eraseFromParent(); 1952 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1953 } 1954 1955 return false; 1956 } 1957 1958 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { 1959 MachineBasicBlock *BB = I.getParent(); 1960 MachineOperand &ImmOp = I.getOperand(1); 1961 1962 // The AMDGPU backend only supports Imm operands and not CImm or FPImm. 1963 if (ImmOp.isFPImm()) { 1964 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt(); 1965 ImmOp.ChangeToImmediate(Imm.getZExtValue()); 1966 } else if (ImmOp.isCImm()) { 1967 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue()); 1968 } 1969 1970 Register DstReg = I.getOperand(0).getReg(); 1971 unsigned Size; 1972 bool IsSgpr; 1973 const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg()); 1974 if (RB) { 1975 IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID; 1976 Size = MRI->getType(DstReg).getSizeInBits(); 1977 } else { 1978 const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg); 1979 IsSgpr = TRI.isSGPRClass(RC); 1980 Size = TRI.getRegSizeInBits(*RC); 1981 } 1982 1983 if (Size != 32 && Size != 64) 1984 return false; 1985 1986 unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1987 if (Size == 32) { 1988 I.setDesc(TII.get(Opcode)); 1989 I.addImplicitDefUseOperands(*MF); 1990 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 1991 } 1992 1993 const DebugLoc &DL = I.getDebugLoc(); 1994 1995 APInt Imm(Size, I.getOperand(1).getImm()); 1996 1997 MachineInstr *ResInst; 1998 if (IsSgpr && TII.isInlineConstant(Imm)) { 1999 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) 2000 .addImm(I.getOperand(1).getImm()); 2001 } else { 2002 const TargetRegisterClass *RC = IsSgpr ? 2003 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass; 2004 Register LoReg = MRI->createVirtualRegister(RC); 2005 Register HiReg = MRI->createVirtualRegister(RC); 2006 2007 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg) 2008 .addImm(Imm.trunc(32).getZExtValue()); 2009 2010 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg) 2011 .addImm(Imm.ashr(32).getZExtValue()); 2012 2013 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 2014 .addReg(LoReg) 2015 .addImm(AMDGPU::sub0) 2016 .addReg(HiReg) 2017 .addImm(AMDGPU::sub1); 2018 } 2019 2020 // We can't call constrainSelectedInstRegOperands here, because it doesn't 2021 // work for target independent opcodes 2022 I.eraseFromParent(); 2023 const TargetRegisterClass *DstRC = 2024 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI); 2025 if (!DstRC) 2026 return true; 2027 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI); 2028 } 2029 2030 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const { 2031 // Only manually handle the f64 SGPR case. 2032 // 2033 // FIXME: This is a workaround for 2.5 different tablegen problems. Because 2034 // the bit ops theoretically have a second result due to the implicit def of 2035 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing 2036 // that is easy by disabling the check. The result works, but uses a 2037 // nonsensical sreg32orlds_and_sreg_1 regclass. 2038 // 2039 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to 2040 // the variadic REG_SEQUENCE operands. 2041 2042 Register Dst = MI.getOperand(0).getReg(); 2043 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 2044 if (DstRB->getID() != AMDGPU::SGPRRegBankID || 2045 MRI->getType(Dst) != LLT::scalar(64)) 2046 return false; 2047 2048 Register Src = MI.getOperand(1).getReg(); 2049 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI); 2050 if (Fabs) 2051 Src = Fabs->getOperand(1).getReg(); 2052 2053 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || 2054 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) 2055 return false; 2056 2057 MachineBasicBlock *BB = MI.getParent(); 2058 const DebugLoc &DL = MI.getDebugLoc(); 2059 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2060 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2061 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2062 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2063 2064 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) 2065 .addReg(Src, 0, AMDGPU::sub0); 2066 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) 2067 .addReg(Src, 0, AMDGPU::sub1); 2068 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) 2069 .addImm(0x80000000); 2070 2071 // Set or toggle sign bit. 2072 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32; 2073 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg) 2074 .addReg(HiReg) 2075 .addReg(ConstReg); 2076 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) 2077 .addReg(LoReg) 2078 .addImm(AMDGPU::sub0) 2079 .addReg(OpReg) 2080 .addImm(AMDGPU::sub1); 2081 MI.eraseFromParent(); 2082 return true; 2083 } 2084 2085 // FIXME: This is a workaround for the same tablegen problems as G_FNEG 2086 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const { 2087 Register Dst = MI.getOperand(0).getReg(); 2088 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 2089 if (DstRB->getID() != AMDGPU::SGPRRegBankID || 2090 MRI->getType(Dst) != LLT::scalar(64)) 2091 return false; 2092 2093 Register Src = MI.getOperand(1).getReg(); 2094 MachineBasicBlock *BB = MI.getParent(); 2095 const DebugLoc &DL = MI.getDebugLoc(); 2096 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2097 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2098 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2099 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2100 2101 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || 2102 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) 2103 return false; 2104 2105 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) 2106 .addReg(Src, 0, AMDGPU::sub0); 2107 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) 2108 .addReg(Src, 0, AMDGPU::sub1); 2109 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) 2110 .addImm(0x7fffffff); 2111 2112 // Clear sign bit. 2113 // TODO: Should this used S_BITSET0_*? 2114 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg) 2115 .addReg(HiReg) 2116 .addReg(ConstReg); 2117 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) 2118 .addReg(LoReg) 2119 .addImm(AMDGPU::sub0) 2120 .addReg(OpReg) 2121 .addImm(AMDGPU::sub1); 2122 2123 MI.eraseFromParent(); 2124 return true; 2125 } 2126 2127 static bool isConstant(const MachineInstr &MI) { 2128 return MI.getOpcode() == TargetOpcode::G_CONSTANT; 2129 } 2130 2131 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, 2132 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const { 2133 2134 const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg()); 2135 2136 assert(PtrMI); 2137 2138 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD) 2139 return; 2140 2141 GEPInfo GEPInfo(*PtrMI); 2142 2143 for (unsigned i = 1; i != 3; ++i) { 2144 const MachineOperand &GEPOp = PtrMI->getOperand(i); 2145 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg()); 2146 assert(OpDef); 2147 if (i == 2 && isConstant(*OpDef)) { 2148 // TODO: Could handle constant base + variable offset, but a combine 2149 // probably should have commuted it. 2150 assert(GEPInfo.Imm == 0); 2151 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue(); 2152 continue; 2153 } 2154 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI); 2155 if (OpBank->getID() == AMDGPU::SGPRRegBankID) 2156 GEPInfo.SgprParts.push_back(GEPOp.getReg()); 2157 else 2158 GEPInfo.VgprParts.push_back(GEPOp.getReg()); 2159 } 2160 2161 AddrInfo.push_back(GEPInfo); 2162 getAddrModeInfo(*PtrMI, MRI, AddrInfo); 2163 } 2164 2165 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const { 2166 if (!MI.hasOneMemOperand()) 2167 return false; 2168 2169 const MachineMemOperand *MMO = *MI.memoperands_begin(); 2170 const Value *Ptr = MMO->getValue(); 2171 2172 // UndefValue means this is a load of a kernel input. These are uniform. 2173 // Sometimes LDS instructions have constant pointers. 2174 // If Ptr is null, then that means this mem operand contains a 2175 // PseudoSourceValue like GOT. 2176 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || 2177 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) 2178 return true; 2179 2180 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 2181 return true; 2182 2183 const Instruction *I = dyn_cast<Instruction>(Ptr); 2184 return I && I->getMetadata("amdgpu.uniform"); 2185 } 2186 2187 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const { 2188 for (const GEPInfo &GEPInfo : AddrInfo) { 2189 if (!GEPInfo.VgprParts.empty()) 2190 return true; 2191 } 2192 return false; 2193 } 2194 2195 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const { 2196 MachineBasicBlock *BB = I.getParent(); 2197 2198 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); 2199 unsigned AS = PtrTy.getAddressSpace(); 2200 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) && 2201 STI.ldsRequiresM0Init()) { 2202 // If DS instructions require M0 initializtion, insert it before selecting. 2203 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 2204 .addImm(-1); 2205 } 2206 } 2207 2208 bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const { 2209 initM0(I); 2210 return selectImpl(I, *CoverageInfo); 2211 } 2212 2213 // TODO: No rtn optimization. 2214 bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG( 2215 MachineInstr &MI) const { 2216 Register PtrReg = MI.getOperand(1).getReg(); 2217 const LLT PtrTy = MRI->getType(PtrReg); 2218 if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS || 2219 STI.useFlatForGlobal()) 2220 return selectImpl(MI, *CoverageInfo); 2221 2222 Register DstReg = MI.getOperand(0).getReg(); 2223 const LLT Ty = MRI->getType(DstReg); 2224 const bool Is64 = Ty.getSizeInBits() == 64; 2225 const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 2226 Register TmpReg = MRI->createVirtualRegister( 2227 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass); 2228 2229 const DebugLoc &DL = MI.getDebugLoc(); 2230 MachineBasicBlock *BB = MI.getParent(); 2231 2232 Register VAddr, RSrcReg, SOffset; 2233 int64_t Offset = 0; 2234 2235 unsigned Opcode; 2236 if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) { 2237 Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN : 2238 AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN; 2239 } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr, 2240 RSrcReg, SOffset, Offset)) { 2241 Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN : 2242 AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN; 2243 } else 2244 return selectImpl(MI, *CoverageInfo); 2245 2246 auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg) 2247 .addReg(MI.getOperand(2).getReg()); 2248 2249 if (VAddr) 2250 MIB.addReg(VAddr); 2251 2252 MIB.addReg(RSrcReg); 2253 if (SOffset) 2254 MIB.addReg(SOffset); 2255 else 2256 MIB.addImm(0); 2257 2258 MIB.addImm(Offset); 2259 MIB.addImm(0); // slc 2260 MIB.cloneMemRefs(MI); 2261 2262 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg) 2263 .addReg(TmpReg, RegState::Kill, SubReg); 2264 2265 MI.eraseFromParent(); 2266 2267 MRI->setRegClass( 2268 DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass); 2269 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 2270 } 2271 2272 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { 2273 MachineBasicBlock *BB = I.getParent(); 2274 MachineOperand &CondOp = I.getOperand(0); 2275 Register CondReg = CondOp.getReg(); 2276 const DebugLoc &DL = I.getDebugLoc(); 2277 2278 unsigned BrOpcode; 2279 Register CondPhysReg; 2280 const TargetRegisterClass *ConstrainRC; 2281 2282 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide 2283 // whether the branch is uniform when selecting the instruction. In 2284 // GlobalISel, we should push that decision into RegBankSelect. Assume for now 2285 // RegBankSelect knows what it's doing if the branch condition is scc, even 2286 // though it currently does not. 2287 if (!isVCC(CondReg, *MRI)) { 2288 if (MRI->getType(CondReg) != LLT::scalar(32)) 2289 return false; 2290 2291 CondPhysReg = AMDGPU::SCC; 2292 BrOpcode = AMDGPU::S_CBRANCH_SCC1; 2293 // FIXME: Hack for isSCC tests 2294 ConstrainRC = &AMDGPU::SGPR_32RegClass; 2295 } else { 2296 // FIXME: Do we have to insert an and with exec here, like in SelectionDAG? 2297 // We sort of know that a VCC producer based on the register bank, that ands 2298 // inactive lanes with 0. What if there was a logical operation with vcc 2299 // producers in different blocks/with different exec masks? 2300 // FIXME: Should scc->vcc copies and with exec? 2301 CondPhysReg = TRI.getVCC(); 2302 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ; 2303 ConstrainRC = TRI.getBoolRC(); 2304 } 2305 2306 if (!MRI->getRegClassOrNull(CondReg)) 2307 MRI->setRegClass(CondReg, ConstrainRC); 2308 2309 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg) 2310 .addReg(CondReg); 2311 BuildMI(*BB, &I, DL, TII.get(BrOpcode)) 2312 .addMBB(I.getOperand(1).getMBB()); 2313 2314 I.eraseFromParent(); 2315 return true; 2316 } 2317 2318 bool AMDGPUInstructionSelector::selectG_FRAME_INDEX_GLOBAL_VALUE( 2319 MachineInstr &I) const { 2320 Register DstReg = I.getOperand(0).getReg(); 2321 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2322 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 2323 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32)); 2324 if (IsVGPR) 2325 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 2326 2327 return RBI.constrainGenericRegister( 2328 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI); 2329 } 2330 2331 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { 2332 Register DstReg = I.getOperand(0).getReg(); 2333 Register SrcReg = I.getOperand(1).getReg(); 2334 Register MaskReg = I.getOperand(2).getReg(); 2335 LLT Ty = MRI->getType(DstReg); 2336 LLT MaskTy = MRI->getType(MaskReg); 2337 2338 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2339 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 2340 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI); 2341 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 2342 if (DstRB != SrcRB) // Should only happen for hand written MIR. 2343 return false; 2344 2345 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 2346 const TargetRegisterClass &RegRC 2347 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 2348 2349 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB, 2350 *MRI); 2351 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB, 2352 *MRI); 2353 const TargetRegisterClass *MaskRC = 2354 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI); 2355 2356 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 2357 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 2358 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI)) 2359 return false; 2360 2361 MachineBasicBlock *BB = I.getParent(); 2362 const DebugLoc &DL = I.getDebugLoc(); 2363 if (Ty.getSizeInBits() == 32) { 2364 assert(MaskTy.getSizeInBits() == 32 && 2365 "ptrmask should have been narrowed during legalize"); 2366 2367 BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg) 2368 .addReg(SrcReg) 2369 .addReg(MaskReg); 2370 I.eraseFromParent(); 2371 return true; 2372 } 2373 2374 Register HiReg = MRI->createVirtualRegister(&RegRC); 2375 Register LoReg = MRI->createVirtualRegister(&RegRC); 2376 2377 // Extract the subregisters from the source pointer. 2378 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg) 2379 .addReg(SrcReg, 0, AMDGPU::sub0); 2380 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg) 2381 .addReg(SrcReg, 0, AMDGPU::sub1); 2382 2383 Register MaskedLo, MaskedHi; 2384 2385 // Try to avoid emitting a bit operation when we only need to touch half of 2386 // the 64-bit pointer. 2387 APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64); 2388 2389 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32); 2390 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32); 2391 if ((MaskOnes & MaskLo32) == MaskLo32) { 2392 // If all the bits in the low half are 1, we only need a copy for it. 2393 MaskedLo = LoReg; 2394 } else { 2395 // Extract the mask subregister and apply the and. 2396 Register MaskLo = MRI->createVirtualRegister(&RegRC); 2397 MaskedLo = MRI->createVirtualRegister(&RegRC); 2398 2399 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo) 2400 .addReg(MaskReg, 0, AMDGPU::sub0); 2401 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo) 2402 .addReg(LoReg) 2403 .addReg(MaskLo); 2404 } 2405 2406 if ((MaskOnes & MaskHi32) == MaskHi32) { 2407 // If all the bits in the high half are 1, we only need a copy for it. 2408 MaskedHi = HiReg; 2409 } else { 2410 Register MaskHi = MRI->createVirtualRegister(&RegRC); 2411 MaskedHi = MRI->createVirtualRegister(&RegRC); 2412 2413 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi) 2414 .addReg(MaskReg, 0, AMDGPU::sub1); 2415 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi) 2416 .addReg(HiReg) 2417 .addReg(MaskHi); 2418 } 2419 2420 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 2421 .addReg(MaskedLo) 2422 .addImm(AMDGPU::sub0) 2423 .addReg(MaskedHi) 2424 .addImm(AMDGPU::sub1); 2425 I.eraseFromParent(); 2426 return true; 2427 } 2428 2429 /// Return the register to use for the index value, and the subregister to use 2430 /// for the indirectly accessed register. 2431 static std::pair<Register, unsigned> 2432 computeIndirectRegIndex(MachineRegisterInfo &MRI, 2433 const SIRegisterInfo &TRI, 2434 const TargetRegisterClass *SuperRC, 2435 Register IdxReg, 2436 unsigned EltSize) { 2437 Register IdxBaseReg; 2438 int Offset; 2439 MachineInstr *Unused; 2440 2441 std::tie(IdxBaseReg, Offset, Unused) 2442 = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg); 2443 if (IdxBaseReg == AMDGPU::NoRegister) { 2444 // This will happen if the index is a known constant. This should ordinarily 2445 // be legalized out, but handle it as a register just in case. 2446 assert(Offset == 0); 2447 IdxBaseReg = IdxReg; 2448 } 2449 2450 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize); 2451 2452 // Skip out of bounds offsets, or else we would end up using an undefined 2453 // register. 2454 if (static_cast<unsigned>(Offset) >= SubRegs.size()) 2455 return std::make_pair(IdxReg, SubRegs[0]); 2456 return std::make_pair(IdxBaseReg, SubRegs[Offset]); 2457 } 2458 2459 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT( 2460 MachineInstr &MI) const { 2461 Register DstReg = MI.getOperand(0).getReg(); 2462 Register SrcReg = MI.getOperand(1).getReg(); 2463 Register IdxReg = MI.getOperand(2).getReg(); 2464 2465 LLT DstTy = MRI->getType(DstReg); 2466 LLT SrcTy = MRI->getType(SrcReg); 2467 2468 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2469 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 2470 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 2471 2472 // The index must be scalar. If it wasn't RegBankSelect should have moved this 2473 // into a waterfall loop. 2474 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 2475 return false; 2476 2477 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB, 2478 *MRI); 2479 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB, 2480 *MRI); 2481 if (!SrcRC || !DstRC) 2482 return false; 2483 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 2484 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 2485 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 2486 return false; 2487 2488 MachineBasicBlock *BB = MI.getParent(); 2489 const DebugLoc &DL = MI.getDebugLoc(); 2490 const bool Is64 = DstTy.getSizeInBits() == 64; 2491 2492 unsigned SubReg; 2493 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg, 2494 DstTy.getSizeInBits() / 8); 2495 2496 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) { 2497 if (DstTy.getSizeInBits() != 32 && !Is64) 2498 return false; 2499 2500 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2501 .addReg(IdxReg); 2502 2503 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32; 2504 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg) 2505 .addReg(SrcReg, 0, SubReg) 2506 .addReg(SrcReg, RegState::Implicit); 2507 MI.eraseFromParent(); 2508 return true; 2509 } 2510 2511 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32) 2512 return false; 2513 2514 if (!STI.useVGPRIndexMode()) { 2515 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2516 .addReg(IdxReg); 2517 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg) 2518 .addReg(SrcReg, RegState::Undef, SubReg) 2519 .addReg(SrcReg, RegState::Implicit); 2520 MI.eraseFromParent(); 2521 return true; 2522 } 2523 2524 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON)) 2525 .addReg(IdxReg) 2526 .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE); 2527 BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstReg) 2528 .addReg(SrcReg, RegState::Undef, SubReg) 2529 .addReg(SrcReg, RegState::Implicit) 2530 .addReg(AMDGPU::M0, RegState::Implicit); 2531 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF)); 2532 2533 MI.eraseFromParent(); 2534 return true; 2535 } 2536 2537 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd 2538 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT( 2539 MachineInstr &MI) const { 2540 Register DstReg = MI.getOperand(0).getReg(); 2541 Register VecReg = MI.getOperand(1).getReg(); 2542 Register ValReg = MI.getOperand(2).getReg(); 2543 Register IdxReg = MI.getOperand(3).getReg(); 2544 2545 LLT VecTy = MRI->getType(DstReg); 2546 LLT ValTy = MRI->getType(ValReg); 2547 unsigned VecSize = VecTy.getSizeInBits(); 2548 unsigned ValSize = ValTy.getSizeInBits(); 2549 2550 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI); 2551 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI); 2552 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 2553 2554 assert(VecTy.getElementType() == ValTy); 2555 2556 // The index must be scalar. If it wasn't RegBankSelect should have moved this 2557 // into a waterfall loop. 2558 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 2559 return false; 2560 2561 const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB, 2562 *MRI); 2563 const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB, 2564 *MRI); 2565 2566 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) || 2567 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) || 2568 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) || 2569 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 2570 return false; 2571 2572 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32) 2573 return false; 2574 2575 unsigned SubReg; 2576 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, 2577 ValSize / 8); 2578 2579 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID && 2580 STI.useVGPRIndexMode(); 2581 2582 MachineBasicBlock *BB = MI.getParent(); 2583 const DebugLoc &DL = MI.getDebugLoc(); 2584 2585 if (IndexMode) { 2586 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON)) 2587 .addReg(IdxReg) 2588 .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE); 2589 } else { 2590 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2591 .addReg(IdxReg); 2592 } 2593 2594 const MCInstrDesc &RegWriteOp 2595 = TII.getIndirectRegWritePseudo(VecSize, ValSize, 2596 VecRB->getID() == AMDGPU::SGPRRegBankID); 2597 BuildMI(*BB, MI, DL, RegWriteOp, DstReg) 2598 .addReg(VecReg) 2599 .addReg(ValReg) 2600 .addImm(SubReg); 2601 2602 if (IndexMode) 2603 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF)); 2604 2605 MI.eraseFromParent(); 2606 return true; 2607 } 2608 2609 static bool isZeroOrUndef(int X) { 2610 return X == 0 || X == -1; 2611 } 2612 2613 static bool isOneOrUndef(int X) { 2614 return X == 1 || X == -1; 2615 } 2616 2617 static bool isZeroOrOneOrUndef(int X) { 2618 return X == 0 || X == 1 || X == -1; 2619 } 2620 2621 // Normalize a VOP3P shuffle mask to refer to the low/high half of a single 2622 // 32-bit register. 2623 static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1, 2624 ArrayRef<int> Mask) { 2625 NewMask[0] = Mask[0]; 2626 NewMask[1] = Mask[1]; 2627 if (isZeroOrOneOrUndef(Mask[0]) && isZeroOrOneOrUndef(Mask[1])) 2628 return Src0; 2629 2630 assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1); 2631 assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1); 2632 2633 // Shift the mask inputs to be 0/1; 2634 NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2; 2635 NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2; 2636 return Src1; 2637 } 2638 2639 // This is only legal with VOP3P instructions as an aid to op_sel matching. 2640 bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR( 2641 MachineInstr &MI) const { 2642 Register DstReg = MI.getOperand(0).getReg(); 2643 Register Src0Reg = MI.getOperand(1).getReg(); 2644 Register Src1Reg = MI.getOperand(2).getReg(); 2645 ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask(); 2646 2647 const LLT V2S16 = LLT::vector(2, 16); 2648 if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16) 2649 return false; 2650 2651 if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask)) 2652 return false; 2653 2654 assert(ShufMask.size() == 2); 2655 assert(STI.hasSDWA() && "no target has VOP3P but not SDWA"); 2656 2657 MachineBasicBlock *MBB = MI.getParent(); 2658 const DebugLoc &DL = MI.getDebugLoc(); 2659 2660 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2661 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; 2662 const TargetRegisterClass &RC = IsVALU ? 2663 AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 2664 2665 // Handle the degenerate case which should have folded out. 2666 if (ShufMask[0] == -1 && ShufMask[1] == -1) { 2667 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg); 2668 2669 MI.eraseFromParent(); 2670 return RBI.constrainGenericRegister(DstReg, RC, *MRI); 2671 } 2672 2673 // A legal VOP3P mask only reads one of the sources. 2674 int Mask[2]; 2675 Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask); 2676 2677 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) || 2678 !RBI.constrainGenericRegister(SrcVec, RC, *MRI)) 2679 return false; 2680 2681 // TODO: This also should have been folded out 2682 if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) { 2683 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg) 2684 .addReg(SrcVec); 2685 2686 MI.eraseFromParent(); 2687 return true; 2688 } 2689 2690 if (Mask[0] == 1 && Mask[1] == -1) { 2691 if (IsVALU) { 2692 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg) 2693 .addImm(16) 2694 .addReg(SrcVec); 2695 } else { 2696 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg) 2697 .addReg(SrcVec) 2698 .addImm(16); 2699 } 2700 } else if (Mask[0] == -1 && Mask[1] == 0) { 2701 if (IsVALU) { 2702 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg) 2703 .addImm(16) 2704 .addReg(SrcVec); 2705 } else { 2706 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg) 2707 .addReg(SrcVec) 2708 .addImm(16); 2709 } 2710 } else if (Mask[0] == 0 && Mask[1] == 0) { 2711 if (IsVALU) { 2712 // Write low half of the register into the high half. 2713 MachineInstr *MovSDWA = 2714 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 2715 .addImm(0) // $src0_modifiers 2716 .addReg(SrcVec) // $src0 2717 .addImm(0) // $clamp 2718 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel 2719 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 2720 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel 2721 .addReg(SrcVec, RegState::Implicit); 2722 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 2723 } else { 2724 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg) 2725 .addReg(SrcVec) 2726 .addReg(SrcVec); 2727 } 2728 } else if (Mask[0] == 1 && Mask[1] == 1) { 2729 if (IsVALU) { 2730 // Write high half of the register into the low half. 2731 MachineInstr *MovSDWA = 2732 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 2733 .addImm(0) // $src0_modifiers 2734 .addReg(SrcVec) // $src0 2735 .addImm(0) // $clamp 2736 .addImm(AMDGPU::SDWA::WORD_0) // $dst_sel 2737 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 2738 .addImm(AMDGPU::SDWA::WORD_1) // $src0_sel 2739 .addReg(SrcVec, RegState::Implicit); 2740 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 2741 } else { 2742 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg) 2743 .addReg(SrcVec) 2744 .addReg(SrcVec); 2745 } 2746 } else if (Mask[0] == 1 && Mask[1] == 0) { 2747 if (IsVALU) { 2748 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32), DstReg) 2749 .addReg(SrcVec) 2750 .addReg(SrcVec) 2751 .addImm(16); 2752 } else { 2753 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2754 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg) 2755 .addReg(SrcVec) 2756 .addImm(16); 2757 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg) 2758 .addReg(TmpReg) 2759 .addReg(SrcVec); 2760 } 2761 } else 2762 llvm_unreachable("all shuffle masks should be handled"); 2763 2764 MI.eraseFromParent(); 2765 return true; 2766 } 2767 2768 bool AMDGPUInstructionSelector::select(MachineInstr &I) { 2769 if (I.isPHI()) 2770 return selectPHI(I); 2771 2772 if (!I.isPreISelOpcode()) { 2773 if (I.isCopy()) 2774 return selectCOPY(I); 2775 return true; 2776 } 2777 2778 switch (I.getOpcode()) { 2779 case TargetOpcode::G_AND: 2780 case TargetOpcode::G_OR: 2781 case TargetOpcode::G_XOR: 2782 if (selectImpl(I, *CoverageInfo)) 2783 return true; 2784 return selectG_AND_OR_XOR(I); 2785 case TargetOpcode::G_ADD: 2786 case TargetOpcode::G_SUB: 2787 if (selectImpl(I, *CoverageInfo)) 2788 return true; 2789 return selectG_ADD_SUB(I); 2790 case TargetOpcode::G_UADDO: 2791 case TargetOpcode::G_USUBO: 2792 case TargetOpcode::G_UADDE: 2793 case TargetOpcode::G_USUBE: 2794 return selectG_UADDO_USUBO_UADDE_USUBE(I); 2795 case TargetOpcode::G_INTTOPTR: 2796 case TargetOpcode::G_BITCAST: 2797 case TargetOpcode::G_PTRTOINT: 2798 return selectCOPY(I); 2799 case TargetOpcode::G_CONSTANT: 2800 case TargetOpcode::G_FCONSTANT: 2801 return selectG_CONSTANT(I); 2802 case TargetOpcode::G_FNEG: 2803 if (selectImpl(I, *CoverageInfo)) 2804 return true; 2805 return selectG_FNEG(I); 2806 case TargetOpcode::G_FABS: 2807 if (selectImpl(I, *CoverageInfo)) 2808 return true; 2809 return selectG_FABS(I); 2810 case TargetOpcode::G_EXTRACT: 2811 return selectG_EXTRACT(I); 2812 case TargetOpcode::G_MERGE_VALUES: 2813 case TargetOpcode::G_BUILD_VECTOR: 2814 case TargetOpcode::G_CONCAT_VECTORS: 2815 return selectG_MERGE_VALUES(I); 2816 case TargetOpcode::G_UNMERGE_VALUES: 2817 return selectG_UNMERGE_VALUES(I); 2818 case TargetOpcode::G_BUILD_VECTOR_TRUNC: 2819 return selectG_BUILD_VECTOR_TRUNC(I); 2820 case TargetOpcode::G_PTR_ADD: 2821 return selectG_PTR_ADD(I); 2822 case TargetOpcode::G_IMPLICIT_DEF: 2823 return selectG_IMPLICIT_DEF(I); 2824 case TargetOpcode::G_FREEZE: 2825 return selectCOPY(I); 2826 case TargetOpcode::G_INSERT: 2827 return selectG_INSERT(I); 2828 case TargetOpcode::G_INTRINSIC: 2829 return selectG_INTRINSIC(I); 2830 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: 2831 return selectG_INTRINSIC_W_SIDE_EFFECTS(I); 2832 case TargetOpcode::G_ICMP: 2833 if (selectG_ICMP(I)) 2834 return true; 2835 return selectImpl(I, *CoverageInfo); 2836 case TargetOpcode::G_LOAD: 2837 case TargetOpcode::G_ATOMIC_CMPXCHG: 2838 case TargetOpcode::G_ATOMICRMW_XCHG: 2839 case TargetOpcode::G_ATOMICRMW_ADD: 2840 case TargetOpcode::G_ATOMICRMW_SUB: 2841 case TargetOpcode::G_ATOMICRMW_AND: 2842 case TargetOpcode::G_ATOMICRMW_OR: 2843 case TargetOpcode::G_ATOMICRMW_XOR: 2844 case TargetOpcode::G_ATOMICRMW_MIN: 2845 case TargetOpcode::G_ATOMICRMW_MAX: 2846 case TargetOpcode::G_ATOMICRMW_UMIN: 2847 case TargetOpcode::G_ATOMICRMW_UMAX: 2848 case TargetOpcode::G_ATOMICRMW_FADD: 2849 case AMDGPU::G_AMDGPU_ATOMIC_INC: 2850 case AMDGPU::G_AMDGPU_ATOMIC_DEC: 2851 return selectG_LOAD_ATOMICRMW(I); 2852 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: 2853 return selectG_AMDGPU_ATOMIC_CMPXCHG(I); 2854 case TargetOpcode::G_SELECT: 2855 return selectG_SELECT(I); 2856 case TargetOpcode::G_STORE: 2857 return selectG_STORE(I); 2858 case TargetOpcode::G_TRUNC: 2859 return selectG_TRUNC(I); 2860 case TargetOpcode::G_SEXT: 2861 case TargetOpcode::G_ZEXT: 2862 case TargetOpcode::G_ANYEXT: 2863 case TargetOpcode::G_SEXT_INREG: 2864 if (selectImpl(I, *CoverageInfo)) 2865 return true; 2866 return selectG_SZA_EXT(I); 2867 case TargetOpcode::G_BRCOND: 2868 return selectG_BRCOND(I); 2869 case TargetOpcode::G_FRAME_INDEX: 2870 case TargetOpcode::G_GLOBAL_VALUE: 2871 return selectG_FRAME_INDEX_GLOBAL_VALUE(I); 2872 case TargetOpcode::G_PTRMASK: 2873 return selectG_PTRMASK(I); 2874 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 2875 return selectG_EXTRACT_VECTOR_ELT(I); 2876 case TargetOpcode::G_INSERT_VECTOR_ELT: 2877 return selectG_INSERT_VECTOR_ELT(I); 2878 case TargetOpcode::G_SHUFFLE_VECTOR: 2879 return selectG_SHUFFLE_VECTOR(I); 2880 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 2881 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: { 2882 const AMDGPU::ImageDimIntrinsicInfo *Intr 2883 = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID()); 2884 assert(Intr && "not an image intrinsic with image pseudo"); 2885 return selectImageIntrinsic(I, Intr); 2886 } 2887 default: 2888 return selectImpl(I, *CoverageInfo); 2889 } 2890 return false; 2891 } 2892 2893 InstructionSelector::ComplexRendererFns 2894 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { 2895 return {{ 2896 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 2897 }}; 2898 2899 } 2900 2901 std::pair<Register, unsigned> 2902 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root) const { 2903 Register Src = Root.getReg(); 2904 Register OrigSrc = Src; 2905 unsigned Mods = 0; 2906 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI); 2907 2908 if (MI && MI->getOpcode() == AMDGPU::G_FNEG) { 2909 Src = MI->getOperand(1).getReg(); 2910 Mods |= SISrcMods::NEG; 2911 MI = getDefIgnoringCopies(Src, *MRI); 2912 } 2913 2914 if (MI && MI->getOpcode() == AMDGPU::G_FABS) { 2915 Src = MI->getOperand(1).getReg(); 2916 Mods |= SISrcMods::ABS; 2917 } 2918 2919 if (Mods != 0 && 2920 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) { 2921 MachineInstr *UseMI = Root.getParent(); 2922 2923 // If we looked through copies to find source modifiers on an SGPR operand, 2924 // we now have an SGPR register source. To avoid potentially violating the 2925 // constant bus restriction, we need to insert a copy to a VGPR. 2926 Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc); 2927 BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(), 2928 TII.get(AMDGPU::COPY), VGPRSrc) 2929 .addReg(Src); 2930 Src = VGPRSrc; 2931 } 2932 2933 return std::make_pair(Src, Mods); 2934 } 2935 2936 /// 2937 /// This will select either an SGPR or VGPR operand and will save us from 2938 /// having to write an extra tablegen pattern. 2939 InstructionSelector::ComplexRendererFns 2940 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const { 2941 return {{ 2942 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 2943 }}; 2944 } 2945 2946 InstructionSelector::ComplexRendererFns 2947 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { 2948 Register Src; 2949 unsigned Mods; 2950 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 2951 2952 return {{ 2953 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2954 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 2955 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 2956 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 2957 }}; 2958 } 2959 2960 InstructionSelector::ComplexRendererFns 2961 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { 2962 return {{ 2963 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 2964 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 2965 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 2966 }}; 2967 } 2968 2969 InstructionSelector::ComplexRendererFns 2970 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { 2971 Register Src; 2972 unsigned Mods; 2973 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 2974 2975 return {{ 2976 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2977 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 2978 }}; 2979 } 2980 2981 InstructionSelector::ComplexRendererFns 2982 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { 2983 Register Reg = Root.getReg(); 2984 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); 2985 if (Def && (Def->getOpcode() == AMDGPU::G_FNEG || 2986 Def->getOpcode() == AMDGPU::G_FABS)) 2987 return {}; 2988 return {{ 2989 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 2990 }}; 2991 } 2992 2993 std::pair<Register, unsigned> 2994 AMDGPUInstructionSelector::selectVOP3PModsImpl( 2995 Register Src, const MachineRegisterInfo &MRI) const { 2996 unsigned Mods = 0; 2997 MachineInstr *MI = MRI.getVRegDef(Src); 2998 2999 if (MI && MI->getOpcode() == AMDGPU::G_FNEG && 3000 // It's possible to see an f32 fneg here, but unlikely. 3001 // TODO: Treat f32 fneg as only high bit. 3002 MRI.getType(Src) == LLT::vector(2, 16)) { 3003 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); 3004 Src = MI->getOperand(1).getReg(); 3005 MI = MRI.getVRegDef(Src); 3006 } 3007 3008 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. 3009 3010 // Packed instructions do not have abs modifiers. 3011 Mods |= SISrcMods::OP_SEL_1; 3012 3013 return std::make_pair(Src, Mods); 3014 } 3015 3016 InstructionSelector::ComplexRendererFns 3017 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const { 3018 MachineRegisterInfo &MRI 3019 = Root.getParent()->getParent()->getParent()->getRegInfo(); 3020 3021 Register Src; 3022 unsigned Mods; 3023 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI); 3024 3025 return {{ 3026 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3027 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3028 }}; 3029 } 3030 3031 InstructionSelector::ComplexRendererFns 3032 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const { 3033 Register Src; 3034 unsigned Mods; 3035 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 3036 if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI)) 3037 return None; 3038 3039 return {{ 3040 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3041 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3042 }}; 3043 } 3044 3045 InstructionSelector::ComplexRendererFns 3046 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { 3047 // FIXME: Handle op_sel 3048 return {{ 3049 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 3050 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods 3051 }}; 3052 } 3053 3054 InstructionSelector::ComplexRendererFns 3055 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { 3056 SmallVector<GEPInfo, 4> AddrInfo; 3057 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 3058 3059 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 3060 return None; 3061 3062 const GEPInfo &GEPInfo = AddrInfo[0]; 3063 Optional<int64_t> EncodedImm = 3064 AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false); 3065 if (!EncodedImm) 3066 return None; 3067 3068 unsigned PtrReg = GEPInfo.SgprParts[0]; 3069 return {{ 3070 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 3071 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } 3072 }}; 3073 } 3074 3075 InstructionSelector::ComplexRendererFns 3076 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { 3077 SmallVector<GEPInfo, 4> AddrInfo; 3078 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 3079 3080 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 3081 return None; 3082 3083 const GEPInfo &GEPInfo = AddrInfo[0]; 3084 Register PtrReg = GEPInfo.SgprParts[0]; 3085 Optional<int64_t> EncodedImm = 3086 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm); 3087 if (!EncodedImm) 3088 return None; 3089 3090 return {{ 3091 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 3092 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } 3093 }}; 3094 } 3095 3096 InstructionSelector::ComplexRendererFns 3097 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { 3098 MachineInstr *MI = Root.getParent(); 3099 MachineBasicBlock *MBB = MI->getParent(); 3100 3101 SmallVector<GEPInfo, 4> AddrInfo; 3102 getAddrModeInfo(*MI, *MRI, AddrInfo); 3103 3104 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, 3105 // then we can select all ptr + 32-bit offsets not just immediate offsets. 3106 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 3107 return None; 3108 3109 const GEPInfo &GEPInfo = AddrInfo[0]; 3110 // SGPR offset is unsigned. 3111 if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm)) 3112 return None; 3113 3114 // If we make it this far we have a load with an 32-bit immediate offset. 3115 // It is OK to select this using a sgpr offset, because we have already 3116 // failed trying to select this load into one of the _IMM variants since 3117 // the _IMM Patterns are considered before the _SGPR patterns. 3118 Register PtrReg = GEPInfo.SgprParts[0]; 3119 Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 3120 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) 3121 .addImm(GEPInfo.Imm); 3122 return {{ 3123 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 3124 [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); } 3125 }}; 3126 } 3127 3128 template <bool Signed> 3129 InstructionSelector::ComplexRendererFns 3130 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const { 3131 MachineInstr *MI = Root.getParent(); 3132 3133 InstructionSelector::ComplexRendererFns Default = {{ 3134 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 3135 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // offset 3136 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 3137 }}; 3138 3139 if (!STI.hasFlatInstOffsets()) 3140 return Default; 3141 3142 const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg()); 3143 if (!OpDef || OpDef->getOpcode() != AMDGPU::G_PTR_ADD) 3144 return Default; 3145 3146 Optional<int64_t> Offset = 3147 getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI); 3148 if (!Offset.hasValue()) 3149 return Default; 3150 3151 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); 3152 if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed)) 3153 return Default; 3154 3155 Register BasePtr = OpDef->getOperand(1).getReg(); 3156 3157 return {{ 3158 [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); }, 3159 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); }, 3160 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 3161 }}; 3162 } 3163 3164 InstructionSelector::ComplexRendererFns 3165 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const { 3166 return selectFlatOffsetImpl<false>(Root); 3167 } 3168 3169 InstructionSelector::ComplexRendererFns 3170 AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const { 3171 return selectFlatOffsetImpl<true>(Root); 3172 } 3173 3174 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { 3175 auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>(); 3176 return PSV && PSV->isStack(); 3177 } 3178 3179 InstructionSelector::ComplexRendererFns 3180 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { 3181 MachineInstr *MI = Root.getParent(); 3182 MachineBasicBlock *MBB = MI->getParent(); 3183 MachineFunction *MF = MBB->getParent(); 3184 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 3185 3186 int64_t Offset = 0; 3187 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) && 3188 Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) { 3189 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3190 3191 // TODO: Should this be inside the render function? The iterator seems to 3192 // move. 3193 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), 3194 HighBits) 3195 .addImm(Offset & ~4095); 3196 3197 return {{[=](MachineInstrBuilder &MIB) { // rsrc 3198 MIB.addReg(Info->getScratchRSrcReg()); 3199 }, 3200 [=](MachineInstrBuilder &MIB) { // vaddr 3201 MIB.addReg(HighBits); 3202 }, 3203 [=](MachineInstrBuilder &MIB) { // soffset 3204 const MachineMemOperand *MMO = *MI->memoperands_begin(); 3205 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 3206 3207 if (isStackPtrRelative(PtrInfo)) 3208 MIB.addReg(Info->getStackPtrOffsetReg()); 3209 else 3210 MIB.addImm(0); 3211 }, 3212 [=](MachineInstrBuilder &MIB) { // offset 3213 MIB.addImm(Offset & 4095); 3214 }}}; 3215 } 3216 3217 assert(Offset == 0 || Offset == -1); 3218 3219 // Try to fold a frame index directly into the MUBUF vaddr field, and any 3220 // offsets. 3221 Optional<int> FI; 3222 Register VAddr = Root.getReg(); 3223 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) { 3224 if (isBaseWithConstantOffset(Root, *MRI)) { 3225 const MachineOperand &LHS = RootDef->getOperand(1); 3226 const MachineOperand &RHS = RootDef->getOperand(2); 3227 const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); 3228 const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); 3229 if (LHSDef && RHSDef) { 3230 int64_t PossibleOffset = 3231 RHSDef->getOperand(1).getCImm()->getSExtValue(); 3232 if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) && 3233 (!STI.privateMemoryResourceIsRangeChecked() || 3234 KnownBits->signBitIsZero(LHS.getReg()))) { 3235 if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX) 3236 FI = LHSDef->getOperand(1).getIndex(); 3237 else 3238 VAddr = LHS.getReg(); 3239 Offset = PossibleOffset; 3240 } 3241 } 3242 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) { 3243 FI = RootDef->getOperand(1).getIndex(); 3244 } 3245 } 3246 3247 return {{[=](MachineInstrBuilder &MIB) { // rsrc 3248 MIB.addReg(Info->getScratchRSrcReg()); 3249 }, 3250 [=](MachineInstrBuilder &MIB) { // vaddr 3251 if (FI.hasValue()) 3252 MIB.addFrameIndex(FI.getValue()); 3253 else 3254 MIB.addReg(VAddr); 3255 }, 3256 [=](MachineInstrBuilder &MIB) { // soffset 3257 // If we don't know this private access is a local stack object, it 3258 // needs to be relative to the entry point's scratch wave offset. 3259 // TODO: Should split large offsets that don't fit like above. 3260 // TODO: Don't use scratch wave offset just because the offset 3261 // didn't fit. 3262 if (!Info->isEntryFunction() && FI.hasValue()) 3263 MIB.addReg(Info->getStackPtrOffsetReg()); 3264 else 3265 MIB.addImm(0); 3266 }, 3267 [=](MachineInstrBuilder &MIB) { // offset 3268 MIB.addImm(Offset); 3269 }}}; 3270 } 3271 3272 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base, 3273 int64_t Offset, 3274 unsigned OffsetBits) const { 3275 if ((OffsetBits == 16 && !isUInt<16>(Offset)) || 3276 (OffsetBits == 8 && !isUInt<8>(Offset))) 3277 return false; 3278 3279 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) 3280 return true; 3281 3282 // On Southern Islands instruction with a negative base value and an offset 3283 // don't seem to work. 3284 return KnownBits->signBitIsZero(Base); 3285 } 3286 3287 InstructionSelector::ComplexRendererFns 3288 AMDGPUInstructionSelector::selectMUBUFScratchOffset( 3289 MachineOperand &Root) const { 3290 MachineInstr *MI = Root.getParent(); 3291 MachineBasicBlock *MBB = MI->getParent(); 3292 3293 int64_t Offset = 0; 3294 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) || 3295 !SIInstrInfo::isLegalMUBUFImmOffset(Offset)) 3296 return {}; 3297 3298 const MachineFunction *MF = MBB->getParent(); 3299 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 3300 const MachineMemOperand *MMO = *MI->memoperands_begin(); 3301 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 3302 3303 return {{ 3304 [=](MachineInstrBuilder &MIB) { // rsrc 3305 MIB.addReg(Info->getScratchRSrcReg()); 3306 }, 3307 [=](MachineInstrBuilder &MIB) { // soffset 3308 if (isStackPtrRelative(PtrInfo)) 3309 MIB.addReg(Info->getStackPtrOffsetReg()); 3310 else 3311 MIB.addImm(0); 3312 }, 3313 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset 3314 }}; 3315 } 3316 3317 std::pair<Register, unsigned> 3318 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const { 3319 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 3320 if (!RootDef) 3321 return std::make_pair(Root.getReg(), 0); 3322 3323 int64_t ConstAddr = 0; 3324 3325 Register PtrBase; 3326 int64_t Offset; 3327 std::tie(PtrBase, Offset) = 3328 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 3329 3330 if (Offset) { 3331 if (isDSOffsetLegal(PtrBase, Offset, 16)) { 3332 // (add n0, c0) 3333 return std::make_pair(PtrBase, Offset); 3334 } 3335 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 3336 // TODO 3337 3338 3339 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 3340 // TODO 3341 3342 } 3343 3344 return std::make_pair(Root.getReg(), 0); 3345 } 3346 3347 InstructionSelector::ComplexRendererFns 3348 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const { 3349 Register Reg; 3350 unsigned Offset; 3351 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root); 3352 return {{ 3353 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 3354 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } 3355 }}; 3356 } 3357 3358 InstructionSelector::ComplexRendererFns 3359 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const { 3360 Register Reg; 3361 unsigned Offset; 3362 std::tie(Reg, Offset) = selectDS64Bit4ByteAlignedImpl(Root); 3363 return {{ 3364 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 3365 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, 3366 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); } 3367 }}; 3368 } 3369 3370 std::pair<Register, unsigned> 3371 AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const { 3372 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 3373 if (!RootDef) 3374 return std::make_pair(Root.getReg(), 0); 3375 3376 int64_t ConstAddr = 0; 3377 3378 Register PtrBase; 3379 int64_t Offset; 3380 std::tie(PtrBase, Offset) = 3381 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 3382 3383 if (Offset) { 3384 int64_t DWordOffset0 = Offset / 4; 3385 int64_t DWordOffset1 = DWordOffset0 + 1; 3386 if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) { 3387 // (add n0, c0) 3388 return std::make_pair(PtrBase, DWordOffset0); 3389 } 3390 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 3391 // TODO 3392 3393 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 3394 // TODO 3395 3396 } 3397 3398 return std::make_pair(Root.getReg(), 0); 3399 } 3400 3401 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return 3402 /// the base value with the constant offset. There may be intervening copies 3403 /// between \p Root and the identified constant. Returns \p Root, 0 if this does 3404 /// not match the pattern. 3405 std::pair<Register, int64_t> 3406 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset( 3407 Register Root, const MachineRegisterInfo &MRI) const { 3408 MachineInstr *RootI = MRI.getVRegDef(Root); 3409 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD) 3410 return {Root, 0}; 3411 3412 MachineOperand &RHS = RootI->getOperand(2); 3413 Optional<ValueAndVReg> MaybeOffset 3414 = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true); 3415 if (!MaybeOffset) 3416 return {Root, 0}; 3417 return {RootI->getOperand(1).getReg(), MaybeOffset->Value}; 3418 } 3419 3420 static void addZeroImm(MachineInstrBuilder &MIB) { 3421 MIB.addImm(0); 3422 } 3423 3424 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p 3425 /// BasePtr is not valid, a null base pointer will be used. 3426 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, 3427 uint32_t FormatLo, uint32_t FormatHi, 3428 Register BasePtr) { 3429 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 3430 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 3431 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 3432 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 3433 3434 B.buildInstr(AMDGPU::S_MOV_B32) 3435 .addDef(RSrc2) 3436 .addImm(FormatLo); 3437 B.buildInstr(AMDGPU::S_MOV_B32) 3438 .addDef(RSrc3) 3439 .addImm(FormatHi); 3440 3441 // Build the half of the subregister with the constants before building the 3442 // full 128-bit register. If we are building multiple resource descriptors, 3443 // this will allow CSEing of the 2-component register. 3444 B.buildInstr(AMDGPU::REG_SEQUENCE) 3445 .addDef(RSrcHi) 3446 .addReg(RSrc2) 3447 .addImm(AMDGPU::sub0) 3448 .addReg(RSrc3) 3449 .addImm(AMDGPU::sub1); 3450 3451 Register RSrcLo = BasePtr; 3452 if (!BasePtr) { 3453 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 3454 B.buildInstr(AMDGPU::S_MOV_B64) 3455 .addDef(RSrcLo) 3456 .addImm(0); 3457 } 3458 3459 B.buildInstr(AMDGPU::REG_SEQUENCE) 3460 .addDef(RSrc) 3461 .addReg(RSrcLo) 3462 .addImm(AMDGPU::sub0_sub1) 3463 .addReg(RSrcHi) 3464 .addImm(AMDGPU::sub2_sub3); 3465 3466 return RSrc; 3467 } 3468 3469 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 3470 const SIInstrInfo &TII, Register BasePtr) { 3471 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 3472 3473 // FIXME: Why are half the "default" bits ignored based on the addressing 3474 // mode? 3475 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr); 3476 } 3477 3478 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 3479 const SIInstrInfo &TII, Register BasePtr) { 3480 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 3481 3482 // FIXME: Why are half the "default" bits ignored based on the addressing 3483 // mode? 3484 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr); 3485 } 3486 3487 AMDGPUInstructionSelector::MUBUFAddressData 3488 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const { 3489 MUBUFAddressData Data; 3490 Data.N0 = Src; 3491 3492 Register PtrBase; 3493 int64_t Offset; 3494 3495 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI); 3496 if (isUInt<32>(Offset)) { 3497 Data.N0 = PtrBase; 3498 Data.Offset = Offset; 3499 } 3500 3501 if (MachineInstr *InputAdd 3502 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) { 3503 Data.N2 = InputAdd->getOperand(1).getReg(); 3504 Data.N3 = InputAdd->getOperand(2).getReg(); 3505 3506 // FIXME: Need to fix extra SGPR->VGPRcopies inserted 3507 // FIXME: Don't know this was defined by operand 0 3508 // 3509 // TODO: Remove this when we have copy folding optimizations after 3510 // RegBankSelect. 3511 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg(); 3512 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg(); 3513 } 3514 3515 return Data; 3516 } 3517 3518 /// Return if the addr64 mubuf mode should be used for the given address. 3519 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const { 3520 // (ptr_add N2, N3) -> addr64, or 3521 // (ptr_add (ptr_add N2, N3), C1) -> addr64 3522 if (Addr.N2) 3523 return true; 3524 3525 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI); 3526 return N0Bank->getID() == AMDGPU::VGPRRegBankID; 3527 } 3528 3529 /// Split an immediate offset \p ImmOffset depending on whether it fits in the 3530 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable 3531 /// component. 3532 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset( 3533 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const { 3534 if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset)) 3535 return; 3536 3537 // Illegal offset, store it in soffset. 3538 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 3539 B.buildInstr(AMDGPU::S_MOV_B32) 3540 .addDef(SOffset) 3541 .addImm(ImmOffset); 3542 ImmOffset = 0; 3543 } 3544 3545 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl( 3546 MachineOperand &Root, Register &VAddr, Register &RSrcReg, 3547 Register &SOffset, int64_t &Offset) const { 3548 // FIXME: Predicates should stop this from reaching here. 3549 // addr64 bit was removed for volcanic islands. 3550 if (!STI.hasAddr64() || STI.useFlatForGlobal()) 3551 return false; 3552 3553 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 3554 if (!shouldUseAddr64(AddrData)) 3555 return false; 3556 3557 Register N0 = AddrData.N0; 3558 Register N2 = AddrData.N2; 3559 Register N3 = AddrData.N3; 3560 Offset = AddrData.Offset; 3561 3562 // Base pointer for the SRD. 3563 Register SRDPtr; 3564 3565 if (N2) { 3566 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 3567 assert(N3); 3568 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 3569 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the 3570 // addr64, and construct the default resource from a 0 address. 3571 VAddr = N0; 3572 } else { 3573 SRDPtr = N3; 3574 VAddr = N2; 3575 } 3576 } else { 3577 // N2 is not divergent. 3578 SRDPtr = N2; 3579 VAddr = N3; 3580 } 3581 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 3582 // Use the default null pointer in the resource 3583 VAddr = N0; 3584 } else { 3585 // N0 -> offset, or 3586 // (N0 + C1) -> offset 3587 SRDPtr = N0; 3588 } 3589 3590 MachineIRBuilder B(*Root.getParent()); 3591 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr); 3592 splitIllegalMUBUFOffset(B, SOffset, Offset); 3593 return true; 3594 } 3595 3596 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl( 3597 MachineOperand &Root, Register &RSrcReg, Register &SOffset, 3598 int64_t &Offset) const { 3599 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 3600 if (shouldUseAddr64(AddrData)) 3601 return false; 3602 3603 // N0 -> offset, or 3604 // (N0 + C1) -> offset 3605 Register SRDPtr = AddrData.N0; 3606 Offset = AddrData.Offset; 3607 3608 // TODO: Look through extensions for 32-bit soffset. 3609 MachineIRBuilder B(*Root.getParent()); 3610 3611 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr); 3612 splitIllegalMUBUFOffset(B, SOffset, Offset); 3613 return true; 3614 } 3615 3616 InstructionSelector::ComplexRendererFns 3617 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const { 3618 Register VAddr; 3619 Register RSrcReg; 3620 Register SOffset; 3621 int64_t Offset = 0; 3622 3623 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 3624 return {}; 3625 3626 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 3627 // pattern. 3628 return {{ 3629 [=](MachineInstrBuilder &MIB) { // rsrc 3630 MIB.addReg(RSrcReg); 3631 }, 3632 [=](MachineInstrBuilder &MIB) { // vaddr 3633 MIB.addReg(VAddr); 3634 }, 3635 [=](MachineInstrBuilder &MIB) { // soffset 3636 if (SOffset) 3637 MIB.addReg(SOffset); 3638 else 3639 MIB.addImm(0); 3640 }, 3641 [=](MachineInstrBuilder &MIB) { // offset 3642 MIB.addImm(Offset); 3643 }, 3644 addZeroImm, // glc 3645 addZeroImm, // slc 3646 addZeroImm, // tfe 3647 addZeroImm, // dlc 3648 addZeroImm // swz 3649 }}; 3650 } 3651 3652 InstructionSelector::ComplexRendererFns 3653 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const { 3654 Register RSrcReg; 3655 Register SOffset; 3656 int64_t Offset = 0; 3657 3658 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 3659 return {}; 3660 3661 return {{ 3662 [=](MachineInstrBuilder &MIB) { // rsrc 3663 MIB.addReg(RSrcReg); 3664 }, 3665 [=](MachineInstrBuilder &MIB) { // soffset 3666 if (SOffset) 3667 MIB.addReg(SOffset); 3668 else 3669 MIB.addImm(0); 3670 }, 3671 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 3672 addZeroImm, // glc 3673 addZeroImm, // slc 3674 addZeroImm, // tfe 3675 addZeroImm, // dlc 3676 addZeroImm // swz 3677 }}; 3678 } 3679 3680 InstructionSelector::ComplexRendererFns 3681 AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const { 3682 Register VAddr; 3683 Register RSrcReg; 3684 Register SOffset; 3685 int64_t Offset = 0; 3686 3687 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 3688 return {}; 3689 3690 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 3691 // pattern. 3692 return {{ 3693 [=](MachineInstrBuilder &MIB) { // rsrc 3694 MIB.addReg(RSrcReg); 3695 }, 3696 [=](MachineInstrBuilder &MIB) { // vaddr 3697 MIB.addReg(VAddr); 3698 }, 3699 [=](MachineInstrBuilder &MIB) { // soffset 3700 if (SOffset) 3701 MIB.addReg(SOffset); 3702 else 3703 MIB.addImm(0); 3704 }, 3705 [=](MachineInstrBuilder &MIB) { // offset 3706 MIB.addImm(Offset); 3707 }, 3708 addZeroImm // slc 3709 }}; 3710 } 3711 3712 InstructionSelector::ComplexRendererFns 3713 AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const { 3714 Register RSrcReg; 3715 Register SOffset; 3716 int64_t Offset = 0; 3717 3718 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 3719 return {}; 3720 3721 return {{ 3722 [=](MachineInstrBuilder &MIB) { // rsrc 3723 MIB.addReg(RSrcReg); 3724 }, 3725 [=](MachineInstrBuilder &MIB) { // soffset 3726 if (SOffset) 3727 MIB.addReg(SOffset); 3728 else 3729 MIB.addImm(0); 3730 }, 3731 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 3732 addZeroImm // slc 3733 }}; 3734 } 3735 3736 /// Get an immediate that must be 32-bits, and treated as zero extended. 3737 static Optional<uint64_t> getConstantZext32Val(Register Reg, 3738 const MachineRegisterInfo &MRI) { 3739 // getConstantVRegVal sexts any values, so see if that matters. 3740 Optional<int64_t> OffsetVal = getConstantVRegVal(Reg, MRI); 3741 if (!OffsetVal || !isInt<32>(*OffsetVal)) 3742 return None; 3743 return Lo_32(*OffsetVal); 3744 } 3745 3746 InstructionSelector::ComplexRendererFns 3747 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const { 3748 Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI); 3749 if (!OffsetVal) 3750 return {}; 3751 3752 Optional<int64_t> EncodedImm = 3753 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true); 3754 if (!EncodedImm) 3755 return {}; 3756 3757 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; 3758 } 3759 3760 InstructionSelector::ComplexRendererFns 3761 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const { 3762 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); 3763 3764 Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI); 3765 if (!OffsetVal) 3766 return {}; 3767 3768 Optional<int64_t> EncodedImm 3769 = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal); 3770 if (!EncodedImm) 3771 return {}; 3772 3773 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; 3774 } 3775 3776 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, 3777 const MachineInstr &MI, 3778 int OpIdx) const { 3779 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 3780 "Expected G_CONSTANT"); 3781 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue()); 3782 } 3783 3784 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB, 3785 const MachineInstr &MI, 3786 int OpIdx) const { 3787 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 3788 "Expected G_CONSTANT"); 3789 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue()); 3790 } 3791 3792 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB, 3793 const MachineInstr &MI, 3794 int OpIdx) const { 3795 assert(OpIdx == -1); 3796 3797 const MachineOperand &Op = MI.getOperand(1); 3798 if (MI.getOpcode() == TargetOpcode::G_FCONSTANT) 3799 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); 3800 else { 3801 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT"); 3802 MIB.addImm(Op.getCImm()->getSExtValue()); 3803 } 3804 } 3805 3806 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB, 3807 const MachineInstr &MI, 3808 int OpIdx) const { 3809 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 3810 "Expected G_CONSTANT"); 3811 MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation()); 3812 } 3813 3814 /// This only really exists to satisfy DAG type checking machinery, so is a 3815 /// no-op here. 3816 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB, 3817 const MachineInstr &MI, 3818 int OpIdx) const { 3819 MIB.addImm(MI.getOperand(OpIdx).getImm()); 3820 } 3821 3822 void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB, 3823 const MachineInstr &MI, 3824 int OpIdx) const { 3825 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3826 MIB.addImm(MI.getOperand(OpIdx).getImm() & 1); 3827 } 3828 3829 void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB, 3830 const MachineInstr &MI, 3831 int OpIdx) const { 3832 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3833 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1); 3834 } 3835 3836 void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB, 3837 const MachineInstr &MI, 3838 int OpIdx) const { 3839 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3840 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1); 3841 } 3842 3843 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB, 3844 const MachineInstr &MI, 3845 int OpIdx) const { 3846 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3847 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1); 3848 } 3849 3850 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const { 3851 return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm()); 3852 } 3853 3854 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const { 3855 return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm()); 3856 } 3857 3858 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const { 3859 return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm()); 3860 } 3861 3862 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const { 3863 return TII.isInlineConstant(Imm); 3864 } 3865