1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the InstructionSelector class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUInstructionSelector.h" 15 #include "AMDGPUInstrInfo.h" 16 #include "AMDGPUGlobalISelUtils.h" 17 #include "AMDGPURegisterBankInfo.h" 18 #include "AMDGPUSubtarget.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 23 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" 24 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" 25 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 26 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 27 #include "llvm/CodeGen/GlobalISel/Utils.h" 28 #include "llvm/CodeGen/MachineBasicBlock.h" 29 #include "llvm/CodeGen/MachineFunction.h" 30 #include "llvm/CodeGen/MachineInstr.h" 31 #include "llvm/CodeGen/MachineInstrBuilder.h" 32 #include "llvm/CodeGen/MachineRegisterInfo.h" 33 #include "llvm/IR/Type.h" 34 #include "llvm/Support/Debug.h" 35 #include "llvm/Support/raw_ostream.h" 36 37 #define DEBUG_TYPE "amdgpu-isel" 38 39 using namespace llvm; 40 using namespace MIPatternMatch; 41 42 static cl::opt<bool> AllowRiskySelect( 43 "amdgpu-global-isel-risky-select", 44 cl::desc("Allow GlobalISel to select cases that are likely to not work yet"), 45 cl::init(false), 46 cl::ReallyHidden); 47 48 #define GET_GLOBALISEL_IMPL 49 #define AMDGPUSubtarget GCNSubtarget 50 #include "AMDGPUGenGlobalISel.inc" 51 #undef GET_GLOBALISEL_IMPL 52 #undef AMDGPUSubtarget 53 54 AMDGPUInstructionSelector::AMDGPUInstructionSelector( 55 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, 56 const AMDGPUTargetMachine &TM) 57 : InstructionSelector(), TII(*STI.getInstrInfo()), 58 TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM), 59 STI(STI), 60 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG), 61 #define GET_GLOBALISEL_PREDICATES_INIT 62 #include "AMDGPUGenGlobalISel.inc" 63 #undef GET_GLOBALISEL_PREDICATES_INIT 64 #define GET_GLOBALISEL_TEMPORARIES_INIT 65 #include "AMDGPUGenGlobalISel.inc" 66 #undef GET_GLOBALISEL_TEMPORARIES_INIT 67 { 68 } 69 70 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } 71 72 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB, 73 CodeGenCoverage &CoverageInfo) { 74 MRI = &MF.getRegInfo(); 75 InstructionSelector::setupMF(MF, KB, CoverageInfo); 76 } 77 78 bool AMDGPUInstructionSelector::isVCC(Register Reg, 79 const MachineRegisterInfo &MRI) const { 80 if (Register::isPhysicalRegister(Reg)) 81 return Reg == TRI.getVCC(); 82 83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 84 const TargetRegisterClass *RC = 85 RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); 86 if (RC) { 87 const LLT Ty = MRI.getType(Reg); 88 return RC->hasSuperClassEq(TRI.getBoolRC()) && 89 Ty.isValid() && Ty.getSizeInBits() == 1; 90 } 91 92 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); 93 return RB->getID() == AMDGPU::VCCRegBankID; 94 } 95 96 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI, 97 unsigned NewOpc) const { 98 MI.setDesc(TII.get(NewOpc)); 99 MI.RemoveOperand(1); // Remove intrinsic ID. 100 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 101 102 MachineOperand &Dst = MI.getOperand(0); 103 MachineOperand &Src = MI.getOperand(1); 104 105 // TODO: This should be legalized to s32 if needed 106 if (MRI->getType(Dst.getReg()) == LLT::scalar(1)) 107 return false; 108 109 const TargetRegisterClass *DstRC 110 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 111 const TargetRegisterClass *SrcRC 112 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 113 if (!DstRC || DstRC != SrcRC) 114 return false; 115 116 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) && 117 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI); 118 } 119 120 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { 121 const DebugLoc &DL = I.getDebugLoc(); 122 MachineBasicBlock *BB = I.getParent(); 123 I.setDesc(TII.get(TargetOpcode::COPY)); 124 125 const MachineOperand &Src = I.getOperand(1); 126 MachineOperand &Dst = I.getOperand(0); 127 Register DstReg = Dst.getReg(); 128 Register SrcReg = Src.getReg(); 129 130 if (isVCC(DstReg, *MRI)) { 131 if (SrcReg == AMDGPU::SCC) { 132 const TargetRegisterClass *RC 133 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 134 if (!RC) 135 return true; 136 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 137 } 138 139 if (!isVCC(SrcReg, *MRI)) { 140 // TODO: Should probably leave the copy and let copyPhysReg expand it. 141 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI)) 142 return false; 143 144 const TargetRegisterClass *SrcRC 145 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 146 147 Register MaskedReg = MRI->createVirtualRegister(SrcRC); 148 149 // We can't trust the high bits at this point, so clear them. 150 151 // TODO: Skip masking high bits if def is known boolean. 152 153 unsigned AndOpc = TRI.isSGPRClass(SrcRC) ? 154 AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32; 155 BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg) 156 .addImm(1) 157 .addReg(SrcReg); 158 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) 159 .addImm(0) 160 .addReg(MaskedReg); 161 162 if (!MRI->getRegClassOrNull(SrcReg)) 163 MRI->setRegClass(SrcReg, SrcRC); 164 I.eraseFromParent(); 165 return true; 166 } 167 168 const TargetRegisterClass *RC = 169 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 170 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) 171 return false; 172 173 // Don't constrain the source register to a class so the def instruction 174 // handles it (unless it's undef). 175 // 176 // FIXME: This is a hack. When selecting the def, we neeed to know 177 // specifically know that the result is VCCRegBank, and not just an SGPR 178 // with size 1. An SReg_32 with size 1 is ambiguous with wave32. 179 if (Src.isUndef()) { 180 const TargetRegisterClass *SrcRC = 181 TRI.getConstrainedRegClassForOperand(Src, *MRI); 182 if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 183 return false; 184 } 185 186 return true; 187 } 188 189 for (const MachineOperand &MO : I.operands()) { 190 if (Register::isPhysicalRegister(MO.getReg())) 191 continue; 192 193 const TargetRegisterClass *RC = 194 TRI.getConstrainedRegClassForOperand(MO, *MRI); 195 if (!RC) 196 continue; 197 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); 198 } 199 return true; 200 } 201 202 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { 203 const Register DefReg = I.getOperand(0).getReg(); 204 const LLT DefTy = MRI->getType(DefReg); 205 if (DefTy == LLT::scalar(1)) { 206 if (!AllowRiskySelect) { 207 LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n"); 208 return false; 209 } 210 211 LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n"); 212 } 213 214 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy) 215 216 const RegClassOrRegBank &RegClassOrBank = 217 MRI->getRegClassOrRegBank(DefReg); 218 219 const TargetRegisterClass *DefRC 220 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); 221 if (!DefRC) { 222 if (!DefTy.isValid()) { 223 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); 224 return false; 225 } 226 227 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); 228 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI); 229 if (!DefRC) { 230 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); 231 return false; 232 } 233 } 234 235 // TODO: Verify that all registers have the same bank 236 I.setDesc(TII.get(TargetOpcode::PHI)); 237 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI); 238 } 239 240 MachineOperand 241 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, 242 const TargetRegisterClass &SubRC, 243 unsigned SubIdx) const { 244 245 MachineInstr *MI = MO.getParent(); 246 MachineBasicBlock *BB = MO.getParent()->getParent(); 247 Register DstReg = MRI->createVirtualRegister(&SubRC); 248 249 if (MO.isReg()) { 250 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); 251 Register Reg = MO.getReg(); 252 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) 253 .addReg(Reg, 0, ComposedSubIdx); 254 255 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(), 256 MO.isKill(), MO.isDead(), MO.isUndef(), 257 MO.isEarlyClobber(), 0, MO.isDebug(), 258 MO.isInternalRead()); 259 } 260 261 assert(MO.isImm()); 262 263 APInt Imm(64, MO.getImm()); 264 265 switch (SubIdx) { 266 default: 267 llvm_unreachable("do not know to split immediate with this sub index."); 268 case AMDGPU::sub0: 269 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue()); 270 case AMDGPU::sub1: 271 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue()); 272 } 273 } 274 275 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { 276 switch (Opc) { 277 case AMDGPU::G_AND: 278 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; 279 case AMDGPU::G_OR: 280 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32; 281 case AMDGPU::G_XOR: 282 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32; 283 default: 284 llvm_unreachable("not a bit op"); 285 } 286 } 287 288 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { 289 MachineOperand &Dst = I.getOperand(0); 290 MachineOperand &Src0 = I.getOperand(1); 291 MachineOperand &Src1 = I.getOperand(2); 292 Register DstReg = Dst.getReg(); 293 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 294 295 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 296 if (DstRB->getID() == AMDGPU::VCCRegBankID) { 297 const TargetRegisterClass *RC = TRI.getBoolRC(); 298 unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), 299 RC == &AMDGPU::SReg_64RegClass); 300 I.setDesc(TII.get(InstOpc)); 301 // Dead implicit-def of scc 302 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef 303 true, // isImp 304 false, // isKill 305 true)); // isDead 306 307 // FIXME: Hack to avoid turning the register bank into a register class. 308 // The selector for G_ICMP relies on seeing the register bank for the result 309 // is VCC. In wave32 if we constrain the registers to SReg_32 here, it will 310 // be ambiguous whether it's a scalar or vector bool. 311 if (Src0.isUndef() && !MRI->getRegClassOrNull(Src0.getReg())) 312 MRI->setRegClass(Src0.getReg(), RC); 313 if (Src1.isUndef() && !MRI->getRegClassOrNull(Src1.getReg())) 314 MRI->setRegClass(Src1.getReg(), RC); 315 316 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 317 } 318 319 // TODO: Should this allow an SCC bank result, and produce a copy from SCC for 320 // the result? 321 if (DstRB->getID() == AMDGPU::SGPRRegBankID) { 322 unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), Size > 32); 323 I.setDesc(TII.get(InstOpc)); 324 // Dead implicit-def of scc 325 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef 326 true, // isImp 327 false, // isKill 328 true)); // isDead 329 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 330 } 331 332 return false; 333 } 334 335 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { 336 MachineBasicBlock *BB = I.getParent(); 337 MachineFunction *MF = BB->getParent(); 338 Register DstReg = I.getOperand(0).getReg(); 339 const DebugLoc &DL = I.getDebugLoc(); 340 LLT Ty = MRI->getType(DstReg); 341 if (Ty.isVector()) 342 return false; 343 344 unsigned Size = Ty.getSizeInBits(); 345 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 346 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID; 347 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB; 348 349 if (Size == 32) { 350 if (IsSALU) { 351 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; 352 MachineInstr *Add = 353 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 354 .add(I.getOperand(1)) 355 .add(I.getOperand(2)); 356 I.eraseFromParent(); 357 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 358 } 359 360 if (STI.hasAddNoCarry()) { 361 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64; 362 I.setDesc(TII.get(Opc)); 363 I.addOperand(*MF, MachineOperand::CreateImm(0)); 364 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 365 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 366 } 367 368 const unsigned Opc = Sub ? AMDGPU::V_SUB_I32_e64 : AMDGPU::V_ADD_I32_e64; 369 370 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass()); 371 MachineInstr *Add 372 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 373 .addDef(UnusedCarry, RegState::Dead) 374 .add(I.getOperand(1)) 375 .add(I.getOperand(2)) 376 .addImm(0); 377 I.eraseFromParent(); 378 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 379 } 380 381 assert(!Sub && "illegal sub should not reach here"); 382 383 const TargetRegisterClass &RC 384 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass; 385 const TargetRegisterClass &HalfRC 386 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass; 387 388 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0)); 389 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0)); 390 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1)); 391 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1)); 392 393 Register DstLo = MRI->createVirtualRegister(&HalfRC); 394 Register DstHi = MRI->createVirtualRegister(&HalfRC); 395 396 if (IsSALU) { 397 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) 398 .add(Lo1) 399 .add(Lo2); 400 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) 401 .add(Hi1) 402 .add(Hi2); 403 } else { 404 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass(); 405 Register CarryReg = MRI->createVirtualRegister(CarryRC); 406 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_I32_e64), DstLo) 407 .addDef(CarryReg) 408 .add(Lo1) 409 .add(Lo2) 410 .addImm(0); 411 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi) 412 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead) 413 .add(Hi1) 414 .add(Hi2) 415 .addReg(CarryReg, RegState::Kill) 416 .addImm(0); 417 418 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI)) 419 return false; 420 } 421 422 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 423 .addReg(DstLo) 424 .addImm(AMDGPU::sub0) 425 .addReg(DstHi) 426 .addImm(AMDGPU::sub1); 427 428 429 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI)) 430 return false; 431 432 I.eraseFromParent(); 433 return true; 434 } 435 436 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE( 437 MachineInstr &I) const { 438 MachineBasicBlock *BB = I.getParent(); 439 MachineFunction *MF = BB->getParent(); 440 const DebugLoc &DL = I.getDebugLoc(); 441 Register Dst0Reg = I.getOperand(0).getReg(); 442 Register Dst1Reg = I.getOperand(1).getReg(); 443 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO || 444 I.getOpcode() == AMDGPU::G_UADDE; 445 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE || 446 I.getOpcode() == AMDGPU::G_USUBE; 447 448 if (isVCC(Dst1Reg, *MRI)) { 449 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned 450 // carry out despite the _i32 name. These were renamed in VI to _U32. 451 // FIXME: We should probably rename the opcodes here. 452 unsigned NoCarryOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; 453 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 454 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc)); 455 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 456 I.addOperand(*MF, MachineOperand::CreateImm(0)); 457 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 458 } 459 460 Register Src0Reg = I.getOperand(2).getReg(); 461 Register Src1Reg = I.getOperand(3).getReg(); 462 463 if (HasCarryIn) { 464 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 465 .addReg(I.getOperand(4).getReg()); 466 } 467 468 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; 469 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; 470 471 BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg) 472 .add(I.getOperand(2)) 473 .add(I.getOperand(3)); 474 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg) 475 .addReg(AMDGPU::SCC); 476 477 if (!MRI->getRegClassOrNull(Dst1Reg)) 478 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass); 479 480 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) || 481 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) || 482 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI)) 483 return false; 484 485 if (HasCarryIn && 486 !RBI.constrainGenericRegister(I.getOperand(4).getReg(), 487 AMDGPU::SReg_32RegClass, *MRI)) 488 return false; 489 490 I.eraseFromParent(); 491 return true; 492 } 493 494 // TODO: We should probably legalize these to only using 32-bit results. 495 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { 496 MachineBasicBlock *BB = I.getParent(); 497 Register DstReg = I.getOperand(0).getReg(); 498 Register SrcReg = I.getOperand(1).getReg(); 499 LLT DstTy = MRI->getType(DstReg); 500 LLT SrcTy = MRI->getType(SrcReg); 501 const unsigned SrcSize = SrcTy.getSizeInBits(); 502 unsigned DstSize = DstTy.getSizeInBits(); 503 504 // TODO: Should handle any multiple of 32 offset. 505 unsigned Offset = I.getOperand(2).getImm(); 506 if (Offset % 32 != 0 || DstSize > 128) 507 return false; 508 509 // 16-bit operations really use 32-bit registers. 510 // FIXME: Probably should not allow 16-bit G_EXTRACT results. 511 if (DstSize == 16) 512 DstSize = 32; 513 514 const TargetRegisterClass *DstRC = 515 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI); 516 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 517 return false; 518 519 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 520 const TargetRegisterClass *SrcRC = 521 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 522 if (!SrcRC) 523 return false; 524 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32, 525 DstSize / 32); 526 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg); 527 if (!SrcRC) 528 return false; 529 530 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I, 531 *SrcRC, I.getOperand(1)); 532 const DebugLoc &DL = I.getDebugLoc(); 533 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg) 534 .addReg(SrcReg, 0, SubReg); 535 536 I.eraseFromParent(); 537 return true; 538 } 539 540 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { 541 MachineBasicBlock *BB = MI.getParent(); 542 Register DstReg = MI.getOperand(0).getReg(); 543 LLT DstTy = MRI->getType(DstReg); 544 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg()); 545 546 const unsigned SrcSize = SrcTy.getSizeInBits(); 547 if (SrcSize < 32) 548 return selectImpl(MI, *CoverageInfo); 549 550 const DebugLoc &DL = MI.getDebugLoc(); 551 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 552 const unsigned DstSize = DstTy.getSizeInBits(); 553 const TargetRegisterClass *DstRC = 554 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 555 if (!DstRC) 556 return false; 557 558 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8); 559 MachineInstrBuilder MIB = 560 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg); 561 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) { 562 MachineOperand &Src = MI.getOperand(I + 1); 563 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef())); 564 MIB.addImm(SubRegs[I]); 565 566 const TargetRegisterClass *SrcRC 567 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 568 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI)) 569 return false; 570 } 571 572 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 573 return false; 574 575 MI.eraseFromParent(); 576 return true; 577 } 578 579 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { 580 MachineBasicBlock *BB = MI.getParent(); 581 const int NumDst = MI.getNumOperands() - 1; 582 583 MachineOperand &Src = MI.getOperand(NumDst); 584 585 Register SrcReg = Src.getReg(); 586 Register DstReg0 = MI.getOperand(0).getReg(); 587 LLT DstTy = MRI->getType(DstReg0); 588 LLT SrcTy = MRI->getType(SrcReg); 589 590 const unsigned DstSize = DstTy.getSizeInBits(); 591 const unsigned SrcSize = SrcTy.getSizeInBits(); 592 const DebugLoc &DL = MI.getDebugLoc(); 593 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 594 595 const TargetRegisterClass *SrcRC = 596 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 597 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 598 return false; 599 600 const unsigned SrcFlags = getUndefRegState(Src.isUndef()); 601 602 // Note we could have mixed SGPR and VGPR destination banks for an SGPR 603 // source, and this relies on the fact that the same subregister indices are 604 // used for both. 605 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); 606 for (int I = 0, E = NumDst; I != E; ++I) { 607 MachineOperand &Dst = MI.getOperand(I); 608 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg()) 609 .addReg(SrcReg, SrcFlags, SubRegs[I]); 610 611 const TargetRegisterClass *DstRC = 612 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 613 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI)) 614 return false; 615 } 616 617 MI.eraseFromParent(); 618 return true; 619 } 620 621 static bool isZero(Register Reg, const MachineRegisterInfo &MRI) { 622 int64_t Val; 623 return mi_match(Reg, MRI, m_ICst(Val)) && Val == 0; 624 } 625 626 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC( 627 MachineInstr &MI) const { 628 if (selectImpl(MI, *CoverageInfo)) 629 return true; 630 631 const LLT S32 = LLT::scalar(32); 632 const LLT V2S16 = LLT::vector(2, 16); 633 634 Register Dst = MI.getOperand(0).getReg(); 635 if (MRI->getType(Dst) != V2S16) 636 return false; 637 638 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI); 639 if (DstBank->getID() != AMDGPU::SGPRRegBankID) 640 return false; 641 642 Register Src0 = MI.getOperand(1).getReg(); 643 Register Src1 = MI.getOperand(2).getReg(); 644 if (MRI->getType(Src0) != S32) 645 return false; 646 647 const DebugLoc &DL = MI.getDebugLoc(); 648 MachineBasicBlock *BB = MI.getParent(); 649 650 // TODO: This should probably be a combine somewhere 651 // (build_vector_trunc $src0, undef -> copy $src0 652 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI); 653 if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) { 654 MI.setDesc(TII.get(AMDGPU::COPY)); 655 MI.RemoveOperand(2); 656 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) && 657 RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI); 658 } 659 660 Register ShiftSrc0; 661 Register ShiftSrc1; 662 int64_t ShiftAmt; 663 664 // With multiple uses of the shift, this will duplicate the shift and 665 // increase register pressure. 666 // 667 // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16) 668 // => (S_PACK_HH_B32_B16 $src0, $src1) 669 // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16)) 670 // => (S_PACK_LH_B32_B16 $src0, $src1) 671 // (build_vector_trunc $src0, $src1) 672 // => (S_PACK_LL_B32_B16 $src0, $src1) 673 674 // FIXME: This is an inconvenient way to check a specific value 675 bool Shift0 = mi_match( 676 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_ICst(ShiftAmt)))) && 677 ShiftAmt == 16; 678 679 bool Shift1 = mi_match( 680 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_ICst(ShiftAmt)))) && 681 ShiftAmt == 16; 682 683 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16; 684 if (Shift0 && Shift1) { 685 Opc = AMDGPU::S_PACK_HH_B32_B16; 686 MI.getOperand(1).setReg(ShiftSrc0); 687 MI.getOperand(2).setReg(ShiftSrc1); 688 } else if (Shift1) { 689 Opc = AMDGPU::S_PACK_LH_B32_B16; 690 MI.getOperand(2).setReg(ShiftSrc1); 691 } else if (Shift0 && isZero(Src1, *MRI)) { 692 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16 693 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst) 694 .addReg(ShiftSrc0) 695 .addImm(16); 696 697 MI.eraseFromParent(); 698 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 699 } 700 701 MI.setDesc(TII.get(Opc)); 702 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); 703 } 704 705 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const { 706 return selectG_ADD_SUB(I); 707 } 708 709 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { 710 const MachineOperand &MO = I.getOperand(0); 711 712 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The 713 // regbank check here is to know why getConstrainedRegClassForOperand failed. 714 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI); 715 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) || 716 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) { 717 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); 718 return true; 719 } 720 721 return false; 722 } 723 724 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { 725 MachineBasicBlock *BB = I.getParent(); 726 727 Register DstReg = I.getOperand(0).getReg(); 728 Register Src0Reg = I.getOperand(1).getReg(); 729 Register Src1Reg = I.getOperand(2).getReg(); 730 LLT Src1Ty = MRI->getType(Src1Reg); 731 732 unsigned DstSize = MRI->getType(DstReg).getSizeInBits(); 733 unsigned InsSize = Src1Ty.getSizeInBits(); 734 735 int64_t Offset = I.getOperand(3).getImm(); 736 737 // FIXME: These cases should have been illegal and unnecessary to check here. 738 if (Offset % 32 != 0 || InsSize % 32 != 0) 739 return false; 740 741 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32); 742 if (SubReg == AMDGPU::NoSubRegister) 743 return false; 744 745 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 746 const TargetRegisterClass *DstRC = 747 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 748 if (!DstRC) 749 return false; 750 751 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI); 752 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI); 753 const TargetRegisterClass *Src0RC = 754 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI); 755 const TargetRegisterClass *Src1RC = 756 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI); 757 758 // Deal with weird cases where the class only partially supports the subreg 759 // index. 760 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg); 761 if (!Src0RC || !Src1RC) 762 return false; 763 764 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 765 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) || 766 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI)) 767 return false; 768 769 const DebugLoc &DL = I.getDebugLoc(); 770 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg) 771 .addReg(Src0Reg) 772 .addReg(Src1Reg) 773 .addImm(SubReg); 774 775 I.eraseFromParent(); 776 return true; 777 } 778 779 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const { 780 if (STI.getLDSBankCount() != 16) 781 return selectImpl(MI, *CoverageInfo); 782 783 Register Dst = MI.getOperand(0).getReg(); 784 Register Src0 = MI.getOperand(2).getReg(); 785 Register M0Val = MI.getOperand(6).getReg(); 786 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) || 787 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) || 788 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI)) 789 return false; 790 791 // This requires 2 instructions. It is possible to write a pattern to support 792 // this, but the generated isel emitter doesn't correctly deal with multiple 793 // output instructions using the same physical register input. The copy to m0 794 // is incorrectly placed before the second instruction. 795 // 796 // TODO: Match source modifiers. 797 798 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 799 const DebugLoc &DL = MI.getDebugLoc(); 800 MachineBasicBlock *MBB = MI.getParent(); 801 802 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 803 .addReg(M0Val); 804 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov) 805 .addImm(2) 806 .addImm(MI.getOperand(4).getImm()) // $attr 807 .addImm(MI.getOperand(3).getImm()); // $attrchan 808 809 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst) 810 .addImm(0) // $src0_modifiers 811 .addReg(Src0) // $src0 812 .addImm(MI.getOperand(4).getImm()) // $attr 813 .addImm(MI.getOperand(3).getImm()) // $attrchan 814 .addImm(0) // $src2_modifiers 815 .addReg(InterpMov) // $src2 - 2 f16 values selected by high 816 .addImm(MI.getOperand(5).getImm()) // $high 817 .addImm(0) // $clamp 818 .addImm(0); // $omod 819 820 MI.eraseFromParent(); 821 return true; 822 } 823 824 // We need to handle this here because tablegen doesn't support matching 825 // instructions with multiple outputs. 826 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const { 827 Register Dst0 = MI.getOperand(0).getReg(); 828 Register Dst1 = MI.getOperand(1).getReg(); 829 830 LLT Ty = MRI->getType(Dst0); 831 unsigned Opc; 832 if (Ty == LLT::scalar(32)) 833 Opc = AMDGPU::V_DIV_SCALE_F32; 834 else if (Ty == LLT::scalar(64)) 835 Opc = AMDGPU::V_DIV_SCALE_F64; 836 else 837 return false; 838 839 const DebugLoc &DL = MI.getDebugLoc(); 840 MachineBasicBlock *MBB = MI.getParent(); 841 842 Register Numer = MI.getOperand(3).getReg(); 843 Register Denom = MI.getOperand(4).getReg(); 844 unsigned ChooseDenom = MI.getOperand(5).getImm(); 845 846 Register Src0 = ChooseDenom != 0 ? Numer : Denom; 847 848 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0) 849 .addDef(Dst1) 850 .addUse(Src0) 851 .addUse(Denom) 852 .addUse(Numer); 853 854 MI.eraseFromParent(); 855 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 856 } 857 858 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { 859 unsigned IntrinsicID = I.getIntrinsicID(); 860 switch (IntrinsicID) { 861 case Intrinsic::amdgcn_if_break: { 862 MachineBasicBlock *BB = I.getParent(); 863 864 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 865 // SelectionDAG uses for wave32 vs wave64. 866 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK)) 867 .add(I.getOperand(0)) 868 .add(I.getOperand(2)) 869 .add(I.getOperand(3)); 870 871 Register DstReg = I.getOperand(0).getReg(); 872 Register Src0Reg = I.getOperand(2).getReg(); 873 Register Src1Reg = I.getOperand(3).getReg(); 874 875 I.eraseFromParent(); 876 877 for (Register Reg : { DstReg, Src0Reg, Src1Reg }) 878 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 879 880 return true; 881 } 882 case Intrinsic::amdgcn_interp_p1_f16: 883 return selectInterpP1F16(I); 884 case Intrinsic::amdgcn_wqm: 885 return constrainCopyLikeIntrin(I, AMDGPU::WQM); 886 case Intrinsic::amdgcn_softwqm: 887 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM); 888 case Intrinsic::amdgcn_wwm: 889 return constrainCopyLikeIntrin(I, AMDGPU::WWM); 890 case Intrinsic::amdgcn_div_scale: 891 return selectDivScale(I); 892 case Intrinsic::amdgcn_icmp: 893 return selectIntrinsicIcmp(I); 894 default: 895 return selectImpl(I, *CoverageInfo); 896 } 897 } 898 899 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) { 900 if (Size != 32 && Size != 64) 901 return -1; 902 switch (P) { 903 default: 904 llvm_unreachable("Unknown condition code!"); 905 case CmpInst::ICMP_NE: 906 return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64; 907 case CmpInst::ICMP_EQ: 908 return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64; 909 case CmpInst::ICMP_SGT: 910 return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64; 911 case CmpInst::ICMP_SGE: 912 return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64; 913 case CmpInst::ICMP_SLT: 914 return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64; 915 case CmpInst::ICMP_SLE: 916 return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64; 917 case CmpInst::ICMP_UGT: 918 return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64; 919 case CmpInst::ICMP_UGE: 920 return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64; 921 case CmpInst::ICMP_ULT: 922 return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64; 923 case CmpInst::ICMP_ULE: 924 return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64; 925 } 926 } 927 928 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P, 929 unsigned Size) const { 930 if (Size == 64) { 931 if (!STI.hasScalarCompareEq64()) 932 return -1; 933 934 switch (P) { 935 case CmpInst::ICMP_NE: 936 return AMDGPU::S_CMP_LG_U64; 937 case CmpInst::ICMP_EQ: 938 return AMDGPU::S_CMP_EQ_U64; 939 default: 940 return -1; 941 } 942 } 943 944 if (Size != 32) 945 return -1; 946 947 switch (P) { 948 case CmpInst::ICMP_NE: 949 return AMDGPU::S_CMP_LG_U32; 950 case CmpInst::ICMP_EQ: 951 return AMDGPU::S_CMP_EQ_U32; 952 case CmpInst::ICMP_SGT: 953 return AMDGPU::S_CMP_GT_I32; 954 case CmpInst::ICMP_SGE: 955 return AMDGPU::S_CMP_GE_I32; 956 case CmpInst::ICMP_SLT: 957 return AMDGPU::S_CMP_LT_I32; 958 case CmpInst::ICMP_SLE: 959 return AMDGPU::S_CMP_LE_I32; 960 case CmpInst::ICMP_UGT: 961 return AMDGPU::S_CMP_GT_U32; 962 case CmpInst::ICMP_UGE: 963 return AMDGPU::S_CMP_GE_U32; 964 case CmpInst::ICMP_ULT: 965 return AMDGPU::S_CMP_LT_U32; 966 case CmpInst::ICMP_ULE: 967 return AMDGPU::S_CMP_LE_U32; 968 default: 969 llvm_unreachable("Unknown condition code!"); 970 } 971 } 972 973 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { 974 MachineBasicBlock *BB = I.getParent(); 975 const DebugLoc &DL = I.getDebugLoc(); 976 977 Register SrcReg = I.getOperand(2).getReg(); 978 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 979 980 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); 981 982 Register CCReg = I.getOperand(0).getReg(); 983 if (!isVCC(CCReg, *MRI)) { 984 int Opcode = getS_CMPOpcode(Pred, Size); 985 if (Opcode == -1) 986 return false; 987 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode)) 988 .add(I.getOperand(2)) 989 .add(I.getOperand(3)); 990 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg) 991 .addReg(AMDGPU::SCC); 992 bool Ret = 993 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) && 994 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI); 995 I.eraseFromParent(); 996 return Ret; 997 } 998 999 int Opcode = getV_CMPOpcode(Pred, Size); 1000 if (Opcode == -1) 1001 return false; 1002 1003 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), 1004 I.getOperand(0).getReg()) 1005 .add(I.getOperand(2)) 1006 .add(I.getOperand(3)); 1007 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), 1008 *TRI.getBoolRC(), *MRI); 1009 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 1010 I.eraseFromParent(); 1011 return Ret; 1012 } 1013 1014 bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const { 1015 Register Dst = I.getOperand(0).getReg(); 1016 if (isVCC(Dst, *MRI)) 1017 return false; 1018 1019 if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize()) 1020 return false; 1021 1022 MachineBasicBlock *BB = I.getParent(); 1023 const DebugLoc &DL = I.getDebugLoc(); 1024 Register SrcReg = I.getOperand(2).getReg(); 1025 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 1026 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm()); 1027 1028 int Opcode = getV_CMPOpcode(Pred, Size); 1029 if (Opcode == -1) 1030 return false; 1031 1032 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst) 1033 .add(I.getOperand(2)) 1034 .add(I.getOperand(3)); 1035 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(), 1036 *MRI); 1037 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 1038 I.eraseFromParent(); 1039 return Ret; 1040 } 1041 1042 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const { 1043 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 1044 // SelectionDAG uses for wave32 vs wave64. 1045 MachineBasicBlock *BB = MI.getParent(); 1046 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF)) 1047 .add(MI.getOperand(1)); 1048 1049 Register Reg = MI.getOperand(1).getReg(); 1050 MI.eraseFromParent(); 1051 1052 if (!MRI->getRegClassOrNull(Reg)) 1053 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 1054 return true; 1055 } 1056 1057 static unsigned getDSShaderTypeValue(const MachineFunction &MF) { 1058 switch (MF.getFunction().getCallingConv()) { 1059 case CallingConv::AMDGPU_PS: 1060 return 1; 1061 case CallingConv::AMDGPU_VS: 1062 return 2; 1063 case CallingConv::AMDGPU_GS: 1064 return 3; 1065 case CallingConv::AMDGPU_HS: 1066 case CallingConv::AMDGPU_LS: 1067 case CallingConv::AMDGPU_ES: 1068 report_fatal_error("ds_ordered_count unsupported for this calling conv"); 1069 case CallingConv::AMDGPU_CS: 1070 case CallingConv::AMDGPU_KERNEL: 1071 case CallingConv::C: 1072 case CallingConv::Fast: 1073 default: 1074 // Assume other calling conventions are various compute callable functions 1075 return 0; 1076 } 1077 } 1078 1079 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic( 1080 MachineInstr &MI, Intrinsic::ID IntrID) const { 1081 MachineBasicBlock *MBB = MI.getParent(); 1082 MachineFunction *MF = MBB->getParent(); 1083 const DebugLoc &DL = MI.getDebugLoc(); 1084 1085 unsigned IndexOperand = MI.getOperand(7).getImm(); 1086 bool WaveRelease = MI.getOperand(8).getImm() != 0; 1087 bool WaveDone = MI.getOperand(9).getImm() != 0; 1088 1089 if (WaveDone && !WaveRelease) 1090 report_fatal_error("ds_ordered_count: wave_done requires wave_release"); 1091 1092 unsigned OrderedCountIndex = IndexOperand & 0x3f; 1093 IndexOperand &= ~0x3f; 1094 unsigned CountDw = 0; 1095 1096 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) { 1097 CountDw = (IndexOperand >> 24) & 0xf; 1098 IndexOperand &= ~(0xf << 24); 1099 1100 if (CountDw < 1 || CountDw > 4) { 1101 report_fatal_error( 1102 "ds_ordered_count: dword count must be between 1 and 4"); 1103 } 1104 } 1105 1106 if (IndexOperand) 1107 report_fatal_error("ds_ordered_count: bad index operand"); 1108 1109 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1; 1110 unsigned ShaderType = getDSShaderTypeValue(*MF); 1111 1112 unsigned Offset0 = OrderedCountIndex << 2; 1113 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) | 1114 (Instruction << 4); 1115 1116 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) 1117 Offset1 |= (CountDw - 1) << 6; 1118 1119 unsigned Offset = Offset0 | (Offset1 << 8); 1120 1121 Register M0Val = MI.getOperand(2).getReg(); 1122 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1123 .addReg(M0Val); 1124 1125 Register DstReg = MI.getOperand(0).getReg(); 1126 Register ValReg = MI.getOperand(3).getReg(); 1127 MachineInstrBuilder DS = 1128 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg) 1129 .addReg(ValReg) 1130 .addImm(Offset) 1131 .cloneMemRefs(MI); 1132 1133 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI)) 1134 return false; 1135 1136 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI); 1137 MI.eraseFromParent(); 1138 return Ret; 1139 } 1140 1141 static unsigned gwsIntrinToOpcode(unsigned IntrID) { 1142 switch (IntrID) { 1143 case Intrinsic::amdgcn_ds_gws_init: 1144 return AMDGPU::DS_GWS_INIT; 1145 case Intrinsic::amdgcn_ds_gws_barrier: 1146 return AMDGPU::DS_GWS_BARRIER; 1147 case Intrinsic::amdgcn_ds_gws_sema_v: 1148 return AMDGPU::DS_GWS_SEMA_V; 1149 case Intrinsic::amdgcn_ds_gws_sema_br: 1150 return AMDGPU::DS_GWS_SEMA_BR; 1151 case Intrinsic::amdgcn_ds_gws_sema_p: 1152 return AMDGPU::DS_GWS_SEMA_P; 1153 case Intrinsic::amdgcn_ds_gws_sema_release_all: 1154 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL; 1155 default: 1156 llvm_unreachable("not a gws intrinsic"); 1157 } 1158 } 1159 1160 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, 1161 Intrinsic::ID IID) const { 1162 if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all && 1163 !STI.hasGWSSemaReleaseAll()) 1164 return false; 1165 1166 // intrinsic ID, vsrc, offset 1167 const bool HasVSrc = MI.getNumOperands() == 3; 1168 assert(HasVSrc || MI.getNumOperands() == 2); 1169 1170 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg(); 1171 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI); 1172 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID) 1173 return false; 1174 1175 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1176 assert(OffsetDef); 1177 1178 unsigned ImmOffset; 1179 1180 MachineBasicBlock *MBB = MI.getParent(); 1181 const DebugLoc &DL = MI.getDebugLoc(); 1182 1183 MachineInstr *Readfirstlane = nullptr; 1184 1185 // If we legalized the VGPR input, strip out the readfirstlane to analyze the 1186 // incoming offset, in case there's an add of a constant. We'll have to put it 1187 // back later. 1188 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) { 1189 Readfirstlane = OffsetDef; 1190 BaseOffset = OffsetDef->getOperand(1).getReg(); 1191 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1192 } 1193 1194 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) { 1195 // If we have a constant offset, try to use the 0 in m0 as the base. 1196 // TODO: Look into changing the default m0 initialization value. If the 1197 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to 1198 // the immediate offset. 1199 1200 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue(); 1201 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1202 .addImm(0); 1203 } else { 1204 std::tie(BaseOffset, ImmOffset, OffsetDef) 1205 = AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset); 1206 1207 if (Readfirstlane) { 1208 // We have the constant offset now, so put the readfirstlane back on the 1209 // variable component. 1210 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI)) 1211 return false; 1212 1213 Readfirstlane->getOperand(1).setReg(BaseOffset); 1214 BaseOffset = Readfirstlane->getOperand(0).getReg(); 1215 } else { 1216 if (!RBI.constrainGenericRegister(BaseOffset, 1217 AMDGPU::SReg_32RegClass, *MRI)) 1218 return false; 1219 } 1220 1221 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1222 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base) 1223 .addReg(BaseOffset) 1224 .addImm(16); 1225 1226 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1227 .addReg(M0Base); 1228 } 1229 1230 // The resource id offset is computed as (<isa opaque base> + M0[21:16] + 1231 // offset field) % 64. Some versions of the programming guide omit the m0 1232 // part, or claim it's from offset 0. 1233 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID))); 1234 1235 if (HasVSrc) { 1236 Register VSrc = MI.getOperand(1).getReg(); 1237 MIB.addReg(VSrc); 1238 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI)) 1239 return false; 1240 } 1241 1242 MIB.addImm(ImmOffset) 1243 .addImm(-1) // $gds 1244 .cloneMemRefs(MI); 1245 1246 MI.eraseFromParent(); 1247 return true; 1248 } 1249 1250 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI, 1251 bool IsAppend) const { 1252 Register PtrBase = MI.getOperand(2).getReg(); 1253 LLT PtrTy = MRI->getType(PtrBase); 1254 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS; 1255 1256 unsigned Offset; 1257 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2)); 1258 1259 // TODO: Should this try to look through readfirstlane like GWS? 1260 if (!isDSOffsetLegal(PtrBase, Offset, 16)) { 1261 PtrBase = MI.getOperand(2).getReg(); 1262 Offset = 0; 1263 } 1264 1265 MachineBasicBlock *MBB = MI.getParent(); 1266 const DebugLoc &DL = MI.getDebugLoc(); 1267 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME; 1268 1269 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1270 .addReg(PtrBase); 1271 BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg()) 1272 .addImm(Offset) 1273 .addImm(IsGDS ? -1 : 0) 1274 .cloneMemRefs(MI); 1275 MI.eraseFromParent(); 1276 return true; 1277 } 1278 1279 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, 1280 bool &IsTexFail) { 1281 if (TexFailCtrl) 1282 IsTexFail = true; 1283 1284 TFE = (TexFailCtrl & 0x1) ? 1 : 0; 1285 TexFailCtrl &= ~(uint64_t)0x1; 1286 LWE = (TexFailCtrl & 0x2) ? 1 : 0; 1287 TexFailCtrl &= ~(uint64_t)0x2; 1288 1289 return TexFailCtrl == 0; 1290 } 1291 1292 static bool parseCachePolicy(uint64_t Value, 1293 bool *GLC, bool *SLC, bool *DLC) { 1294 if (GLC) { 1295 *GLC = (Value & 0x1) ? 1 : 0; 1296 Value &= ~(uint64_t)0x1; 1297 } 1298 if (SLC) { 1299 *SLC = (Value & 0x2) ? 1 : 0; 1300 Value &= ~(uint64_t)0x2; 1301 } 1302 if (DLC) { 1303 *DLC = (Value & 0x4) ? 1 : 0; 1304 Value &= ~(uint64_t)0x4; 1305 } 1306 1307 return Value == 0; 1308 } 1309 1310 bool AMDGPUInstructionSelector::selectImageIntrinsic( 1311 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 1312 MachineBasicBlock *MBB = MI.getParent(); 1313 const DebugLoc &DL = MI.getDebugLoc(); 1314 1315 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 1316 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 1317 1318 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); 1319 const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 1320 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode); 1321 const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo = 1322 AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode); 1323 unsigned IntrOpcode = Intr->BaseOpcode; 1324 const bool IsGFX10 = STI.getGeneration() >= AMDGPUSubtarget::GFX10; 1325 1326 const int VAddrIdx = getImageVAddrIdxBegin(BaseOpcode, 1327 MI.getNumExplicitDefs()); 1328 int NumVAddr, NumGradients; 1329 std::tie(NumVAddr, NumGradients) = getImageNumVAddr(Intr, BaseOpcode); 1330 1331 Register VDataIn, VDataOut; 1332 LLT VDataTy; 1333 int NumVDataDwords = -1; 1334 bool IsD16 = false; 1335 1336 // XXX - Can we just get the second to last argument for ctrl? 1337 unsigned CtrlIdx; // Index of texfailctrl argument 1338 bool Unorm; 1339 if (!BaseOpcode->Sampler) { 1340 Unorm = true; 1341 CtrlIdx = VAddrIdx + NumVAddr + 1; 1342 } else { 1343 Unorm = MI.getOperand(VAddrIdx + NumVAddr + 2).getImm() != 0; 1344 CtrlIdx = VAddrIdx + NumVAddr + 3; 1345 } 1346 1347 bool TFE; 1348 bool LWE; 1349 bool IsTexFail = false; 1350 if (!parseTexFail(MI.getOperand(CtrlIdx).getImm(), TFE, LWE, IsTexFail)) 1351 return false; 1352 1353 const int Flags = MI.getOperand(CtrlIdx + 2).getImm(); 1354 const bool IsA16 = (Flags & 1) != 0; 1355 const bool IsG16 = (Flags & 2) != 0; 1356 1357 // A16 implies 16 bit gradients 1358 if (IsA16 && !IsG16) 1359 return false; 1360 1361 unsigned DMask = 0; 1362 unsigned DMaskLanes = 0; 1363 1364 if (BaseOpcode->Atomic) { 1365 VDataOut = MI.getOperand(0).getReg(); 1366 VDataIn = MI.getOperand(2).getReg(); 1367 LLT Ty = MRI->getType(VDataIn); 1368 1369 // Be careful to allow atomic swap on 16-bit element vectors. 1370 const bool Is64Bit = BaseOpcode->AtomicX2 ? 1371 Ty.getSizeInBits() == 128 : 1372 Ty.getSizeInBits() == 64; 1373 1374 if (BaseOpcode->AtomicX2) { 1375 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister); 1376 1377 DMask = Is64Bit ? 0xf : 0x3; 1378 NumVDataDwords = Is64Bit ? 4 : 2; 1379 } else { 1380 DMask = Is64Bit ? 0x3 : 0x1; 1381 NumVDataDwords = Is64Bit ? 2 : 1; 1382 } 1383 } else { 1384 const int DMaskIdx = 2; // Input/output + intrinsic ID. 1385 1386 DMask = MI.getOperand(DMaskIdx).getImm(); 1387 DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask); 1388 1389 if (BaseOpcode->Store) { 1390 VDataIn = MI.getOperand(1).getReg(); 1391 VDataTy = MRI->getType(VDataIn); 1392 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32; 1393 } else { 1394 VDataOut = MI.getOperand(0).getReg(); 1395 VDataTy = MRI->getType(VDataOut); 1396 NumVDataDwords = DMaskLanes; 1397 1398 // One memoperand is mandatory, except for getresinfo. 1399 // FIXME: Check this in verifier. 1400 if (!MI.memoperands_empty()) { 1401 const MachineMemOperand *MMO = *MI.memoperands_begin(); 1402 1403 // Infer d16 from the memory size, as the register type will be mangled by 1404 // unpacked subtargets, or by TFE. 1405 IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32; 1406 1407 if (IsD16 && !STI.hasUnpackedD16VMem()) 1408 NumVDataDwords = (DMaskLanes + 1) / 2; 1409 } 1410 } 1411 } 1412 1413 // Optimize _L to _LZ when _L is zero 1414 if (LZMappingInfo) { 1415 // The legalizer replaced the register with an immediate 0 if we need to 1416 // change the opcode. 1417 const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1); 1418 if (Lod.isImm()) { 1419 assert(Lod.getImm() == 0); 1420 IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l 1421 } 1422 } 1423 1424 // Optimize _mip away, when 'lod' is zero 1425 if (MIPMappingInfo) { 1426 const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1); 1427 if (Lod.isImm()) { 1428 assert(Lod.getImm() == 0); 1429 IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip 1430 } 1431 } 1432 1433 // Set G16 opcode 1434 if (IsG16 && !IsA16) { 1435 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = 1436 AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode); 1437 assert(G16MappingInfo); 1438 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16 1439 } 1440 1441 // TODO: Check this in verifier. 1442 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this"); 1443 1444 bool GLC = false; 1445 bool SLC = false; 1446 bool DLC = false; 1447 if (BaseOpcode->Atomic) { 1448 GLC = true; // TODO no-return optimization 1449 if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), nullptr, &SLC, 1450 IsGFX10 ? &DLC : nullptr)) 1451 return false; 1452 } else { 1453 if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), &GLC, &SLC, 1454 IsGFX10 ? &DLC : nullptr)) 1455 return false; 1456 } 1457 1458 int NumVAddrRegs = 0; 1459 int NumVAddrDwords = 0; 1460 for (int I = 0; I < NumVAddr; ++I) { 1461 // Skip the $noregs and 0s inserted during legalization. 1462 MachineOperand &AddrOp = MI.getOperand(VAddrIdx + I); 1463 if (!AddrOp.isReg()) 1464 continue; // XXX - Break? 1465 1466 Register Addr = AddrOp.getReg(); 1467 if (!Addr) 1468 break; 1469 1470 ++NumVAddrRegs; 1471 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32; 1472 } 1473 1474 // The legalizer preprocessed the intrinsic arguments. If we aren't using 1475 // NSA, these should have beeen packed into a single value in the first 1476 // address register 1477 const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs; 1478 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) { 1479 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n"); 1480 return false; 1481 } 1482 1483 if (IsTexFail) 1484 ++NumVDataDwords; 1485 1486 int Opcode = -1; 1487 if (IsGFX10) { 1488 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, 1489 UseNSA ? AMDGPU::MIMGEncGfx10NSA 1490 : AMDGPU::MIMGEncGfx10Default, 1491 NumVDataDwords, NumVAddrDwords); 1492 } else { 1493 if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 1494 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, 1495 NumVDataDwords, NumVAddrDwords); 1496 if (Opcode == -1) 1497 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6, 1498 NumVDataDwords, NumVAddrDwords); 1499 } 1500 assert(Opcode != -1); 1501 1502 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode)) 1503 .cloneMemRefs(MI); 1504 1505 if (VDataOut) { 1506 if (BaseOpcode->AtomicX2) { 1507 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64; 1508 1509 Register TmpReg = MRI->createVirtualRegister( 1510 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass); 1511 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 1512 1513 MIB.addDef(TmpReg); 1514 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut) 1515 .addReg(TmpReg, RegState::Kill, SubReg); 1516 1517 } else { 1518 MIB.addDef(VDataOut); // vdata output 1519 } 1520 } 1521 1522 if (VDataIn) 1523 MIB.addReg(VDataIn); // vdata input 1524 1525 for (int i = 0; i != NumVAddrRegs; ++i) { 1526 MachineOperand &SrcOp = MI.getOperand(VAddrIdx + i); 1527 if (SrcOp.isReg()) { 1528 assert(SrcOp.getReg() != 0); 1529 MIB.addReg(SrcOp.getReg()); 1530 } 1531 } 1532 1533 MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr).getReg()); // rsrc 1534 if (BaseOpcode->Sampler) 1535 MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr + 1).getReg()); // sampler 1536 1537 MIB.addImm(DMask); // dmask 1538 1539 if (IsGFX10) 1540 MIB.addImm(DimInfo->Encoding); 1541 MIB.addImm(Unorm); 1542 if (IsGFX10) 1543 MIB.addImm(DLC); 1544 1545 MIB.addImm(GLC); 1546 MIB.addImm(SLC); 1547 MIB.addImm(IsA16 && // a16 or r128 1548 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0); 1549 if (IsGFX10) 1550 MIB.addImm(IsA16 ? -1 : 0); 1551 1552 MIB.addImm(TFE); // tfe 1553 MIB.addImm(LWE); // lwe 1554 if (!IsGFX10) 1555 MIB.addImm(DimInfo->DA ? -1 : 0); 1556 if (BaseOpcode->HasD16) 1557 MIB.addImm(IsD16 ? -1 : 0); 1558 1559 MI.eraseFromParent(); 1560 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1561 } 1562 1563 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( 1564 MachineInstr &I) const { 1565 unsigned IntrinsicID = I.getIntrinsicID(); 1566 switch (IntrinsicID) { 1567 case Intrinsic::amdgcn_end_cf: 1568 return selectEndCfIntrinsic(I); 1569 case Intrinsic::amdgcn_ds_ordered_add: 1570 case Intrinsic::amdgcn_ds_ordered_swap: 1571 return selectDSOrderedIntrinsic(I, IntrinsicID); 1572 case Intrinsic::amdgcn_ds_gws_init: 1573 case Intrinsic::amdgcn_ds_gws_barrier: 1574 case Intrinsic::amdgcn_ds_gws_sema_v: 1575 case Intrinsic::amdgcn_ds_gws_sema_br: 1576 case Intrinsic::amdgcn_ds_gws_sema_p: 1577 case Intrinsic::amdgcn_ds_gws_sema_release_all: 1578 return selectDSGWSIntrinsic(I, IntrinsicID); 1579 case Intrinsic::amdgcn_ds_append: 1580 return selectDSAppendConsume(I, true); 1581 case Intrinsic::amdgcn_ds_consume: 1582 return selectDSAppendConsume(I, false); 1583 default: { 1584 return selectImpl(I, *CoverageInfo); 1585 } 1586 } 1587 } 1588 1589 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { 1590 if (selectImpl(I, *CoverageInfo)) 1591 return true; 1592 1593 MachineBasicBlock *BB = I.getParent(); 1594 const DebugLoc &DL = I.getDebugLoc(); 1595 1596 Register DstReg = I.getOperand(0).getReg(); 1597 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 1598 assert(Size <= 32 || Size == 64); 1599 const MachineOperand &CCOp = I.getOperand(1); 1600 Register CCReg = CCOp.getReg(); 1601 if (!isVCC(CCReg, *MRI)) { 1602 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 : 1603 AMDGPU::S_CSELECT_B32; 1604 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 1605 .addReg(CCReg); 1606 1607 // The generic constrainSelectedInstRegOperands doesn't work for the scc register 1608 // bank, because it does not cover the register class that we used to represent 1609 // for it. So we need to manually set the register class here. 1610 if (!MRI->getRegClassOrNull(CCReg)) 1611 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI)); 1612 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) 1613 .add(I.getOperand(2)) 1614 .add(I.getOperand(3)); 1615 1616 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) | 1617 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI); 1618 I.eraseFromParent(); 1619 return Ret; 1620 } 1621 1622 // Wide VGPR select should have been split in RegBankSelect. 1623 if (Size > 32) 1624 return false; 1625 1626 MachineInstr *Select = 1627 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1628 .addImm(0) 1629 .add(I.getOperand(3)) 1630 .addImm(0) 1631 .add(I.getOperand(2)) 1632 .add(I.getOperand(1)); 1633 1634 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); 1635 I.eraseFromParent(); 1636 return Ret; 1637 } 1638 1639 bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { 1640 initM0(I); 1641 return selectImpl(I, *CoverageInfo); 1642 } 1643 1644 static int sizeToSubRegIndex(unsigned Size) { 1645 switch (Size) { 1646 case 32: 1647 return AMDGPU::sub0; 1648 case 64: 1649 return AMDGPU::sub0_sub1; 1650 case 96: 1651 return AMDGPU::sub0_sub1_sub2; 1652 case 128: 1653 return AMDGPU::sub0_sub1_sub2_sub3; 1654 case 256: 1655 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; 1656 default: 1657 if (Size < 32) 1658 return AMDGPU::sub0; 1659 if (Size > 256) 1660 return -1; 1661 return sizeToSubRegIndex(PowerOf2Ceil(Size)); 1662 } 1663 } 1664 1665 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { 1666 Register DstReg = I.getOperand(0).getReg(); 1667 Register SrcReg = I.getOperand(1).getReg(); 1668 const LLT DstTy = MRI->getType(DstReg); 1669 const LLT SrcTy = MRI->getType(SrcReg); 1670 const LLT S1 = LLT::scalar(1); 1671 1672 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1673 const RegisterBank *DstRB; 1674 if (DstTy == S1) { 1675 // This is a special case. We don't treat s1 for legalization artifacts as 1676 // vcc booleans. 1677 DstRB = SrcRB; 1678 } else { 1679 DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1680 if (SrcRB != DstRB) 1681 return false; 1682 } 1683 1684 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; 1685 1686 unsigned DstSize = DstTy.getSizeInBits(); 1687 unsigned SrcSize = SrcTy.getSizeInBits(); 1688 1689 const TargetRegisterClass *SrcRC 1690 = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI); 1691 const TargetRegisterClass *DstRC 1692 = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI); 1693 if (!SrcRC || !DstRC) 1694 return false; 1695 1696 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 1697 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) { 1698 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); 1699 return false; 1700 } 1701 1702 if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) { 1703 MachineBasicBlock *MBB = I.getParent(); 1704 const DebugLoc &DL = I.getDebugLoc(); 1705 1706 Register LoReg = MRI->createVirtualRegister(DstRC); 1707 Register HiReg = MRI->createVirtualRegister(DstRC); 1708 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg) 1709 .addReg(SrcReg, 0, AMDGPU::sub0); 1710 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg) 1711 .addReg(SrcReg, 0, AMDGPU::sub1); 1712 1713 if (IsVALU && STI.hasSDWA()) { 1714 // Write the low 16-bits of the high element into the high 16-bits of the 1715 // low element. 1716 MachineInstr *MovSDWA = 1717 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 1718 .addImm(0) // $src0_modifiers 1719 .addReg(HiReg) // $src0 1720 .addImm(0) // $clamp 1721 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel 1722 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 1723 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel 1724 .addReg(LoReg, RegState::Implicit); 1725 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 1726 } else { 1727 Register TmpReg0 = MRI->createVirtualRegister(DstRC); 1728 Register TmpReg1 = MRI->createVirtualRegister(DstRC); 1729 Register ImmReg = MRI->createVirtualRegister(DstRC); 1730 if (IsVALU) { 1731 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0) 1732 .addImm(16) 1733 .addReg(HiReg); 1734 } else { 1735 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0) 1736 .addReg(HiReg) 1737 .addImm(16); 1738 } 1739 1740 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 1741 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 1742 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32; 1743 1744 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg) 1745 .addImm(0xffff); 1746 BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1) 1747 .addReg(LoReg) 1748 .addReg(ImmReg); 1749 BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg) 1750 .addReg(TmpReg0) 1751 .addReg(TmpReg1); 1752 } 1753 1754 I.eraseFromParent(); 1755 return true; 1756 } 1757 1758 if (!DstTy.isScalar()) 1759 return false; 1760 1761 if (SrcSize > 32) { 1762 int SubRegIdx = sizeToSubRegIndex(DstSize); 1763 if (SubRegIdx == -1) 1764 return false; 1765 1766 // Deal with weird cases where the class only partially supports the subreg 1767 // index. 1768 const TargetRegisterClass *SrcWithSubRC 1769 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx); 1770 if (!SrcWithSubRC) 1771 return false; 1772 1773 if (SrcWithSubRC != SrcRC) { 1774 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI)) 1775 return false; 1776 } 1777 1778 I.getOperand(1).setSubReg(SubRegIdx); 1779 } 1780 1781 I.setDesc(TII.get(TargetOpcode::COPY)); 1782 return true; 1783 } 1784 1785 /// \returns true if a bitmask for \p Size bits will be an inline immediate. 1786 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) { 1787 Mask = maskTrailingOnes<unsigned>(Size); 1788 int SignedMask = static_cast<int>(Mask); 1789 return SignedMask >= -16 && SignedMask <= 64; 1790 } 1791 1792 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1. 1793 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank( 1794 Register Reg, const MachineRegisterInfo &MRI, 1795 const TargetRegisterInfo &TRI) const { 1796 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 1797 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>()) 1798 return RB; 1799 1800 // Ignore the type, since we don't use vcc in artifacts. 1801 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>()) 1802 return &RBI.getRegBankFromRegClass(*RC, LLT()); 1803 return nullptr; 1804 } 1805 1806 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { 1807 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG; 1808 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg; 1809 const DebugLoc &DL = I.getDebugLoc(); 1810 MachineBasicBlock &MBB = *I.getParent(); 1811 const Register DstReg = I.getOperand(0).getReg(); 1812 const Register SrcReg = I.getOperand(1).getReg(); 1813 1814 const LLT DstTy = MRI->getType(DstReg); 1815 const LLT SrcTy = MRI->getType(SrcReg); 1816 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ? 1817 I.getOperand(2).getImm() : SrcTy.getSizeInBits(); 1818 const unsigned DstSize = DstTy.getSizeInBits(); 1819 if (!DstTy.isScalar()) 1820 return false; 1821 1822 if (I.getOpcode() == AMDGPU::G_ANYEXT) 1823 return selectCOPY(I); 1824 1825 // Artifact casts should never use vcc. 1826 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI); 1827 1828 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) { 1829 // 64-bit should have been split up in RegBankSelect 1830 1831 // Try to use an and with a mask if it will save code size. 1832 unsigned Mask; 1833 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1834 MachineInstr *ExtI = 1835 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg) 1836 .addImm(Mask) 1837 .addReg(SrcReg); 1838 I.eraseFromParent(); 1839 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1840 } 1841 1842 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32; 1843 MachineInstr *ExtI = 1844 BuildMI(MBB, I, DL, TII.get(BFE), DstReg) 1845 .addReg(SrcReg) 1846 .addImm(0) // Offset 1847 .addImm(SrcSize); // Width 1848 I.eraseFromParent(); 1849 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1850 } 1851 1852 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) { 1853 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ? 1854 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass; 1855 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI)) 1856 return false; 1857 1858 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) { 1859 const unsigned SextOpc = SrcSize == 8 ? 1860 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16; 1861 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg) 1862 .addReg(SrcReg); 1863 I.eraseFromParent(); 1864 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1865 } 1866 1867 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; 1868 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; 1869 1870 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width. 1871 if (DstSize > 32 && (SrcSize <= 32 || InReg)) { 1872 // We need a 64-bit register source, but the high bits don't matter. 1873 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); 1874 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1875 unsigned SubReg = InReg ? AMDGPU::sub0 : 0; 1876 1877 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 1878 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) 1879 .addReg(SrcReg, 0, SubReg) 1880 .addImm(AMDGPU::sub0) 1881 .addReg(UndefReg) 1882 .addImm(AMDGPU::sub1); 1883 1884 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg) 1885 .addReg(ExtReg) 1886 .addImm(SrcSize << 16); 1887 1888 I.eraseFromParent(); 1889 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI); 1890 } 1891 1892 unsigned Mask; 1893 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1894 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg) 1895 .addReg(SrcReg) 1896 .addImm(Mask); 1897 } else { 1898 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg) 1899 .addReg(SrcReg) 1900 .addImm(SrcSize << 16); 1901 } 1902 1903 I.eraseFromParent(); 1904 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1905 } 1906 1907 return false; 1908 } 1909 1910 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { 1911 MachineBasicBlock *BB = I.getParent(); 1912 MachineOperand &ImmOp = I.getOperand(1); 1913 1914 // The AMDGPU backend only supports Imm operands and not CImm or FPImm. 1915 if (ImmOp.isFPImm()) { 1916 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt(); 1917 ImmOp.ChangeToImmediate(Imm.getZExtValue()); 1918 } else if (ImmOp.isCImm()) { 1919 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getZExtValue()); 1920 } 1921 1922 Register DstReg = I.getOperand(0).getReg(); 1923 unsigned Size; 1924 bool IsSgpr; 1925 const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg()); 1926 if (RB) { 1927 IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID; 1928 Size = MRI->getType(DstReg).getSizeInBits(); 1929 } else { 1930 const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg); 1931 IsSgpr = TRI.isSGPRClass(RC); 1932 Size = TRI.getRegSizeInBits(*RC); 1933 } 1934 1935 if (Size != 32 && Size != 64) 1936 return false; 1937 1938 unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1939 if (Size == 32) { 1940 I.setDesc(TII.get(Opcode)); 1941 I.addImplicitDefUseOperands(*MF); 1942 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 1943 } 1944 1945 const DebugLoc &DL = I.getDebugLoc(); 1946 1947 APInt Imm(Size, I.getOperand(1).getImm()); 1948 1949 MachineInstr *ResInst; 1950 if (IsSgpr && TII.isInlineConstant(Imm)) { 1951 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) 1952 .addImm(I.getOperand(1).getImm()); 1953 } else { 1954 const TargetRegisterClass *RC = IsSgpr ? 1955 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass; 1956 Register LoReg = MRI->createVirtualRegister(RC); 1957 Register HiReg = MRI->createVirtualRegister(RC); 1958 1959 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg) 1960 .addImm(Imm.trunc(32).getZExtValue()); 1961 1962 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg) 1963 .addImm(Imm.ashr(32).getZExtValue()); 1964 1965 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 1966 .addReg(LoReg) 1967 .addImm(AMDGPU::sub0) 1968 .addReg(HiReg) 1969 .addImm(AMDGPU::sub1); 1970 } 1971 1972 // We can't call constrainSelectedInstRegOperands here, because it doesn't 1973 // work for target independent opcodes 1974 I.eraseFromParent(); 1975 const TargetRegisterClass *DstRC = 1976 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI); 1977 if (!DstRC) 1978 return true; 1979 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI); 1980 } 1981 1982 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const { 1983 // Only manually handle the f64 SGPR case. 1984 // 1985 // FIXME: This is a workaround for 2.5 different tablegen problems. Because 1986 // the bit ops theoretically have a second result due to the implicit def of 1987 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing 1988 // that is easy by disabling the check. The result works, but uses a 1989 // nonsensical sreg32orlds_and_sreg_1 regclass. 1990 // 1991 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to 1992 // the variadic REG_SEQUENCE operands. 1993 1994 Register Dst = MI.getOperand(0).getReg(); 1995 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 1996 if (DstRB->getID() != AMDGPU::SGPRRegBankID || 1997 MRI->getType(Dst) != LLT::scalar(64)) 1998 return false; 1999 2000 Register Src = MI.getOperand(1).getReg(); 2001 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI); 2002 if (Fabs) 2003 Src = Fabs->getOperand(1).getReg(); 2004 2005 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || 2006 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) 2007 return false; 2008 2009 MachineBasicBlock *BB = MI.getParent(); 2010 const DebugLoc &DL = MI.getDebugLoc(); 2011 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2012 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2013 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2014 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2015 2016 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) 2017 .addReg(Src, 0, AMDGPU::sub0); 2018 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) 2019 .addReg(Src, 0, AMDGPU::sub1); 2020 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) 2021 .addImm(0x80000000); 2022 2023 // Set or toggle sign bit. 2024 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32; 2025 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg) 2026 .addReg(HiReg) 2027 .addReg(ConstReg); 2028 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) 2029 .addReg(LoReg) 2030 .addImm(AMDGPU::sub0) 2031 .addReg(OpReg) 2032 .addImm(AMDGPU::sub1); 2033 MI.eraseFromParent(); 2034 return true; 2035 } 2036 2037 // FIXME: This is a workaround for the same tablegen problems as G_FNEG 2038 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const { 2039 Register Dst = MI.getOperand(0).getReg(); 2040 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 2041 if (DstRB->getID() != AMDGPU::SGPRRegBankID || 2042 MRI->getType(Dst) != LLT::scalar(64)) 2043 return false; 2044 2045 Register Src = MI.getOperand(1).getReg(); 2046 MachineBasicBlock *BB = MI.getParent(); 2047 const DebugLoc &DL = MI.getDebugLoc(); 2048 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2049 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2050 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2051 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2052 2053 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || 2054 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) 2055 return false; 2056 2057 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) 2058 .addReg(Src, 0, AMDGPU::sub0); 2059 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) 2060 .addReg(Src, 0, AMDGPU::sub1); 2061 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) 2062 .addImm(0x7fffffff); 2063 2064 // Clear sign bit. 2065 // TODO: Should this used S_BITSET0_*? 2066 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg) 2067 .addReg(HiReg) 2068 .addReg(ConstReg); 2069 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) 2070 .addReg(LoReg) 2071 .addImm(AMDGPU::sub0) 2072 .addReg(OpReg) 2073 .addImm(AMDGPU::sub1); 2074 2075 MI.eraseFromParent(); 2076 return true; 2077 } 2078 2079 static bool isConstant(const MachineInstr &MI) { 2080 return MI.getOpcode() == TargetOpcode::G_CONSTANT; 2081 } 2082 2083 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, 2084 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const { 2085 2086 const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg()); 2087 2088 assert(PtrMI); 2089 2090 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD) 2091 return; 2092 2093 GEPInfo GEPInfo(*PtrMI); 2094 2095 for (unsigned i = 1; i != 3; ++i) { 2096 const MachineOperand &GEPOp = PtrMI->getOperand(i); 2097 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg()); 2098 assert(OpDef); 2099 if (i == 2 && isConstant(*OpDef)) { 2100 // TODO: Could handle constant base + variable offset, but a combine 2101 // probably should have commuted it. 2102 assert(GEPInfo.Imm == 0); 2103 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue(); 2104 continue; 2105 } 2106 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI); 2107 if (OpBank->getID() == AMDGPU::SGPRRegBankID) 2108 GEPInfo.SgprParts.push_back(GEPOp.getReg()); 2109 else 2110 GEPInfo.VgprParts.push_back(GEPOp.getReg()); 2111 } 2112 2113 AddrInfo.push_back(GEPInfo); 2114 getAddrModeInfo(*PtrMI, MRI, AddrInfo); 2115 } 2116 2117 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const { 2118 if (!MI.hasOneMemOperand()) 2119 return false; 2120 2121 const MachineMemOperand *MMO = *MI.memoperands_begin(); 2122 const Value *Ptr = MMO->getValue(); 2123 2124 // UndefValue means this is a load of a kernel input. These are uniform. 2125 // Sometimes LDS instructions have constant pointers. 2126 // If Ptr is null, then that means this mem operand contains a 2127 // PseudoSourceValue like GOT. 2128 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || 2129 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) 2130 return true; 2131 2132 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 2133 return true; 2134 2135 const Instruction *I = dyn_cast<Instruction>(Ptr); 2136 return I && I->getMetadata("amdgpu.uniform"); 2137 } 2138 2139 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const { 2140 for (const GEPInfo &GEPInfo : AddrInfo) { 2141 if (!GEPInfo.VgprParts.empty()) 2142 return true; 2143 } 2144 return false; 2145 } 2146 2147 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const { 2148 MachineBasicBlock *BB = I.getParent(); 2149 2150 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); 2151 unsigned AS = PtrTy.getAddressSpace(); 2152 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) && 2153 STI.ldsRequiresM0Init()) { 2154 // If DS instructions require M0 initializtion, insert it before selecting. 2155 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 2156 .addImm(-1); 2157 } 2158 } 2159 2160 bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const { 2161 initM0(I); 2162 return selectImpl(I, *CoverageInfo); 2163 } 2164 2165 // TODO: No rtn optimization. 2166 bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG( 2167 MachineInstr &MI) const { 2168 Register PtrReg = MI.getOperand(1).getReg(); 2169 const LLT PtrTy = MRI->getType(PtrReg); 2170 if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS || 2171 STI.useFlatForGlobal()) 2172 return selectImpl(MI, *CoverageInfo); 2173 2174 Register DstReg = MI.getOperand(0).getReg(); 2175 const LLT Ty = MRI->getType(DstReg); 2176 const bool Is64 = Ty.getSizeInBits() == 64; 2177 const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 2178 Register TmpReg = MRI->createVirtualRegister( 2179 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass); 2180 2181 const DebugLoc &DL = MI.getDebugLoc(); 2182 MachineBasicBlock *BB = MI.getParent(); 2183 2184 Register VAddr, RSrcReg, SOffset; 2185 int64_t Offset = 0; 2186 2187 unsigned Opcode; 2188 if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) { 2189 Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN : 2190 AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN; 2191 } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr, 2192 RSrcReg, SOffset, Offset)) { 2193 Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN : 2194 AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN; 2195 } else 2196 return selectImpl(MI, *CoverageInfo); 2197 2198 auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg) 2199 .addReg(MI.getOperand(2).getReg()); 2200 2201 if (VAddr) 2202 MIB.addReg(VAddr); 2203 2204 MIB.addReg(RSrcReg); 2205 if (SOffset) 2206 MIB.addReg(SOffset); 2207 else 2208 MIB.addImm(0); 2209 2210 MIB.addImm(Offset); 2211 MIB.addImm(0); // slc 2212 MIB.cloneMemRefs(MI); 2213 2214 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg) 2215 .addReg(TmpReg, RegState::Kill, SubReg); 2216 2217 MI.eraseFromParent(); 2218 2219 MRI->setRegClass( 2220 DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass); 2221 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 2222 } 2223 2224 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { 2225 MachineBasicBlock *BB = I.getParent(); 2226 MachineOperand &CondOp = I.getOperand(0); 2227 Register CondReg = CondOp.getReg(); 2228 const DebugLoc &DL = I.getDebugLoc(); 2229 2230 unsigned BrOpcode; 2231 Register CondPhysReg; 2232 const TargetRegisterClass *ConstrainRC; 2233 2234 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide 2235 // whether the branch is uniform when selecting the instruction. In 2236 // GlobalISel, we should push that decision into RegBankSelect. Assume for now 2237 // RegBankSelect knows what it's doing if the branch condition is scc, even 2238 // though it currently does not. 2239 if (!isVCC(CondReg, *MRI)) { 2240 if (MRI->getType(CondReg) != LLT::scalar(32)) 2241 return false; 2242 2243 CondPhysReg = AMDGPU::SCC; 2244 BrOpcode = AMDGPU::S_CBRANCH_SCC1; 2245 // FIXME: Hack for isSCC tests 2246 ConstrainRC = &AMDGPU::SGPR_32RegClass; 2247 } else { 2248 // FIXME: Do we have to insert an and with exec here, like in SelectionDAG? 2249 // We sort of know that a VCC producer based on the register bank, that ands 2250 // inactive lanes with 0. What if there was a logical operation with vcc 2251 // producers in different blocks/with different exec masks? 2252 // FIXME: Should scc->vcc copies and with exec? 2253 CondPhysReg = TRI.getVCC(); 2254 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ; 2255 ConstrainRC = TRI.getBoolRC(); 2256 } 2257 2258 if (!MRI->getRegClassOrNull(CondReg)) 2259 MRI->setRegClass(CondReg, ConstrainRC); 2260 2261 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg) 2262 .addReg(CondReg); 2263 BuildMI(*BB, &I, DL, TII.get(BrOpcode)) 2264 .addMBB(I.getOperand(1).getMBB()); 2265 2266 I.eraseFromParent(); 2267 return true; 2268 } 2269 2270 bool AMDGPUInstructionSelector::selectG_FRAME_INDEX_GLOBAL_VALUE( 2271 MachineInstr &I) const { 2272 Register DstReg = I.getOperand(0).getReg(); 2273 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2274 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 2275 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32)); 2276 if (IsVGPR) 2277 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 2278 2279 return RBI.constrainGenericRegister( 2280 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI); 2281 } 2282 2283 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { 2284 Register DstReg = I.getOperand(0).getReg(); 2285 Register SrcReg = I.getOperand(1).getReg(); 2286 Register MaskReg = I.getOperand(2).getReg(); 2287 LLT Ty = MRI->getType(DstReg); 2288 LLT MaskTy = MRI->getType(MaskReg); 2289 2290 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2291 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 2292 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI); 2293 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 2294 if (DstRB != SrcRB) // Should only happen for hand written MIR. 2295 return false; 2296 2297 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 2298 const TargetRegisterClass &RegRC 2299 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 2300 2301 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB, 2302 *MRI); 2303 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB, 2304 *MRI); 2305 const TargetRegisterClass *MaskRC = 2306 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI); 2307 2308 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 2309 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 2310 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI)) 2311 return false; 2312 2313 MachineBasicBlock *BB = I.getParent(); 2314 const DebugLoc &DL = I.getDebugLoc(); 2315 if (Ty.getSizeInBits() == 32) { 2316 assert(MaskTy.getSizeInBits() == 32 && 2317 "ptrmask should have been narrowed during legalize"); 2318 2319 BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg) 2320 .addReg(SrcReg) 2321 .addReg(MaskReg); 2322 I.eraseFromParent(); 2323 return true; 2324 } 2325 2326 Register HiReg = MRI->createVirtualRegister(&RegRC); 2327 Register LoReg = MRI->createVirtualRegister(&RegRC); 2328 2329 // Extract the subregisters from the source pointer. 2330 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg) 2331 .addReg(SrcReg, 0, AMDGPU::sub0); 2332 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg) 2333 .addReg(SrcReg, 0, AMDGPU::sub1); 2334 2335 Register MaskedLo, MaskedHi; 2336 2337 // Try to avoid emitting a bit operation when we only need to touch half of 2338 // the 64-bit pointer. 2339 APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64); 2340 2341 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32); 2342 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32); 2343 if ((MaskOnes & MaskLo32) == MaskLo32) { 2344 // If all the bits in the low half are 1, we only need a copy for it. 2345 MaskedLo = LoReg; 2346 } else { 2347 // Extract the mask subregister and apply the and. 2348 Register MaskLo = MRI->createVirtualRegister(&RegRC); 2349 MaskedLo = MRI->createVirtualRegister(&RegRC); 2350 2351 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo) 2352 .addReg(MaskReg, 0, AMDGPU::sub0); 2353 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo) 2354 .addReg(LoReg) 2355 .addReg(MaskLo); 2356 } 2357 2358 if ((MaskOnes & MaskHi32) == MaskHi32) { 2359 // If all the bits in the high half are 1, we only need a copy for it. 2360 MaskedHi = HiReg; 2361 } else { 2362 Register MaskHi = MRI->createVirtualRegister(&RegRC); 2363 MaskedHi = MRI->createVirtualRegister(&RegRC); 2364 2365 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi) 2366 .addReg(MaskReg, 0, AMDGPU::sub1); 2367 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi) 2368 .addReg(HiReg) 2369 .addReg(MaskHi); 2370 } 2371 2372 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 2373 .addReg(MaskedLo) 2374 .addImm(AMDGPU::sub0) 2375 .addReg(MaskedHi) 2376 .addImm(AMDGPU::sub1); 2377 I.eraseFromParent(); 2378 return true; 2379 } 2380 2381 /// Return the register to use for the index value, and the subregister to use 2382 /// for the indirectly accessed register. 2383 static std::pair<Register, unsigned> 2384 computeIndirectRegIndex(MachineRegisterInfo &MRI, 2385 const SIRegisterInfo &TRI, 2386 const TargetRegisterClass *SuperRC, 2387 Register IdxReg, 2388 unsigned EltSize) { 2389 Register IdxBaseReg; 2390 int Offset; 2391 MachineInstr *Unused; 2392 2393 std::tie(IdxBaseReg, Offset, Unused) 2394 = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg); 2395 if (IdxBaseReg == AMDGPU::NoRegister) { 2396 // This will happen if the index is a known constant. This should ordinarily 2397 // be legalized out, but handle it as a register just in case. 2398 assert(Offset == 0); 2399 IdxBaseReg = IdxReg; 2400 } 2401 2402 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize); 2403 2404 // Skip out of bounds offsets, or else we would end up using an undefined 2405 // register. 2406 if (static_cast<unsigned>(Offset) >= SubRegs.size()) 2407 return std::make_pair(IdxReg, SubRegs[0]); 2408 return std::make_pair(IdxBaseReg, SubRegs[Offset]); 2409 } 2410 2411 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT( 2412 MachineInstr &MI) const { 2413 Register DstReg = MI.getOperand(0).getReg(); 2414 Register SrcReg = MI.getOperand(1).getReg(); 2415 Register IdxReg = MI.getOperand(2).getReg(); 2416 2417 LLT DstTy = MRI->getType(DstReg); 2418 LLT SrcTy = MRI->getType(SrcReg); 2419 2420 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2421 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 2422 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 2423 2424 // The index must be scalar. If it wasn't RegBankSelect should have moved this 2425 // into a waterfall loop. 2426 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 2427 return false; 2428 2429 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB, 2430 *MRI); 2431 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB, 2432 *MRI); 2433 if (!SrcRC || !DstRC) 2434 return false; 2435 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 2436 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 2437 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 2438 return false; 2439 2440 MachineBasicBlock *BB = MI.getParent(); 2441 const DebugLoc &DL = MI.getDebugLoc(); 2442 const bool Is64 = DstTy.getSizeInBits() == 64; 2443 2444 unsigned SubReg; 2445 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg, 2446 DstTy.getSizeInBits() / 8); 2447 2448 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) { 2449 if (DstTy.getSizeInBits() != 32 && !Is64) 2450 return false; 2451 2452 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2453 .addReg(IdxReg); 2454 2455 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32; 2456 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg) 2457 .addReg(SrcReg, 0, SubReg) 2458 .addReg(SrcReg, RegState::Implicit); 2459 MI.eraseFromParent(); 2460 return true; 2461 } 2462 2463 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32) 2464 return false; 2465 2466 if (!STI.useVGPRIndexMode()) { 2467 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2468 .addReg(IdxReg); 2469 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg) 2470 .addReg(SrcReg, RegState::Undef, SubReg) 2471 .addReg(SrcReg, RegState::Implicit); 2472 MI.eraseFromParent(); 2473 return true; 2474 } 2475 2476 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON)) 2477 .addReg(IdxReg) 2478 .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE); 2479 BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstReg) 2480 .addReg(SrcReg, RegState::Undef, SubReg) 2481 .addReg(SrcReg, RegState::Implicit) 2482 .addReg(AMDGPU::M0, RegState::Implicit); 2483 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF)); 2484 2485 MI.eraseFromParent(); 2486 return true; 2487 } 2488 2489 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd 2490 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT( 2491 MachineInstr &MI) const { 2492 Register DstReg = MI.getOperand(0).getReg(); 2493 Register VecReg = MI.getOperand(1).getReg(); 2494 Register ValReg = MI.getOperand(2).getReg(); 2495 Register IdxReg = MI.getOperand(3).getReg(); 2496 2497 LLT VecTy = MRI->getType(DstReg); 2498 LLT ValTy = MRI->getType(ValReg); 2499 unsigned VecSize = VecTy.getSizeInBits(); 2500 unsigned ValSize = ValTy.getSizeInBits(); 2501 2502 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI); 2503 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI); 2504 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 2505 2506 assert(VecTy.getElementType() == ValTy); 2507 2508 // The index must be scalar. If it wasn't RegBankSelect should have moved this 2509 // into a waterfall loop. 2510 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 2511 return false; 2512 2513 const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB, 2514 *MRI); 2515 const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB, 2516 *MRI); 2517 2518 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) || 2519 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) || 2520 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) || 2521 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 2522 return false; 2523 2524 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32) 2525 return false; 2526 2527 unsigned SubReg; 2528 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, 2529 ValSize / 8); 2530 2531 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID && 2532 STI.useVGPRIndexMode(); 2533 2534 MachineBasicBlock *BB = MI.getParent(); 2535 const DebugLoc &DL = MI.getDebugLoc(); 2536 2537 if (IndexMode) { 2538 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON)) 2539 .addReg(IdxReg) 2540 .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE); 2541 } else { 2542 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2543 .addReg(IdxReg); 2544 } 2545 2546 const MCInstrDesc &RegWriteOp 2547 = TII.getIndirectRegWritePseudo(VecSize, ValSize, 2548 VecRB->getID() == AMDGPU::SGPRRegBankID); 2549 BuildMI(*BB, MI, DL, RegWriteOp, DstReg) 2550 .addReg(VecReg) 2551 .addReg(ValReg) 2552 .addImm(SubReg); 2553 2554 if (IndexMode) 2555 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF)); 2556 2557 MI.eraseFromParent(); 2558 return true; 2559 } 2560 2561 static bool isZeroOrUndef(int X) { 2562 return X == 0 || X == -1; 2563 } 2564 2565 static bool isOneOrUndef(int X) { 2566 return X == 1 || X == -1; 2567 } 2568 2569 static bool isZeroOrOneOrUndef(int X) { 2570 return X == 0 || X == 1 || X == -1; 2571 } 2572 2573 // Normalize a VOP3P shuffle mask to refer to the low/high half of a single 2574 // 32-bit register. 2575 static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1, 2576 ArrayRef<int> Mask) { 2577 NewMask[0] = Mask[0]; 2578 NewMask[1] = Mask[1]; 2579 if (isZeroOrOneOrUndef(Mask[0]) && isZeroOrOneOrUndef(Mask[1])) 2580 return Src0; 2581 2582 assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1); 2583 assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1); 2584 2585 // Shift the mask inputs to be 0/1; 2586 NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2; 2587 NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2; 2588 return Src1; 2589 } 2590 2591 // This is only legal with VOP3P instructions as an aid to op_sel matching. 2592 bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR( 2593 MachineInstr &MI) const { 2594 Register DstReg = MI.getOperand(0).getReg(); 2595 Register Src0Reg = MI.getOperand(1).getReg(); 2596 Register Src1Reg = MI.getOperand(2).getReg(); 2597 ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask(); 2598 2599 const LLT V2S16 = LLT::vector(2, 16); 2600 if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16) 2601 return false; 2602 2603 if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask)) 2604 return false; 2605 2606 assert(ShufMask.size() == 2); 2607 assert(STI.hasSDWA() && "no target has VOP3P but not SDWA"); 2608 2609 MachineBasicBlock *MBB = MI.getParent(); 2610 const DebugLoc &DL = MI.getDebugLoc(); 2611 2612 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2613 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; 2614 const TargetRegisterClass &RC = IsVALU ? 2615 AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 2616 2617 // Handle the degenerate case which should have folded out. 2618 if (ShufMask[0] == -1 && ShufMask[1] == -1) { 2619 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg); 2620 2621 MI.eraseFromParent(); 2622 return RBI.constrainGenericRegister(DstReg, RC, *MRI); 2623 } 2624 2625 // A legal VOP3P mask only reads one of the sources. 2626 int Mask[2]; 2627 Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask); 2628 2629 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) || 2630 !RBI.constrainGenericRegister(SrcVec, RC, *MRI)) 2631 return false; 2632 2633 // TODO: This also should have been folded out 2634 if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) { 2635 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg) 2636 .addReg(SrcVec); 2637 2638 MI.eraseFromParent(); 2639 return true; 2640 } 2641 2642 if (Mask[0] == 1 && Mask[1] == -1) { 2643 if (IsVALU) { 2644 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg) 2645 .addImm(16) 2646 .addReg(SrcVec); 2647 } else { 2648 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg) 2649 .addReg(SrcVec) 2650 .addImm(16); 2651 } 2652 } else if (Mask[0] == -1 && Mask[1] == 0) { 2653 if (IsVALU) { 2654 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg) 2655 .addImm(16) 2656 .addReg(SrcVec); 2657 } else { 2658 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg) 2659 .addReg(SrcVec) 2660 .addImm(16); 2661 } 2662 } else if (Mask[0] == 0 && Mask[1] == 0) { 2663 if (IsVALU) { 2664 // Write low half of the register into the high half. 2665 MachineInstr *MovSDWA = 2666 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 2667 .addImm(0) // $src0_modifiers 2668 .addReg(SrcVec) // $src0 2669 .addImm(0) // $clamp 2670 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel 2671 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 2672 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel 2673 .addReg(SrcVec, RegState::Implicit); 2674 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 2675 } else { 2676 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg) 2677 .addReg(SrcVec) 2678 .addReg(SrcVec); 2679 } 2680 } else if (Mask[0] == 1 && Mask[1] == 1) { 2681 if (IsVALU) { 2682 // Write high half of the register into the low half. 2683 MachineInstr *MovSDWA = 2684 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 2685 .addImm(0) // $src0_modifiers 2686 .addReg(SrcVec) // $src0 2687 .addImm(0) // $clamp 2688 .addImm(AMDGPU::SDWA::WORD_0) // $dst_sel 2689 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 2690 .addImm(AMDGPU::SDWA::WORD_1) // $src0_sel 2691 .addReg(SrcVec, RegState::Implicit); 2692 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 2693 } else { 2694 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg) 2695 .addReg(SrcVec) 2696 .addReg(SrcVec); 2697 } 2698 } else if (Mask[0] == 1 && Mask[1] == 0) { 2699 if (IsVALU) { 2700 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32), DstReg) 2701 .addReg(SrcVec) 2702 .addReg(SrcVec) 2703 .addImm(16); 2704 } else { 2705 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2706 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg) 2707 .addReg(SrcVec) 2708 .addImm(16); 2709 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg) 2710 .addReg(TmpReg) 2711 .addReg(SrcVec); 2712 } 2713 } else 2714 llvm_unreachable("all shuffle masks should be handled"); 2715 2716 MI.eraseFromParent(); 2717 return true; 2718 } 2719 2720 bool AMDGPUInstructionSelector::select(MachineInstr &I) { 2721 if (I.isPHI()) 2722 return selectPHI(I); 2723 2724 if (!I.isPreISelOpcode()) { 2725 if (I.isCopy()) 2726 return selectCOPY(I); 2727 return true; 2728 } 2729 2730 switch (I.getOpcode()) { 2731 case TargetOpcode::G_AND: 2732 case TargetOpcode::G_OR: 2733 case TargetOpcode::G_XOR: 2734 if (selectImpl(I, *CoverageInfo)) 2735 return true; 2736 return selectG_AND_OR_XOR(I); 2737 case TargetOpcode::G_ADD: 2738 case TargetOpcode::G_SUB: 2739 if (selectImpl(I, *CoverageInfo)) 2740 return true; 2741 return selectG_ADD_SUB(I); 2742 case TargetOpcode::G_UADDO: 2743 case TargetOpcode::G_USUBO: 2744 case TargetOpcode::G_UADDE: 2745 case TargetOpcode::G_USUBE: 2746 return selectG_UADDO_USUBO_UADDE_USUBE(I); 2747 case TargetOpcode::G_INTTOPTR: 2748 case TargetOpcode::G_BITCAST: 2749 case TargetOpcode::G_PTRTOINT: 2750 return selectCOPY(I); 2751 case TargetOpcode::G_CONSTANT: 2752 case TargetOpcode::G_FCONSTANT: 2753 return selectG_CONSTANT(I); 2754 case TargetOpcode::G_FNEG: 2755 if (selectImpl(I, *CoverageInfo)) 2756 return true; 2757 return selectG_FNEG(I); 2758 case TargetOpcode::G_FABS: 2759 if (selectImpl(I, *CoverageInfo)) 2760 return true; 2761 return selectG_FABS(I); 2762 case TargetOpcode::G_EXTRACT: 2763 return selectG_EXTRACT(I); 2764 case TargetOpcode::G_MERGE_VALUES: 2765 case TargetOpcode::G_BUILD_VECTOR: 2766 case TargetOpcode::G_CONCAT_VECTORS: 2767 return selectG_MERGE_VALUES(I); 2768 case TargetOpcode::G_UNMERGE_VALUES: 2769 return selectG_UNMERGE_VALUES(I); 2770 case TargetOpcode::G_BUILD_VECTOR_TRUNC: 2771 return selectG_BUILD_VECTOR_TRUNC(I); 2772 case TargetOpcode::G_PTR_ADD: 2773 return selectG_PTR_ADD(I); 2774 case TargetOpcode::G_IMPLICIT_DEF: 2775 return selectG_IMPLICIT_DEF(I); 2776 case TargetOpcode::G_INSERT: 2777 return selectG_INSERT(I); 2778 case TargetOpcode::G_INTRINSIC: 2779 return selectG_INTRINSIC(I); 2780 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: 2781 return selectG_INTRINSIC_W_SIDE_EFFECTS(I); 2782 case TargetOpcode::G_ICMP: 2783 if (selectG_ICMP(I)) 2784 return true; 2785 return selectImpl(I, *CoverageInfo); 2786 case TargetOpcode::G_LOAD: 2787 case TargetOpcode::G_ATOMIC_CMPXCHG: 2788 case TargetOpcode::G_ATOMICRMW_XCHG: 2789 case TargetOpcode::G_ATOMICRMW_ADD: 2790 case TargetOpcode::G_ATOMICRMW_SUB: 2791 case TargetOpcode::G_ATOMICRMW_AND: 2792 case TargetOpcode::G_ATOMICRMW_OR: 2793 case TargetOpcode::G_ATOMICRMW_XOR: 2794 case TargetOpcode::G_ATOMICRMW_MIN: 2795 case TargetOpcode::G_ATOMICRMW_MAX: 2796 case TargetOpcode::G_ATOMICRMW_UMIN: 2797 case TargetOpcode::G_ATOMICRMW_UMAX: 2798 case TargetOpcode::G_ATOMICRMW_FADD: 2799 return selectG_LOAD_ATOMICRMW(I); 2800 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: 2801 return selectG_AMDGPU_ATOMIC_CMPXCHG(I); 2802 case TargetOpcode::G_SELECT: 2803 return selectG_SELECT(I); 2804 case TargetOpcode::G_STORE: 2805 return selectG_STORE(I); 2806 case TargetOpcode::G_TRUNC: 2807 return selectG_TRUNC(I); 2808 case TargetOpcode::G_SEXT: 2809 case TargetOpcode::G_ZEXT: 2810 case TargetOpcode::G_ANYEXT: 2811 case TargetOpcode::G_SEXT_INREG: 2812 if (selectImpl(I, *CoverageInfo)) 2813 return true; 2814 return selectG_SZA_EXT(I); 2815 case TargetOpcode::G_BRCOND: 2816 return selectG_BRCOND(I); 2817 case TargetOpcode::G_FRAME_INDEX: 2818 case TargetOpcode::G_GLOBAL_VALUE: 2819 return selectG_FRAME_INDEX_GLOBAL_VALUE(I); 2820 case TargetOpcode::G_PTRMASK: 2821 return selectG_PTRMASK(I); 2822 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 2823 return selectG_EXTRACT_VECTOR_ELT(I); 2824 case TargetOpcode::G_INSERT_VECTOR_ELT: 2825 return selectG_INSERT_VECTOR_ELT(I); 2826 case TargetOpcode::G_SHUFFLE_VECTOR: 2827 return selectG_SHUFFLE_VECTOR(I); 2828 case AMDGPU::G_AMDGPU_ATOMIC_INC: 2829 case AMDGPU::G_AMDGPU_ATOMIC_DEC: 2830 initM0(I); 2831 return selectImpl(I, *CoverageInfo); 2832 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 2833 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: { 2834 const AMDGPU::ImageDimIntrinsicInfo *Intr 2835 = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID()); 2836 assert(Intr && "not an image intrinsic with image pseudo"); 2837 return selectImageIntrinsic(I, Intr); 2838 } 2839 default: 2840 return selectImpl(I, *CoverageInfo); 2841 } 2842 return false; 2843 } 2844 2845 InstructionSelector::ComplexRendererFns 2846 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { 2847 return {{ 2848 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 2849 }}; 2850 2851 } 2852 2853 std::pair<Register, unsigned> 2854 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root) const { 2855 Register Src = Root.getReg(); 2856 Register OrigSrc = Src; 2857 unsigned Mods = 0; 2858 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI); 2859 2860 if (MI && MI->getOpcode() == AMDGPU::G_FNEG) { 2861 Src = MI->getOperand(1).getReg(); 2862 Mods |= SISrcMods::NEG; 2863 MI = getDefIgnoringCopies(Src, *MRI); 2864 } 2865 2866 if (MI && MI->getOpcode() == AMDGPU::G_FABS) { 2867 Src = MI->getOperand(1).getReg(); 2868 Mods |= SISrcMods::ABS; 2869 } 2870 2871 if (Mods != 0 && 2872 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) { 2873 MachineInstr *UseMI = Root.getParent(); 2874 2875 // If we looked through copies to find source modifiers on an SGPR operand, 2876 // we now have an SGPR register source. To avoid potentially violating the 2877 // constant bus restriction, we need to insert a copy to a VGPR. 2878 Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc); 2879 BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(), 2880 TII.get(AMDGPU::COPY), VGPRSrc) 2881 .addReg(Src); 2882 Src = VGPRSrc; 2883 } 2884 2885 return std::make_pair(Src, Mods); 2886 } 2887 2888 /// 2889 /// This will select either an SGPR or VGPR operand and will save us from 2890 /// having to write an extra tablegen pattern. 2891 InstructionSelector::ComplexRendererFns 2892 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const { 2893 return {{ 2894 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 2895 }}; 2896 } 2897 2898 InstructionSelector::ComplexRendererFns 2899 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { 2900 Register Src; 2901 unsigned Mods; 2902 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 2903 2904 return {{ 2905 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2906 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 2907 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 2908 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 2909 }}; 2910 } 2911 2912 InstructionSelector::ComplexRendererFns 2913 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { 2914 return {{ 2915 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 2916 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 2917 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 2918 }}; 2919 } 2920 2921 InstructionSelector::ComplexRendererFns 2922 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { 2923 Register Src; 2924 unsigned Mods; 2925 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 2926 2927 return {{ 2928 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2929 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 2930 }}; 2931 } 2932 2933 InstructionSelector::ComplexRendererFns 2934 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { 2935 Register Reg = Root.getReg(); 2936 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); 2937 if (Def && (Def->getOpcode() == AMDGPU::G_FNEG || 2938 Def->getOpcode() == AMDGPU::G_FABS)) 2939 return {}; 2940 return {{ 2941 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 2942 }}; 2943 } 2944 2945 std::pair<Register, unsigned> 2946 AMDGPUInstructionSelector::selectVOP3PModsImpl( 2947 Register Src, const MachineRegisterInfo &MRI) const { 2948 unsigned Mods = 0; 2949 MachineInstr *MI = MRI.getVRegDef(Src); 2950 2951 if (MI && MI->getOpcode() == AMDGPU::G_FNEG && 2952 // It's possible to see an f32 fneg here, but unlikely. 2953 // TODO: Treat f32 fneg as only high bit. 2954 MRI.getType(Src) == LLT::vector(2, 16)) { 2955 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); 2956 Src = MI->getOperand(1).getReg(); 2957 MI = MRI.getVRegDef(Src); 2958 } 2959 2960 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. 2961 2962 // Packed instructions do not have abs modifiers. 2963 Mods |= SISrcMods::OP_SEL_1; 2964 2965 return std::make_pair(Src, Mods); 2966 } 2967 2968 InstructionSelector::ComplexRendererFns 2969 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const { 2970 MachineRegisterInfo &MRI 2971 = Root.getParent()->getParent()->getParent()->getRegInfo(); 2972 2973 Register Src; 2974 unsigned Mods; 2975 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI); 2976 2977 return {{ 2978 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2979 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 2980 }}; 2981 } 2982 2983 InstructionSelector::ComplexRendererFns 2984 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const { 2985 Register Src; 2986 unsigned Mods; 2987 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 2988 if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI)) 2989 return None; 2990 2991 return {{ 2992 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2993 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 2994 }}; 2995 } 2996 2997 InstructionSelector::ComplexRendererFns 2998 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { 2999 // FIXME: Handle op_sel 3000 return {{ 3001 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 3002 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods 3003 }}; 3004 } 3005 3006 InstructionSelector::ComplexRendererFns 3007 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { 3008 SmallVector<GEPInfo, 4> AddrInfo; 3009 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 3010 3011 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 3012 return None; 3013 3014 const GEPInfo &GEPInfo = AddrInfo[0]; 3015 Optional<int64_t> EncodedImm = 3016 AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false); 3017 if (!EncodedImm) 3018 return None; 3019 3020 unsigned PtrReg = GEPInfo.SgprParts[0]; 3021 return {{ 3022 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 3023 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } 3024 }}; 3025 } 3026 3027 InstructionSelector::ComplexRendererFns 3028 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { 3029 SmallVector<GEPInfo, 4> AddrInfo; 3030 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 3031 3032 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 3033 return None; 3034 3035 const GEPInfo &GEPInfo = AddrInfo[0]; 3036 Register PtrReg = GEPInfo.SgprParts[0]; 3037 Optional<int64_t> EncodedImm = 3038 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm); 3039 if (!EncodedImm) 3040 return None; 3041 3042 return {{ 3043 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 3044 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } 3045 }}; 3046 } 3047 3048 InstructionSelector::ComplexRendererFns 3049 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { 3050 MachineInstr *MI = Root.getParent(); 3051 MachineBasicBlock *MBB = MI->getParent(); 3052 3053 SmallVector<GEPInfo, 4> AddrInfo; 3054 getAddrModeInfo(*MI, *MRI, AddrInfo); 3055 3056 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, 3057 // then we can select all ptr + 32-bit offsets not just immediate offsets. 3058 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 3059 return None; 3060 3061 const GEPInfo &GEPInfo = AddrInfo[0]; 3062 // SGPR offset is unsigned. 3063 if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm)) 3064 return None; 3065 3066 // If we make it this far we have a load with an 32-bit immediate offset. 3067 // It is OK to select this using a sgpr offset, because we have already 3068 // failed trying to select this load into one of the _IMM variants since 3069 // the _IMM Patterns are considered before the _SGPR patterns. 3070 Register PtrReg = GEPInfo.SgprParts[0]; 3071 Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 3072 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) 3073 .addImm(GEPInfo.Imm); 3074 return {{ 3075 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 3076 [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); } 3077 }}; 3078 } 3079 3080 template <bool Signed> 3081 InstructionSelector::ComplexRendererFns 3082 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const { 3083 MachineInstr *MI = Root.getParent(); 3084 3085 InstructionSelector::ComplexRendererFns Default = {{ 3086 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 3087 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // offset 3088 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 3089 }}; 3090 3091 if (!STI.hasFlatInstOffsets()) 3092 return Default; 3093 3094 const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg()); 3095 if (!OpDef || OpDef->getOpcode() != AMDGPU::G_PTR_ADD) 3096 return Default; 3097 3098 Optional<int64_t> Offset = 3099 getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI); 3100 if (!Offset.hasValue()) 3101 return Default; 3102 3103 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); 3104 if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed)) 3105 return Default; 3106 3107 Register BasePtr = OpDef->getOperand(1).getReg(); 3108 3109 return {{ 3110 [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); }, 3111 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); }, 3112 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 3113 }}; 3114 } 3115 3116 InstructionSelector::ComplexRendererFns 3117 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const { 3118 return selectFlatOffsetImpl<false>(Root); 3119 } 3120 3121 InstructionSelector::ComplexRendererFns 3122 AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const { 3123 return selectFlatOffsetImpl<true>(Root); 3124 } 3125 3126 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { 3127 auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>(); 3128 return PSV && PSV->isStack(); 3129 } 3130 3131 InstructionSelector::ComplexRendererFns 3132 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { 3133 MachineInstr *MI = Root.getParent(); 3134 MachineBasicBlock *MBB = MI->getParent(); 3135 MachineFunction *MF = MBB->getParent(); 3136 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 3137 3138 int64_t Offset = 0; 3139 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) && 3140 Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) { 3141 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3142 3143 // TODO: Should this be inside the render function? The iterator seems to 3144 // move. 3145 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), 3146 HighBits) 3147 .addImm(Offset & ~4095); 3148 3149 return {{[=](MachineInstrBuilder &MIB) { // rsrc 3150 MIB.addReg(Info->getScratchRSrcReg()); 3151 }, 3152 [=](MachineInstrBuilder &MIB) { // vaddr 3153 MIB.addReg(HighBits); 3154 }, 3155 [=](MachineInstrBuilder &MIB) { // soffset 3156 const MachineMemOperand *MMO = *MI->memoperands_begin(); 3157 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 3158 3159 if (isStackPtrRelative(PtrInfo)) 3160 MIB.addReg(Info->getStackPtrOffsetReg()); 3161 else 3162 MIB.addImm(0); 3163 }, 3164 [=](MachineInstrBuilder &MIB) { // offset 3165 MIB.addImm(Offset & 4095); 3166 }}}; 3167 } 3168 3169 assert(Offset == 0 || Offset == -1); 3170 3171 // Try to fold a frame index directly into the MUBUF vaddr field, and any 3172 // offsets. 3173 Optional<int> FI; 3174 Register VAddr = Root.getReg(); 3175 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) { 3176 if (isBaseWithConstantOffset(Root, *MRI)) { 3177 const MachineOperand &LHS = RootDef->getOperand(1); 3178 const MachineOperand &RHS = RootDef->getOperand(2); 3179 const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); 3180 const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); 3181 if (LHSDef && RHSDef) { 3182 int64_t PossibleOffset = 3183 RHSDef->getOperand(1).getCImm()->getSExtValue(); 3184 if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) && 3185 (!STI.privateMemoryResourceIsRangeChecked() || 3186 KnownBits->signBitIsZero(LHS.getReg()))) { 3187 if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX) 3188 FI = LHSDef->getOperand(1).getIndex(); 3189 else 3190 VAddr = LHS.getReg(); 3191 Offset = PossibleOffset; 3192 } 3193 } 3194 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) { 3195 FI = RootDef->getOperand(1).getIndex(); 3196 } 3197 } 3198 3199 return {{[=](MachineInstrBuilder &MIB) { // rsrc 3200 MIB.addReg(Info->getScratchRSrcReg()); 3201 }, 3202 [=](MachineInstrBuilder &MIB) { // vaddr 3203 if (FI.hasValue()) 3204 MIB.addFrameIndex(FI.getValue()); 3205 else 3206 MIB.addReg(VAddr); 3207 }, 3208 [=](MachineInstrBuilder &MIB) { // soffset 3209 // If we don't know this private access is a local stack object, it 3210 // needs to be relative to the entry point's scratch wave offset. 3211 // TODO: Should split large offsets that don't fit like above. 3212 // TODO: Don't use scratch wave offset just because the offset 3213 // didn't fit. 3214 if (!Info->isEntryFunction() && FI.hasValue()) 3215 MIB.addReg(Info->getStackPtrOffsetReg()); 3216 else 3217 MIB.addImm(0); 3218 }, 3219 [=](MachineInstrBuilder &MIB) { // offset 3220 MIB.addImm(Offset); 3221 }}}; 3222 } 3223 3224 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base, 3225 int64_t Offset, 3226 unsigned OffsetBits) const { 3227 if ((OffsetBits == 16 && !isUInt<16>(Offset)) || 3228 (OffsetBits == 8 && !isUInt<8>(Offset))) 3229 return false; 3230 3231 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) 3232 return true; 3233 3234 // On Southern Islands instruction with a negative base value and an offset 3235 // don't seem to work. 3236 return KnownBits->signBitIsZero(Base); 3237 } 3238 3239 InstructionSelector::ComplexRendererFns 3240 AMDGPUInstructionSelector::selectMUBUFScratchOffset( 3241 MachineOperand &Root) const { 3242 MachineInstr *MI = Root.getParent(); 3243 MachineBasicBlock *MBB = MI->getParent(); 3244 3245 int64_t Offset = 0; 3246 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) || 3247 !SIInstrInfo::isLegalMUBUFImmOffset(Offset)) 3248 return {}; 3249 3250 const MachineFunction *MF = MBB->getParent(); 3251 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 3252 const MachineMemOperand *MMO = *MI->memoperands_begin(); 3253 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 3254 3255 return {{ 3256 [=](MachineInstrBuilder &MIB) { // rsrc 3257 MIB.addReg(Info->getScratchRSrcReg()); 3258 }, 3259 [=](MachineInstrBuilder &MIB) { // soffset 3260 if (isStackPtrRelative(PtrInfo)) 3261 MIB.addReg(Info->getStackPtrOffsetReg()); 3262 else 3263 MIB.addImm(0); 3264 }, 3265 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset 3266 }}; 3267 } 3268 3269 std::pair<Register, unsigned> 3270 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const { 3271 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 3272 if (!RootDef) 3273 return std::make_pair(Root.getReg(), 0); 3274 3275 int64_t ConstAddr = 0; 3276 3277 Register PtrBase; 3278 int64_t Offset; 3279 std::tie(PtrBase, Offset) = 3280 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 3281 3282 if (Offset) { 3283 if (isDSOffsetLegal(PtrBase, Offset, 16)) { 3284 // (add n0, c0) 3285 return std::make_pair(PtrBase, Offset); 3286 } 3287 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 3288 // TODO 3289 3290 3291 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 3292 // TODO 3293 3294 } 3295 3296 return std::make_pair(Root.getReg(), 0); 3297 } 3298 3299 InstructionSelector::ComplexRendererFns 3300 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const { 3301 Register Reg; 3302 unsigned Offset; 3303 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root); 3304 return {{ 3305 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 3306 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } 3307 }}; 3308 } 3309 3310 InstructionSelector::ComplexRendererFns 3311 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const { 3312 Register Reg; 3313 unsigned Offset; 3314 std::tie(Reg, Offset) = selectDS64Bit4ByteAlignedImpl(Root); 3315 return {{ 3316 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 3317 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, 3318 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); } 3319 }}; 3320 } 3321 3322 std::pair<Register, unsigned> 3323 AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const { 3324 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 3325 if (!RootDef) 3326 return std::make_pair(Root.getReg(), 0); 3327 3328 int64_t ConstAddr = 0; 3329 3330 Register PtrBase; 3331 int64_t Offset; 3332 std::tie(PtrBase, Offset) = 3333 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 3334 3335 if (Offset) { 3336 int64_t DWordOffset0 = Offset / 4; 3337 int64_t DWordOffset1 = DWordOffset0 + 1; 3338 if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) { 3339 // (add n0, c0) 3340 return std::make_pair(PtrBase, DWordOffset0); 3341 } 3342 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 3343 // TODO 3344 3345 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 3346 // TODO 3347 3348 } 3349 3350 return std::make_pair(Root.getReg(), 0); 3351 } 3352 3353 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return 3354 /// the base value with the constant offset. There may be intervening copies 3355 /// between \p Root and the identified constant. Returns \p Root, 0 if this does 3356 /// not match the pattern. 3357 std::pair<Register, int64_t> 3358 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset( 3359 Register Root, const MachineRegisterInfo &MRI) const { 3360 MachineInstr *RootI = MRI.getVRegDef(Root); 3361 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD) 3362 return {Root, 0}; 3363 3364 MachineOperand &RHS = RootI->getOperand(2); 3365 Optional<ValueAndVReg> MaybeOffset 3366 = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true); 3367 if (!MaybeOffset) 3368 return {Root, 0}; 3369 return {RootI->getOperand(1).getReg(), MaybeOffset->Value}; 3370 } 3371 3372 static void addZeroImm(MachineInstrBuilder &MIB) { 3373 MIB.addImm(0); 3374 } 3375 3376 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p 3377 /// BasePtr is not valid, a null base pointer will be used. 3378 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, 3379 uint32_t FormatLo, uint32_t FormatHi, 3380 Register BasePtr) { 3381 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 3382 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 3383 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 3384 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 3385 3386 B.buildInstr(AMDGPU::S_MOV_B32) 3387 .addDef(RSrc2) 3388 .addImm(FormatLo); 3389 B.buildInstr(AMDGPU::S_MOV_B32) 3390 .addDef(RSrc3) 3391 .addImm(FormatHi); 3392 3393 // Build the half of the subregister with the constants before building the 3394 // full 128-bit register. If we are building multiple resource descriptors, 3395 // this will allow CSEing of the 2-component register. 3396 B.buildInstr(AMDGPU::REG_SEQUENCE) 3397 .addDef(RSrcHi) 3398 .addReg(RSrc2) 3399 .addImm(AMDGPU::sub0) 3400 .addReg(RSrc3) 3401 .addImm(AMDGPU::sub1); 3402 3403 Register RSrcLo = BasePtr; 3404 if (!BasePtr) { 3405 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 3406 B.buildInstr(AMDGPU::S_MOV_B64) 3407 .addDef(RSrcLo) 3408 .addImm(0); 3409 } 3410 3411 B.buildInstr(AMDGPU::REG_SEQUENCE) 3412 .addDef(RSrc) 3413 .addReg(RSrcLo) 3414 .addImm(AMDGPU::sub0_sub1) 3415 .addReg(RSrcHi) 3416 .addImm(AMDGPU::sub2_sub3); 3417 3418 return RSrc; 3419 } 3420 3421 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 3422 const SIInstrInfo &TII, Register BasePtr) { 3423 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 3424 3425 // FIXME: Why are half the "default" bits ignored based on the addressing 3426 // mode? 3427 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr); 3428 } 3429 3430 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 3431 const SIInstrInfo &TII, Register BasePtr) { 3432 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 3433 3434 // FIXME: Why are half the "default" bits ignored based on the addressing 3435 // mode? 3436 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr); 3437 } 3438 3439 AMDGPUInstructionSelector::MUBUFAddressData 3440 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const { 3441 MUBUFAddressData Data; 3442 Data.N0 = Src; 3443 3444 Register PtrBase; 3445 int64_t Offset; 3446 3447 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI); 3448 if (isUInt<32>(Offset)) { 3449 Data.N0 = PtrBase; 3450 Data.Offset = Offset; 3451 } 3452 3453 if (MachineInstr *InputAdd 3454 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) { 3455 Data.N2 = InputAdd->getOperand(1).getReg(); 3456 Data.N3 = InputAdd->getOperand(2).getReg(); 3457 3458 // FIXME: Need to fix extra SGPR->VGPRcopies inserted 3459 // FIXME: Don't know this was defined by operand 0 3460 // 3461 // TODO: Remove this when we have copy folding optimizations after 3462 // RegBankSelect. 3463 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg(); 3464 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg(); 3465 } 3466 3467 return Data; 3468 } 3469 3470 /// Return if the addr64 mubuf mode should be used for the given address. 3471 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const { 3472 // (ptr_add N2, N3) -> addr64, or 3473 // (ptr_add (ptr_add N2, N3), C1) -> addr64 3474 if (Addr.N2) 3475 return true; 3476 3477 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI); 3478 return N0Bank->getID() == AMDGPU::VGPRRegBankID; 3479 } 3480 3481 /// Split an immediate offset \p ImmOffset depending on whether it fits in the 3482 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable 3483 /// component. 3484 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset( 3485 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const { 3486 if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset)) 3487 return; 3488 3489 // Illegal offset, store it in soffset. 3490 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 3491 B.buildInstr(AMDGPU::S_MOV_B32) 3492 .addDef(SOffset) 3493 .addImm(ImmOffset); 3494 ImmOffset = 0; 3495 } 3496 3497 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl( 3498 MachineOperand &Root, Register &VAddr, Register &RSrcReg, 3499 Register &SOffset, int64_t &Offset) const { 3500 // FIXME: Predicates should stop this from reaching here. 3501 // addr64 bit was removed for volcanic islands. 3502 if (!STI.hasAddr64() || STI.useFlatForGlobal()) 3503 return false; 3504 3505 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 3506 if (!shouldUseAddr64(AddrData)) 3507 return false; 3508 3509 Register N0 = AddrData.N0; 3510 Register N2 = AddrData.N2; 3511 Register N3 = AddrData.N3; 3512 Offset = AddrData.Offset; 3513 3514 // Base pointer for the SRD. 3515 Register SRDPtr; 3516 3517 if (N2) { 3518 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 3519 assert(N3); 3520 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 3521 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the 3522 // addr64, and construct the default resource from a 0 address. 3523 VAddr = N0; 3524 } else { 3525 SRDPtr = N3; 3526 VAddr = N2; 3527 } 3528 } else { 3529 // N2 is not divergent. 3530 SRDPtr = N2; 3531 VAddr = N3; 3532 } 3533 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 3534 // Use the default null pointer in the resource 3535 VAddr = N0; 3536 } else { 3537 // N0 -> offset, or 3538 // (N0 + C1) -> offset 3539 SRDPtr = N0; 3540 } 3541 3542 MachineIRBuilder B(*Root.getParent()); 3543 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr); 3544 splitIllegalMUBUFOffset(B, SOffset, Offset); 3545 return true; 3546 } 3547 3548 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl( 3549 MachineOperand &Root, Register &RSrcReg, Register &SOffset, 3550 int64_t &Offset) const { 3551 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 3552 if (shouldUseAddr64(AddrData)) 3553 return false; 3554 3555 // N0 -> offset, or 3556 // (N0 + C1) -> offset 3557 Register SRDPtr = AddrData.N0; 3558 Offset = AddrData.Offset; 3559 3560 // TODO: Look through extensions for 32-bit soffset. 3561 MachineIRBuilder B(*Root.getParent()); 3562 3563 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr); 3564 splitIllegalMUBUFOffset(B, SOffset, Offset); 3565 return true; 3566 } 3567 3568 InstructionSelector::ComplexRendererFns 3569 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const { 3570 Register VAddr; 3571 Register RSrcReg; 3572 Register SOffset; 3573 int64_t Offset = 0; 3574 3575 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 3576 return {}; 3577 3578 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 3579 // pattern. 3580 return {{ 3581 [=](MachineInstrBuilder &MIB) { // rsrc 3582 MIB.addReg(RSrcReg); 3583 }, 3584 [=](MachineInstrBuilder &MIB) { // vaddr 3585 MIB.addReg(VAddr); 3586 }, 3587 [=](MachineInstrBuilder &MIB) { // soffset 3588 if (SOffset) 3589 MIB.addReg(SOffset); 3590 else 3591 MIB.addImm(0); 3592 }, 3593 [=](MachineInstrBuilder &MIB) { // offset 3594 MIB.addImm(Offset); 3595 }, 3596 addZeroImm, // glc 3597 addZeroImm, // slc 3598 addZeroImm, // tfe 3599 addZeroImm, // dlc 3600 addZeroImm // swz 3601 }}; 3602 } 3603 3604 InstructionSelector::ComplexRendererFns 3605 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const { 3606 Register RSrcReg; 3607 Register SOffset; 3608 int64_t Offset = 0; 3609 3610 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 3611 return {}; 3612 3613 return {{ 3614 [=](MachineInstrBuilder &MIB) { // rsrc 3615 MIB.addReg(RSrcReg); 3616 }, 3617 [=](MachineInstrBuilder &MIB) { // soffset 3618 if (SOffset) 3619 MIB.addReg(SOffset); 3620 else 3621 MIB.addImm(0); 3622 }, 3623 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 3624 addZeroImm, // glc 3625 addZeroImm, // slc 3626 addZeroImm, // tfe 3627 addZeroImm, // dlc 3628 addZeroImm // swz 3629 }}; 3630 } 3631 3632 InstructionSelector::ComplexRendererFns 3633 AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const { 3634 Register VAddr; 3635 Register RSrcReg; 3636 Register SOffset; 3637 int64_t Offset = 0; 3638 3639 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 3640 return {}; 3641 3642 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 3643 // pattern. 3644 return {{ 3645 [=](MachineInstrBuilder &MIB) { // rsrc 3646 MIB.addReg(RSrcReg); 3647 }, 3648 [=](MachineInstrBuilder &MIB) { // vaddr 3649 MIB.addReg(VAddr); 3650 }, 3651 [=](MachineInstrBuilder &MIB) { // soffset 3652 if (SOffset) 3653 MIB.addReg(SOffset); 3654 else 3655 MIB.addImm(0); 3656 }, 3657 [=](MachineInstrBuilder &MIB) { // offset 3658 MIB.addImm(Offset); 3659 }, 3660 addZeroImm // slc 3661 }}; 3662 } 3663 3664 InstructionSelector::ComplexRendererFns 3665 AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const { 3666 Register RSrcReg; 3667 Register SOffset; 3668 int64_t Offset = 0; 3669 3670 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 3671 return {}; 3672 3673 return {{ 3674 [=](MachineInstrBuilder &MIB) { // rsrc 3675 MIB.addReg(RSrcReg); 3676 }, 3677 [=](MachineInstrBuilder &MIB) { // soffset 3678 if (SOffset) 3679 MIB.addReg(SOffset); 3680 else 3681 MIB.addImm(0); 3682 }, 3683 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 3684 addZeroImm // slc 3685 }}; 3686 } 3687 3688 /// Get an immediate that must be 32-bits, and treated as zero extended. 3689 static Optional<uint64_t> getConstantZext32Val(Register Reg, 3690 const MachineRegisterInfo &MRI) { 3691 // getConstantVRegVal sexts any values, so see if that matters. 3692 Optional<int64_t> OffsetVal = getConstantVRegVal(Reg, MRI); 3693 if (!OffsetVal || !isInt<32>(*OffsetVal)) 3694 return None; 3695 return Lo_32(*OffsetVal); 3696 } 3697 3698 InstructionSelector::ComplexRendererFns 3699 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const { 3700 Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI); 3701 if (!OffsetVal) 3702 return {}; 3703 3704 Optional<int64_t> EncodedImm = 3705 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true); 3706 if (!EncodedImm) 3707 return {}; 3708 3709 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; 3710 } 3711 3712 InstructionSelector::ComplexRendererFns 3713 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const { 3714 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); 3715 3716 Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI); 3717 if (!OffsetVal) 3718 return {}; 3719 3720 Optional<int64_t> EncodedImm 3721 = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal); 3722 if (!EncodedImm) 3723 return {}; 3724 3725 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; 3726 } 3727 3728 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, 3729 const MachineInstr &MI, 3730 int OpIdx) const { 3731 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 3732 "Expected G_CONSTANT"); 3733 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue()); 3734 } 3735 3736 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB, 3737 const MachineInstr &MI, 3738 int OpIdx) const { 3739 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 3740 "Expected G_CONSTANT"); 3741 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue()); 3742 } 3743 3744 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB, 3745 const MachineInstr &MI, 3746 int OpIdx) const { 3747 assert(OpIdx == -1); 3748 3749 const MachineOperand &Op = MI.getOperand(1); 3750 if (MI.getOpcode() == TargetOpcode::G_FCONSTANT) 3751 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); 3752 else { 3753 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT"); 3754 MIB.addImm(Op.getCImm()->getSExtValue()); 3755 } 3756 } 3757 3758 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB, 3759 const MachineInstr &MI, 3760 int OpIdx) const { 3761 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 3762 "Expected G_CONSTANT"); 3763 MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation()); 3764 } 3765 3766 /// This only really exists to satisfy DAG type checking machinery, so is a 3767 /// no-op here. 3768 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB, 3769 const MachineInstr &MI, 3770 int OpIdx) const { 3771 MIB.addImm(MI.getOperand(OpIdx).getImm()); 3772 } 3773 3774 void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB, 3775 const MachineInstr &MI, 3776 int OpIdx) const { 3777 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3778 MIB.addImm(MI.getOperand(OpIdx).getImm() & 1); 3779 } 3780 3781 void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB, 3782 const MachineInstr &MI, 3783 int OpIdx) const { 3784 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3785 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1); 3786 } 3787 3788 void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB, 3789 const MachineInstr &MI, 3790 int OpIdx) const { 3791 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3792 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1); 3793 } 3794 3795 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB, 3796 const MachineInstr &MI, 3797 int OpIdx) const { 3798 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3799 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1); 3800 } 3801 3802 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const { 3803 return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm()); 3804 } 3805 3806 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const { 3807 return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm()); 3808 } 3809 3810 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const { 3811 return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm()); 3812 } 3813 3814 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const { 3815 return TII.isInlineConstant(Imm); 3816 } 3817