1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the InstructionSelector class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUInstructionSelector.h" 15 #include "AMDGPUInstrInfo.h" 16 #include "AMDGPUGlobalISelUtils.h" 17 #include "AMDGPURegisterBankInfo.h" 18 #include "AMDGPUSubtarget.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 23 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" 24 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" 25 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 26 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 27 #include "llvm/CodeGen/GlobalISel/Utils.h" 28 #include "llvm/CodeGen/MachineBasicBlock.h" 29 #include "llvm/CodeGen/MachineFunction.h" 30 #include "llvm/CodeGen/MachineInstr.h" 31 #include "llvm/CodeGen/MachineInstrBuilder.h" 32 #include "llvm/CodeGen/MachineRegisterInfo.h" 33 #include "llvm/IR/Type.h" 34 #include "llvm/Support/Debug.h" 35 #include "llvm/Support/raw_ostream.h" 36 37 #define DEBUG_TYPE "amdgpu-isel" 38 39 using namespace llvm; 40 using namespace MIPatternMatch; 41 42 static cl::opt<bool> AllowRiskySelect( 43 "amdgpu-global-isel-risky-select", 44 cl::desc("Allow GlobalISel to select cases that are likely to not work yet"), 45 cl::init(false), 46 cl::ReallyHidden); 47 48 #define GET_GLOBALISEL_IMPL 49 #define AMDGPUSubtarget GCNSubtarget 50 #include "AMDGPUGenGlobalISel.inc" 51 #undef GET_GLOBALISEL_IMPL 52 #undef AMDGPUSubtarget 53 54 AMDGPUInstructionSelector::AMDGPUInstructionSelector( 55 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, 56 const AMDGPUTargetMachine &TM) 57 : InstructionSelector(), TII(*STI.getInstrInfo()), 58 TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM), 59 STI(STI), 60 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG), 61 #define GET_GLOBALISEL_PREDICATES_INIT 62 #include "AMDGPUGenGlobalISel.inc" 63 #undef GET_GLOBALISEL_PREDICATES_INIT 64 #define GET_GLOBALISEL_TEMPORARIES_INIT 65 #include "AMDGPUGenGlobalISel.inc" 66 #undef GET_GLOBALISEL_TEMPORARIES_INIT 67 { 68 } 69 70 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } 71 72 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB, 73 CodeGenCoverage &CoverageInfo) { 74 MRI = &MF.getRegInfo(); 75 InstructionSelector::setupMF(MF, KB, CoverageInfo); 76 } 77 78 bool AMDGPUInstructionSelector::isVCC(Register Reg, 79 const MachineRegisterInfo &MRI) const { 80 // The verifier is oblivious to s1 being a valid value for wavesize registers. 81 if (Reg.isPhysical()) 82 return false; 83 84 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 85 const TargetRegisterClass *RC = 86 RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); 87 if (RC) { 88 const LLT Ty = MRI.getType(Reg); 89 return RC->hasSuperClassEq(TRI.getBoolRC()) && 90 Ty.isValid() && Ty.getSizeInBits() == 1; 91 } 92 93 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); 94 return RB->getID() == AMDGPU::VCCRegBankID; 95 } 96 97 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI, 98 unsigned NewOpc) const { 99 MI.setDesc(TII.get(NewOpc)); 100 MI.RemoveOperand(1); // Remove intrinsic ID. 101 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 102 103 MachineOperand &Dst = MI.getOperand(0); 104 MachineOperand &Src = MI.getOperand(1); 105 106 // TODO: This should be legalized to s32 if needed 107 if (MRI->getType(Dst.getReg()) == LLT::scalar(1)) 108 return false; 109 110 const TargetRegisterClass *DstRC 111 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 112 const TargetRegisterClass *SrcRC 113 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 114 if (!DstRC || DstRC != SrcRC) 115 return false; 116 117 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) && 118 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI); 119 } 120 121 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { 122 const DebugLoc &DL = I.getDebugLoc(); 123 MachineBasicBlock *BB = I.getParent(); 124 I.setDesc(TII.get(TargetOpcode::COPY)); 125 126 const MachineOperand &Src = I.getOperand(1); 127 MachineOperand &Dst = I.getOperand(0); 128 Register DstReg = Dst.getReg(); 129 Register SrcReg = Src.getReg(); 130 131 if (isVCC(DstReg, *MRI)) { 132 if (SrcReg == AMDGPU::SCC) { 133 const TargetRegisterClass *RC 134 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 135 if (!RC) 136 return true; 137 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 138 } 139 140 if (!isVCC(SrcReg, *MRI)) { 141 // TODO: Should probably leave the copy and let copyPhysReg expand it. 142 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI)) 143 return false; 144 145 const TargetRegisterClass *SrcRC 146 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 147 148 Register MaskedReg = MRI->createVirtualRegister(SrcRC); 149 150 // We can't trust the high bits at this point, so clear them. 151 152 // TODO: Skip masking high bits if def is known boolean. 153 154 unsigned AndOpc = TRI.isSGPRClass(SrcRC) ? 155 AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32; 156 BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg) 157 .addImm(1) 158 .addReg(SrcReg); 159 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) 160 .addImm(0) 161 .addReg(MaskedReg); 162 163 if (!MRI->getRegClassOrNull(SrcReg)) 164 MRI->setRegClass(SrcReg, SrcRC); 165 I.eraseFromParent(); 166 return true; 167 } 168 169 const TargetRegisterClass *RC = 170 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 171 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) 172 return false; 173 174 return true; 175 } 176 177 for (const MachineOperand &MO : I.operands()) { 178 if (Register::isPhysicalRegister(MO.getReg())) 179 continue; 180 181 const TargetRegisterClass *RC = 182 TRI.getConstrainedRegClassForOperand(MO, *MRI); 183 if (!RC) 184 continue; 185 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); 186 } 187 return true; 188 } 189 190 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { 191 const Register DefReg = I.getOperand(0).getReg(); 192 const LLT DefTy = MRI->getType(DefReg); 193 if (DefTy == LLT::scalar(1)) { 194 if (!AllowRiskySelect) { 195 LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n"); 196 return false; 197 } 198 199 LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n"); 200 } 201 202 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy) 203 204 const RegClassOrRegBank &RegClassOrBank = 205 MRI->getRegClassOrRegBank(DefReg); 206 207 const TargetRegisterClass *DefRC 208 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); 209 if (!DefRC) { 210 if (!DefTy.isValid()) { 211 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); 212 return false; 213 } 214 215 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); 216 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI); 217 if (!DefRC) { 218 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); 219 return false; 220 } 221 } 222 223 // TODO: Verify that all registers have the same bank 224 I.setDesc(TII.get(TargetOpcode::PHI)); 225 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI); 226 } 227 228 MachineOperand 229 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, 230 const TargetRegisterClass &SubRC, 231 unsigned SubIdx) const { 232 233 MachineInstr *MI = MO.getParent(); 234 MachineBasicBlock *BB = MO.getParent()->getParent(); 235 Register DstReg = MRI->createVirtualRegister(&SubRC); 236 237 if (MO.isReg()) { 238 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); 239 Register Reg = MO.getReg(); 240 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) 241 .addReg(Reg, 0, ComposedSubIdx); 242 243 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(), 244 MO.isKill(), MO.isDead(), MO.isUndef(), 245 MO.isEarlyClobber(), 0, MO.isDebug(), 246 MO.isInternalRead()); 247 } 248 249 assert(MO.isImm()); 250 251 APInt Imm(64, MO.getImm()); 252 253 switch (SubIdx) { 254 default: 255 llvm_unreachable("do not know to split immediate with this sub index."); 256 case AMDGPU::sub0: 257 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue()); 258 case AMDGPU::sub1: 259 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue()); 260 } 261 } 262 263 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { 264 switch (Opc) { 265 case AMDGPU::G_AND: 266 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; 267 case AMDGPU::G_OR: 268 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32; 269 case AMDGPU::G_XOR: 270 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32; 271 default: 272 llvm_unreachable("not a bit op"); 273 } 274 } 275 276 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { 277 Register DstReg = I.getOperand(0).getReg(); 278 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 279 280 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 281 if (DstRB->getID() != AMDGPU::SGPRRegBankID && 282 DstRB->getID() != AMDGPU::VCCRegBankID) 283 return false; 284 285 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID && 286 STI.isWave64()); 287 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64))); 288 289 // Dead implicit-def of scc 290 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef 291 true, // isImp 292 false, // isKill 293 true)); // isDead 294 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 295 } 296 297 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { 298 MachineBasicBlock *BB = I.getParent(); 299 MachineFunction *MF = BB->getParent(); 300 Register DstReg = I.getOperand(0).getReg(); 301 const DebugLoc &DL = I.getDebugLoc(); 302 LLT Ty = MRI->getType(DstReg); 303 if (Ty.isVector()) 304 return false; 305 306 unsigned Size = Ty.getSizeInBits(); 307 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 308 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID; 309 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB; 310 311 if (Size == 32) { 312 if (IsSALU) { 313 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; 314 MachineInstr *Add = 315 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 316 .add(I.getOperand(1)) 317 .add(I.getOperand(2)); 318 I.eraseFromParent(); 319 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 320 } 321 322 if (STI.hasAddNoCarry()) { 323 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64; 324 I.setDesc(TII.get(Opc)); 325 I.addOperand(*MF, MachineOperand::CreateImm(0)); 326 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 327 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 328 } 329 330 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64; 331 332 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass()); 333 MachineInstr *Add 334 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 335 .addDef(UnusedCarry, RegState::Dead) 336 .add(I.getOperand(1)) 337 .add(I.getOperand(2)) 338 .addImm(0); 339 I.eraseFromParent(); 340 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 341 } 342 343 assert(!Sub && "illegal sub should not reach here"); 344 345 const TargetRegisterClass &RC 346 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass; 347 const TargetRegisterClass &HalfRC 348 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass; 349 350 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0)); 351 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0)); 352 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1)); 353 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1)); 354 355 Register DstLo = MRI->createVirtualRegister(&HalfRC); 356 Register DstHi = MRI->createVirtualRegister(&HalfRC); 357 358 if (IsSALU) { 359 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) 360 .add(Lo1) 361 .add(Lo2); 362 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) 363 .add(Hi1) 364 .add(Hi2); 365 } else { 366 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass(); 367 Register CarryReg = MRI->createVirtualRegister(CarryRC); 368 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo) 369 .addDef(CarryReg) 370 .add(Lo1) 371 .add(Lo2) 372 .addImm(0); 373 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi) 374 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead) 375 .add(Hi1) 376 .add(Hi2) 377 .addReg(CarryReg, RegState::Kill) 378 .addImm(0); 379 380 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI)) 381 return false; 382 } 383 384 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 385 .addReg(DstLo) 386 .addImm(AMDGPU::sub0) 387 .addReg(DstHi) 388 .addImm(AMDGPU::sub1); 389 390 391 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI)) 392 return false; 393 394 I.eraseFromParent(); 395 return true; 396 } 397 398 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE( 399 MachineInstr &I) const { 400 MachineBasicBlock *BB = I.getParent(); 401 MachineFunction *MF = BB->getParent(); 402 const DebugLoc &DL = I.getDebugLoc(); 403 Register Dst0Reg = I.getOperand(0).getReg(); 404 Register Dst1Reg = I.getOperand(1).getReg(); 405 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO || 406 I.getOpcode() == AMDGPU::G_UADDE; 407 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE || 408 I.getOpcode() == AMDGPU::G_USUBE; 409 410 if (isVCC(Dst1Reg, *MRI)) { 411 unsigned NoCarryOpc = 412 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64; 413 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 414 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc)); 415 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 416 I.addOperand(*MF, MachineOperand::CreateImm(0)); 417 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 418 } 419 420 Register Src0Reg = I.getOperand(2).getReg(); 421 Register Src1Reg = I.getOperand(3).getReg(); 422 423 if (HasCarryIn) { 424 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 425 .addReg(I.getOperand(4).getReg()); 426 } 427 428 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; 429 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; 430 431 BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg) 432 .add(I.getOperand(2)) 433 .add(I.getOperand(3)); 434 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg) 435 .addReg(AMDGPU::SCC); 436 437 if (!MRI->getRegClassOrNull(Dst1Reg)) 438 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass); 439 440 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) || 441 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) || 442 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI)) 443 return false; 444 445 if (HasCarryIn && 446 !RBI.constrainGenericRegister(I.getOperand(4).getReg(), 447 AMDGPU::SReg_32RegClass, *MRI)) 448 return false; 449 450 I.eraseFromParent(); 451 return true; 452 } 453 454 // TODO: We should probably legalize these to only using 32-bit results. 455 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { 456 MachineBasicBlock *BB = I.getParent(); 457 Register DstReg = I.getOperand(0).getReg(); 458 Register SrcReg = I.getOperand(1).getReg(); 459 LLT DstTy = MRI->getType(DstReg); 460 LLT SrcTy = MRI->getType(SrcReg); 461 const unsigned SrcSize = SrcTy.getSizeInBits(); 462 unsigned DstSize = DstTy.getSizeInBits(); 463 464 // TODO: Should handle any multiple of 32 offset. 465 unsigned Offset = I.getOperand(2).getImm(); 466 if (Offset % 32 != 0 || DstSize > 128) 467 return false; 468 469 // 16-bit operations really use 32-bit registers. 470 // FIXME: Probably should not allow 16-bit G_EXTRACT results. 471 if (DstSize == 16) 472 DstSize = 32; 473 474 const TargetRegisterClass *DstRC = 475 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI); 476 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 477 return false; 478 479 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 480 const TargetRegisterClass *SrcRC = 481 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 482 if (!SrcRC) 483 return false; 484 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32, 485 DstSize / 32); 486 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg); 487 if (!SrcRC) 488 return false; 489 490 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I, 491 *SrcRC, I.getOperand(1)); 492 const DebugLoc &DL = I.getDebugLoc(); 493 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg) 494 .addReg(SrcReg, 0, SubReg); 495 496 I.eraseFromParent(); 497 return true; 498 } 499 500 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { 501 MachineBasicBlock *BB = MI.getParent(); 502 Register DstReg = MI.getOperand(0).getReg(); 503 LLT DstTy = MRI->getType(DstReg); 504 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg()); 505 506 const unsigned SrcSize = SrcTy.getSizeInBits(); 507 if (SrcSize < 32) 508 return selectImpl(MI, *CoverageInfo); 509 510 const DebugLoc &DL = MI.getDebugLoc(); 511 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 512 const unsigned DstSize = DstTy.getSizeInBits(); 513 const TargetRegisterClass *DstRC = 514 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 515 if (!DstRC) 516 return false; 517 518 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8); 519 MachineInstrBuilder MIB = 520 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg); 521 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) { 522 MachineOperand &Src = MI.getOperand(I + 1); 523 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef())); 524 MIB.addImm(SubRegs[I]); 525 526 const TargetRegisterClass *SrcRC 527 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 528 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI)) 529 return false; 530 } 531 532 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 533 return false; 534 535 MI.eraseFromParent(); 536 return true; 537 } 538 539 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { 540 MachineBasicBlock *BB = MI.getParent(); 541 const int NumDst = MI.getNumOperands() - 1; 542 543 MachineOperand &Src = MI.getOperand(NumDst); 544 545 Register SrcReg = Src.getReg(); 546 Register DstReg0 = MI.getOperand(0).getReg(); 547 LLT DstTy = MRI->getType(DstReg0); 548 LLT SrcTy = MRI->getType(SrcReg); 549 550 const unsigned DstSize = DstTy.getSizeInBits(); 551 const unsigned SrcSize = SrcTy.getSizeInBits(); 552 const DebugLoc &DL = MI.getDebugLoc(); 553 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 554 555 const TargetRegisterClass *SrcRC = 556 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 557 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 558 return false; 559 560 const unsigned SrcFlags = getUndefRegState(Src.isUndef()); 561 562 // Note we could have mixed SGPR and VGPR destination banks for an SGPR 563 // source, and this relies on the fact that the same subregister indices are 564 // used for both. 565 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); 566 for (int I = 0, E = NumDst; I != E; ++I) { 567 MachineOperand &Dst = MI.getOperand(I); 568 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg()) 569 .addReg(SrcReg, SrcFlags, SubRegs[I]); 570 571 // Make sure the subregister index is valid for the source register. 572 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]); 573 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 574 return false; 575 576 const TargetRegisterClass *DstRC = 577 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 578 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI)) 579 return false; 580 } 581 582 MI.eraseFromParent(); 583 return true; 584 } 585 586 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC( 587 MachineInstr &MI) const { 588 if (selectImpl(MI, *CoverageInfo)) 589 return true; 590 591 const LLT S32 = LLT::scalar(32); 592 const LLT V2S16 = LLT::vector(2, 16); 593 594 Register Dst = MI.getOperand(0).getReg(); 595 if (MRI->getType(Dst) != V2S16) 596 return false; 597 598 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI); 599 if (DstBank->getID() != AMDGPU::SGPRRegBankID) 600 return false; 601 602 Register Src0 = MI.getOperand(1).getReg(); 603 Register Src1 = MI.getOperand(2).getReg(); 604 if (MRI->getType(Src0) != S32) 605 return false; 606 607 const DebugLoc &DL = MI.getDebugLoc(); 608 MachineBasicBlock *BB = MI.getParent(); 609 610 auto ConstSrc1 = getConstantVRegValWithLookThrough(Src1, *MRI, true, true); 611 if (ConstSrc1) { 612 auto ConstSrc0 = getConstantVRegValWithLookThrough(Src0, *MRI, true, true); 613 if (ConstSrc0) { 614 uint32_t Lo16 = static_cast<uint32_t>(ConstSrc0->Value) & 0xffff; 615 uint32_t Hi16 = static_cast<uint32_t>(ConstSrc1->Value) & 0xffff; 616 617 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst) 618 .addImm(Lo16 | (Hi16 << 16)); 619 MI.eraseFromParent(); 620 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI); 621 } 622 } 623 624 // TODO: This should probably be a combine somewhere 625 // (build_vector_trunc $src0, undef -> copy $src0 626 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI); 627 if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) { 628 MI.setDesc(TII.get(AMDGPU::COPY)); 629 MI.RemoveOperand(2); 630 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) && 631 RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI); 632 } 633 634 Register ShiftSrc0; 635 Register ShiftSrc1; 636 int64_t ShiftAmt; 637 638 // With multiple uses of the shift, this will duplicate the shift and 639 // increase register pressure. 640 // 641 // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16) 642 // => (S_PACK_HH_B32_B16 $src0, $src1) 643 // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16)) 644 // => (S_PACK_LH_B32_B16 $src0, $src1) 645 // (build_vector_trunc $src0, $src1) 646 // => (S_PACK_LL_B32_B16 $src0, $src1) 647 648 // FIXME: This is an inconvenient way to check a specific value 649 bool Shift0 = mi_match( 650 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_ICst(ShiftAmt)))) && 651 ShiftAmt == 16; 652 653 bool Shift1 = mi_match( 654 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_ICst(ShiftAmt)))) && 655 ShiftAmt == 16; 656 657 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16; 658 if (Shift0 && Shift1) { 659 Opc = AMDGPU::S_PACK_HH_B32_B16; 660 MI.getOperand(1).setReg(ShiftSrc0); 661 MI.getOperand(2).setReg(ShiftSrc1); 662 } else if (Shift1) { 663 Opc = AMDGPU::S_PACK_LH_B32_B16; 664 MI.getOperand(2).setReg(ShiftSrc1); 665 } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) { 666 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16 667 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst) 668 .addReg(ShiftSrc0) 669 .addImm(16); 670 671 MI.eraseFromParent(); 672 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 673 } 674 675 MI.setDesc(TII.get(Opc)); 676 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); 677 } 678 679 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const { 680 return selectG_ADD_SUB(I); 681 } 682 683 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { 684 const MachineOperand &MO = I.getOperand(0); 685 686 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The 687 // regbank check here is to know why getConstrainedRegClassForOperand failed. 688 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI); 689 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) || 690 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) { 691 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); 692 return true; 693 } 694 695 return false; 696 } 697 698 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { 699 MachineBasicBlock *BB = I.getParent(); 700 701 Register DstReg = I.getOperand(0).getReg(); 702 Register Src0Reg = I.getOperand(1).getReg(); 703 Register Src1Reg = I.getOperand(2).getReg(); 704 LLT Src1Ty = MRI->getType(Src1Reg); 705 706 unsigned DstSize = MRI->getType(DstReg).getSizeInBits(); 707 unsigned InsSize = Src1Ty.getSizeInBits(); 708 709 int64_t Offset = I.getOperand(3).getImm(); 710 711 // FIXME: These cases should have been illegal and unnecessary to check here. 712 if (Offset % 32 != 0 || InsSize % 32 != 0) 713 return false; 714 715 // Currently not handled by getSubRegFromChannel. 716 if (InsSize > 128) 717 return false; 718 719 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32); 720 if (SubReg == AMDGPU::NoSubRegister) 721 return false; 722 723 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 724 const TargetRegisterClass *DstRC = 725 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 726 if (!DstRC) 727 return false; 728 729 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI); 730 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI); 731 const TargetRegisterClass *Src0RC = 732 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI); 733 const TargetRegisterClass *Src1RC = 734 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI); 735 736 // Deal with weird cases where the class only partially supports the subreg 737 // index. 738 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg); 739 if (!Src0RC || !Src1RC) 740 return false; 741 742 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 743 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) || 744 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI)) 745 return false; 746 747 const DebugLoc &DL = I.getDebugLoc(); 748 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg) 749 .addReg(Src0Reg) 750 .addReg(Src1Reg) 751 .addImm(SubReg); 752 753 I.eraseFromParent(); 754 return true; 755 } 756 757 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const { 758 if (STI.getLDSBankCount() != 16) 759 return selectImpl(MI, *CoverageInfo); 760 761 Register Dst = MI.getOperand(0).getReg(); 762 Register Src0 = MI.getOperand(2).getReg(); 763 Register M0Val = MI.getOperand(6).getReg(); 764 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) || 765 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) || 766 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI)) 767 return false; 768 769 // This requires 2 instructions. It is possible to write a pattern to support 770 // this, but the generated isel emitter doesn't correctly deal with multiple 771 // output instructions using the same physical register input. The copy to m0 772 // is incorrectly placed before the second instruction. 773 // 774 // TODO: Match source modifiers. 775 776 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 777 const DebugLoc &DL = MI.getDebugLoc(); 778 MachineBasicBlock *MBB = MI.getParent(); 779 780 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 781 .addReg(M0Val); 782 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov) 783 .addImm(2) 784 .addImm(MI.getOperand(4).getImm()) // $attr 785 .addImm(MI.getOperand(3).getImm()); // $attrchan 786 787 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst) 788 .addImm(0) // $src0_modifiers 789 .addReg(Src0) // $src0 790 .addImm(MI.getOperand(4).getImm()) // $attr 791 .addImm(MI.getOperand(3).getImm()) // $attrchan 792 .addImm(0) // $src2_modifiers 793 .addReg(InterpMov) // $src2 - 2 f16 values selected by high 794 .addImm(MI.getOperand(5).getImm()) // $high 795 .addImm(0) // $clamp 796 .addImm(0); // $omod 797 798 MI.eraseFromParent(); 799 return true; 800 } 801 802 // We need to handle this here because tablegen doesn't support matching 803 // instructions with multiple outputs. 804 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const { 805 Register Dst0 = MI.getOperand(0).getReg(); 806 Register Dst1 = MI.getOperand(1).getReg(); 807 808 LLT Ty = MRI->getType(Dst0); 809 unsigned Opc; 810 if (Ty == LLT::scalar(32)) 811 Opc = AMDGPU::V_DIV_SCALE_F32; 812 else if (Ty == LLT::scalar(64)) 813 Opc = AMDGPU::V_DIV_SCALE_F64; 814 else 815 return false; 816 817 const DebugLoc &DL = MI.getDebugLoc(); 818 MachineBasicBlock *MBB = MI.getParent(); 819 820 Register Numer = MI.getOperand(3).getReg(); 821 Register Denom = MI.getOperand(4).getReg(); 822 unsigned ChooseDenom = MI.getOperand(5).getImm(); 823 824 Register Src0 = ChooseDenom != 0 ? Numer : Denom; 825 826 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0) 827 .addDef(Dst1) 828 .addUse(Src0) 829 .addUse(Denom) 830 .addUse(Numer); 831 832 MI.eraseFromParent(); 833 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 834 } 835 836 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { 837 unsigned IntrinsicID = I.getIntrinsicID(); 838 switch (IntrinsicID) { 839 case Intrinsic::amdgcn_if_break: { 840 MachineBasicBlock *BB = I.getParent(); 841 842 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 843 // SelectionDAG uses for wave32 vs wave64. 844 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK)) 845 .add(I.getOperand(0)) 846 .add(I.getOperand(2)) 847 .add(I.getOperand(3)); 848 849 Register DstReg = I.getOperand(0).getReg(); 850 Register Src0Reg = I.getOperand(2).getReg(); 851 Register Src1Reg = I.getOperand(3).getReg(); 852 853 I.eraseFromParent(); 854 855 for (Register Reg : { DstReg, Src0Reg, Src1Reg }) 856 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 857 858 return true; 859 } 860 case Intrinsic::amdgcn_interp_p1_f16: 861 return selectInterpP1F16(I); 862 case Intrinsic::amdgcn_wqm: 863 return constrainCopyLikeIntrin(I, AMDGPU::WQM); 864 case Intrinsic::amdgcn_softwqm: 865 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM); 866 case Intrinsic::amdgcn_wwm: 867 return constrainCopyLikeIntrin(I, AMDGPU::WWM); 868 case Intrinsic::amdgcn_div_scale: 869 return selectDivScale(I); 870 case Intrinsic::amdgcn_icmp: 871 return selectIntrinsicIcmp(I); 872 case Intrinsic::amdgcn_ballot: 873 return selectBallot(I); 874 case Intrinsic::amdgcn_reloc_constant: 875 return selectRelocConstant(I); 876 case Intrinsic::returnaddress: 877 return selectReturnAddress(I); 878 default: 879 return selectImpl(I, *CoverageInfo); 880 } 881 } 882 883 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) { 884 if (Size != 32 && Size != 64) 885 return -1; 886 switch (P) { 887 default: 888 llvm_unreachable("Unknown condition code!"); 889 case CmpInst::ICMP_NE: 890 return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64; 891 case CmpInst::ICMP_EQ: 892 return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64; 893 case CmpInst::ICMP_SGT: 894 return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64; 895 case CmpInst::ICMP_SGE: 896 return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64; 897 case CmpInst::ICMP_SLT: 898 return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64; 899 case CmpInst::ICMP_SLE: 900 return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64; 901 case CmpInst::ICMP_UGT: 902 return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64; 903 case CmpInst::ICMP_UGE: 904 return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64; 905 case CmpInst::ICMP_ULT: 906 return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64; 907 case CmpInst::ICMP_ULE: 908 return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64; 909 } 910 } 911 912 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P, 913 unsigned Size) const { 914 if (Size == 64) { 915 if (!STI.hasScalarCompareEq64()) 916 return -1; 917 918 switch (P) { 919 case CmpInst::ICMP_NE: 920 return AMDGPU::S_CMP_LG_U64; 921 case CmpInst::ICMP_EQ: 922 return AMDGPU::S_CMP_EQ_U64; 923 default: 924 return -1; 925 } 926 } 927 928 if (Size != 32) 929 return -1; 930 931 switch (P) { 932 case CmpInst::ICMP_NE: 933 return AMDGPU::S_CMP_LG_U32; 934 case CmpInst::ICMP_EQ: 935 return AMDGPU::S_CMP_EQ_U32; 936 case CmpInst::ICMP_SGT: 937 return AMDGPU::S_CMP_GT_I32; 938 case CmpInst::ICMP_SGE: 939 return AMDGPU::S_CMP_GE_I32; 940 case CmpInst::ICMP_SLT: 941 return AMDGPU::S_CMP_LT_I32; 942 case CmpInst::ICMP_SLE: 943 return AMDGPU::S_CMP_LE_I32; 944 case CmpInst::ICMP_UGT: 945 return AMDGPU::S_CMP_GT_U32; 946 case CmpInst::ICMP_UGE: 947 return AMDGPU::S_CMP_GE_U32; 948 case CmpInst::ICMP_ULT: 949 return AMDGPU::S_CMP_LT_U32; 950 case CmpInst::ICMP_ULE: 951 return AMDGPU::S_CMP_LE_U32; 952 default: 953 llvm_unreachable("Unknown condition code!"); 954 } 955 } 956 957 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { 958 MachineBasicBlock *BB = I.getParent(); 959 const DebugLoc &DL = I.getDebugLoc(); 960 961 Register SrcReg = I.getOperand(2).getReg(); 962 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 963 964 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); 965 966 Register CCReg = I.getOperand(0).getReg(); 967 if (!isVCC(CCReg, *MRI)) { 968 int Opcode = getS_CMPOpcode(Pred, Size); 969 if (Opcode == -1) 970 return false; 971 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode)) 972 .add(I.getOperand(2)) 973 .add(I.getOperand(3)); 974 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg) 975 .addReg(AMDGPU::SCC); 976 bool Ret = 977 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) && 978 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI); 979 I.eraseFromParent(); 980 return Ret; 981 } 982 983 int Opcode = getV_CMPOpcode(Pred, Size); 984 if (Opcode == -1) 985 return false; 986 987 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), 988 I.getOperand(0).getReg()) 989 .add(I.getOperand(2)) 990 .add(I.getOperand(3)); 991 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), 992 *TRI.getBoolRC(), *MRI); 993 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 994 I.eraseFromParent(); 995 return Ret; 996 } 997 998 bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const { 999 Register Dst = I.getOperand(0).getReg(); 1000 if (isVCC(Dst, *MRI)) 1001 return false; 1002 1003 if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize()) 1004 return false; 1005 1006 MachineBasicBlock *BB = I.getParent(); 1007 const DebugLoc &DL = I.getDebugLoc(); 1008 Register SrcReg = I.getOperand(2).getReg(); 1009 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 1010 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm()); 1011 1012 int Opcode = getV_CMPOpcode(Pred, Size); 1013 if (Opcode == -1) 1014 return false; 1015 1016 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst) 1017 .add(I.getOperand(2)) 1018 .add(I.getOperand(3)); 1019 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(), 1020 *MRI); 1021 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 1022 I.eraseFromParent(); 1023 return Ret; 1024 } 1025 1026 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const { 1027 MachineBasicBlock *BB = I.getParent(); 1028 const DebugLoc &DL = I.getDebugLoc(); 1029 Register DstReg = I.getOperand(0).getReg(); 1030 const unsigned Size = MRI->getType(DstReg).getSizeInBits(); 1031 const bool Is64 = Size == 64; 1032 1033 if (Size != STI.getWavefrontSize()) 1034 return false; 1035 1036 Optional<ValueAndVReg> Arg = 1037 getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true); 1038 1039 if (Arg.hasValue()) { 1040 const int64_t Value = Arg.getValue().Value; 1041 if (Value == 0) { 1042 unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; 1043 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0); 1044 } else if (Value == -1) { // all ones 1045 Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO; 1046 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg); 1047 } else 1048 return false; 1049 } else { 1050 Register SrcReg = I.getOperand(2).getReg(); 1051 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg); 1052 } 1053 1054 I.eraseFromParent(); 1055 return true; 1056 } 1057 1058 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const { 1059 Register DstReg = I.getOperand(0).getReg(); 1060 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 1061 const TargetRegisterClass *DstRC = 1062 TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI); 1063 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 1064 return false; 1065 1066 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID; 1067 1068 Module *M = MF->getFunction().getParent(); 1069 const MDNode *Metadata = I.getOperand(2).getMetadata(); 1070 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString(); 1071 auto RelocSymbol = cast<GlobalVariable>( 1072 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext()))); 1073 1074 MachineBasicBlock *BB = I.getParent(); 1075 BuildMI(*BB, &I, I.getDebugLoc(), 1076 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg) 1077 .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO); 1078 1079 I.eraseFromParent(); 1080 return true; 1081 } 1082 1083 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const { 1084 MachineBasicBlock *MBB = I.getParent(); 1085 MachineFunction &MF = *MBB->getParent(); 1086 const DebugLoc &DL = I.getDebugLoc(); 1087 1088 MachineOperand &Dst = I.getOperand(0); 1089 Register DstReg = Dst.getReg(); 1090 unsigned Depth = I.getOperand(2).getImm(); 1091 1092 const TargetRegisterClass *RC 1093 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 1094 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) || 1095 !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) 1096 return false; 1097 1098 // Check for kernel and shader functions 1099 if (Depth != 0 || 1100 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) { 1101 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) 1102 .addImm(0); 1103 I.eraseFromParent(); 1104 return true; 1105 } 1106 1107 MachineFrameInfo &MFI = MF.getFrameInfo(); 1108 // There is a call to @llvm.returnaddress in this function 1109 MFI.setReturnAddressIsTaken(true); 1110 1111 // Get the return address reg and mark it as an implicit live-in 1112 Register ReturnAddrReg = TRI.getReturnAddressReg(MF); 1113 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg, 1114 AMDGPU::SReg_64RegClass); 1115 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg) 1116 .addReg(LiveIn); 1117 I.eraseFromParent(); 1118 return true; 1119 } 1120 1121 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const { 1122 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 1123 // SelectionDAG uses for wave32 vs wave64. 1124 MachineBasicBlock *BB = MI.getParent(); 1125 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF)) 1126 .add(MI.getOperand(1)); 1127 1128 Register Reg = MI.getOperand(1).getReg(); 1129 MI.eraseFromParent(); 1130 1131 if (!MRI->getRegClassOrNull(Reg)) 1132 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 1133 return true; 1134 } 1135 1136 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic( 1137 MachineInstr &MI, Intrinsic::ID IntrID) const { 1138 MachineBasicBlock *MBB = MI.getParent(); 1139 MachineFunction *MF = MBB->getParent(); 1140 const DebugLoc &DL = MI.getDebugLoc(); 1141 1142 unsigned IndexOperand = MI.getOperand(7).getImm(); 1143 bool WaveRelease = MI.getOperand(8).getImm() != 0; 1144 bool WaveDone = MI.getOperand(9).getImm() != 0; 1145 1146 if (WaveDone && !WaveRelease) 1147 report_fatal_error("ds_ordered_count: wave_done requires wave_release"); 1148 1149 unsigned OrderedCountIndex = IndexOperand & 0x3f; 1150 IndexOperand &= ~0x3f; 1151 unsigned CountDw = 0; 1152 1153 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) { 1154 CountDw = (IndexOperand >> 24) & 0xf; 1155 IndexOperand &= ~(0xf << 24); 1156 1157 if (CountDw < 1 || CountDw > 4) { 1158 report_fatal_error( 1159 "ds_ordered_count: dword count must be between 1 and 4"); 1160 } 1161 } 1162 1163 if (IndexOperand) 1164 report_fatal_error("ds_ordered_count: bad index operand"); 1165 1166 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1; 1167 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF); 1168 1169 unsigned Offset0 = OrderedCountIndex << 2; 1170 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) | 1171 (Instruction << 4); 1172 1173 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) 1174 Offset1 |= (CountDw - 1) << 6; 1175 1176 unsigned Offset = Offset0 | (Offset1 << 8); 1177 1178 Register M0Val = MI.getOperand(2).getReg(); 1179 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1180 .addReg(M0Val); 1181 1182 Register DstReg = MI.getOperand(0).getReg(); 1183 Register ValReg = MI.getOperand(3).getReg(); 1184 MachineInstrBuilder DS = 1185 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg) 1186 .addReg(ValReg) 1187 .addImm(Offset) 1188 .cloneMemRefs(MI); 1189 1190 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI)) 1191 return false; 1192 1193 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI); 1194 MI.eraseFromParent(); 1195 return Ret; 1196 } 1197 1198 static unsigned gwsIntrinToOpcode(unsigned IntrID) { 1199 switch (IntrID) { 1200 case Intrinsic::amdgcn_ds_gws_init: 1201 return AMDGPU::DS_GWS_INIT; 1202 case Intrinsic::amdgcn_ds_gws_barrier: 1203 return AMDGPU::DS_GWS_BARRIER; 1204 case Intrinsic::amdgcn_ds_gws_sema_v: 1205 return AMDGPU::DS_GWS_SEMA_V; 1206 case Intrinsic::amdgcn_ds_gws_sema_br: 1207 return AMDGPU::DS_GWS_SEMA_BR; 1208 case Intrinsic::amdgcn_ds_gws_sema_p: 1209 return AMDGPU::DS_GWS_SEMA_P; 1210 case Intrinsic::amdgcn_ds_gws_sema_release_all: 1211 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL; 1212 default: 1213 llvm_unreachable("not a gws intrinsic"); 1214 } 1215 } 1216 1217 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, 1218 Intrinsic::ID IID) const { 1219 if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all && 1220 !STI.hasGWSSemaReleaseAll()) 1221 return false; 1222 1223 // intrinsic ID, vsrc, offset 1224 const bool HasVSrc = MI.getNumOperands() == 3; 1225 assert(HasVSrc || MI.getNumOperands() == 2); 1226 1227 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg(); 1228 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI); 1229 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID) 1230 return false; 1231 1232 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1233 assert(OffsetDef); 1234 1235 unsigned ImmOffset; 1236 1237 MachineBasicBlock *MBB = MI.getParent(); 1238 const DebugLoc &DL = MI.getDebugLoc(); 1239 1240 MachineInstr *Readfirstlane = nullptr; 1241 1242 // If we legalized the VGPR input, strip out the readfirstlane to analyze the 1243 // incoming offset, in case there's an add of a constant. We'll have to put it 1244 // back later. 1245 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) { 1246 Readfirstlane = OffsetDef; 1247 BaseOffset = OffsetDef->getOperand(1).getReg(); 1248 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1249 } 1250 1251 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) { 1252 // If we have a constant offset, try to use the 0 in m0 as the base. 1253 // TODO: Look into changing the default m0 initialization value. If the 1254 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to 1255 // the immediate offset. 1256 1257 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue(); 1258 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1259 .addImm(0); 1260 } else { 1261 std::tie(BaseOffset, ImmOffset, OffsetDef) 1262 = AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset); 1263 1264 if (Readfirstlane) { 1265 // We have the constant offset now, so put the readfirstlane back on the 1266 // variable component. 1267 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI)) 1268 return false; 1269 1270 Readfirstlane->getOperand(1).setReg(BaseOffset); 1271 BaseOffset = Readfirstlane->getOperand(0).getReg(); 1272 } else { 1273 if (!RBI.constrainGenericRegister(BaseOffset, 1274 AMDGPU::SReg_32RegClass, *MRI)) 1275 return false; 1276 } 1277 1278 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1279 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base) 1280 .addReg(BaseOffset) 1281 .addImm(16); 1282 1283 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1284 .addReg(M0Base); 1285 } 1286 1287 // The resource id offset is computed as (<isa opaque base> + M0[21:16] + 1288 // offset field) % 64. Some versions of the programming guide omit the m0 1289 // part, or claim it's from offset 0. 1290 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID))); 1291 1292 if (HasVSrc) { 1293 Register VSrc = MI.getOperand(1).getReg(); 1294 MIB.addReg(VSrc); 1295 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI)) 1296 return false; 1297 } 1298 1299 MIB.addImm(ImmOffset) 1300 .addImm(-1) // $gds 1301 .cloneMemRefs(MI); 1302 1303 MI.eraseFromParent(); 1304 return true; 1305 } 1306 1307 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI, 1308 bool IsAppend) const { 1309 Register PtrBase = MI.getOperand(2).getReg(); 1310 LLT PtrTy = MRI->getType(PtrBase); 1311 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS; 1312 1313 unsigned Offset; 1314 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2)); 1315 1316 // TODO: Should this try to look through readfirstlane like GWS? 1317 if (!isDSOffsetLegal(PtrBase, Offset, 16)) { 1318 PtrBase = MI.getOperand(2).getReg(); 1319 Offset = 0; 1320 } 1321 1322 MachineBasicBlock *MBB = MI.getParent(); 1323 const DebugLoc &DL = MI.getDebugLoc(); 1324 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME; 1325 1326 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1327 .addReg(PtrBase); 1328 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI)) 1329 return false; 1330 1331 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg()) 1332 .addImm(Offset) 1333 .addImm(IsGDS ? -1 : 0) 1334 .cloneMemRefs(MI); 1335 MI.eraseFromParent(); 1336 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1337 } 1338 1339 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, 1340 bool &IsTexFail) { 1341 if (TexFailCtrl) 1342 IsTexFail = true; 1343 1344 TFE = (TexFailCtrl & 0x1) ? 1 : 0; 1345 TexFailCtrl &= ~(uint64_t)0x1; 1346 LWE = (TexFailCtrl & 0x2) ? 1 : 0; 1347 TexFailCtrl &= ~(uint64_t)0x2; 1348 1349 return TexFailCtrl == 0; 1350 } 1351 1352 static bool parseCachePolicy(uint64_t Value, 1353 bool *GLC, bool *SLC, bool *DLC) { 1354 if (GLC) { 1355 *GLC = (Value & 0x1) ? 1 : 0; 1356 Value &= ~(uint64_t)0x1; 1357 } 1358 if (SLC) { 1359 *SLC = (Value & 0x2) ? 1 : 0; 1360 Value &= ~(uint64_t)0x2; 1361 } 1362 if (DLC) { 1363 *DLC = (Value & 0x4) ? 1 : 0; 1364 Value &= ~(uint64_t)0x4; 1365 } 1366 1367 return Value == 0; 1368 } 1369 1370 bool AMDGPUInstructionSelector::selectImageIntrinsic( 1371 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 1372 MachineBasicBlock *MBB = MI.getParent(); 1373 const DebugLoc &DL = MI.getDebugLoc(); 1374 1375 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 1376 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 1377 1378 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); 1379 const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 1380 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode); 1381 const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo = 1382 AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode); 1383 unsigned IntrOpcode = Intr->BaseOpcode; 1384 const bool IsGFX10 = STI.getGeneration() >= AMDGPUSubtarget::GFX10; 1385 1386 const int VAddrIdx = getImageVAddrIdxBegin(BaseOpcode, 1387 MI.getNumExplicitDefs()); 1388 int NumVAddr, NumGradients; 1389 std::tie(NumVAddr, NumGradients) = getImageNumVAddr(Intr, BaseOpcode); 1390 1391 Register VDataIn, VDataOut; 1392 LLT VDataTy; 1393 int NumVDataDwords = -1; 1394 bool IsD16 = false; 1395 1396 // XXX - Can we just get the second to last argument for ctrl? 1397 unsigned CtrlIdx; // Index of texfailctrl argument 1398 bool Unorm; 1399 if (!BaseOpcode->Sampler) { 1400 Unorm = true; 1401 CtrlIdx = VAddrIdx + NumVAddr + 1; 1402 } else { 1403 Unorm = MI.getOperand(VAddrIdx + NumVAddr + 2).getImm() != 0; 1404 CtrlIdx = VAddrIdx + NumVAddr + 3; 1405 } 1406 1407 bool TFE; 1408 bool LWE; 1409 bool IsTexFail = false; 1410 if (!parseTexFail(MI.getOperand(CtrlIdx).getImm(), TFE, LWE, IsTexFail)) 1411 return false; 1412 1413 const int Flags = MI.getOperand(CtrlIdx + 2).getImm(); 1414 const bool IsA16 = (Flags & 1) != 0; 1415 const bool IsG16 = (Flags & 2) != 0; 1416 1417 // A16 implies 16 bit gradients 1418 if (IsA16 && !IsG16) 1419 return false; 1420 1421 unsigned DMask = 0; 1422 unsigned DMaskLanes = 0; 1423 1424 if (BaseOpcode->Atomic) { 1425 VDataOut = MI.getOperand(0).getReg(); 1426 VDataIn = MI.getOperand(2).getReg(); 1427 LLT Ty = MRI->getType(VDataIn); 1428 1429 // Be careful to allow atomic swap on 16-bit element vectors. 1430 const bool Is64Bit = BaseOpcode->AtomicX2 ? 1431 Ty.getSizeInBits() == 128 : 1432 Ty.getSizeInBits() == 64; 1433 1434 if (BaseOpcode->AtomicX2) { 1435 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister); 1436 1437 DMask = Is64Bit ? 0xf : 0x3; 1438 NumVDataDwords = Is64Bit ? 4 : 2; 1439 } else { 1440 DMask = Is64Bit ? 0x3 : 0x1; 1441 NumVDataDwords = Is64Bit ? 2 : 1; 1442 } 1443 } else { 1444 const int DMaskIdx = 2; // Input/output + intrinsic ID. 1445 1446 DMask = MI.getOperand(DMaskIdx).getImm(); 1447 DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask); 1448 1449 if (BaseOpcode->Store) { 1450 VDataIn = MI.getOperand(1).getReg(); 1451 VDataTy = MRI->getType(VDataIn); 1452 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32; 1453 } else { 1454 VDataOut = MI.getOperand(0).getReg(); 1455 VDataTy = MRI->getType(VDataOut); 1456 NumVDataDwords = DMaskLanes; 1457 1458 // One memoperand is mandatory, except for getresinfo. 1459 // FIXME: Check this in verifier. 1460 if (!MI.memoperands_empty()) { 1461 const MachineMemOperand *MMO = *MI.memoperands_begin(); 1462 1463 // Infer d16 from the memory size, as the register type will be mangled by 1464 // unpacked subtargets, or by TFE. 1465 IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32; 1466 1467 if (IsD16 && !STI.hasUnpackedD16VMem()) 1468 NumVDataDwords = (DMaskLanes + 1) / 2; 1469 } 1470 } 1471 } 1472 1473 // Optimize _L to _LZ when _L is zero 1474 if (LZMappingInfo) { 1475 // The legalizer replaced the register with an immediate 0 if we need to 1476 // change the opcode. 1477 const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1); 1478 if (Lod.isImm()) { 1479 assert(Lod.getImm() == 0); 1480 IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l 1481 } 1482 } 1483 1484 // Optimize _mip away, when 'lod' is zero 1485 if (MIPMappingInfo) { 1486 const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1); 1487 if (Lod.isImm()) { 1488 assert(Lod.getImm() == 0); 1489 IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip 1490 } 1491 } 1492 1493 // Set G16 opcode 1494 if (IsG16 && !IsA16) { 1495 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = 1496 AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode); 1497 assert(G16MappingInfo); 1498 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16 1499 } 1500 1501 // TODO: Check this in verifier. 1502 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this"); 1503 1504 bool GLC = false; 1505 bool SLC = false; 1506 bool DLC = false; 1507 if (BaseOpcode->Atomic) { 1508 GLC = true; // TODO no-return optimization 1509 if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), nullptr, &SLC, 1510 IsGFX10 ? &DLC : nullptr)) 1511 return false; 1512 } else { 1513 if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), &GLC, &SLC, 1514 IsGFX10 ? &DLC : nullptr)) 1515 return false; 1516 } 1517 1518 int NumVAddrRegs = 0; 1519 int NumVAddrDwords = 0; 1520 for (int I = 0; I < NumVAddr; ++I) { 1521 // Skip the $noregs and 0s inserted during legalization. 1522 MachineOperand &AddrOp = MI.getOperand(VAddrIdx + I); 1523 if (!AddrOp.isReg()) 1524 continue; // XXX - Break? 1525 1526 Register Addr = AddrOp.getReg(); 1527 if (!Addr) 1528 break; 1529 1530 ++NumVAddrRegs; 1531 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32; 1532 } 1533 1534 // The legalizer preprocessed the intrinsic arguments. If we aren't using 1535 // NSA, these should have beeen packed into a single value in the first 1536 // address register 1537 const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs; 1538 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) { 1539 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n"); 1540 return false; 1541 } 1542 1543 if (IsTexFail) 1544 ++NumVDataDwords; 1545 1546 int Opcode = -1; 1547 if (IsGFX10) { 1548 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, 1549 UseNSA ? AMDGPU::MIMGEncGfx10NSA 1550 : AMDGPU::MIMGEncGfx10Default, 1551 NumVDataDwords, NumVAddrDwords); 1552 } else { 1553 if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 1554 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, 1555 NumVDataDwords, NumVAddrDwords); 1556 if (Opcode == -1) 1557 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6, 1558 NumVDataDwords, NumVAddrDwords); 1559 } 1560 assert(Opcode != -1); 1561 1562 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode)) 1563 .cloneMemRefs(MI); 1564 1565 if (VDataOut) { 1566 if (BaseOpcode->AtomicX2) { 1567 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64; 1568 1569 Register TmpReg = MRI->createVirtualRegister( 1570 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass); 1571 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 1572 1573 MIB.addDef(TmpReg); 1574 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut) 1575 .addReg(TmpReg, RegState::Kill, SubReg); 1576 1577 } else { 1578 MIB.addDef(VDataOut); // vdata output 1579 } 1580 } 1581 1582 if (VDataIn) 1583 MIB.addReg(VDataIn); // vdata input 1584 1585 for (int i = 0; i != NumVAddrRegs; ++i) { 1586 MachineOperand &SrcOp = MI.getOperand(VAddrIdx + i); 1587 if (SrcOp.isReg()) { 1588 assert(SrcOp.getReg() != 0); 1589 MIB.addReg(SrcOp.getReg()); 1590 } 1591 } 1592 1593 MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr).getReg()); // rsrc 1594 if (BaseOpcode->Sampler) 1595 MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr + 1).getReg()); // sampler 1596 1597 MIB.addImm(DMask); // dmask 1598 1599 if (IsGFX10) 1600 MIB.addImm(DimInfo->Encoding); 1601 MIB.addImm(Unorm); 1602 if (IsGFX10) 1603 MIB.addImm(DLC); 1604 1605 MIB.addImm(GLC); 1606 MIB.addImm(SLC); 1607 MIB.addImm(IsA16 && // a16 or r128 1608 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0); 1609 if (IsGFX10) 1610 MIB.addImm(IsA16 ? -1 : 0); 1611 1612 MIB.addImm(TFE); // tfe 1613 MIB.addImm(LWE); // lwe 1614 if (!IsGFX10) 1615 MIB.addImm(DimInfo->DA ? -1 : 0); 1616 if (BaseOpcode->HasD16) 1617 MIB.addImm(IsD16 ? -1 : 0); 1618 1619 MI.eraseFromParent(); 1620 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1621 } 1622 1623 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( 1624 MachineInstr &I) const { 1625 unsigned IntrinsicID = I.getIntrinsicID(); 1626 switch (IntrinsicID) { 1627 case Intrinsic::amdgcn_end_cf: 1628 return selectEndCfIntrinsic(I); 1629 case Intrinsic::amdgcn_ds_ordered_add: 1630 case Intrinsic::amdgcn_ds_ordered_swap: 1631 return selectDSOrderedIntrinsic(I, IntrinsicID); 1632 case Intrinsic::amdgcn_ds_gws_init: 1633 case Intrinsic::amdgcn_ds_gws_barrier: 1634 case Intrinsic::amdgcn_ds_gws_sema_v: 1635 case Intrinsic::amdgcn_ds_gws_sema_br: 1636 case Intrinsic::amdgcn_ds_gws_sema_p: 1637 case Intrinsic::amdgcn_ds_gws_sema_release_all: 1638 return selectDSGWSIntrinsic(I, IntrinsicID); 1639 case Intrinsic::amdgcn_ds_append: 1640 return selectDSAppendConsume(I, true); 1641 case Intrinsic::amdgcn_ds_consume: 1642 return selectDSAppendConsume(I, false); 1643 default: { 1644 return selectImpl(I, *CoverageInfo); 1645 } 1646 } 1647 } 1648 1649 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { 1650 if (selectImpl(I, *CoverageInfo)) 1651 return true; 1652 1653 MachineBasicBlock *BB = I.getParent(); 1654 const DebugLoc &DL = I.getDebugLoc(); 1655 1656 Register DstReg = I.getOperand(0).getReg(); 1657 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 1658 assert(Size <= 32 || Size == 64); 1659 const MachineOperand &CCOp = I.getOperand(1); 1660 Register CCReg = CCOp.getReg(); 1661 if (!isVCC(CCReg, *MRI)) { 1662 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 : 1663 AMDGPU::S_CSELECT_B32; 1664 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 1665 .addReg(CCReg); 1666 1667 // The generic constrainSelectedInstRegOperands doesn't work for the scc register 1668 // bank, because it does not cover the register class that we used to represent 1669 // for it. So we need to manually set the register class here. 1670 if (!MRI->getRegClassOrNull(CCReg)) 1671 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI)); 1672 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) 1673 .add(I.getOperand(2)) 1674 .add(I.getOperand(3)); 1675 1676 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) | 1677 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI); 1678 I.eraseFromParent(); 1679 return Ret; 1680 } 1681 1682 // Wide VGPR select should have been split in RegBankSelect. 1683 if (Size > 32) 1684 return false; 1685 1686 MachineInstr *Select = 1687 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1688 .addImm(0) 1689 .add(I.getOperand(3)) 1690 .addImm(0) 1691 .add(I.getOperand(2)) 1692 .add(I.getOperand(1)); 1693 1694 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); 1695 I.eraseFromParent(); 1696 return Ret; 1697 } 1698 1699 bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { 1700 initM0(I); 1701 return selectImpl(I, *CoverageInfo); 1702 } 1703 1704 static int sizeToSubRegIndex(unsigned Size) { 1705 switch (Size) { 1706 case 32: 1707 return AMDGPU::sub0; 1708 case 64: 1709 return AMDGPU::sub0_sub1; 1710 case 96: 1711 return AMDGPU::sub0_sub1_sub2; 1712 case 128: 1713 return AMDGPU::sub0_sub1_sub2_sub3; 1714 case 256: 1715 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; 1716 default: 1717 if (Size < 32) 1718 return AMDGPU::sub0; 1719 if (Size > 256) 1720 return -1; 1721 return sizeToSubRegIndex(PowerOf2Ceil(Size)); 1722 } 1723 } 1724 1725 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { 1726 Register DstReg = I.getOperand(0).getReg(); 1727 Register SrcReg = I.getOperand(1).getReg(); 1728 const LLT DstTy = MRI->getType(DstReg); 1729 const LLT SrcTy = MRI->getType(SrcReg); 1730 const LLT S1 = LLT::scalar(1); 1731 1732 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1733 const RegisterBank *DstRB; 1734 if (DstTy == S1) { 1735 // This is a special case. We don't treat s1 for legalization artifacts as 1736 // vcc booleans. 1737 DstRB = SrcRB; 1738 } else { 1739 DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1740 if (SrcRB != DstRB) 1741 return false; 1742 } 1743 1744 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; 1745 1746 unsigned DstSize = DstTy.getSizeInBits(); 1747 unsigned SrcSize = SrcTy.getSizeInBits(); 1748 1749 const TargetRegisterClass *SrcRC 1750 = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI); 1751 const TargetRegisterClass *DstRC 1752 = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI); 1753 if (!SrcRC || !DstRC) 1754 return false; 1755 1756 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 1757 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) { 1758 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); 1759 return false; 1760 } 1761 1762 if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) { 1763 MachineBasicBlock *MBB = I.getParent(); 1764 const DebugLoc &DL = I.getDebugLoc(); 1765 1766 Register LoReg = MRI->createVirtualRegister(DstRC); 1767 Register HiReg = MRI->createVirtualRegister(DstRC); 1768 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg) 1769 .addReg(SrcReg, 0, AMDGPU::sub0); 1770 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg) 1771 .addReg(SrcReg, 0, AMDGPU::sub1); 1772 1773 if (IsVALU && STI.hasSDWA()) { 1774 // Write the low 16-bits of the high element into the high 16-bits of the 1775 // low element. 1776 MachineInstr *MovSDWA = 1777 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 1778 .addImm(0) // $src0_modifiers 1779 .addReg(HiReg) // $src0 1780 .addImm(0) // $clamp 1781 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel 1782 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 1783 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel 1784 .addReg(LoReg, RegState::Implicit); 1785 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 1786 } else { 1787 Register TmpReg0 = MRI->createVirtualRegister(DstRC); 1788 Register TmpReg1 = MRI->createVirtualRegister(DstRC); 1789 Register ImmReg = MRI->createVirtualRegister(DstRC); 1790 if (IsVALU) { 1791 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0) 1792 .addImm(16) 1793 .addReg(HiReg); 1794 } else { 1795 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0) 1796 .addReg(HiReg) 1797 .addImm(16); 1798 } 1799 1800 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 1801 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 1802 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32; 1803 1804 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg) 1805 .addImm(0xffff); 1806 BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1) 1807 .addReg(LoReg) 1808 .addReg(ImmReg); 1809 BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg) 1810 .addReg(TmpReg0) 1811 .addReg(TmpReg1); 1812 } 1813 1814 I.eraseFromParent(); 1815 return true; 1816 } 1817 1818 if (!DstTy.isScalar()) 1819 return false; 1820 1821 if (SrcSize > 32) { 1822 int SubRegIdx = sizeToSubRegIndex(DstSize); 1823 if (SubRegIdx == -1) 1824 return false; 1825 1826 // Deal with weird cases where the class only partially supports the subreg 1827 // index. 1828 const TargetRegisterClass *SrcWithSubRC 1829 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx); 1830 if (!SrcWithSubRC) 1831 return false; 1832 1833 if (SrcWithSubRC != SrcRC) { 1834 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI)) 1835 return false; 1836 } 1837 1838 I.getOperand(1).setSubReg(SubRegIdx); 1839 } 1840 1841 I.setDesc(TII.get(TargetOpcode::COPY)); 1842 return true; 1843 } 1844 1845 /// \returns true if a bitmask for \p Size bits will be an inline immediate. 1846 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) { 1847 Mask = maskTrailingOnes<unsigned>(Size); 1848 int SignedMask = static_cast<int>(Mask); 1849 return SignedMask >= -16 && SignedMask <= 64; 1850 } 1851 1852 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1. 1853 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank( 1854 Register Reg, const MachineRegisterInfo &MRI, 1855 const TargetRegisterInfo &TRI) const { 1856 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 1857 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>()) 1858 return RB; 1859 1860 // Ignore the type, since we don't use vcc in artifacts. 1861 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>()) 1862 return &RBI.getRegBankFromRegClass(*RC, LLT()); 1863 return nullptr; 1864 } 1865 1866 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { 1867 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG; 1868 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg; 1869 const DebugLoc &DL = I.getDebugLoc(); 1870 MachineBasicBlock &MBB = *I.getParent(); 1871 const Register DstReg = I.getOperand(0).getReg(); 1872 const Register SrcReg = I.getOperand(1).getReg(); 1873 1874 const LLT DstTy = MRI->getType(DstReg); 1875 const LLT SrcTy = MRI->getType(SrcReg); 1876 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ? 1877 I.getOperand(2).getImm() : SrcTy.getSizeInBits(); 1878 const unsigned DstSize = DstTy.getSizeInBits(); 1879 if (!DstTy.isScalar()) 1880 return false; 1881 1882 // Artifact casts should never use vcc. 1883 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI); 1884 1885 // FIXME: This should probably be illegal and split earlier. 1886 if (I.getOpcode() == AMDGPU::G_ANYEXT) { 1887 if (DstSize <= 32) 1888 return selectCOPY(I); 1889 1890 const TargetRegisterClass *SrcRC = 1891 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank, *MRI); 1892 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 1893 const TargetRegisterClass *DstRC = 1894 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 1895 1896 Register UndefReg = MRI->createVirtualRegister(SrcRC); 1897 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 1898 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 1899 .addReg(SrcReg) 1900 .addImm(AMDGPU::sub0) 1901 .addReg(UndefReg) 1902 .addImm(AMDGPU::sub1); 1903 I.eraseFromParent(); 1904 1905 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) && 1906 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI); 1907 } 1908 1909 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) { 1910 // 64-bit should have been split up in RegBankSelect 1911 1912 // Try to use an and with a mask if it will save code size. 1913 unsigned Mask; 1914 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1915 MachineInstr *ExtI = 1916 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg) 1917 .addImm(Mask) 1918 .addReg(SrcReg); 1919 I.eraseFromParent(); 1920 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1921 } 1922 1923 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32; 1924 MachineInstr *ExtI = 1925 BuildMI(MBB, I, DL, TII.get(BFE), DstReg) 1926 .addReg(SrcReg) 1927 .addImm(0) // Offset 1928 .addImm(SrcSize); // Width 1929 I.eraseFromParent(); 1930 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1931 } 1932 1933 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) { 1934 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ? 1935 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass; 1936 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI)) 1937 return false; 1938 1939 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) { 1940 const unsigned SextOpc = SrcSize == 8 ? 1941 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16; 1942 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg) 1943 .addReg(SrcReg); 1944 I.eraseFromParent(); 1945 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1946 } 1947 1948 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; 1949 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; 1950 1951 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width. 1952 if (DstSize > 32 && (SrcSize <= 32 || InReg)) { 1953 // We need a 64-bit register source, but the high bits don't matter. 1954 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); 1955 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1956 unsigned SubReg = InReg ? AMDGPU::sub0 : 0; 1957 1958 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 1959 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) 1960 .addReg(SrcReg, 0, SubReg) 1961 .addImm(AMDGPU::sub0) 1962 .addReg(UndefReg) 1963 .addImm(AMDGPU::sub1); 1964 1965 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg) 1966 .addReg(ExtReg) 1967 .addImm(SrcSize << 16); 1968 1969 I.eraseFromParent(); 1970 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI); 1971 } 1972 1973 unsigned Mask; 1974 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1975 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg) 1976 .addReg(SrcReg) 1977 .addImm(Mask); 1978 } else { 1979 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg) 1980 .addReg(SrcReg) 1981 .addImm(SrcSize << 16); 1982 } 1983 1984 I.eraseFromParent(); 1985 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1986 } 1987 1988 return false; 1989 } 1990 1991 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { 1992 MachineBasicBlock *BB = I.getParent(); 1993 MachineOperand &ImmOp = I.getOperand(1); 1994 1995 // The AMDGPU backend only supports Imm operands and not CImm or FPImm. 1996 if (ImmOp.isFPImm()) { 1997 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt(); 1998 ImmOp.ChangeToImmediate(Imm.getZExtValue()); 1999 } else if (ImmOp.isCImm()) { 2000 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue()); 2001 } 2002 2003 Register DstReg = I.getOperand(0).getReg(); 2004 unsigned Size; 2005 bool IsSgpr; 2006 const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg()); 2007 if (RB) { 2008 IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID; 2009 Size = MRI->getType(DstReg).getSizeInBits(); 2010 } else { 2011 const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg); 2012 IsSgpr = TRI.isSGPRClass(RC); 2013 Size = TRI.getRegSizeInBits(*RC); 2014 } 2015 2016 if (Size != 32 && Size != 64) 2017 return false; 2018 2019 unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 2020 if (Size == 32) { 2021 I.setDesc(TII.get(Opcode)); 2022 I.addImplicitDefUseOperands(*MF); 2023 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2024 } 2025 2026 const DebugLoc &DL = I.getDebugLoc(); 2027 2028 APInt Imm(Size, I.getOperand(1).getImm()); 2029 2030 MachineInstr *ResInst; 2031 if (IsSgpr && TII.isInlineConstant(Imm)) { 2032 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) 2033 .addImm(I.getOperand(1).getImm()); 2034 } else { 2035 const TargetRegisterClass *RC = IsSgpr ? 2036 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass; 2037 Register LoReg = MRI->createVirtualRegister(RC); 2038 Register HiReg = MRI->createVirtualRegister(RC); 2039 2040 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg) 2041 .addImm(Imm.trunc(32).getZExtValue()); 2042 2043 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg) 2044 .addImm(Imm.ashr(32).getZExtValue()); 2045 2046 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 2047 .addReg(LoReg) 2048 .addImm(AMDGPU::sub0) 2049 .addReg(HiReg) 2050 .addImm(AMDGPU::sub1); 2051 } 2052 2053 // We can't call constrainSelectedInstRegOperands here, because it doesn't 2054 // work for target independent opcodes 2055 I.eraseFromParent(); 2056 const TargetRegisterClass *DstRC = 2057 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI); 2058 if (!DstRC) 2059 return true; 2060 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI); 2061 } 2062 2063 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const { 2064 // Only manually handle the f64 SGPR case. 2065 // 2066 // FIXME: This is a workaround for 2.5 different tablegen problems. Because 2067 // the bit ops theoretically have a second result due to the implicit def of 2068 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing 2069 // that is easy by disabling the check. The result works, but uses a 2070 // nonsensical sreg32orlds_and_sreg_1 regclass. 2071 // 2072 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to 2073 // the variadic REG_SEQUENCE operands. 2074 2075 Register Dst = MI.getOperand(0).getReg(); 2076 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 2077 if (DstRB->getID() != AMDGPU::SGPRRegBankID || 2078 MRI->getType(Dst) != LLT::scalar(64)) 2079 return false; 2080 2081 Register Src = MI.getOperand(1).getReg(); 2082 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI); 2083 if (Fabs) 2084 Src = Fabs->getOperand(1).getReg(); 2085 2086 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || 2087 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) 2088 return false; 2089 2090 MachineBasicBlock *BB = MI.getParent(); 2091 const DebugLoc &DL = MI.getDebugLoc(); 2092 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2093 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2094 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2095 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2096 2097 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) 2098 .addReg(Src, 0, AMDGPU::sub0); 2099 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) 2100 .addReg(Src, 0, AMDGPU::sub1); 2101 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) 2102 .addImm(0x80000000); 2103 2104 // Set or toggle sign bit. 2105 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32; 2106 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg) 2107 .addReg(HiReg) 2108 .addReg(ConstReg); 2109 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) 2110 .addReg(LoReg) 2111 .addImm(AMDGPU::sub0) 2112 .addReg(OpReg) 2113 .addImm(AMDGPU::sub1); 2114 MI.eraseFromParent(); 2115 return true; 2116 } 2117 2118 // FIXME: This is a workaround for the same tablegen problems as G_FNEG 2119 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const { 2120 Register Dst = MI.getOperand(0).getReg(); 2121 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 2122 if (DstRB->getID() != AMDGPU::SGPRRegBankID || 2123 MRI->getType(Dst) != LLT::scalar(64)) 2124 return false; 2125 2126 Register Src = MI.getOperand(1).getReg(); 2127 MachineBasicBlock *BB = MI.getParent(); 2128 const DebugLoc &DL = MI.getDebugLoc(); 2129 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2130 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2131 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2132 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2133 2134 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || 2135 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) 2136 return false; 2137 2138 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) 2139 .addReg(Src, 0, AMDGPU::sub0); 2140 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) 2141 .addReg(Src, 0, AMDGPU::sub1); 2142 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) 2143 .addImm(0x7fffffff); 2144 2145 // Clear sign bit. 2146 // TODO: Should this used S_BITSET0_*? 2147 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg) 2148 .addReg(HiReg) 2149 .addReg(ConstReg); 2150 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) 2151 .addReg(LoReg) 2152 .addImm(AMDGPU::sub0) 2153 .addReg(OpReg) 2154 .addImm(AMDGPU::sub1); 2155 2156 MI.eraseFromParent(); 2157 return true; 2158 } 2159 2160 static bool isConstant(const MachineInstr &MI) { 2161 return MI.getOpcode() == TargetOpcode::G_CONSTANT; 2162 } 2163 2164 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, 2165 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const { 2166 2167 const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg()); 2168 2169 assert(PtrMI); 2170 2171 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD) 2172 return; 2173 2174 GEPInfo GEPInfo(*PtrMI); 2175 2176 for (unsigned i = 1; i != 3; ++i) { 2177 const MachineOperand &GEPOp = PtrMI->getOperand(i); 2178 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg()); 2179 assert(OpDef); 2180 if (i == 2 && isConstant(*OpDef)) { 2181 // TODO: Could handle constant base + variable offset, but a combine 2182 // probably should have commuted it. 2183 assert(GEPInfo.Imm == 0); 2184 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue(); 2185 continue; 2186 } 2187 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI); 2188 if (OpBank->getID() == AMDGPU::SGPRRegBankID) 2189 GEPInfo.SgprParts.push_back(GEPOp.getReg()); 2190 else 2191 GEPInfo.VgprParts.push_back(GEPOp.getReg()); 2192 } 2193 2194 AddrInfo.push_back(GEPInfo); 2195 getAddrModeInfo(*PtrMI, MRI, AddrInfo); 2196 } 2197 2198 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const { 2199 if (!MI.hasOneMemOperand()) 2200 return false; 2201 2202 const MachineMemOperand *MMO = *MI.memoperands_begin(); 2203 const Value *Ptr = MMO->getValue(); 2204 2205 // UndefValue means this is a load of a kernel input. These are uniform. 2206 // Sometimes LDS instructions have constant pointers. 2207 // If Ptr is null, then that means this mem operand contains a 2208 // PseudoSourceValue like GOT. 2209 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || 2210 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) 2211 return true; 2212 2213 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 2214 return true; 2215 2216 const Instruction *I = dyn_cast<Instruction>(Ptr); 2217 return I && I->getMetadata("amdgpu.uniform"); 2218 } 2219 2220 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const { 2221 for (const GEPInfo &GEPInfo : AddrInfo) { 2222 if (!GEPInfo.VgprParts.empty()) 2223 return true; 2224 } 2225 return false; 2226 } 2227 2228 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const { 2229 MachineBasicBlock *BB = I.getParent(); 2230 2231 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); 2232 unsigned AS = PtrTy.getAddressSpace(); 2233 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) && 2234 STI.ldsRequiresM0Init()) { 2235 // If DS instructions require M0 initializtion, insert it before selecting. 2236 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 2237 .addImm(-1); 2238 } 2239 } 2240 2241 bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const { 2242 initM0(I); 2243 return selectImpl(I, *CoverageInfo); 2244 } 2245 2246 // TODO: No rtn optimization. 2247 bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG( 2248 MachineInstr &MI) const { 2249 Register PtrReg = MI.getOperand(1).getReg(); 2250 const LLT PtrTy = MRI->getType(PtrReg); 2251 if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS || 2252 STI.useFlatForGlobal()) 2253 return selectImpl(MI, *CoverageInfo); 2254 2255 Register DstReg = MI.getOperand(0).getReg(); 2256 const LLT Ty = MRI->getType(DstReg); 2257 const bool Is64 = Ty.getSizeInBits() == 64; 2258 const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 2259 Register TmpReg = MRI->createVirtualRegister( 2260 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass); 2261 2262 const DebugLoc &DL = MI.getDebugLoc(); 2263 MachineBasicBlock *BB = MI.getParent(); 2264 2265 Register VAddr, RSrcReg, SOffset; 2266 int64_t Offset = 0; 2267 2268 unsigned Opcode; 2269 if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) { 2270 Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN : 2271 AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN; 2272 } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr, 2273 RSrcReg, SOffset, Offset)) { 2274 Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN : 2275 AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN; 2276 } else 2277 return selectImpl(MI, *CoverageInfo); 2278 2279 auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg) 2280 .addReg(MI.getOperand(2).getReg()); 2281 2282 if (VAddr) 2283 MIB.addReg(VAddr); 2284 2285 MIB.addReg(RSrcReg); 2286 if (SOffset) 2287 MIB.addReg(SOffset); 2288 else 2289 MIB.addImm(0); 2290 2291 MIB.addImm(Offset); 2292 MIB.addImm(0); // slc 2293 MIB.cloneMemRefs(MI); 2294 2295 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg) 2296 .addReg(TmpReg, RegState::Kill, SubReg); 2297 2298 MI.eraseFromParent(); 2299 2300 MRI->setRegClass( 2301 DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass); 2302 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 2303 } 2304 2305 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { 2306 MachineBasicBlock *BB = I.getParent(); 2307 MachineOperand &CondOp = I.getOperand(0); 2308 Register CondReg = CondOp.getReg(); 2309 const DebugLoc &DL = I.getDebugLoc(); 2310 2311 unsigned BrOpcode; 2312 Register CondPhysReg; 2313 const TargetRegisterClass *ConstrainRC; 2314 2315 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide 2316 // whether the branch is uniform when selecting the instruction. In 2317 // GlobalISel, we should push that decision into RegBankSelect. Assume for now 2318 // RegBankSelect knows what it's doing if the branch condition is scc, even 2319 // though it currently does not. 2320 if (!isVCC(CondReg, *MRI)) { 2321 if (MRI->getType(CondReg) != LLT::scalar(32)) 2322 return false; 2323 2324 CondPhysReg = AMDGPU::SCC; 2325 BrOpcode = AMDGPU::S_CBRANCH_SCC1; 2326 ConstrainRC = &AMDGPU::SReg_32RegClass; 2327 } else { 2328 // FIXME: Do we have to insert an and with exec here, like in SelectionDAG? 2329 // We sort of know that a VCC producer based on the register bank, that ands 2330 // inactive lanes with 0. What if there was a logical operation with vcc 2331 // producers in different blocks/with different exec masks? 2332 // FIXME: Should scc->vcc copies and with exec? 2333 CondPhysReg = TRI.getVCC(); 2334 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ; 2335 ConstrainRC = TRI.getBoolRC(); 2336 } 2337 2338 if (!MRI->getRegClassOrNull(CondReg)) 2339 MRI->setRegClass(CondReg, ConstrainRC); 2340 2341 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg) 2342 .addReg(CondReg); 2343 BuildMI(*BB, &I, DL, TII.get(BrOpcode)) 2344 .addMBB(I.getOperand(1).getMBB()); 2345 2346 I.eraseFromParent(); 2347 return true; 2348 } 2349 2350 bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE( 2351 MachineInstr &I) const { 2352 Register DstReg = I.getOperand(0).getReg(); 2353 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2354 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 2355 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32)); 2356 if (IsVGPR) 2357 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 2358 2359 return RBI.constrainGenericRegister( 2360 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI); 2361 } 2362 2363 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { 2364 Register DstReg = I.getOperand(0).getReg(); 2365 Register SrcReg = I.getOperand(1).getReg(); 2366 Register MaskReg = I.getOperand(2).getReg(); 2367 LLT Ty = MRI->getType(DstReg); 2368 LLT MaskTy = MRI->getType(MaskReg); 2369 2370 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2371 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 2372 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI); 2373 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 2374 if (DstRB != SrcRB) // Should only happen for hand written MIR. 2375 return false; 2376 2377 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 2378 const TargetRegisterClass &RegRC 2379 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 2380 2381 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB, 2382 *MRI); 2383 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB, 2384 *MRI); 2385 const TargetRegisterClass *MaskRC = 2386 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI); 2387 2388 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 2389 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 2390 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI)) 2391 return false; 2392 2393 MachineBasicBlock *BB = I.getParent(); 2394 const DebugLoc &DL = I.getDebugLoc(); 2395 if (Ty.getSizeInBits() == 32) { 2396 assert(MaskTy.getSizeInBits() == 32 && 2397 "ptrmask should have been narrowed during legalize"); 2398 2399 BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg) 2400 .addReg(SrcReg) 2401 .addReg(MaskReg); 2402 I.eraseFromParent(); 2403 return true; 2404 } 2405 2406 Register HiReg = MRI->createVirtualRegister(&RegRC); 2407 Register LoReg = MRI->createVirtualRegister(&RegRC); 2408 2409 // Extract the subregisters from the source pointer. 2410 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg) 2411 .addReg(SrcReg, 0, AMDGPU::sub0); 2412 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg) 2413 .addReg(SrcReg, 0, AMDGPU::sub1); 2414 2415 Register MaskedLo, MaskedHi; 2416 2417 // Try to avoid emitting a bit operation when we only need to touch half of 2418 // the 64-bit pointer. 2419 APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64); 2420 2421 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32); 2422 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32); 2423 if ((MaskOnes & MaskLo32) == MaskLo32) { 2424 // If all the bits in the low half are 1, we only need a copy for it. 2425 MaskedLo = LoReg; 2426 } else { 2427 // Extract the mask subregister and apply the and. 2428 Register MaskLo = MRI->createVirtualRegister(&RegRC); 2429 MaskedLo = MRI->createVirtualRegister(&RegRC); 2430 2431 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo) 2432 .addReg(MaskReg, 0, AMDGPU::sub0); 2433 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo) 2434 .addReg(LoReg) 2435 .addReg(MaskLo); 2436 } 2437 2438 if ((MaskOnes & MaskHi32) == MaskHi32) { 2439 // If all the bits in the high half are 1, we only need a copy for it. 2440 MaskedHi = HiReg; 2441 } else { 2442 Register MaskHi = MRI->createVirtualRegister(&RegRC); 2443 MaskedHi = MRI->createVirtualRegister(&RegRC); 2444 2445 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi) 2446 .addReg(MaskReg, 0, AMDGPU::sub1); 2447 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi) 2448 .addReg(HiReg) 2449 .addReg(MaskHi); 2450 } 2451 2452 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 2453 .addReg(MaskedLo) 2454 .addImm(AMDGPU::sub0) 2455 .addReg(MaskedHi) 2456 .addImm(AMDGPU::sub1); 2457 I.eraseFromParent(); 2458 return true; 2459 } 2460 2461 /// Return the register to use for the index value, and the subregister to use 2462 /// for the indirectly accessed register. 2463 static std::pair<Register, unsigned> 2464 computeIndirectRegIndex(MachineRegisterInfo &MRI, 2465 const SIRegisterInfo &TRI, 2466 const TargetRegisterClass *SuperRC, 2467 Register IdxReg, 2468 unsigned EltSize) { 2469 Register IdxBaseReg; 2470 int Offset; 2471 MachineInstr *Unused; 2472 2473 std::tie(IdxBaseReg, Offset, Unused) 2474 = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg); 2475 if (IdxBaseReg == AMDGPU::NoRegister) { 2476 // This will happen if the index is a known constant. This should ordinarily 2477 // be legalized out, but handle it as a register just in case. 2478 assert(Offset == 0); 2479 IdxBaseReg = IdxReg; 2480 } 2481 2482 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize); 2483 2484 // Skip out of bounds offsets, or else we would end up using an undefined 2485 // register. 2486 if (static_cast<unsigned>(Offset) >= SubRegs.size()) 2487 return std::make_pair(IdxReg, SubRegs[0]); 2488 return std::make_pair(IdxBaseReg, SubRegs[Offset]); 2489 } 2490 2491 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT( 2492 MachineInstr &MI) const { 2493 Register DstReg = MI.getOperand(0).getReg(); 2494 Register SrcReg = MI.getOperand(1).getReg(); 2495 Register IdxReg = MI.getOperand(2).getReg(); 2496 2497 LLT DstTy = MRI->getType(DstReg); 2498 LLT SrcTy = MRI->getType(SrcReg); 2499 2500 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2501 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 2502 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 2503 2504 // The index must be scalar. If it wasn't RegBankSelect should have moved this 2505 // into a waterfall loop. 2506 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 2507 return false; 2508 2509 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB, 2510 *MRI); 2511 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB, 2512 *MRI); 2513 if (!SrcRC || !DstRC) 2514 return false; 2515 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 2516 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 2517 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 2518 return false; 2519 2520 MachineBasicBlock *BB = MI.getParent(); 2521 const DebugLoc &DL = MI.getDebugLoc(); 2522 const bool Is64 = DstTy.getSizeInBits() == 64; 2523 2524 unsigned SubReg; 2525 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg, 2526 DstTy.getSizeInBits() / 8); 2527 2528 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) { 2529 if (DstTy.getSizeInBits() != 32 && !Is64) 2530 return false; 2531 2532 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2533 .addReg(IdxReg); 2534 2535 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32; 2536 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg) 2537 .addReg(SrcReg, 0, SubReg) 2538 .addReg(SrcReg, RegState::Implicit); 2539 MI.eraseFromParent(); 2540 return true; 2541 } 2542 2543 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32) 2544 return false; 2545 2546 if (!STI.useVGPRIndexMode()) { 2547 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2548 .addReg(IdxReg); 2549 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg) 2550 .addReg(SrcReg, 0, SubReg) 2551 .addReg(SrcReg, RegState::Implicit); 2552 MI.eraseFromParent(); 2553 return true; 2554 } 2555 2556 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON)) 2557 .addReg(IdxReg) 2558 .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE); 2559 BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstReg) 2560 .addReg(SrcReg, 0, SubReg) 2561 .addReg(SrcReg, RegState::Implicit) 2562 .addReg(AMDGPU::M0, RegState::Implicit); 2563 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF)); 2564 2565 MI.eraseFromParent(); 2566 return true; 2567 } 2568 2569 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd 2570 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT( 2571 MachineInstr &MI) const { 2572 Register DstReg = MI.getOperand(0).getReg(); 2573 Register VecReg = MI.getOperand(1).getReg(); 2574 Register ValReg = MI.getOperand(2).getReg(); 2575 Register IdxReg = MI.getOperand(3).getReg(); 2576 2577 LLT VecTy = MRI->getType(DstReg); 2578 LLT ValTy = MRI->getType(ValReg); 2579 unsigned VecSize = VecTy.getSizeInBits(); 2580 unsigned ValSize = ValTy.getSizeInBits(); 2581 2582 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI); 2583 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI); 2584 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 2585 2586 assert(VecTy.getElementType() == ValTy); 2587 2588 // The index must be scalar. If it wasn't RegBankSelect should have moved this 2589 // into a waterfall loop. 2590 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 2591 return false; 2592 2593 const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB, 2594 *MRI); 2595 const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB, 2596 *MRI); 2597 2598 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) || 2599 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) || 2600 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) || 2601 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 2602 return false; 2603 2604 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32) 2605 return false; 2606 2607 unsigned SubReg; 2608 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, 2609 ValSize / 8); 2610 2611 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID && 2612 STI.useVGPRIndexMode(); 2613 2614 MachineBasicBlock *BB = MI.getParent(); 2615 const DebugLoc &DL = MI.getDebugLoc(); 2616 2617 if (IndexMode) { 2618 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON)) 2619 .addReg(IdxReg) 2620 .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE); 2621 } else { 2622 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2623 .addReg(IdxReg); 2624 } 2625 2626 const MCInstrDesc &RegWriteOp 2627 = TII.getIndirectRegWritePseudo(VecSize, ValSize, 2628 VecRB->getID() == AMDGPU::SGPRRegBankID); 2629 BuildMI(*BB, MI, DL, RegWriteOp, DstReg) 2630 .addReg(VecReg) 2631 .addReg(ValReg) 2632 .addImm(SubReg); 2633 2634 if (IndexMode) 2635 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF)); 2636 2637 MI.eraseFromParent(); 2638 return true; 2639 } 2640 2641 static bool isZeroOrUndef(int X) { 2642 return X == 0 || X == -1; 2643 } 2644 2645 static bool isOneOrUndef(int X) { 2646 return X == 1 || X == -1; 2647 } 2648 2649 static bool isZeroOrOneOrUndef(int X) { 2650 return X == 0 || X == 1 || X == -1; 2651 } 2652 2653 // Normalize a VOP3P shuffle mask to refer to the low/high half of a single 2654 // 32-bit register. 2655 static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1, 2656 ArrayRef<int> Mask) { 2657 NewMask[0] = Mask[0]; 2658 NewMask[1] = Mask[1]; 2659 if (isZeroOrOneOrUndef(Mask[0]) && isZeroOrOneOrUndef(Mask[1])) 2660 return Src0; 2661 2662 assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1); 2663 assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1); 2664 2665 // Shift the mask inputs to be 0/1; 2666 NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2; 2667 NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2; 2668 return Src1; 2669 } 2670 2671 // This is only legal with VOP3P instructions as an aid to op_sel matching. 2672 bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR( 2673 MachineInstr &MI) const { 2674 Register DstReg = MI.getOperand(0).getReg(); 2675 Register Src0Reg = MI.getOperand(1).getReg(); 2676 Register Src1Reg = MI.getOperand(2).getReg(); 2677 ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask(); 2678 2679 const LLT V2S16 = LLT::vector(2, 16); 2680 if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16) 2681 return false; 2682 2683 if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask)) 2684 return false; 2685 2686 assert(ShufMask.size() == 2); 2687 assert(STI.hasSDWA() && "no target has VOP3P but not SDWA"); 2688 2689 MachineBasicBlock *MBB = MI.getParent(); 2690 const DebugLoc &DL = MI.getDebugLoc(); 2691 2692 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2693 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; 2694 const TargetRegisterClass &RC = IsVALU ? 2695 AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 2696 2697 // Handle the degenerate case which should have folded out. 2698 if (ShufMask[0] == -1 && ShufMask[1] == -1) { 2699 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg); 2700 2701 MI.eraseFromParent(); 2702 return RBI.constrainGenericRegister(DstReg, RC, *MRI); 2703 } 2704 2705 // A legal VOP3P mask only reads one of the sources. 2706 int Mask[2]; 2707 Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask); 2708 2709 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) || 2710 !RBI.constrainGenericRegister(SrcVec, RC, *MRI)) 2711 return false; 2712 2713 // TODO: This also should have been folded out 2714 if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) { 2715 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg) 2716 .addReg(SrcVec); 2717 2718 MI.eraseFromParent(); 2719 return true; 2720 } 2721 2722 if (Mask[0] == 1 && Mask[1] == -1) { 2723 if (IsVALU) { 2724 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg) 2725 .addImm(16) 2726 .addReg(SrcVec); 2727 } else { 2728 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg) 2729 .addReg(SrcVec) 2730 .addImm(16); 2731 } 2732 } else if (Mask[0] == -1 && Mask[1] == 0) { 2733 if (IsVALU) { 2734 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg) 2735 .addImm(16) 2736 .addReg(SrcVec); 2737 } else { 2738 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg) 2739 .addReg(SrcVec) 2740 .addImm(16); 2741 } 2742 } else if (Mask[0] == 0 && Mask[1] == 0) { 2743 if (IsVALU) { 2744 // Write low half of the register into the high half. 2745 MachineInstr *MovSDWA = 2746 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 2747 .addImm(0) // $src0_modifiers 2748 .addReg(SrcVec) // $src0 2749 .addImm(0) // $clamp 2750 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel 2751 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 2752 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel 2753 .addReg(SrcVec, RegState::Implicit); 2754 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 2755 } else { 2756 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg) 2757 .addReg(SrcVec) 2758 .addReg(SrcVec); 2759 } 2760 } else if (Mask[0] == 1 && Mask[1] == 1) { 2761 if (IsVALU) { 2762 // Write high half of the register into the low half. 2763 MachineInstr *MovSDWA = 2764 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 2765 .addImm(0) // $src0_modifiers 2766 .addReg(SrcVec) // $src0 2767 .addImm(0) // $clamp 2768 .addImm(AMDGPU::SDWA::WORD_0) // $dst_sel 2769 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 2770 .addImm(AMDGPU::SDWA::WORD_1) // $src0_sel 2771 .addReg(SrcVec, RegState::Implicit); 2772 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 2773 } else { 2774 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg) 2775 .addReg(SrcVec) 2776 .addReg(SrcVec); 2777 } 2778 } else if (Mask[0] == 1 && Mask[1] == 0) { 2779 if (IsVALU) { 2780 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32), DstReg) 2781 .addReg(SrcVec) 2782 .addReg(SrcVec) 2783 .addImm(16); 2784 } else { 2785 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2786 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg) 2787 .addReg(SrcVec) 2788 .addImm(16); 2789 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg) 2790 .addReg(TmpReg) 2791 .addReg(SrcVec); 2792 } 2793 } else 2794 llvm_unreachable("all shuffle masks should be handled"); 2795 2796 MI.eraseFromParent(); 2797 return true; 2798 } 2799 2800 bool AMDGPUInstructionSelector::select(MachineInstr &I) { 2801 if (I.isPHI()) 2802 return selectPHI(I); 2803 2804 if (!I.isPreISelOpcode()) { 2805 if (I.isCopy()) 2806 return selectCOPY(I); 2807 return true; 2808 } 2809 2810 switch (I.getOpcode()) { 2811 case TargetOpcode::G_AND: 2812 case TargetOpcode::G_OR: 2813 case TargetOpcode::G_XOR: 2814 if (selectImpl(I, *CoverageInfo)) 2815 return true; 2816 return selectG_AND_OR_XOR(I); 2817 case TargetOpcode::G_ADD: 2818 case TargetOpcode::G_SUB: 2819 if (selectImpl(I, *CoverageInfo)) 2820 return true; 2821 return selectG_ADD_SUB(I); 2822 case TargetOpcode::G_UADDO: 2823 case TargetOpcode::G_USUBO: 2824 case TargetOpcode::G_UADDE: 2825 case TargetOpcode::G_USUBE: 2826 return selectG_UADDO_USUBO_UADDE_USUBE(I); 2827 case TargetOpcode::G_INTTOPTR: 2828 case TargetOpcode::G_BITCAST: 2829 case TargetOpcode::G_PTRTOINT: 2830 return selectCOPY(I); 2831 case TargetOpcode::G_CONSTANT: 2832 case TargetOpcode::G_FCONSTANT: 2833 return selectG_CONSTANT(I); 2834 case TargetOpcode::G_FNEG: 2835 if (selectImpl(I, *CoverageInfo)) 2836 return true; 2837 return selectG_FNEG(I); 2838 case TargetOpcode::G_FABS: 2839 if (selectImpl(I, *CoverageInfo)) 2840 return true; 2841 return selectG_FABS(I); 2842 case TargetOpcode::G_EXTRACT: 2843 return selectG_EXTRACT(I); 2844 case TargetOpcode::G_MERGE_VALUES: 2845 case TargetOpcode::G_BUILD_VECTOR: 2846 case TargetOpcode::G_CONCAT_VECTORS: 2847 return selectG_MERGE_VALUES(I); 2848 case TargetOpcode::G_UNMERGE_VALUES: 2849 return selectG_UNMERGE_VALUES(I); 2850 case TargetOpcode::G_BUILD_VECTOR_TRUNC: 2851 return selectG_BUILD_VECTOR_TRUNC(I); 2852 case TargetOpcode::G_PTR_ADD: 2853 return selectG_PTR_ADD(I); 2854 case TargetOpcode::G_IMPLICIT_DEF: 2855 return selectG_IMPLICIT_DEF(I); 2856 case TargetOpcode::G_FREEZE: 2857 return selectCOPY(I); 2858 case TargetOpcode::G_INSERT: 2859 return selectG_INSERT(I); 2860 case TargetOpcode::G_INTRINSIC: 2861 return selectG_INTRINSIC(I); 2862 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: 2863 return selectG_INTRINSIC_W_SIDE_EFFECTS(I); 2864 case TargetOpcode::G_ICMP: 2865 if (selectG_ICMP(I)) 2866 return true; 2867 return selectImpl(I, *CoverageInfo); 2868 case TargetOpcode::G_LOAD: 2869 case TargetOpcode::G_ATOMIC_CMPXCHG: 2870 case TargetOpcode::G_ATOMICRMW_XCHG: 2871 case TargetOpcode::G_ATOMICRMW_ADD: 2872 case TargetOpcode::G_ATOMICRMW_SUB: 2873 case TargetOpcode::G_ATOMICRMW_AND: 2874 case TargetOpcode::G_ATOMICRMW_OR: 2875 case TargetOpcode::G_ATOMICRMW_XOR: 2876 case TargetOpcode::G_ATOMICRMW_MIN: 2877 case TargetOpcode::G_ATOMICRMW_MAX: 2878 case TargetOpcode::G_ATOMICRMW_UMIN: 2879 case TargetOpcode::G_ATOMICRMW_UMAX: 2880 case TargetOpcode::G_ATOMICRMW_FADD: 2881 case AMDGPU::G_AMDGPU_ATOMIC_INC: 2882 case AMDGPU::G_AMDGPU_ATOMIC_DEC: 2883 case AMDGPU::G_AMDGPU_ATOMIC_FMIN: 2884 case AMDGPU::G_AMDGPU_ATOMIC_FMAX: 2885 return selectG_LOAD_ATOMICRMW(I); 2886 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: 2887 return selectG_AMDGPU_ATOMIC_CMPXCHG(I); 2888 case TargetOpcode::G_SELECT: 2889 return selectG_SELECT(I); 2890 case TargetOpcode::G_STORE: 2891 return selectG_STORE(I); 2892 case TargetOpcode::G_TRUNC: 2893 return selectG_TRUNC(I); 2894 case TargetOpcode::G_SEXT: 2895 case TargetOpcode::G_ZEXT: 2896 case TargetOpcode::G_ANYEXT: 2897 case TargetOpcode::G_SEXT_INREG: 2898 if (selectImpl(I, *CoverageInfo)) 2899 return true; 2900 return selectG_SZA_EXT(I); 2901 case TargetOpcode::G_BRCOND: 2902 return selectG_BRCOND(I); 2903 case TargetOpcode::G_GLOBAL_VALUE: 2904 return selectG_GLOBAL_VALUE(I); 2905 case TargetOpcode::G_PTRMASK: 2906 return selectG_PTRMASK(I); 2907 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 2908 return selectG_EXTRACT_VECTOR_ELT(I); 2909 case TargetOpcode::G_INSERT_VECTOR_ELT: 2910 return selectG_INSERT_VECTOR_ELT(I); 2911 case TargetOpcode::G_SHUFFLE_VECTOR: 2912 return selectG_SHUFFLE_VECTOR(I); 2913 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 2914 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: { 2915 const AMDGPU::ImageDimIntrinsicInfo *Intr 2916 = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID()); 2917 assert(Intr && "not an image intrinsic with image pseudo"); 2918 return selectImageIntrinsic(I, Intr); 2919 } 2920 default: 2921 return selectImpl(I, *CoverageInfo); 2922 } 2923 return false; 2924 } 2925 2926 InstructionSelector::ComplexRendererFns 2927 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { 2928 return {{ 2929 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 2930 }}; 2931 2932 } 2933 2934 std::pair<Register, unsigned> 2935 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root) const { 2936 Register Src = Root.getReg(); 2937 Register OrigSrc = Src; 2938 unsigned Mods = 0; 2939 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI); 2940 2941 if (MI && MI->getOpcode() == AMDGPU::G_FNEG) { 2942 Src = MI->getOperand(1).getReg(); 2943 Mods |= SISrcMods::NEG; 2944 MI = getDefIgnoringCopies(Src, *MRI); 2945 } 2946 2947 if (MI && MI->getOpcode() == AMDGPU::G_FABS) { 2948 Src = MI->getOperand(1).getReg(); 2949 Mods |= SISrcMods::ABS; 2950 } 2951 2952 if (Mods != 0 && 2953 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) { 2954 MachineInstr *UseMI = Root.getParent(); 2955 2956 // If we looked through copies to find source modifiers on an SGPR operand, 2957 // we now have an SGPR register source. To avoid potentially violating the 2958 // constant bus restriction, we need to insert a copy to a VGPR. 2959 Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc); 2960 BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(), 2961 TII.get(AMDGPU::COPY), VGPRSrc) 2962 .addReg(Src); 2963 Src = VGPRSrc; 2964 } 2965 2966 return std::make_pair(Src, Mods); 2967 } 2968 2969 /// 2970 /// This will select either an SGPR or VGPR operand and will save us from 2971 /// having to write an extra tablegen pattern. 2972 InstructionSelector::ComplexRendererFns 2973 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const { 2974 return {{ 2975 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 2976 }}; 2977 } 2978 2979 InstructionSelector::ComplexRendererFns 2980 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { 2981 Register Src; 2982 unsigned Mods; 2983 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 2984 2985 return {{ 2986 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2987 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 2988 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 2989 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 2990 }}; 2991 } 2992 2993 InstructionSelector::ComplexRendererFns 2994 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { 2995 return {{ 2996 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 2997 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 2998 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 2999 }}; 3000 } 3001 3002 InstructionSelector::ComplexRendererFns 3003 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { 3004 Register Src; 3005 unsigned Mods; 3006 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 3007 3008 return {{ 3009 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3010 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3011 }}; 3012 } 3013 3014 InstructionSelector::ComplexRendererFns 3015 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { 3016 Register Reg = Root.getReg(); 3017 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); 3018 if (Def && (Def->getOpcode() == AMDGPU::G_FNEG || 3019 Def->getOpcode() == AMDGPU::G_FABS)) 3020 return {}; 3021 return {{ 3022 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 3023 }}; 3024 } 3025 3026 std::pair<Register, unsigned> 3027 AMDGPUInstructionSelector::selectVOP3PModsImpl( 3028 Register Src, const MachineRegisterInfo &MRI) const { 3029 unsigned Mods = 0; 3030 MachineInstr *MI = MRI.getVRegDef(Src); 3031 3032 if (MI && MI->getOpcode() == AMDGPU::G_FNEG && 3033 // It's possible to see an f32 fneg here, but unlikely. 3034 // TODO: Treat f32 fneg as only high bit. 3035 MRI.getType(Src) == LLT::vector(2, 16)) { 3036 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); 3037 Src = MI->getOperand(1).getReg(); 3038 MI = MRI.getVRegDef(Src); 3039 } 3040 3041 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. 3042 3043 // Packed instructions do not have abs modifiers. 3044 Mods |= SISrcMods::OP_SEL_1; 3045 3046 return std::make_pair(Src, Mods); 3047 } 3048 3049 InstructionSelector::ComplexRendererFns 3050 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const { 3051 MachineRegisterInfo &MRI 3052 = Root.getParent()->getParent()->getParent()->getRegInfo(); 3053 3054 Register Src; 3055 unsigned Mods; 3056 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI); 3057 3058 return {{ 3059 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3060 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3061 }}; 3062 } 3063 3064 InstructionSelector::ComplexRendererFns 3065 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const { 3066 Register Src; 3067 unsigned Mods; 3068 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 3069 if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI)) 3070 return None; 3071 3072 return {{ 3073 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3074 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3075 }}; 3076 } 3077 3078 InstructionSelector::ComplexRendererFns 3079 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { 3080 // FIXME: Handle op_sel 3081 return {{ 3082 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 3083 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods 3084 }}; 3085 } 3086 3087 InstructionSelector::ComplexRendererFns 3088 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { 3089 SmallVector<GEPInfo, 4> AddrInfo; 3090 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 3091 3092 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 3093 return None; 3094 3095 const GEPInfo &GEPInfo = AddrInfo[0]; 3096 Optional<int64_t> EncodedImm = 3097 AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false); 3098 if (!EncodedImm) 3099 return None; 3100 3101 unsigned PtrReg = GEPInfo.SgprParts[0]; 3102 return {{ 3103 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 3104 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } 3105 }}; 3106 } 3107 3108 InstructionSelector::ComplexRendererFns 3109 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { 3110 SmallVector<GEPInfo, 4> AddrInfo; 3111 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 3112 3113 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 3114 return None; 3115 3116 const GEPInfo &GEPInfo = AddrInfo[0]; 3117 Register PtrReg = GEPInfo.SgprParts[0]; 3118 Optional<int64_t> EncodedImm = 3119 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm); 3120 if (!EncodedImm) 3121 return None; 3122 3123 return {{ 3124 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 3125 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } 3126 }}; 3127 } 3128 3129 InstructionSelector::ComplexRendererFns 3130 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { 3131 MachineInstr *MI = Root.getParent(); 3132 MachineBasicBlock *MBB = MI->getParent(); 3133 3134 SmallVector<GEPInfo, 4> AddrInfo; 3135 getAddrModeInfo(*MI, *MRI, AddrInfo); 3136 3137 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, 3138 // then we can select all ptr + 32-bit offsets not just immediate offsets. 3139 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 3140 return None; 3141 3142 const GEPInfo &GEPInfo = AddrInfo[0]; 3143 // SGPR offset is unsigned. 3144 if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm)) 3145 return None; 3146 3147 // If we make it this far we have a load with an 32-bit immediate offset. 3148 // It is OK to select this using a sgpr offset, because we have already 3149 // failed trying to select this load into one of the _IMM variants since 3150 // the _IMM Patterns are considered before the _SGPR patterns. 3151 Register PtrReg = GEPInfo.SgprParts[0]; 3152 Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 3153 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) 3154 .addImm(GEPInfo.Imm); 3155 return {{ 3156 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 3157 [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); } 3158 }}; 3159 } 3160 3161 template <bool Signed> 3162 InstructionSelector::ComplexRendererFns 3163 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const { 3164 MachineInstr *MI = Root.getParent(); 3165 3166 InstructionSelector::ComplexRendererFns Default = {{ 3167 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 3168 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // offset 3169 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 3170 }}; 3171 3172 if (!STI.hasFlatInstOffsets()) 3173 return Default; 3174 3175 const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg()); 3176 if (!OpDef || OpDef->getOpcode() != AMDGPU::G_PTR_ADD) 3177 return Default; 3178 3179 Optional<int64_t> Offset = 3180 getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI); 3181 if (!Offset.hasValue()) 3182 return Default; 3183 3184 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); 3185 if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed)) 3186 return Default; 3187 3188 Register BasePtr = OpDef->getOperand(1).getReg(); 3189 3190 return {{ 3191 [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); }, 3192 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); }, 3193 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 3194 }}; 3195 } 3196 3197 InstructionSelector::ComplexRendererFns 3198 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const { 3199 return selectFlatOffsetImpl<false>(Root); 3200 } 3201 3202 InstructionSelector::ComplexRendererFns 3203 AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const { 3204 return selectFlatOffsetImpl<true>(Root); 3205 } 3206 3207 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { 3208 auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>(); 3209 return PSV && PSV->isStack(); 3210 } 3211 3212 InstructionSelector::ComplexRendererFns 3213 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { 3214 MachineInstr *MI = Root.getParent(); 3215 MachineBasicBlock *MBB = MI->getParent(); 3216 MachineFunction *MF = MBB->getParent(); 3217 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 3218 3219 int64_t Offset = 0; 3220 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) && 3221 Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) { 3222 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3223 3224 // TODO: Should this be inside the render function? The iterator seems to 3225 // move. 3226 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), 3227 HighBits) 3228 .addImm(Offset & ~4095); 3229 3230 return {{[=](MachineInstrBuilder &MIB) { // rsrc 3231 MIB.addReg(Info->getScratchRSrcReg()); 3232 }, 3233 [=](MachineInstrBuilder &MIB) { // vaddr 3234 MIB.addReg(HighBits); 3235 }, 3236 [=](MachineInstrBuilder &MIB) { // soffset 3237 const MachineMemOperand *MMO = *MI->memoperands_begin(); 3238 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 3239 3240 if (isStackPtrRelative(PtrInfo)) 3241 MIB.addReg(Info->getStackPtrOffsetReg()); 3242 else 3243 MIB.addImm(0); 3244 }, 3245 [=](MachineInstrBuilder &MIB) { // offset 3246 MIB.addImm(Offset & 4095); 3247 }}}; 3248 } 3249 3250 assert(Offset == 0 || Offset == -1); 3251 3252 // Try to fold a frame index directly into the MUBUF vaddr field, and any 3253 // offsets. 3254 Optional<int> FI; 3255 Register VAddr = Root.getReg(); 3256 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) { 3257 if (isBaseWithConstantOffset(Root, *MRI)) { 3258 const MachineOperand &LHS = RootDef->getOperand(1); 3259 const MachineOperand &RHS = RootDef->getOperand(2); 3260 const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); 3261 const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); 3262 if (LHSDef && RHSDef) { 3263 int64_t PossibleOffset = 3264 RHSDef->getOperand(1).getCImm()->getSExtValue(); 3265 if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) && 3266 (!STI.privateMemoryResourceIsRangeChecked() || 3267 KnownBits->signBitIsZero(LHS.getReg()))) { 3268 if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX) 3269 FI = LHSDef->getOperand(1).getIndex(); 3270 else 3271 VAddr = LHS.getReg(); 3272 Offset = PossibleOffset; 3273 } 3274 } 3275 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) { 3276 FI = RootDef->getOperand(1).getIndex(); 3277 } 3278 } 3279 3280 return {{[=](MachineInstrBuilder &MIB) { // rsrc 3281 MIB.addReg(Info->getScratchRSrcReg()); 3282 }, 3283 [=](MachineInstrBuilder &MIB) { // vaddr 3284 if (FI.hasValue()) 3285 MIB.addFrameIndex(FI.getValue()); 3286 else 3287 MIB.addReg(VAddr); 3288 }, 3289 [=](MachineInstrBuilder &MIB) { // soffset 3290 // If we don't know this private access is a local stack object, it 3291 // needs to be relative to the entry point's scratch wave offset. 3292 // TODO: Should split large offsets that don't fit like above. 3293 // TODO: Don't use scratch wave offset just because the offset 3294 // didn't fit. 3295 if (!Info->isEntryFunction() && FI.hasValue()) 3296 MIB.addReg(Info->getStackPtrOffsetReg()); 3297 else 3298 MIB.addImm(0); 3299 }, 3300 [=](MachineInstrBuilder &MIB) { // offset 3301 MIB.addImm(Offset); 3302 }}}; 3303 } 3304 3305 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base, 3306 int64_t Offset, 3307 unsigned OffsetBits) const { 3308 if ((OffsetBits == 16 && !isUInt<16>(Offset)) || 3309 (OffsetBits == 8 && !isUInt<8>(Offset))) 3310 return false; 3311 3312 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) 3313 return true; 3314 3315 // On Southern Islands instruction with a negative base value and an offset 3316 // don't seem to work. 3317 return KnownBits->signBitIsZero(Base); 3318 } 3319 3320 InstructionSelector::ComplexRendererFns 3321 AMDGPUInstructionSelector::selectMUBUFScratchOffset( 3322 MachineOperand &Root) const { 3323 MachineInstr *MI = Root.getParent(); 3324 MachineBasicBlock *MBB = MI->getParent(); 3325 3326 int64_t Offset = 0; 3327 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) || 3328 !SIInstrInfo::isLegalMUBUFImmOffset(Offset)) 3329 return {}; 3330 3331 const MachineFunction *MF = MBB->getParent(); 3332 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 3333 const MachineMemOperand *MMO = *MI->memoperands_begin(); 3334 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 3335 3336 return {{ 3337 [=](MachineInstrBuilder &MIB) { // rsrc 3338 MIB.addReg(Info->getScratchRSrcReg()); 3339 }, 3340 [=](MachineInstrBuilder &MIB) { // soffset 3341 if (isStackPtrRelative(PtrInfo)) 3342 MIB.addReg(Info->getStackPtrOffsetReg()); 3343 else 3344 MIB.addImm(0); 3345 }, 3346 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset 3347 }}; 3348 } 3349 3350 std::pair<Register, unsigned> 3351 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const { 3352 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 3353 if (!RootDef) 3354 return std::make_pair(Root.getReg(), 0); 3355 3356 int64_t ConstAddr = 0; 3357 3358 Register PtrBase; 3359 int64_t Offset; 3360 std::tie(PtrBase, Offset) = 3361 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 3362 3363 if (Offset) { 3364 if (isDSOffsetLegal(PtrBase, Offset, 16)) { 3365 // (add n0, c0) 3366 return std::make_pair(PtrBase, Offset); 3367 } 3368 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 3369 // TODO 3370 3371 3372 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 3373 // TODO 3374 3375 } 3376 3377 return std::make_pair(Root.getReg(), 0); 3378 } 3379 3380 InstructionSelector::ComplexRendererFns 3381 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const { 3382 Register Reg; 3383 unsigned Offset; 3384 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root); 3385 return {{ 3386 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 3387 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } 3388 }}; 3389 } 3390 3391 InstructionSelector::ComplexRendererFns 3392 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const { 3393 Register Reg; 3394 unsigned Offset; 3395 std::tie(Reg, Offset) = selectDS64Bit4ByteAlignedImpl(Root); 3396 return {{ 3397 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 3398 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, 3399 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); } 3400 }}; 3401 } 3402 3403 std::pair<Register, unsigned> 3404 AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const { 3405 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 3406 if (!RootDef) 3407 return std::make_pair(Root.getReg(), 0); 3408 3409 int64_t ConstAddr = 0; 3410 3411 Register PtrBase; 3412 int64_t Offset; 3413 std::tie(PtrBase, Offset) = 3414 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 3415 3416 if (Offset) { 3417 int64_t DWordOffset0 = Offset / 4; 3418 int64_t DWordOffset1 = DWordOffset0 + 1; 3419 if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) { 3420 // (add n0, c0) 3421 return std::make_pair(PtrBase, DWordOffset0); 3422 } 3423 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 3424 // TODO 3425 3426 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 3427 // TODO 3428 3429 } 3430 3431 return std::make_pair(Root.getReg(), 0); 3432 } 3433 3434 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return 3435 /// the base value with the constant offset. There may be intervening copies 3436 /// between \p Root and the identified constant. Returns \p Root, 0 if this does 3437 /// not match the pattern. 3438 std::pair<Register, int64_t> 3439 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset( 3440 Register Root, const MachineRegisterInfo &MRI) const { 3441 MachineInstr *RootI = MRI.getVRegDef(Root); 3442 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD) 3443 return {Root, 0}; 3444 3445 MachineOperand &RHS = RootI->getOperand(2); 3446 Optional<ValueAndVReg> MaybeOffset 3447 = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true); 3448 if (!MaybeOffset) 3449 return {Root, 0}; 3450 return {RootI->getOperand(1).getReg(), MaybeOffset->Value}; 3451 } 3452 3453 static void addZeroImm(MachineInstrBuilder &MIB) { 3454 MIB.addImm(0); 3455 } 3456 3457 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p 3458 /// BasePtr is not valid, a null base pointer will be used. 3459 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, 3460 uint32_t FormatLo, uint32_t FormatHi, 3461 Register BasePtr) { 3462 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 3463 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 3464 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 3465 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 3466 3467 B.buildInstr(AMDGPU::S_MOV_B32) 3468 .addDef(RSrc2) 3469 .addImm(FormatLo); 3470 B.buildInstr(AMDGPU::S_MOV_B32) 3471 .addDef(RSrc3) 3472 .addImm(FormatHi); 3473 3474 // Build the half of the subregister with the constants before building the 3475 // full 128-bit register. If we are building multiple resource descriptors, 3476 // this will allow CSEing of the 2-component register. 3477 B.buildInstr(AMDGPU::REG_SEQUENCE) 3478 .addDef(RSrcHi) 3479 .addReg(RSrc2) 3480 .addImm(AMDGPU::sub0) 3481 .addReg(RSrc3) 3482 .addImm(AMDGPU::sub1); 3483 3484 Register RSrcLo = BasePtr; 3485 if (!BasePtr) { 3486 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 3487 B.buildInstr(AMDGPU::S_MOV_B64) 3488 .addDef(RSrcLo) 3489 .addImm(0); 3490 } 3491 3492 B.buildInstr(AMDGPU::REG_SEQUENCE) 3493 .addDef(RSrc) 3494 .addReg(RSrcLo) 3495 .addImm(AMDGPU::sub0_sub1) 3496 .addReg(RSrcHi) 3497 .addImm(AMDGPU::sub2_sub3); 3498 3499 return RSrc; 3500 } 3501 3502 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 3503 const SIInstrInfo &TII, Register BasePtr) { 3504 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 3505 3506 // FIXME: Why are half the "default" bits ignored based on the addressing 3507 // mode? 3508 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr); 3509 } 3510 3511 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 3512 const SIInstrInfo &TII, Register BasePtr) { 3513 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 3514 3515 // FIXME: Why are half the "default" bits ignored based on the addressing 3516 // mode? 3517 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr); 3518 } 3519 3520 AMDGPUInstructionSelector::MUBUFAddressData 3521 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const { 3522 MUBUFAddressData Data; 3523 Data.N0 = Src; 3524 3525 Register PtrBase; 3526 int64_t Offset; 3527 3528 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI); 3529 if (isUInt<32>(Offset)) { 3530 Data.N0 = PtrBase; 3531 Data.Offset = Offset; 3532 } 3533 3534 if (MachineInstr *InputAdd 3535 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) { 3536 Data.N2 = InputAdd->getOperand(1).getReg(); 3537 Data.N3 = InputAdd->getOperand(2).getReg(); 3538 3539 // FIXME: Need to fix extra SGPR->VGPRcopies inserted 3540 // FIXME: Don't know this was defined by operand 0 3541 // 3542 // TODO: Remove this when we have copy folding optimizations after 3543 // RegBankSelect. 3544 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg(); 3545 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg(); 3546 } 3547 3548 return Data; 3549 } 3550 3551 /// Return if the addr64 mubuf mode should be used for the given address. 3552 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const { 3553 // (ptr_add N2, N3) -> addr64, or 3554 // (ptr_add (ptr_add N2, N3), C1) -> addr64 3555 if (Addr.N2) 3556 return true; 3557 3558 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI); 3559 return N0Bank->getID() == AMDGPU::VGPRRegBankID; 3560 } 3561 3562 /// Split an immediate offset \p ImmOffset depending on whether it fits in the 3563 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable 3564 /// component. 3565 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset( 3566 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const { 3567 if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset)) 3568 return; 3569 3570 // Illegal offset, store it in soffset. 3571 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 3572 B.buildInstr(AMDGPU::S_MOV_B32) 3573 .addDef(SOffset) 3574 .addImm(ImmOffset); 3575 ImmOffset = 0; 3576 } 3577 3578 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl( 3579 MachineOperand &Root, Register &VAddr, Register &RSrcReg, 3580 Register &SOffset, int64_t &Offset) const { 3581 // FIXME: Predicates should stop this from reaching here. 3582 // addr64 bit was removed for volcanic islands. 3583 if (!STI.hasAddr64() || STI.useFlatForGlobal()) 3584 return false; 3585 3586 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 3587 if (!shouldUseAddr64(AddrData)) 3588 return false; 3589 3590 Register N0 = AddrData.N0; 3591 Register N2 = AddrData.N2; 3592 Register N3 = AddrData.N3; 3593 Offset = AddrData.Offset; 3594 3595 // Base pointer for the SRD. 3596 Register SRDPtr; 3597 3598 if (N2) { 3599 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 3600 assert(N3); 3601 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 3602 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the 3603 // addr64, and construct the default resource from a 0 address. 3604 VAddr = N0; 3605 } else { 3606 SRDPtr = N3; 3607 VAddr = N2; 3608 } 3609 } else { 3610 // N2 is not divergent. 3611 SRDPtr = N2; 3612 VAddr = N3; 3613 } 3614 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 3615 // Use the default null pointer in the resource 3616 VAddr = N0; 3617 } else { 3618 // N0 -> offset, or 3619 // (N0 + C1) -> offset 3620 SRDPtr = N0; 3621 } 3622 3623 MachineIRBuilder B(*Root.getParent()); 3624 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr); 3625 splitIllegalMUBUFOffset(B, SOffset, Offset); 3626 return true; 3627 } 3628 3629 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl( 3630 MachineOperand &Root, Register &RSrcReg, Register &SOffset, 3631 int64_t &Offset) const { 3632 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 3633 if (shouldUseAddr64(AddrData)) 3634 return false; 3635 3636 // N0 -> offset, or 3637 // (N0 + C1) -> offset 3638 Register SRDPtr = AddrData.N0; 3639 Offset = AddrData.Offset; 3640 3641 // TODO: Look through extensions for 32-bit soffset. 3642 MachineIRBuilder B(*Root.getParent()); 3643 3644 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr); 3645 splitIllegalMUBUFOffset(B, SOffset, Offset); 3646 return true; 3647 } 3648 3649 InstructionSelector::ComplexRendererFns 3650 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const { 3651 Register VAddr; 3652 Register RSrcReg; 3653 Register SOffset; 3654 int64_t Offset = 0; 3655 3656 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 3657 return {}; 3658 3659 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 3660 // pattern. 3661 return {{ 3662 [=](MachineInstrBuilder &MIB) { // rsrc 3663 MIB.addReg(RSrcReg); 3664 }, 3665 [=](MachineInstrBuilder &MIB) { // vaddr 3666 MIB.addReg(VAddr); 3667 }, 3668 [=](MachineInstrBuilder &MIB) { // soffset 3669 if (SOffset) 3670 MIB.addReg(SOffset); 3671 else 3672 MIB.addImm(0); 3673 }, 3674 [=](MachineInstrBuilder &MIB) { // offset 3675 MIB.addImm(Offset); 3676 }, 3677 addZeroImm, // glc 3678 addZeroImm, // slc 3679 addZeroImm, // tfe 3680 addZeroImm, // dlc 3681 addZeroImm // swz 3682 }}; 3683 } 3684 3685 InstructionSelector::ComplexRendererFns 3686 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const { 3687 Register RSrcReg; 3688 Register SOffset; 3689 int64_t Offset = 0; 3690 3691 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 3692 return {}; 3693 3694 return {{ 3695 [=](MachineInstrBuilder &MIB) { // rsrc 3696 MIB.addReg(RSrcReg); 3697 }, 3698 [=](MachineInstrBuilder &MIB) { // soffset 3699 if (SOffset) 3700 MIB.addReg(SOffset); 3701 else 3702 MIB.addImm(0); 3703 }, 3704 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 3705 addZeroImm, // glc 3706 addZeroImm, // slc 3707 addZeroImm, // tfe 3708 addZeroImm, // dlc 3709 addZeroImm // swz 3710 }}; 3711 } 3712 3713 InstructionSelector::ComplexRendererFns 3714 AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const { 3715 Register VAddr; 3716 Register RSrcReg; 3717 Register SOffset; 3718 int64_t Offset = 0; 3719 3720 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 3721 return {}; 3722 3723 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 3724 // pattern. 3725 return {{ 3726 [=](MachineInstrBuilder &MIB) { // rsrc 3727 MIB.addReg(RSrcReg); 3728 }, 3729 [=](MachineInstrBuilder &MIB) { // vaddr 3730 MIB.addReg(VAddr); 3731 }, 3732 [=](MachineInstrBuilder &MIB) { // soffset 3733 if (SOffset) 3734 MIB.addReg(SOffset); 3735 else 3736 MIB.addImm(0); 3737 }, 3738 [=](MachineInstrBuilder &MIB) { // offset 3739 MIB.addImm(Offset); 3740 }, 3741 addZeroImm // slc 3742 }}; 3743 } 3744 3745 InstructionSelector::ComplexRendererFns 3746 AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const { 3747 Register RSrcReg; 3748 Register SOffset; 3749 int64_t Offset = 0; 3750 3751 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 3752 return {}; 3753 3754 return {{ 3755 [=](MachineInstrBuilder &MIB) { // rsrc 3756 MIB.addReg(RSrcReg); 3757 }, 3758 [=](MachineInstrBuilder &MIB) { // soffset 3759 if (SOffset) 3760 MIB.addReg(SOffset); 3761 else 3762 MIB.addImm(0); 3763 }, 3764 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 3765 addZeroImm // slc 3766 }}; 3767 } 3768 3769 /// Get an immediate that must be 32-bits, and treated as zero extended. 3770 static Optional<uint64_t> getConstantZext32Val(Register Reg, 3771 const MachineRegisterInfo &MRI) { 3772 // getConstantVRegVal sexts any values, so see if that matters. 3773 Optional<int64_t> OffsetVal = getConstantVRegVal(Reg, MRI); 3774 if (!OffsetVal || !isInt<32>(*OffsetVal)) 3775 return None; 3776 return Lo_32(*OffsetVal); 3777 } 3778 3779 InstructionSelector::ComplexRendererFns 3780 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const { 3781 Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI); 3782 if (!OffsetVal) 3783 return {}; 3784 3785 Optional<int64_t> EncodedImm = 3786 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true); 3787 if (!EncodedImm) 3788 return {}; 3789 3790 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; 3791 } 3792 3793 InstructionSelector::ComplexRendererFns 3794 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const { 3795 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); 3796 3797 Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI); 3798 if (!OffsetVal) 3799 return {}; 3800 3801 Optional<int64_t> EncodedImm 3802 = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal); 3803 if (!EncodedImm) 3804 return {}; 3805 3806 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; 3807 } 3808 3809 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, 3810 const MachineInstr &MI, 3811 int OpIdx) const { 3812 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 3813 "Expected G_CONSTANT"); 3814 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue()); 3815 } 3816 3817 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB, 3818 const MachineInstr &MI, 3819 int OpIdx) const { 3820 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 3821 "Expected G_CONSTANT"); 3822 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue()); 3823 } 3824 3825 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB, 3826 const MachineInstr &MI, 3827 int OpIdx) const { 3828 assert(OpIdx == -1); 3829 3830 const MachineOperand &Op = MI.getOperand(1); 3831 if (MI.getOpcode() == TargetOpcode::G_FCONSTANT) 3832 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); 3833 else { 3834 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT"); 3835 MIB.addImm(Op.getCImm()->getSExtValue()); 3836 } 3837 } 3838 3839 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB, 3840 const MachineInstr &MI, 3841 int OpIdx) const { 3842 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 3843 "Expected G_CONSTANT"); 3844 MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation()); 3845 } 3846 3847 /// This only really exists to satisfy DAG type checking machinery, so is a 3848 /// no-op here. 3849 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB, 3850 const MachineInstr &MI, 3851 int OpIdx) const { 3852 MIB.addImm(MI.getOperand(OpIdx).getImm()); 3853 } 3854 3855 void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB, 3856 const MachineInstr &MI, 3857 int OpIdx) const { 3858 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3859 MIB.addImm(MI.getOperand(OpIdx).getImm() & 1); 3860 } 3861 3862 void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB, 3863 const MachineInstr &MI, 3864 int OpIdx) const { 3865 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3866 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1); 3867 } 3868 3869 void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB, 3870 const MachineInstr &MI, 3871 int OpIdx) const { 3872 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3873 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1); 3874 } 3875 3876 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB, 3877 const MachineInstr &MI, 3878 int OpIdx) const { 3879 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3880 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1); 3881 } 3882 3883 void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB, 3884 const MachineInstr &MI, 3885 int OpIdx) const { 3886 MIB.addFrameIndex((MI.getOperand(1).getIndex())); 3887 } 3888 3889 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const { 3890 return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm()); 3891 } 3892 3893 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const { 3894 return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm()); 3895 } 3896 3897 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const { 3898 return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm()); 3899 } 3900 3901 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const { 3902 return TII.isInlineConstant(Imm); 3903 } 3904