1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the InstructionSelector class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUInstructionSelector.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUGlobalISelUtils.h" 17 #include "AMDGPUInstrInfo.h" 18 #include "AMDGPURegisterBankInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 23 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" 24 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 25 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 26 #include "llvm/IR/DiagnosticInfo.h" 27 28 #define DEBUG_TYPE "amdgpu-isel" 29 30 using namespace llvm; 31 using namespace MIPatternMatch; 32 33 static cl::opt<bool> AllowRiskySelect( 34 "amdgpu-global-isel-risky-select", 35 cl::desc("Allow GlobalISel to select cases that are likely to not work yet"), 36 cl::init(false), 37 cl::ReallyHidden); 38 39 #define GET_GLOBALISEL_IMPL 40 #define AMDGPUSubtarget GCNSubtarget 41 #include "AMDGPUGenGlobalISel.inc" 42 #undef GET_GLOBALISEL_IMPL 43 #undef AMDGPUSubtarget 44 45 AMDGPUInstructionSelector::AMDGPUInstructionSelector( 46 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, 47 const AMDGPUTargetMachine &TM) 48 : InstructionSelector(), TII(*STI.getInstrInfo()), 49 TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM), 50 STI(STI), 51 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG), 52 #define GET_GLOBALISEL_PREDICATES_INIT 53 #include "AMDGPUGenGlobalISel.inc" 54 #undef GET_GLOBALISEL_PREDICATES_INIT 55 #define GET_GLOBALISEL_TEMPORARIES_INIT 56 #include "AMDGPUGenGlobalISel.inc" 57 #undef GET_GLOBALISEL_TEMPORARIES_INIT 58 { 59 } 60 61 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } 62 63 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits *KB, 64 CodeGenCoverage &CoverageInfo, 65 ProfileSummaryInfo *PSI, 66 BlockFrequencyInfo *BFI) { 67 MRI = &MF.getRegInfo(); 68 Subtarget = &MF.getSubtarget<GCNSubtarget>(); 69 InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI); 70 } 71 72 bool AMDGPUInstructionSelector::isVCC(Register Reg, 73 const MachineRegisterInfo &MRI) const { 74 // The verifier is oblivious to s1 being a valid value for wavesize registers. 75 if (Reg.isPhysical()) 76 return false; 77 78 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 79 const TargetRegisterClass *RC = 80 RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); 81 if (RC) { 82 const LLT Ty = MRI.getType(Reg); 83 return RC->hasSuperClassEq(TRI.getBoolRC()) && 84 Ty.isValid() && Ty.getSizeInBits() == 1; 85 } 86 87 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); 88 return RB->getID() == AMDGPU::VCCRegBankID; 89 } 90 91 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI, 92 unsigned NewOpc) const { 93 MI.setDesc(TII.get(NewOpc)); 94 MI.RemoveOperand(1); // Remove intrinsic ID. 95 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 96 97 MachineOperand &Dst = MI.getOperand(0); 98 MachineOperand &Src = MI.getOperand(1); 99 100 // TODO: This should be legalized to s32 if needed 101 if (MRI->getType(Dst.getReg()) == LLT::scalar(1)) 102 return false; 103 104 const TargetRegisterClass *DstRC 105 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 106 const TargetRegisterClass *SrcRC 107 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 108 if (!DstRC || DstRC != SrcRC) 109 return false; 110 111 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) && 112 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI); 113 } 114 115 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { 116 const DebugLoc &DL = I.getDebugLoc(); 117 MachineBasicBlock *BB = I.getParent(); 118 I.setDesc(TII.get(TargetOpcode::COPY)); 119 120 const MachineOperand &Src = I.getOperand(1); 121 MachineOperand &Dst = I.getOperand(0); 122 Register DstReg = Dst.getReg(); 123 Register SrcReg = Src.getReg(); 124 125 if (isVCC(DstReg, *MRI)) { 126 if (SrcReg == AMDGPU::SCC) { 127 const TargetRegisterClass *RC 128 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 129 if (!RC) 130 return true; 131 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 132 } 133 134 if (!isVCC(SrcReg, *MRI)) { 135 // TODO: Should probably leave the copy and let copyPhysReg expand it. 136 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI)) 137 return false; 138 139 const TargetRegisterClass *SrcRC 140 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 141 142 Optional<ValueAndVReg> ConstVal = 143 getConstantVRegValWithLookThrough(SrcReg, *MRI, true, true); 144 if (ConstVal) { 145 unsigned MovOpc = 146 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; 147 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg) 148 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0); 149 } else { 150 Register MaskedReg = MRI->createVirtualRegister(SrcRC); 151 152 // We can't trust the high bits at this point, so clear them. 153 154 // TODO: Skip masking high bits if def is known boolean. 155 156 unsigned AndOpc = 157 TRI.isSGPRClass(SrcRC) ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32; 158 BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg) 159 .addImm(1) 160 .addReg(SrcReg); 161 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) 162 .addImm(0) 163 .addReg(MaskedReg); 164 } 165 166 if (!MRI->getRegClassOrNull(SrcReg)) 167 MRI->setRegClass(SrcReg, SrcRC); 168 I.eraseFromParent(); 169 return true; 170 } 171 172 const TargetRegisterClass *RC = 173 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 174 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) 175 return false; 176 177 return true; 178 } 179 180 for (const MachineOperand &MO : I.operands()) { 181 if (MO.getReg().isPhysical()) 182 continue; 183 184 const TargetRegisterClass *RC = 185 TRI.getConstrainedRegClassForOperand(MO, *MRI); 186 if (!RC) 187 continue; 188 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); 189 } 190 return true; 191 } 192 193 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { 194 const Register DefReg = I.getOperand(0).getReg(); 195 const LLT DefTy = MRI->getType(DefReg); 196 if (DefTy == LLT::scalar(1)) { 197 if (!AllowRiskySelect) { 198 LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n"); 199 return false; 200 } 201 202 LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n"); 203 } 204 205 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy) 206 207 const RegClassOrRegBank &RegClassOrBank = 208 MRI->getRegClassOrRegBank(DefReg); 209 210 const TargetRegisterClass *DefRC 211 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); 212 if (!DefRC) { 213 if (!DefTy.isValid()) { 214 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); 215 return false; 216 } 217 218 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); 219 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI); 220 if (!DefRC) { 221 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); 222 return false; 223 } 224 } 225 226 // TODO: Verify that all registers have the same bank 227 I.setDesc(TII.get(TargetOpcode::PHI)); 228 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI); 229 } 230 231 MachineOperand 232 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, 233 const TargetRegisterClass &SubRC, 234 unsigned SubIdx) const { 235 236 MachineInstr *MI = MO.getParent(); 237 MachineBasicBlock *BB = MO.getParent()->getParent(); 238 Register DstReg = MRI->createVirtualRegister(&SubRC); 239 240 if (MO.isReg()) { 241 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); 242 Register Reg = MO.getReg(); 243 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) 244 .addReg(Reg, 0, ComposedSubIdx); 245 246 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(), 247 MO.isKill(), MO.isDead(), MO.isUndef(), 248 MO.isEarlyClobber(), 0, MO.isDebug(), 249 MO.isInternalRead()); 250 } 251 252 assert(MO.isImm()); 253 254 APInt Imm(64, MO.getImm()); 255 256 switch (SubIdx) { 257 default: 258 llvm_unreachable("do not know to split immediate with this sub index."); 259 case AMDGPU::sub0: 260 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue()); 261 case AMDGPU::sub1: 262 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue()); 263 } 264 } 265 266 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { 267 switch (Opc) { 268 case AMDGPU::G_AND: 269 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; 270 case AMDGPU::G_OR: 271 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32; 272 case AMDGPU::G_XOR: 273 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32; 274 default: 275 llvm_unreachable("not a bit op"); 276 } 277 } 278 279 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { 280 Register DstReg = I.getOperand(0).getReg(); 281 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 282 283 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 284 if (DstRB->getID() != AMDGPU::SGPRRegBankID && 285 DstRB->getID() != AMDGPU::VCCRegBankID) 286 return false; 287 288 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID && 289 STI.isWave64()); 290 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64))); 291 292 // Dead implicit-def of scc 293 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef 294 true, // isImp 295 false, // isKill 296 true)); // isDead 297 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 298 } 299 300 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { 301 MachineBasicBlock *BB = I.getParent(); 302 MachineFunction *MF = BB->getParent(); 303 Register DstReg = I.getOperand(0).getReg(); 304 const DebugLoc &DL = I.getDebugLoc(); 305 LLT Ty = MRI->getType(DstReg); 306 if (Ty.isVector()) 307 return false; 308 309 unsigned Size = Ty.getSizeInBits(); 310 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 311 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID; 312 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB; 313 314 if (Size == 32) { 315 if (IsSALU) { 316 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; 317 MachineInstr *Add = 318 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 319 .add(I.getOperand(1)) 320 .add(I.getOperand(2)); 321 I.eraseFromParent(); 322 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 323 } 324 325 if (STI.hasAddNoCarry()) { 326 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64; 327 I.setDesc(TII.get(Opc)); 328 I.addOperand(*MF, MachineOperand::CreateImm(0)); 329 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 330 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 331 } 332 333 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64; 334 335 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass()); 336 MachineInstr *Add 337 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 338 .addDef(UnusedCarry, RegState::Dead) 339 .add(I.getOperand(1)) 340 .add(I.getOperand(2)) 341 .addImm(0); 342 I.eraseFromParent(); 343 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 344 } 345 346 assert(!Sub && "illegal sub should not reach here"); 347 348 const TargetRegisterClass &RC 349 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass; 350 const TargetRegisterClass &HalfRC 351 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass; 352 353 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0)); 354 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0)); 355 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1)); 356 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1)); 357 358 Register DstLo = MRI->createVirtualRegister(&HalfRC); 359 Register DstHi = MRI->createVirtualRegister(&HalfRC); 360 361 if (IsSALU) { 362 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) 363 .add(Lo1) 364 .add(Lo2); 365 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) 366 .add(Hi1) 367 .add(Hi2); 368 } else { 369 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass(); 370 Register CarryReg = MRI->createVirtualRegister(CarryRC); 371 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo) 372 .addDef(CarryReg) 373 .add(Lo1) 374 .add(Lo2) 375 .addImm(0); 376 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi) 377 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead) 378 .add(Hi1) 379 .add(Hi2) 380 .addReg(CarryReg, RegState::Kill) 381 .addImm(0); 382 383 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI)) 384 return false; 385 } 386 387 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 388 .addReg(DstLo) 389 .addImm(AMDGPU::sub0) 390 .addReg(DstHi) 391 .addImm(AMDGPU::sub1); 392 393 394 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI)) 395 return false; 396 397 I.eraseFromParent(); 398 return true; 399 } 400 401 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE( 402 MachineInstr &I) const { 403 MachineBasicBlock *BB = I.getParent(); 404 MachineFunction *MF = BB->getParent(); 405 const DebugLoc &DL = I.getDebugLoc(); 406 Register Dst0Reg = I.getOperand(0).getReg(); 407 Register Dst1Reg = I.getOperand(1).getReg(); 408 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO || 409 I.getOpcode() == AMDGPU::G_UADDE; 410 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE || 411 I.getOpcode() == AMDGPU::G_USUBE; 412 413 if (isVCC(Dst1Reg, *MRI)) { 414 unsigned NoCarryOpc = 415 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64; 416 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 417 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc)); 418 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 419 I.addOperand(*MF, MachineOperand::CreateImm(0)); 420 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 421 } 422 423 Register Src0Reg = I.getOperand(2).getReg(); 424 Register Src1Reg = I.getOperand(3).getReg(); 425 426 if (HasCarryIn) { 427 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 428 .addReg(I.getOperand(4).getReg()); 429 } 430 431 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; 432 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; 433 434 BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg) 435 .add(I.getOperand(2)) 436 .add(I.getOperand(3)); 437 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg) 438 .addReg(AMDGPU::SCC); 439 440 if (!MRI->getRegClassOrNull(Dst1Reg)) 441 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass); 442 443 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) || 444 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) || 445 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI)) 446 return false; 447 448 if (HasCarryIn && 449 !RBI.constrainGenericRegister(I.getOperand(4).getReg(), 450 AMDGPU::SReg_32RegClass, *MRI)) 451 return false; 452 453 I.eraseFromParent(); 454 return true; 455 } 456 457 // TODO: We should probably legalize these to only using 32-bit results. 458 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { 459 MachineBasicBlock *BB = I.getParent(); 460 Register DstReg = I.getOperand(0).getReg(); 461 Register SrcReg = I.getOperand(1).getReg(); 462 LLT DstTy = MRI->getType(DstReg); 463 LLT SrcTy = MRI->getType(SrcReg); 464 const unsigned SrcSize = SrcTy.getSizeInBits(); 465 unsigned DstSize = DstTy.getSizeInBits(); 466 467 // TODO: Should handle any multiple of 32 offset. 468 unsigned Offset = I.getOperand(2).getImm(); 469 if (Offset % 32 != 0 || DstSize > 128) 470 return false; 471 472 // 16-bit operations really use 32-bit registers. 473 // FIXME: Probably should not allow 16-bit G_EXTRACT results. 474 if (DstSize == 16) 475 DstSize = 32; 476 477 const TargetRegisterClass *DstRC = 478 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI); 479 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 480 return false; 481 482 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 483 const TargetRegisterClass *SrcRC = 484 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 485 if (!SrcRC) 486 return false; 487 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32, 488 DstSize / 32); 489 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg); 490 if (!SrcRC) 491 return false; 492 493 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I, 494 *SrcRC, I.getOperand(1)); 495 const DebugLoc &DL = I.getDebugLoc(); 496 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg) 497 .addReg(SrcReg, 0, SubReg); 498 499 I.eraseFromParent(); 500 return true; 501 } 502 503 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { 504 MachineBasicBlock *BB = MI.getParent(); 505 Register DstReg = MI.getOperand(0).getReg(); 506 LLT DstTy = MRI->getType(DstReg); 507 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg()); 508 509 const unsigned SrcSize = SrcTy.getSizeInBits(); 510 if (SrcSize < 32) 511 return selectImpl(MI, *CoverageInfo); 512 513 const DebugLoc &DL = MI.getDebugLoc(); 514 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 515 const unsigned DstSize = DstTy.getSizeInBits(); 516 const TargetRegisterClass *DstRC = 517 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 518 if (!DstRC) 519 return false; 520 521 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8); 522 MachineInstrBuilder MIB = 523 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg); 524 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) { 525 MachineOperand &Src = MI.getOperand(I + 1); 526 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef())); 527 MIB.addImm(SubRegs[I]); 528 529 const TargetRegisterClass *SrcRC 530 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 531 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI)) 532 return false; 533 } 534 535 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 536 return false; 537 538 MI.eraseFromParent(); 539 return true; 540 } 541 542 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { 543 MachineBasicBlock *BB = MI.getParent(); 544 const int NumDst = MI.getNumOperands() - 1; 545 546 MachineOperand &Src = MI.getOperand(NumDst); 547 548 Register SrcReg = Src.getReg(); 549 Register DstReg0 = MI.getOperand(0).getReg(); 550 LLT DstTy = MRI->getType(DstReg0); 551 LLT SrcTy = MRI->getType(SrcReg); 552 553 const unsigned DstSize = DstTy.getSizeInBits(); 554 const unsigned SrcSize = SrcTy.getSizeInBits(); 555 const DebugLoc &DL = MI.getDebugLoc(); 556 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 557 558 const TargetRegisterClass *SrcRC = 559 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 560 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 561 return false; 562 563 // Note we could have mixed SGPR and VGPR destination banks for an SGPR 564 // source, and this relies on the fact that the same subregister indices are 565 // used for both. 566 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); 567 for (int I = 0, E = NumDst; I != E; ++I) { 568 MachineOperand &Dst = MI.getOperand(I); 569 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg()) 570 .addReg(SrcReg, 0, SubRegs[I]); 571 572 // Make sure the subregister index is valid for the source register. 573 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]); 574 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 575 return false; 576 577 const TargetRegisterClass *DstRC = 578 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 579 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI)) 580 return false; 581 } 582 583 MI.eraseFromParent(); 584 return true; 585 } 586 587 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC( 588 MachineInstr &MI) const { 589 if (selectImpl(MI, *CoverageInfo)) 590 return true; 591 592 const LLT S32 = LLT::scalar(32); 593 const LLT V2S16 = LLT::vector(2, 16); 594 595 Register Dst = MI.getOperand(0).getReg(); 596 if (MRI->getType(Dst) != V2S16) 597 return false; 598 599 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI); 600 if (DstBank->getID() != AMDGPU::SGPRRegBankID) 601 return false; 602 603 Register Src0 = MI.getOperand(1).getReg(); 604 Register Src1 = MI.getOperand(2).getReg(); 605 if (MRI->getType(Src0) != S32) 606 return false; 607 608 const DebugLoc &DL = MI.getDebugLoc(); 609 MachineBasicBlock *BB = MI.getParent(); 610 611 auto ConstSrc1 = 612 getConstantVRegValWithLookThrough(Src1, *MRI, true, true, true); 613 if (ConstSrc1) { 614 auto ConstSrc0 = 615 getConstantVRegValWithLookThrough(Src0, *MRI, true, true, true); 616 if (ConstSrc0) { 617 const int64_t K0 = ConstSrc0->Value.getSExtValue(); 618 const int64_t K1 = ConstSrc1->Value.getSExtValue(); 619 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff; 620 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff; 621 622 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst) 623 .addImm(Lo16 | (Hi16 << 16)); 624 MI.eraseFromParent(); 625 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI); 626 } 627 } 628 629 // TODO: This should probably be a combine somewhere 630 // (build_vector_trunc $src0, undef -> copy $src0 631 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI); 632 if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) { 633 MI.setDesc(TII.get(AMDGPU::COPY)); 634 MI.RemoveOperand(2); 635 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) && 636 RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI); 637 } 638 639 Register ShiftSrc0; 640 Register ShiftSrc1; 641 642 // With multiple uses of the shift, this will duplicate the shift and 643 // increase register pressure. 644 // 645 // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16) 646 // => (S_PACK_HH_B32_B16 $src0, $src1) 647 // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16)) 648 // => (S_PACK_LH_B32_B16 $src0, $src1) 649 // (build_vector_trunc $src0, $src1) 650 // => (S_PACK_LL_B32_B16 $src0, $src1) 651 652 bool Shift0 = mi_match( 653 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16)))); 654 655 bool Shift1 = mi_match( 656 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16)))); 657 658 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16; 659 if (Shift0 && Shift1) { 660 Opc = AMDGPU::S_PACK_HH_B32_B16; 661 MI.getOperand(1).setReg(ShiftSrc0); 662 MI.getOperand(2).setReg(ShiftSrc1); 663 } else if (Shift1) { 664 Opc = AMDGPU::S_PACK_LH_B32_B16; 665 MI.getOperand(2).setReg(ShiftSrc1); 666 } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) { 667 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16 668 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst) 669 .addReg(ShiftSrc0) 670 .addImm(16); 671 672 MI.eraseFromParent(); 673 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 674 } 675 676 MI.setDesc(TII.get(Opc)); 677 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); 678 } 679 680 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const { 681 return selectG_ADD_SUB(I); 682 } 683 684 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { 685 const MachineOperand &MO = I.getOperand(0); 686 687 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The 688 // regbank check here is to know why getConstrainedRegClassForOperand failed. 689 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI); 690 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) || 691 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) { 692 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); 693 return true; 694 } 695 696 return false; 697 } 698 699 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { 700 MachineBasicBlock *BB = I.getParent(); 701 702 Register DstReg = I.getOperand(0).getReg(); 703 Register Src0Reg = I.getOperand(1).getReg(); 704 Register Src1Reg = I.getOperand(2).getReg(); 705 LLT Src1Ty = MRI->getType(Src1Reg); 706 707 unsigned DstSize = MRI->getType(DstReg).getSizeInBits(); 708 unsigned InsSize = Src1Ty.getSizeInBits(); 709 710 int64_t Offset = I.getOperand(3).getImm(); 711 712 // FIXME: These cases should have been illegal and unnecessary to check here. 713 if (Offset % 32 != 0 || InsSize % 32 != 0) 714 return false; 715 716 // Currently not handled by getSubRegFromChannel. 717 if (InsSize > 128) 718 return false; 719 720 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32); 721 if (SubReg == AMDGPU::NoSubRegister) 722 return false; 723 724 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 725 const TargetRegisterClass *DstRC = 726 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 727 if (!DstRC) 728 return false; 729 730 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI); 731 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI); 732 const TargetRegisterClass *Src0RC = 733 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI); 734 const TargetRegisterClass *Src1RC = 735 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI); 736 737 // Deal with weird cases where the class only partially supports the subreg 738 // index. 739 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg); 740 if (!Src0RC || !Src1RC) 741 return false; 742 743 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 744 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) || 745 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI)) 746 return false; 747 748 const DebugLoc &DL = I.getDebugLoc(); 749 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg) 750 .addReg(Src0Reg) 751 .addReg(Src1Reg) 752 .addImm(SubReg); 753 754 I.eraseFromParent(); 755 return true; 756 } 757 758 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const { 759 if (STI.getLDSBankCount() != 16) 760 return selectImpl(MI, *CoverageInfo); 761 762 Register Dst = MI.getOperand(0).getReg(); 763 Register Src0 = MI.getOperand(2).getReg(); 764 Register M0Val = MI.getOperand(6).getReg(); 765 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) || 766 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) || 767 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI)) 768 return false; 769 770 // This requires 2 instructions. It is possible to write a pattern to support 771 // this, but the generated isel emitter doesn't correctly deal with multiple 772 // output instructions using the same physical register input. The copy to m0 773 // is incorrectly placed before the second instruction. 774 // 775 // TODO: Match source modifiers. 776 777 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 778 const DebugLoc &DL = MI.getDebugLoc(); 779 MachineBasicBlock *MBB = MI.getParent(); 780 781 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 782 .addReg(M0Val); 783 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov) 784 .addImm(2) 785 .addImm(MI.getOperand(4).getImm()) // $attr 786 .addImm(MI.getOperand(3).getImm()); // $attrchan 787 788 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst) 789 .addImm(0) // $src0_modifiers 790 .addReg(Src0) // $src0 791 .addImm(MI.getOperand(4).getImm()) // $attr 792 .addImm(MI.getOperand(3).getImm()) // $attrchan 793 .addImm(0) // $src2_modifiers 794 .addReg(InterpMov) // $src2 - 2 f16 values selected by high 795 .addImm(MI.getOperand(5).getImm()) // $high 796 .addImm(0) // $clamp 797 .addImm(0); // $omod 798 799 MI.eraseFromParent(); 800 return true; 801 } 802 803 // Writelane is special in that it can use SGPR and M0 (which would normally 804 // count as using the constant bus twice - but in this case it is allowed since 805 // the lane selector doesn't count as a use of the constant bus). However, it is 806 // still required to abide by the 1 SGPR rule. Fix this up if we might have 807 // multiple SGPRs. 808 bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const { 809 // With a constant bus limit of at least 2, there's no issue. 810 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1) 811 return selectImpl(MI, *CoverageInfo); 812 813 MachineBasicBlock *MBB = MI.getParent(); 814 const DebugLoc &DL = MI.getDebugLoc(); 815 Register VDst = MI.getOperand(0).getReg(); 816 Register Val = MI.getOperand(2).getReg(); 817 Register LaneSelect = MI.getOperand(3).getReg(); 818 Register VDstIn = MI.getOperand(4).getReg(); 819 820 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst); 821 822 Optional<ValueAndVReg> ConstSelect = 823 getConstantVRegValWithLookThrough(LaneSelect, *MRI, true, true); 824 if (ConstSelect) { 825 // The selector has to be an inline immediate, so we can use whatever for 826 // the other operands. 827 MIB.addReg(Val); 828 MIB.addImm(ConstSelect->Value.getSExtValue() & 829 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2())); 830 } else { 831 Optional<ValueAndVReg> ConstVal = 832 getConstantVRegValWithLookThrough(Val, *MRI, true, true); 833 834 // If the value written is an inline immediate, we can get away without a 835 // copy to m0. 836 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(), 837 STI.hasInv2PiInlineImm())) { 838 MIB.addImm(ConstVal->Value.getSExtValue()); 839 MIB.addReg(LaneSelect); 840 } else { 841 MIB.addReg(Val); 842 843 // If the lane selector was originally in a VGPR and copied with 844 // readfirstlane, there's a hazard to read the same SGPR from the 845 // VALU. Constrain to a different SGPR to help avoid needing a nop later. 846 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI); 847 848 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 849 .addReg(LaneSelect); 850 MIB.addReg(AMDGPU::M0); 851 } 852 } 853 854 MIB.addReg(VDstIn); 855 856 MI.eraseFromParent(); 857 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 858 } 859 860 // We need to handle this here because tablegen doesn't support matching 861 // instructions with multiple outputs. 862 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const { 863 Register Dst0 = MI.getOperand(0).getReg(); 864 Register Dst1 = MI.getOperand(1).getReg(); 865 866 LLT Ty = MRI->getType(Dst0); 867 unsigned Opc; 868 if (Ty == LLT::scalar(32)) 869 Opc = AMDGPU::V_DIV_SCALE_F32_e64; 870 else if (Ty == LLT::scalar(64)) 871 Opc = AMDGPU::V_DIV_SCALE_F64_e64; 872 else 873 return false; 874 875 // TODO: Match source modifiers. 876 877 const DebugLoc &DL = MI.getDebugLoc(); 878 MachineBasicBlock *MBB = MI.getParent(); 879 880 Register Numer = MI.getOperand(3).getReg(); 881 Register Denom = MI.getOperand(4).getReg(); 882 unsigned ChooseDenom = MI.getOperand(5).getImm(); 883 884 Register Src0 = ChooseDenom != 0 ? Numer : Denom; 885 886 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0) 887 .addDef(Dst1) 888 .addImm(0) // $src0_modifiers 889 .addUse(Src0) // $src0 890 .addImm(0) // $src1_modifiers 891 .addUse(Denom) // $src1 892 .addImm(0) // $src2_modifiers 893 .addUse(Numer) // $src2 894 .addImm(0) // $clamp 895 .addImm(0); // $omod 896 897 MI.eraseFromParent(); 898 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 899 } 900 901 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { 902 unsigned IntrinsicID = I.getIntrinsicID(); 903 switch (IntrinsicID) { 904 case Intrinsic::amdgcn_if_break: { 905 MachineBasicBlock *BB = I.getParent(); 906 907 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 908 // SelectionDAG uses for wave32 vs wave64. 909 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK)) 910 .add(I.getOperand(0)) 911 .add(I.getOperand(2)) 912 .add(I.getOperand(3)); 913 914 Register DstReg = I.getOperand(0).getReg(); 915 Register Src0Reg = I.getOperand(2).getReg(); 916 Register Src1Reg = I.getOperand(3).getReg(); 917 918 I.eraseFromParent(); 919 920 for (Register Reg : { DstReg, Src0Reg, Src1Reg }) 921 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 922 923 return true; 924 } 925 case Intrinsic::amdgcn_interp_p1_f16: 926 return selectInterpP1F16(I); 927 case Intrinsic::amdgcn_wqm: 928 return constrainCopyLikeIntrin(I, AMDGPU::WQM); 929 case Intrinsic::amdgcn_softwqm: 930 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM); 931 case Intrinsic::amdgcn_strict_wwm: 932 case Intrinsic::amdgcn_wwm: 933 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM); 934 case Intrinsic::amdgcn_strict_wqm: 935 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM); 936 case Intrinsic::amdgcn_writelane: 937 return selectWritelane(I); 938 case Intrinsic::amdgcn_div_scale: 939 return selectDivScale(I); 940 case Intrinsic::amdgcn_icmp: 941 return selectIntrinsicIcmp(I); 942 case Intrinsic::amdgcn_ballot: 943 return selectBallot(I); 944 case Intrinsic::amdgcn_reloc_constant: 945 return selectRelocConstant(I); 946 case Intrinsic::amdgcn_groupstaticsize: 947 return selectGroupStaticSize(I); 948 case Intrinsic::returnaddress: 949 return selectReturnAddress(I); 950 default: 951 return selectImpl(I, *CoverageInfo); 952 } 953 } 954 955 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) { 956 if (Size != 32 && Size != 64) 957 return -1; 958 switch (P) { 959 default: 960 llvm_unreachable("Unknown condition code!"); 961 case CmpInst::ICMP_NE: 962 return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64; 963 case CmpInst::ICMP_EQ: 964 return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64; 965 case CmpInst::ICMP_SGT: 966 return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64; 967 case CmpInst::ICMP_SGE: 968 return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64; 969 case CmpInst::ICMP_SLT: 970 return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64; 971 case CmpInst::ICMP_SLE: 972 return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64; 973 case CmpInst::ICMP_UGT: 974 return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64; 975 case CmpInst::ICMP_UGE: 976 return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64; 977 case CmpInst::ICMP_ULT: 978 return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64; 979 case CmpInst::ICMP_ULE: 980 return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64; 981 } 982 } 983 984 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P, 985 unsigned Size) const { 986 if (Size == 64) { 987 if (!STI.hasScalarCompareEq64()) 988 return -1; 989 990 switch (P) { 991 case CmpInst::ICMP_NE: 992 return AMDGPU::S_CMP_LG_U64; 993 case CmpInst::ICMP_EQ: 994 return AMDGPU::S_CMP_EQ_U64; 995 default: 996 return -1; 997 } 998 } 999 1000 if (Size != 32) 1001 return -1; 1002 1003 switch (P) { 1004 case CmpInst::ICMP_NE: 1005 return AMDGPU::S_CMP_LG_U32; 1006 case CmpInst::ICMP_EQ: 1007 return AMDGPU::S_CMP_EQ_U32; 1008 case CmpInst::ICMP_SGT: 1009 return AMDGPU::S_CMP_GT_I32; 1010 case CmpInst::ICMP_SGE: 1011 return AMDGPU::S_CMP_GE_I32; 1012 case CmpInst::ICMP_SLT: 1013 return AMDGPU::S_CMP_LT_I32; 1014 case CmpInst::ICMP_SLE: 1015 return AMDGPU::S_CMP_LE_I32; 1016 case CmpInst::ICMP_UGT: 1017 return AMDGPU::S_CMP_GT_U32; 1018 case CmpInst::ICMP_UGE: 1019 return AMDGPU::S_CMP_GE_U32; 1020 case CmpInst::ICMP_ULT: 1021 return AMDGPU::S_CMP_LT_U32; 1022 case CmpInst::ICMP_ULE: 1023 return AMDGPU::S_CMP_LE_U32; 1024 default: 1025 llvm_unreachable("Unknown condition code!"); 1026 } 1027 } 1028 1029 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { 1030 MachineBasicBlock *BB = I.getParent(); 1031 const DebugLoc &DL = I.getDebugLoc(); 1032 1033 Register SrcReg = I.getOperand(2).getReg(); 1034 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 1035 1036 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); 1037 1038 Register CCReg = I.getOperand(0).getReg(); 1039 if (!isVCC(CCReg, *MRI)) { 1040 int Opcode = getS_CMPOpcode(Pred, Size); 1041 if (Opcode == -1) 1042 return false; 1043 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode)) 1044 .add(I.getOperand(2)) 1045 .add(I.getOperand(3)); 1046 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg) 1047 .addReg(AMDGPU::SCC); 1048 bool Ret = 1049 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) && 1050 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI); 1051 I.eraseFromParent(); 1052 return Ret; 1053 } 1054 1055 int Opcode = getV_CMPOpcode(Pred, Size); 1056 if (Opcode == -1) 1057 return false; 1058 1059 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), 1060 I.getOperand(0).getReg()) 1061 .add(I.getOperand(2)) 1062 .add(I.getOperand(3)); 1063 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), 1064 *TRI.getBoolRC(), *MRI); 1065 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 1066 I.eraseFromParent(); 1067 return Ret; 1068 } 1069 1070 bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const { 1071 Register Dst = I.getOperand(0).getReg(); 1072 if (isVCC(Dst, *MRI)) 1073 return false; 1074 1075 if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize()) 1076 return false; 1077 1078 MachineBasicBlock *BB = I.getParent(); 1079 const DebugLoc &DL = I.getDebugLoc(); 1080 Register SrcReg = I.getOperand(2).getReg(); 1081 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 1082 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm()); 1083 1084 int Opcode = getV_CMPOpcode(Pred, Size); 1085 if (Opcode == -1) 1086 return false; 1087 1088 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst) 1089 .add(I.getOperand(2)) 1090 .add(I.getOperand(3)); 1091 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(), 1092 *MRI); 1093 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 1094 I.eraseFromParent(); 1095 return Ret; 1096 } 1097 1098 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const { 1099 MachineBasicBlock *BB = I.getParent(); 1100 const DebugLoc &DL = I.getDebugLoc(); 1101 Register DstReg = I.getOperand(0).getReg(); 1102 const unsigned Size = MRI->getType(DstReg).getSizeInBits(); 1103 const bool Is64 = Size == 64; 1104 1105 if (Size != STI.getWavefrontSize()) 1106 return false; 1107 1108 Optional<ValueAndVReg> Arg = 1109 getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true); 1110 1111 if (Arg.hasValue()) { 1112 const int64_t Value = Arg.getValue().Value.getSExtValue(); 1113 if (Value == 0) { 1114 unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; 1115 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0); 1116 } else if (Value == -1) { // all ones 1117 Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO; 1118 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg); 1119 } else 1120 return false; 1121 } else { 1122 Register SrcReg = I.getOperand(2).getReg(); 1123 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg); 1124 } 1125 1126 I.eraseFromParent(); 1127 return true; 1128 } 1129 1130 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const { 1131 Register DstReg = I.getOperand(0).getReg(); 1132 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 1133 const TargetRegisterClass *DstRC = 1134 TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI); 1135 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 1136 return false; 1137 1138 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID; 1139 1140 Module *M = MF->getFunction().getParent(); 1141 const MDNode *Metadata = I.getOperand(2).getMetadata(); 1142 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString(); 1143 auto RelocSymbol = cast<GlobalVariable>( 1144 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext()))); 1145 1146 MachineBasicBlock *BB = I.getParent(); 1147 BuildMI(*BB, &I, I.getDebugLoc(), 1148 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg) 1149 .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO); 1150 1151 I.eraseFromParent(); 1152 return true; 1153 } 1154 1155 bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const { 1156 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS(); 1157 1158 Register DstReg = I.getOperand(0).getReg(); 1159 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1160 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ? 1161 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1162 1163 MachineBasicBlock *MBB = I.getParent(); 1164 const DebugLoc &DL = I.getDebugLoc(); 1165 1166 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg); 1167 1168 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) { 1169 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1170 MIB.addImm(MFI->getLDSSize()); 1171 } else { 1172 Module *M = MF->getFunction().getParent(); 1173 const GlobalValue *GV 1174 = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize); 1175 MIB.addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO); 1176 } 1177 1178 I.eraseFromParent(); 1179 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1180 } 1181 1182 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const { 1183 MachineBasicBlock *MBB = I.getParent(); 1184 MachineFunction &MF = *MBB->getParent(); 1185 const DebugLoc &DL = I.getDebugLoc(); 1186 1187 MachineOperand &Dst = I.getOperand(0); 1188 Register DstReg = Dst.getReg(); 1189 unsigned Depth = I.getOperand(2).getImm(); 1190 1191 const TargetRegisterClass *RC 1192 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 1193 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) || 1194 !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) 1195 return false; 1196 1197 // Check for kernel and shader functions 1198 if (Depth != 0 || 1199 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) { 1200 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) 1201 .addImm(0); 1202 I.eraseFromParent(); 1203 return true; 1204 } 1205 1206 MachineFrameInfo &MFI = MF.getFrameInfo(); 1207 // There is a call to @llvm.returnaddress in this function 1208 MFI.setReturnAddressIsTaken(true); 1209 1210 // Get the return address reg and mark it as an implicit live-in 1211 Register ReturnAddrReg = TRI.getReturnAddressReg(MF); 1212 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg, 1213 AMDGPU::SReg_64RegClass); 1214 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg) 1215 .addReg(LiveIn); 1216 I.eraseFromParent(); 1217 return true; 1218 } 1219 1220 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const { 1221 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 1222 // SelectionDAG uses for wave32 vs wave64. 1223 MachineBasicBlock *BB = MI.getParent(); 1224 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF)) 1225 .add(MI.getOperand(1)); 1226 1227 Register Reg = MI.getOperand(1).getReg(); 1228 MI.eraseFromParent(); 1229 1230 if (!MRI->getRegClassOrNull(Reg)) 1231 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 1232 return true; 1233 } 1234 1235 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic( 1236 MachineInstr &MI, Intrinsic::ID IntrID) const { 1237 MachineBasicBlock *MBB = MI.getParent(); 1238 MachineFunction *MF = MBB->getParent(); 1239 const DebugLoc &DL = MI.getDebugLoc(); 1240 1241 unsigned IndexOperand = MI.getOperand(7).getImm(); 1242 bool WaveRelease = MI.getOperand(8).getImm() != 0; 1243 bool WaveDone = MI.getOperand(9).getImm() != 0; 1244 1245 if (WaveDone && !WaveRelease) 1246 report_fatal_error("ds_ordered_count: wave_done requires wave_release"); 1247 1248 unsigned OrderedCountIndex = IndexOperand & 0x3f; 1249 IndexOperand &= ~0x3f; 1250 unsigned CountDw = 0; 1251 1252 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) { 1253 CountDw = (IndexOperand >> 24) & 0xf; 1254 IndexOperand &= ~(0xf << 24); 1255 1256 if (CountDw < 1 || CountDw > 4) { 1257 report_fatal_error( 1258 "ds_ordered_count: dword count must be between 1 and 4"); 1259 } 1260 } 1261 1262 if (IndexOperand) 1263 report_fatal_error("ds_ordered_count: bad index operand"); 1264 1265 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1; 1266 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF); 1267 1268 unsigned Offset0 = OrderedCountIndex << 2; 1269 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) | 1270 (Instruction << 4); 1271 1272 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) 1273 Offset1 |= (CountDw - 1) << 6; 1274 1275 unsigned Offset = Offset0 | (Offset1 << 8); 1276 1277 Register M0Val = MI.getOperand(2).getReg(); 1278 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1279 .addReg(M0Val); 1280 1281 Register DstReg = MI.getOperand(0).getReg(); 1282 Register ValReg = MI.getOperand(3).getReg(); 1283 MachineInstrBuilder DS = 1284 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg) 1285 .addReg(ValReg) 1286 .addImm(Offset) 1287 .cloneMemRefs(MI); 1288 1289 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI)) 1290 return false; 1291 1292 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI); 1293 MI.eraseFromParent(); 1294 return Ret; 1295 } 1296 1297 static unsigned gwsIntrinToOpcode(unsigned IntrID) { 1298 switch (IntrID) { 1299 case Intrinsic::amdgcn_ds_gws_init: 1300 return AMDGPU::DS_GWS_INIT; 1301 case Intrinsic::amdgcn_ds_gws_barrier: 1302 return AMDGPU::DS_GWS_BARRIER; 1303 case Intrinsic::amdgcn_ds_gws_sema_v: 1304 return AMDGPU::DS_GWS_SEMA_V; 1305 case Intrinsic::amdgcn_ds_gws_sema_br: 1306 return AMDGPU::DS_GWS_SEMA_BR; 1307 case Intrinsic::amdgcn_ds_gws_sema_p: 1308 return AMDGPU::DS_GWS_SEMA_P; 1309 case Intrinsic::amdgcn_ds_gws_sema_release_all: 1310 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL; 1311 default: 1312 llvm_unreachable("not a gws intrinsic"); 1313 } 1314 } 1315 1316 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, 1317 Intrinsic::ID IID) const { 1318 if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all && 1319 !STI.hasGWSSemaReleaseAll()) 1320 return false; 1321 1322 // intrinsic ID, vsrc, offset 1323 const bool HasVSrc = MI.getNumOperands() == 3; 1324 assert(HasVSrc || MI.getNumOperands() == 2); 1325 1326 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg(); 1327 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI); 1328 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID) 1329 return false; 1330 1331 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1332 assert(OffsetDef); 1333 1334 unsigned ImmOffset; 1335 1336 MachineBasicBlock *MBB = MI.getParent(); 1337 const DebugLoc &DL = MI.getDebugLoc(); 1338 1339 MachineInstr *Readfirstlane = nullptr; 1340 1341 // If we legalized the VGPR input, strip out the readfirstlane to analyze the 1342 // incoming offset, in case there's an add of a constant. We'll have to put it 1343 // back later. 1344 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) { 1345 Readfirstlane = OffsetDef; 1346 BaseOffset = OffsetDef->getOperand(1).getReg(); 1347 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1348 } 1349 1350 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) { 1351 // If we have a constant offset, try to use the 0 in m0 as the base. 1352 // TODO: Look into changing the default m0 initialization value. If the 1353 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to 1354 // the immediate offset. 1355 1356 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue(); 1357 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1358 .addImm(0); 1359 } else { 1360 std::tie(BaseOffset, ImmOffset) = 1361 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset); 1362 1363 if (Readfirstlane) { 1364 // We have the constant offset now, so put the readfirstlane back on the 1365 // variable component. 1366 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI)) 1367 return false; 1368 1369 Readfirstlane->getOperand(1).setReg(BaseOffset); 1370 BaseOffset = Readfirstlane->getOperand(0).getReg(); 1371 } else { 1372 if (!RBI.constrainGenericRegister(BaseOffset, 1373 AMDGPU::SReg_32RegClass, *MRI)) 1374 return false; 1375 } 1376 1377 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1378 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base) 1379 .addReg(BaseOffset) 1380 .addImm(16); 1381 1382 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1383 .addReg(M0Base); 1384 } 1385 1386 // The resource id offset is computed as (<isa opaque base> + M0[21:16] + 1387 // offset field) % 64. Some versions of the programming guide omit the m0 1388 // part, or claim it's from offset 0. 1389 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID))); 1390 1391 if (HasVSrc) { 1392 Register VSrc = MI.getOperand(1).getReg(); 1393 1394 if (STI.needsAlignedVGPRs()) { 1395 // Add implicit aligned super-reg to force alignment on the data operand. 1396 Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1397 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef); 1398 Register NewVR = 1399 MRI->createVirtualRegister(&AMDGPU::VReg_64_Align2RegClass); 1400 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), NewVR) 1401 .addReg(VSrc, 0, MI.getOperand(1).getSubReg()) 1402 .addImm(AMDGPU::sub0) 1403 .addReg(Undef) 1404 .addImm(AMDGPU::sub1); 1405 MIB.addReg(NewVR, 0, AMDGPU::sub0); 1406 MIB.addReg(NewVR, RegState::Implicit); 1407 } else { 1408 MIB.addReg(VSrc); 1409 } 1410 1411 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI)) 1412 return false; 1413 } 1414 1415 MIB.addImm(ImmOffset) 1416 .cloneMemRefs(MI); 1417 1418 MI.eraseFromParent(); 1419 return true; 1420 } 1421 1422 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI, 1423 bool IsAppend) const { 1424 Register PtrBase = MI.getOperand(2).getReg(); 1425 LLT PtrTy = MRI->getType(PtrBase); 1426 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS; 1427 1428 unsigned Offset; 1429 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2)); 1430 1431 // TODO: Should this try to look through readfirstlane like GWS? 1432 if (!isDSOffsetLegal(PtrBase, Offset)) { 1433 PtrBase = MI.getOperand(2).getReg(); 1434 Offset = 0; 1435 } 1436 1437 MachineBasicBlock *MBB = MI.getParent(); 1438 const DebugLoc &DL = MI.getDebugLoc(); 1439 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME; 1440 1441 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1442 .addReg(PtrBase); 1443 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI)) 1444 return false; 1445 1446 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg()) 1447 .addImm(Offset) 1448 .addImm(IsGDS ? -1 : 0) 1449 .cloneMemRefs(MI); 1450 MI.eraseFromParent(); 1451 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1452 } 1453 1454 bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const { 1455 if (TM.getOptLevel() > CodeGenOpt::None) { 1456 unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second; 1457 if (WGSize <= STI.getWavefrontSize()) { 1458 MachineBasicBlock *MBB = MI.getParent(); 1459 const DebugLoc &DL = MI.getDebugLoc(); 1460 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER)); 1461 MI.eraseFromParent(); 1462 return true; 1463 } 1464 } 1465 return selectImpl(MI, *CoverageInfo); 1466 } 1467 1468 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, 1469 bool &IsTexFail) { 1470 if (TexFailCtrl) 1471 IsTexFail = true; 1472 1473 TFE = (TexFailCtrl & 0x1) ? 1 : 0; 1474 TexFailCtrl &= ~(uint64_t)0x1; 1475 LWE = (TexFailCtrl & 0x2) ? 1 : 0; 1476 TexFailCtrl &= ~(uint64_t)0x2; 1477 1478 return TexFailCtrl == 0; 1479 } 1480 1481 bool AMDGPUInstructionSelector::selectImageIntrinsic( 1482 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 1483 MachineBasicBlock *MBB = MI.getParent(); 1484 const DebugLoc &DL = MI.getDebugLoc(); 1485 1486 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 1487 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 1488 1489 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); 1490 const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 1491 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode); 1492 const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo = 1493 AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode); 1494 unsigned IntrOpcode = Intr->BaseOpcode; 1495 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI); 1496 1497 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1; 1498 1499 Register VDataIn, VDataOut; 1500 LLT VDataTy; 1501 int NumVDataDwords = -1; 1502 bool IsD16 = false; 1503 1504 bool Unorm; 1505 if (!BaseOpcode->Sampler) 1506 Unorm = true; 1507 else 1508 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0; 1509 1510 bool TFE; 1511 bool LWE; 1512 bool IsTexFail = false; 1513 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(), 1514 TFE, LWE, IsTexFail)) 1515 return false; 1516 1517 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm(); 1518 const bool IsA16 = (Flags & 1) != 0; 1519 const bool IsG16 = (Flags & 2) != 0; 1520 1521 // A16 implies 16 bit gradients if subtarget doesn't support G16 1522 if (IsA16 && !STI.hasG16() && !IsG16) 1523 return false; 1524 1525 unsigned DMask = 0; 1526 unsigned DMaskLanes = 0; 1527 1528 if (BaseOpcode->Atomic) { 1529 VDataOut = MI.getOperand(0).getReg(); 1530 VDataIn = MI.getOperand(2).getReg(); 1531 LLT Ty = MRI->getType(VDataIn); 1532 1533 // Be careful to allow atomic swap on 16-bit element vectors. 1534 const bool Is64Bit = BaseOpcode->AtomicX2 ? 1535 Ty.getSizeInBits() == 128 : 1536 Ty.getSizeInBits() == 64; 1537 1538 if (BaseOpcode->AtomicX2) { 1539 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister); 1540 1541 DMask = Is64Bit ? 0xf : 0x3; 1542 NumVDataDwords = Is64Bit ? 4 : 2; 1543 } else { 1544 DMask = Is64Bit ? 0x3 : 0x1; 1545 NumVDataDwords = Is64Bit ? 2 : 1; 1546 } 1547 } else { 1548 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 1549 DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask); 1550 1551 // One memoperand is mandatory, except for getresinfo. 1552 // FIXME: Check this in verifier. 1553 if (!MI.memoperands_empty()) { 1554 const MachineMemOperand *MMO = *MI.memoperands_begin(); 1555 1556 // Infer d16 from the memory size, as the register type will be mangled by 1557 // unpacked subtargets, or by TFE. 1558 IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32; 1559 } 1560 1561 if (BaseOpcode->Store) { 1562 VDataIn = MI.getOperand(1).getReg(); 1563 VDataTy = MRI->getType(VDataIn); 1564 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32; 1565 } else { 1566 VDataOut = MI.getOperand(0).getReg(); 1567 VDataTy = MRI->getType(VDataOut); 1568 NumVDataDwords = DMaskLanes; 1569 1570 if (IsD16 && !STI.hasUnpackedD16VMem()) 1571 NumVDataDwords = (DMaskLanes + 1) / 2; 1572 } 1573 } 1574 1575 // Optimize _L to _LZ when _L is zero 1576 if (LZMappingInfo) { 1577 // The legalizer replaced the register with an immediate 0 if we need to 1578 // change the opcode. 1579 const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->LodIndex); 1580 if (Lod.isImm()) { 1581 assert(Lod.getImm() == 0); 1582 IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l 1583 } 1584 } 1585 1586 // Optimize _mip away, when 'lod' is zero 1587 if (MIPMappingInfo) { 1588 const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->MipIndex); 1589 if (Lod.isImm()) { 1590 assert(Lod.getImm() == 0); 1591 IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip 1592 } 1593 } 1594 1595 // Set G16 opcode 1596 if (IsG16 && !IsA16) { 1597 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = 1598 AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode); 1599 assert(G16MappingInfo); 1600 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16 1601 } 1602 1603 // TODO: Check this in verifier. 1604 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this"); 1605 1606 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(); 1607 if (BaseOpcode->Atomic) 1608 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization 1609 if (CPol & ~AMDGPU::CPol::ALL) 1610 return false; 1611 1612 int NumVAddrRegs = 0; 1613 int NumVAddrDwords = 0; 1614 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 1615 // Skip the $noregs and 0s inserted during legalization. 1616 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I); 1617 if (!AddrOp.isReg()) 1618 continue; // XXX - Break? 1619 1620 Register Addr = AddrOp.getReg(); 1621 if (!Addr) 1622 break; 1623 1624 ++NumVAddrRegs; 1625 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32; 1626 } 1627 1628 // The legalizer preprocessed the intrinsic arguments. If we aren't using 1629 // NSA, these should have beeen packed into a single value in the first 1630 // address register 1631 const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs; 1632 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) { 1633 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n"); 1634 return false; 1635 } 1636 1637 if (IsTexFail) 1638 ++NumVDataDwords; 1639 1640 int Opcode = -1; 1641 if (IsGFX10Plus) { 1642 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, 1643 UseNSA ? AMDGPU::MIMGEncGfx10NSA 1644 : AMDGPU::MIMGEncGfx10Default, 1645 NumVDataDwords, NumVAddrDwords); 1646 } else { 1647 if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 1648 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, 1649 NumVDataDwords, NumVAddrDwords); 1650 if (Opcode == -1) 1651 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6, 1652 NumVDataDwords, NumVAddrDwords); 1653 } 1654 assert(Opcode != -1); 1655 1656 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode)) 1657 .cloneMemRefs(MI); 1658 1659 if (VDataOut) { 1660 if (BaseOpcode->AtomicX2) { 1661 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64; 1662 1663 Register TmpReg = MRI->createVirtualRegister( 1664 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass); 1665 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 1666 1667 MIB.addDef(TmpReg); 1668 if (!MRI->use_empty(VDataOut)) { 1669 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut) 1670 .addReg(TmpReg, RegState::Kill, SubReg); 1671 } 1672 1673 } else { 1674 MIB.addDef(VDataOut); // vdata output 1675 } 1676 } 1677 1678 if (VDataIn) 1679 MIB.addReg(VDataIn); // vdata input 1680 1681 for (int I = 0; I != NumVAddrRegs; ++I) { 1682 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I); 1683 if (SrcOp.isReg()) { 1684 assert(SrcOp.getReg() != 0); 1685 MIB.addReg(SrcOp.getReg()); 1686 } 1687 } 1688 1689 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg()); 1690 if (BaseOpcode->Sampler) 1691 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg()); 1692 1693 MIB.addImm(DMask); // dmask 1694 1695 if (IsGFX10Plus) 1696 MIB.addImm(DimInfo->Encoding); 1697 MIB.addImm(Unorm); 1698 1699 MIB.addImm(CPol); 1700 MIB.addImm(IsA16 && // a16 or r128 1701 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0); 1702 if (IsGFX10Plus) 1703 MIB.addImm(IsA16 ? -1 : 0); 1704 1705 MIB.addImm(TFE); // tfe 1706 MIB.addImm(LWE); // lwe 1707 if (!IsGFX10Plus) 1708 MIB.addImm(DimInfo->DA ? -1 : 0); 1709 if (BaseOpcode->HasD16) 1710 MIB.addImm(IsD16 ? -1 : 0); 1711 1712 if (IsTexFail) { 1713 // An image load instruction with TFE/LWE only conditionally writes to its 1714 // result registers. Initialize them to zero so that we always get well 1715 // defined result values. 1716 assert(VDataOut && !VDataIn); 1717 Register Tied = MRI->cloneVirtualRegister(VDataOut); 1718 Register Zero = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1719 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::V_MOV_B32_e32), Zero) 1720 .addImm(0); 1721 auto Parts = TRI.getRegSplitParts(MRI->getRegClass(Tied), 4); 1722 if (STI.usePRTStrictNull()) { 1723 // With enable-prt-strict-null enabled, initialize all result registers to 1724 // zero. 1725 auto RegSeq = 1726 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied); 1727 for (auto Sub : Parts) 1728 RegSeq.addReg(Zero).addImm(Sub); 1729 } else { 1730 // With enable-prt-strict-null disabled, only initialize the extra TFE/LWE 1731 // result register. 1732 Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1733 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef); 1734 auto RegSeq = 1735 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied); 1736 for (auto Sub : Parts.drop_back(1)) 1737 RegSeq.addReg(Undef).addImm(Sub); 1738 RegSeq.addReg(Zero).addImm(Parts.back()); 1739 } 1740 MIB.addReg(Tied, RegState::Implicit); 1741 MIB->tieOperands(0, MIB->getNumOperands() - 1); 1742 } 1743 1744 MI.eraseFromParent(); 1745 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1746 } 1747 1748 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( 1749 MachineInstr &I) const { 1750 unsigned IntrinsicID = I.getIntrinsicID(); 1751 switch (IntrinsicID) { 1752 case Intrinsic::amdgcn_end_cf: 1753 return selectEndCfIntrinsic(I); 1754 case Intrinsic::amdgcn_ds_ordered_add: 1755 case Intrinsic::amdgcn_ds_ordered_swap: 1756 return selectDSOrderedIntrinsic(I, IntrinsicID); 1757 case Intrinsic::amdgcn_ds_gws_init: 1758 case Intrinsic::amdgcn_ds_gws_barrier: 1759 case Intrinsic::amdgcn_ds_gws_sema_v: 1760 case Intrinsic::amdgcn_ds_gws_sema_br: 1761 case Intrinsic::amdgcn_ds_gws_sema_p: 1762 case Intrinsic::amdgcn_ds_gws_sema_release_all: 1763 return selectDSGWSIntrinsic(I, IntrinsicID); 1764 case Intrinsic::amdgcn_ds_append: 1765 return selectDSAppendConsume(I, true); 1766 case Intrinsic::amdgcn_ds_consume: 1767 return selectDSAppendConsume(I, false); 1768 case Intrinsic::amdgcn_s_barrier: 1769 return selectSBarrier(I); 1770 case Intrinsic::amdgcn_global_atomic_fadd: 1771 return selectGlobalAtomicFadd(I, I.getOperand(2), I.getOperand(3)); 1772 default: { 1773 return selectImpl(I, *CoverageInfo); 1774 } 1775 } 1776 } 1777 1778 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { 1779 if (selectImpl(I, *CoverageInfo)) 1780 return true; 1781 1782 MachineBasicBlock *BB = I.getParent(); 1783 const DebugLoc &DL = I.getDebugLoc(); 1784 1785 Register DstReg = I.getOperand(0).getReg(); 1786 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 1787 assert(Size <= 32 || Size == 64); 1788 const MachineOperand &CCOp = I.getOperand(1); 1789 Register CCReg = CCOp.getReg(); 1790 if (!isVCC(CCReg, *MRI)) { 1791 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 : 1792 AMDGPU::S_CSELECT_B32; 1793 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 1794 .addReg(CCReg); 1795 1796 // The generic constrainSelectedInstRegOperands doesn't work for the scc register 1797 // bank, because it does not cover the register class that we used to represent 1798 // for it. So we need to manually set the register class here. 1799 if (!MRI->getRegClassOrNull(CCReg)) 1800 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI)); 1801 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) 1802 .add(I.getOperand(2)) 1803 .add(I.getOperand(3)); 1804 1805 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) | 1806 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI); 1807 I.eraseFromParent(); 1808 return Ret; 1809 } 1810 1811 // Wide VGPR select should have been split in RegBankSelect. 1812 if (Size > 32) 1813 return false; 1814 1815 MachineInstr *Select = 1816 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1817 .addImm(0) 1818 .add(I.getOperand(3)) 1819 .addImm(0) 1820 .add(I.getOperand(2)) 1821 .add(I.getOperand(1)); 1822 1823 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); 1824 I.eraseFromParent(); 1825 return Ret; 1826 } 1827 1828 static int sizeToSubRegIndex(unsigned Size) { 1829 switch (Size) { 1830 case 32: 1831 return AMDGPU::sub0; 1832 case 64: 1833 return AMDGPU::sub0_sub1; 1834 case 96: 1835 return AMDGPU::sub0_sub1_sub2; 1836 case 128: 1837 return AMDGPU::sub0_sub1_sub2_sub3; 1838 case 256: 1839 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; 1840 default: 1841 if (Size < 32) 1842 return AMDGPU::sub0; 1843 if (Size > 256) 1844 return -1; 1845 return sizeToSubRegIndex(PowerOf2Ceil(Size)); 1846 } 1847 } 1848 1849 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { 1850 Register DstReg = I.getOperand(0).getReg(); 1851 Register SrcReg = I.getOperand(1).getReg(); 1852 const LLT DstTy = MRI->getType(DstReg); 1853 const LLT SrcTy = MRI->getType(SrcReg); 1854 const LLT S1 = LLT::scalar(1); 1855 1856 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1857 const RegisterBank *DstRB; 1858 if (DstTy == S1) { 1859 // This is a special case. We don't treat s1 for legalization artifacts as 1860 // vcc booleans. 1861 DstRB = SrcRB; 1862 } else { 1863 DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1864 if (SrcRB != DstRB) 1865 return false; 1866 } 1867 1868 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; 1869 1870 unsigned DstSize = DstTy.getSizeInBits(); 1871 unsigned SrcSize = SrcTy.getSizeInBits(); 1872 1873 const TargetRegisterClass *SrcRC 1874 = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI); 1875 const TargetRegisterClass *DstRC 1876 = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI); 1877 if (!SrcRC || !DstRC) 1878 return false; 1879 1880 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 1881 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) { 1882 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); 1883 return false; 1884 } 1885 1886 if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) { 1887 MachineBasicBlock *MBB = I.getParent(); 1888 const DebugLoc &DL = I.getDebugLoc(); 1889 1890 Register LoReg = MRI->createVirtualRegister(DstRC); 1891 Register HiReg = MRI->createVirtualRegister(DstRC); 1892 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg) 1893 .addReg(SrcReg, 0, AMDGPU::sub0); 1894 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg) 1895 .addReg(SrcReg, 0, AMDGPU::sub1); 1896 1897 if (IsVALU && STI.hasSDWA()) { 1898 // Write the low 16-bits of the high element into the high 16-bits of the 1899 // low element. 1900 MachineInstr *MovSDWA = 1901 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 1902 .addImm(0) // $src0_modifiers 1903 .addReg(HiReg) // $src0 1904 .addImm(0) // $clamp 1905 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel 1906 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 1907 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel 1908 .addReg(LoReg, RegState::Implicit); 1909 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 1910 } else { 1911 Register TmpReg0 = MRI->createVirtualRegister(DstRC); 1912 Register TmpReg1 = MRI->createVirtualRegister(DstRC); 1913 Register ImmReg = MRI->createVirtualRegister(DstRC); 1914 if (IsVALU) { 1915 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0) 1916 .addImm(16) 1917 .addReg(HiReg); 1918 } else { 1919 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0) 1920 .addReg(HiReg) 1921 .addImm(16); 1922 } 1923 1924 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 1925 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 1926 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32; 1927 1928 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg) 1929 .addImm(0xffff); 1930 BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1) 1931 .addReg(LoReg) 1932 .addReg(ImmReg); 1933 BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg) 1934 .addReg(TmpReg0) 1935 .addReg(TmpReg1); 1936 } 1937 1938 I.eraseFromParent(); 1939 return true; 1940 } 1941 1942 if (!DstTy.isScalar()) 1943 return false; 1944 1945 if (SrcSize > 32) { 1946 int SubRegIdx = sizeToSubRegIndex(DstSize); 1947 if (SubRegIdx == -1) 1948 return false; 1949 1950 // Deal with weird cases where the class only partially supports the subreg 1951 // index. 1952 const TargetRegisterClass *SrcWithSubRC 1953 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx); 1954 if (!SrcWithSubRC) 1955 return false; 1956 1957 if (SrcWithSubRC != SrcRC) { 1958 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI)) 1959 return false; 1960 } 1961 1962 I.getOperand(1).setSubReg(SubRegIdx); 1963 } 1964 1965 I.setDesc(TII.get(TargetOpcode::COPY)); 1966 return true; 1967 } 1968 1969 /// \returns true if a bitmask for \p Size bits will be an inline immediate. 1970 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) { 1971 Mask = maskTrailingOnes<unsigned>(Size); 1972 int SignedMask = static_cast<int>(Mask); 1973 return SignedMask >= -16 && SignedMask <= 64; 1974 } 1975 1976 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1. 1977 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank( 1978 Register Reg, const MachineRegisterInfo &MRI, 1979 const TargetRegisterInfo &TRI) const { 1980 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 1981 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>()) 1982 return RB; 1983 1984 // Ignore the type, since we don't use vcc in artifacts. 1985 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>()) 1986 return &RBI.getRegBankFromRegClass(*RC, LLT()); 1987 return nullptr; 1988 } 1989 1990 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { 1991 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG; 1992 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg; 1993 const DebugLoc &DL = I.getDebugLoc(); 1994 MachineBasicBlock &MBB = *I.getParent(); 1995 const Register DstReg = I.getOperand(0).getReg(); 1996 const Register SrcReg = I.getOperand(1).getReg(); 1997 1998 const LLT DstTy = MRI->getType(DstReg); 1999 const LLT SrcTy = MRI->getType(SrcReg); 2000 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ? 2001 I.getOperand(2).getImm() : SrcTy.getSizeInBits(); 2002 const unsigned DstSize = DstTy.getSizeInBits(); 2003 if (!DstTy.isScalar()) 2004 return false; 2005 2006 // Artifact casts should never use vcc. 2007 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI); 2008 2009 // FIXME: This should probably be illegal and split earlier. 2010 if (I.getOpcode() == AMDGPU::G_ANYEXT) { 2011 if (DstSize <= 32) 2012 return selectCOPY(I); 2013 2014 const TargetRegisterClass *SrcRC = 2015 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank, *MRI); 2016 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 2017 const TargetRegisterClass *DstRC = 2018 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 2019 2020 Register UndefReg = MRI->createVirtualRegister(SrcRC); 2021 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 2022 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 2023 .addReg(SrcReg) 2024 .addImm(AMDGPU::sub0) 2025 .addReg(UndefReg) 2026 .addImm(AMDGPU::sub1); 2027 I.eraseFromParent(); 2028 2029 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) && 2030 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI); 2031 } 2032 2033 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) { 2034 // 64-bit should have been split up in RegBankSelect 2035 2036 // Try to use an and with a mask if it will save code size. 2037 unsigned Mask; 2038 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 2039 MachineInstr *ExtI = 2040 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg) 2041 .addImm(Mask) 2042 .addReg(SrcReg); 2043 I.eraseFromParent(); 2044 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 2045 } 2046 2047 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64; 2048 MachineInstr *ExtI = 2049 BuildMI(MBB, I, DL, TII.get(BFE), DstReg) 2050 .addReg(SrcReg) 2051 .addImm(0) // Offset 2052 .addImm(SrcSize); // Width 2053 I.eraseFromParent(); 2054 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 2055 } 2056 2057 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) { 2058 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ? 2059 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass; 2060 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI)) 2061 return false; 2062 2063 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) { 2064 const unsigned SextOpc = SrcSize == 8 ? 2065 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16; 2066 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg) 2067 .addReg(SrcReg); 2068 I.eraseFromParent(); 2069 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 2070 } 2071 2072 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; 2073 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; 2074 2075 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width. 2076 if (DstSize > 32 && (SrcSize <= 32 || InReg)) { 2077 // We need a 64-bit register source, but the high bits don't matter. 2078 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); 2079 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2080 unsigned SubReg = InReg ? AMDGPU::sub0 : 0; 2081 2082 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 2083 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) 2084 .addReg(SrcReg, 0, SubReg) 2085 .addImm(AMDGPU::sub0) 2086 .addReg(UndefReg) 2087 .addImm(AMDGPU::sub1); 2088 2089 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg) 2090 .addReg(ExtReg) 2091 .addImm(SrcSize << 16); 2092 2093 I.eraseFromParent(); 2094 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI); 2095 } 2096 2097 unsigned Mask; 2098 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 2099 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg) 2100 .addReg(SrcReg) 2101 .addImm(Mask); 2102 } else { 2103 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg) 2104 .addReg(SrcReg) 2105 .addImm(SrcSize << 16); 2106 } 2107 2108 I.eraseFromParent(); 2109 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 2110 } 2111 2112 return false; 2113 } 2114 2115 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { 2116 MachineBasicBlock *BB = I.getParent(); 2117 MachineOperand &ImmOp = I.getOperand(1); 2118 Register DstReg = I.getOperand(0).getReg(); 2119 unsigned Size = MRI->getType(DstReg).getSizeInBits(); 2120 2121 // The AMDGPU backend only supports Imm operands and not CImm or FPImm. 2122 if (ImmOp.isFPImm()) { 2123 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt(); 2124 ImmOp.ChangeToImmediate(Imm.getZExtValue()); 2125 } else if (ImmOp.isCImm()) { 2126 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue()); 2127 } else { 2128 llvm_unreachable("Not supported by g_constants"); 2129 } 2130 2131 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2132 const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID; 2133 2134 unsigned Opcode; 2135 if (DstRB->getID() == AMDGPU::VCCRegBankID) { 2136 Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 2137 } else { 2138 Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 2139 2140 // We should never produce s1 values on banks other than VCC. If the user of 2141 // this already constrained the register, we may incorrectly think it's VCC 2142 // if it wasn't originally. 2143 if (Size == 1) 2144 return false; 2145 } 2146 2147 if (Size != 64) { 2148 I.setDesc(TII.get(Opcode)); 2149 I.addImplicitDefUseOperands(*MF); 2150 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2151 } 2152 2153 const DebugLoc &DL = I.getDebugLoc(); 2154 2155 APInt Imm(Size, I.getOperand(1).getImm()); 2156 2157 MachineInstr *ResInst; 2158 if (IsSgpr && TII.isInlineConstant(Imm)) { 2159 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) 2160 .addImm(I.getOperand(1).getImm()); 2161 } else { 2162 const TargetRegisterClass *RC = IsSgpr ? 2163 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass; 2164 Register LoReg = MRI->createVirtualRegister(RC); 2165 Register HiReg = MRI->createVirtualRegister(RC); 2166 2167 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg) 2168 .addImm(Imm.trunc(32).getZExtValue()); 2169 2170 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg) 2171 .addImm(Imm.ashr(32).getZExtValue()); 2172 2173 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 2174 .addReg(LoReg) 2175 .addImm(AMDGPU::sub0) 2176 .addReg(HiReg) 2177 .addImm(AMDGPU::sub1); 2178 } 2179 2180 // We can't call constrainSelectedInstRegOperands here, because it doesn't 2181 // work for target independent opcodes 2182 I.eraseFromParent(); 2183 const TargetRegisterClass *DstRC = 2184 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI); 2185 if (!DstRC) 2186 return true; 2187 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI); 2188 } 2189 2190 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const { 2191 // Only manually handle the f64 SGPR case. 2192 // 2193 // FIXME: This is a workaround for 2.5 different tablegen problems. Because 2194 // the bit ops theoretically have a second result due to the implicit def of 2195 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing 2196 // that is easy by disabling the check. The result works, but uses a 2197 // nonsensical sreg32orlds_and_sreg_1 regclass. 2198 // 2199 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to 2200 // the variadic REG_SEQUENCE operands. 2201 2202 Register Dst = MI.getOperand(0).getReg(); 2203 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 2204 if (DstRB->getID() != AMDGPU::SGPRRegBankID || 2205 MRI->getType(Dst) != LLT::scalar(64)) 2206 return false; 2207 2208 Register Src = MI.getOperand(1).getReg(); 2209 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI); 2210 if (Fabs) 2211 Src = Fabs->getOperand(1).getReg(); 2212 2213 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || 2214 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) 2215 return false; 2216 2217 MachineBasicBlock *BB = MI.getParent(); 2218 const DebugLoc &DL = MI.getDebugLoc(); 2219 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2220 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2221 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2222 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2223 2224 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) 2225 .addReg(Src, 0, AMDGPU::sub0); 2226 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) 2227 .addReg(Src, 0, AMDGPU::sub1); 2228 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) 2229 .addImm(0x80000000); 2230 2231 // Set or toggle sign bit. 2232 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32; 2233 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg) 2234 .addReg(HiReg) 2235 .addReg(ConstReg); 2236 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) 2237 .addReg(LoReg) 2238 .addImm(AMDGPU::sub0) 2239 .addReg(OpReg) 2240 .addImm(AMDGPU::sub1); 2241 MI.eraseFromParent(); 2242 return true; 2243 } 2244 2245 // FIXME: This is a workaround for the same tablegen problems as G_FNEG 2246 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const { 2247 Register Dst = MI.getOperand(0).getReg(); 2248 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 2249 if (DstRB->getID() != AMDGPU::SGPRRegBankID || 2250 MRI->getType(Dst) != LLT::scalar(64)) 2251 return false; 2252 2253 Register Src = MI.getOperand(1).getReg(); 2254 MachineBasicBlock *BB = MI.getParent(); 2255 const DebugLoc &DL = MI.getDebugLoc(); 2256 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2257 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2258 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2259 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2260 2261 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || 2262 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) 2263 return false; 2264 2265 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) 2266 .addReg(Src, 0, AMDGPU::sub0); 2267 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) 2268 .addReg(Src, 0, AMDGPU::sub1); 2269 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) 2270 .addImm(0x7fffffff); 2271 2272 // Clear sign bit. 2273 // TODO: Should this used S_BITSET0_*? 2274 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg) 2275 .addReg(HiReg) 2276 .addReg(ConstReg); 2277 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) 2278 .addReg(LoReg) 2279 .addImm(AMDGPU::sub0) 2280 .addReg(OpReg) 2281 .addImm(AMDGPU::sub1); 2282 2283 MI.eraseFromParent(); 2284 return true; 2285 } 2286 2287 static bool isConstant(const MachineInstr &MI) { 2288 return MI.getOpcode() == TargetOpcode::G_CONSTANT; 2289 } 2290 2291 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, 2292 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const { 2293 2294 const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg()); 2295 2296 assert(PtrMI); 2297 2298 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD) 2299 return; 2300 2301 GEPInfo GEPInfo(*PtrMI); 2302 2303 for (unsigned i = 1; i != 3; ++i) { 2304 const MachineOperand &GEPOp = PtrMI->getOperand(i); 2305 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg()); 2306 assert(OpDef); 2307 if (i == 2 && isConstant(*OpDef)) { 2308 // TODO: Could handle constant base + variable offset, but a combine 2309 // probably should have commuted it. 2310 assert(GEPInfo.Imm == 0); 2311 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue(); 2312 continue; 2313 } 2314 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI); 2315 if (OpBank->getID() == AMDGPU::SGPRRegBankID) 2316 GEPInfo.SgprParts.push_back(GEPOp.getReg()); 2317 else 2318 GEPInfo.VgprParts.push_back(GEPOp.getReg()); 2319 } 2320 2321 AddrInfo.push_back(GEPInfo); 2322 getAddrModeInfo(*PtrMI, MRI, AddrInfo); 2323 } 2324 2325 bool AMDGPUInstructionSelector::isSGPR(Register Reg) const { 2326 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID; 2327 } 2328 2329 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const { 2330 if (!MI.hasOneMemOperand()) 2331 return false; 2332 2333 const MachineMemOperand *MMO = *MI.memoperands_begin(); 2334 const Value *Ptr = MMO->getValue(); 2335 2336 // UndefValue means this is a load of a kernel input. These are uniform. 2337 // Sometimes LDS instructions have constant pointers. 2338 // If Ptr is null, then that means this mem operand contains a 2339 // PseudoSourceValue like GOT. 2340 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || 2341 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) 2342 return true; 2343 2344 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 2345 return true; 2346 2347 const Instruction *I = dyn_cast<Instruction>(Ptr); 2348 return I && I->getMetadata("amdgpu.uniform"); 2349 } 2350 2351 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const { 2352 for (const GEPInfo &GEPInfo : AddrInfo) { 2353 if (!GEPInfo.VgprParts.empty()) 2354 return true; 2355 } 2356 return false; 2357 } 2358 2359 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const { 2360 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); 2361 unsigned AS = PtrTy.getAddressSpace(); 2362 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) && 2363 STI.ldsRequiresM0Init()) { 2364 MachineBasicBlock *BB = I.getParent(); 2365 2366 // If DS instructions require M0 initializtion, insert it before selecting. 2367 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 2368 .addImm(-1); 2369 } 2370 } 2371 2372 bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW( 2373 MachineInstr &I) const { 2374 if (I.getOpcode() == TargetOpcode::G_ATOMICRMW_FADD) { 2375 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); 2376 unsigned AS = PtrTy.getAddressSpace(); 2377 if (AS == AMDGPUAS::GLOBAL_ADDRESS) 2378 return selectGlobalAtomicFadd(I, I.getOperand(1), I.getOperand(2)); 2379 } 2380 2381 initM0(I); 2382 return selectImpl(I, *CoverageInfo); 2383 } 2384 2385 // TODO: No rtn optimization. 2386 bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG( 2387 MachineInstr &MI) const { 2388 Register PtrReg = MI.getOperand(1).getReg(); 2389 const LLT PtrTy = MRI->getType(PtrReg); 2390 if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS || 2391 STI.useFlatForGlobal()) 2392 return selectImpl(MI, *CoverageInfo); 2393 2394 Register DstReg = MI.getOperand(0).getReg(); 2395 const LLT Ty = MRI->getType(DstReg); 2396 const bool Is64 = Ty.getSizeInBits() == 64; 2397 const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 2398 Register TmpReg = MRI->createVirtualRegister( 2399 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass); 2400 2401 const DebugLoc &DL = MI.getDebugLoc(); 2402 MachineBasicBlock *BB = MI.getParent(); 2403 2404 Register VAddr, RSrcReg, SOffset; 2405 int64_t Offset = 0; 2406 2407 unsigned Opcode; 2408 if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) { 2409 Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN : 2410 AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN; 2411 } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr, 2412 RSrcReg, SOffset, Offset)) { 2413 Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN : 2414 AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN; 2415 } else 2416 return selectImpl(MI, *CoverageInfo); 2417 2418 auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg) 2419 .addReg(MI.getOperand(2).getReg()); 2420 2421 if (VAddr) 2422 MIB.addReg(VAddr); 2423 2424 MIB.addReg(RSrcReg); 2425 if (SOffset) 2426 MIB.addReg(SOffset); 2427 else 2428 MIB.addImm(0); 2429 2430 MIB.addImm(Offset); 2431 MIB.addImm(AMDGPU::CPol::GLC); 2432 MIB.cloneMemRefs(MI); 2433 2434 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg) 2435 .addReg(TmpReg, RegState::Kill, SubReg); 2436 2437 MI.eraseFromParent(); 2438 2439 MRI->setRegClass( 2440 DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass); 2441 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 2442 } 2443 2444 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { 2445 MachineBasicBlock *BB = I.getParent(); 2446 MachineOperand &CondOp = I.getOperand(0); 2447 Register CondReg = CondOp.getReg(); 2448 const DebugLoc &DL = I.getDebugLoc(); 2449 2450 unsigned BrOpcode; 2451 Register CondPhysReg; 2452 const TargetRegisterClass *ConstrainRC; 2453 2454 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide 2455 // whether the branch is uniform when selecting the instruction. In 2456 // GlobalISel, we should push that decision into RegBankSelect. Assume for now 2457 // RegBankSelect knows what it's doing if the branch condition is scc, even 2458 // though it currently does not. 2459 if (!isVCC(CondReg, *MRI)) { 2460 if (MRI->getType(CondReg) != LLT::scalar(32)) 2461 return false; 2462 2463 CondPhysReg = AMDGPU::SCC; 2464 BrOpcode = AMDGPU::S_CBRANCH_SCC1; 2465 ConstrainRC = &AMDGPU::SReg_32RegClass; 2466 } else { 2467 // FIXME: Do we have to insert an and with exec here, like in SelectionDAG? 2468 // We sort of know that a VCC producer based on the register bank, that ands 2469 // inactive lanes with 0. What if there was a logical operation with vcc 2470 // producers in different blocks/with different exec masks? 2471 // FIXME: Should scc->vcc copies and with exec? 2472 CondPhysReg = TRI.getVCC(); 2473 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ; 2474 ConstrainRC = TRI.getBoolRC(); 2475 } 2476 2477 if (!MRI->getRegClassOrNull(CondReg)) 2478 MRI->setRegClass(CondReg, ConstrainRC); 2479 2480 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg) 2481 .addReg(CondReg); 2482 BuildMI(*BB, &I, DL, TII.get(BrOpcode)) 2483 .addMBB(I.getOperand(1).getMBB()); 2484 2485 I.eraseFromParent(); 2486 return true; 2487 } 2488 2489 bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE( 2490 MachineInstr &I) const { 2491 Register DstReg = I.getOperand(0).getReg(); 2492 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2493 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 2494 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32)); 2495 if (IsVGPR) 2496 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 2497 2498 return RBI.constrainGenericRegister( 2499 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI); 2500 } 2501 2502 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { 2503 Register DstReg = I.getOperand(0).getReg(); 2504 Register SrcReg = I.getOperand(1).getReg(); 2505 Register MaskReg = I.getOperand(2).getReg(); 2506 LLT Ty = MRI->getType(DstReg); 2507 LLT MaskTy = MRI->getType(MaskReg); 2508 2509 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2510 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 2511 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI); 2512 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 2513 if (DstRB != SrcRB) // Should only happen for hand written MIR. 2514 return false; 2515 2516 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 2517 const TargetRegisterClass &RegRC 2518 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 2519 2520 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB, 2521 *MRI); 2522 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB, 2523 *MRI); 2524 const TargetRegisterClass *MaskRC = 2525 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI); 2526 2527 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 2528 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 2529 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI)) 2530 return false; 2531 2532 MachineBasicBlock *BB = I.getParent(); 2533 const DebugLoc &DL = I.getDebugLoc(); 2534 if (Ty.getSizeInBits() == 32) { 2535 assert(MaskTy.getSizeInBits() == 32 && 2536 "ptrmask should have been narrowed during legalize"); 2537 2538 BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg) 2539 .addReg(SrcReg) 2540 .addReg(MaskReg); 2541 I.eraseFromParent(); 2542 return true; 2543 } 2544 2545 Register HiReg = MRI->createVirtualRegister(&RegRC); 2546 Register LoReg = MRI->createVirtualRegister(&RegRC); 2547 2548 // Extract the subregisters from the source pointer. 2549 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg) 2550 .addReg(SrcReg, 0, AMDGPU::sub0); 2551 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg) 2552 .addReg(SrcReg, 0, AMDGPU::sub1); 2553 2554 Register MaskedLo, MaskedHi; 2555 2556 // Try to avoid emitting a bit operation when we only need to touch half of 2557 // the 64-bit pointer. 2558 APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64); 2559 2560 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32); 2561 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32); 2562 if ((MaskOnes & MaskLo32) == MaskLo32) { 2563 // If all the bits in the low half are 1, we only need a copy for it. 2564 MaskedLo = LoReg; 2565 } else { 2566 // Extract the mask subregister and apply the and. 2567 Register MaskLo = MRI->createVirtualRegister(&RegRC); 2568 MaskedLo = MRI->createVirtualRegister(&RegRC); 2569 2570 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo) 2571 .addReg(MaskReg, 0, AMDGPU::sub0); 2572 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo) 2573 .addReg(LoReg) 2574 .addReg(MaskLo); 2575 } 2576 2577 if ((MaskOnes & MaskHi32) == MaskHi32) { 2578 // If all the bits in the high half are 1, we only need a copy for it. 2579 MaskedHi = HiReg; 2580 } else { 2581 Register MaskHi = MRI->createVirtualRegister(&RegRC); 2582 MaskedHi = MRI->createVirtualRegister(&RegRC); 2583 2584 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi) 2585 .addReg(MaskReg, 0, AMDGPU::sub1); 2586 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi) 2587 .addReg(HiReg) 2588 .addReg(MaskHi); 2589 } 2590 2591 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 2592 .addReg(MaskedLo) 2593 .addImm(AMDGPU::sub0) 2594 .addReg(MaskedHi) 2595 .addImm(AMDGPU::sub1); 2596 I.eraseFromParent(); 2597 return true; 2598 } 2599 2600 /// Return the register to use for the index value, and the subregister to use 2601 /// for the indirectly accessed register. 2602 static std::pair<Register, unsigned> 2603 computeIndirectRegIndex(MachineRegisterInfo &MRI, 2604 const SIRegisterInfo &TRI, 2605 const TargetRegisterClass *SuperRC, 2606 Register IdxReg, 2607 unsigned EltSize) { 2608 Register IdxBaseReg; 2609 int Offset; 2610 2611 std::tie(IdxBaseReg, Offset) = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg); 2612 if (IdxBaseReg == AMDGPU::NoRegister) { 2613 // This will happen if the index is a known constant. This should ordinarily 2614 // be legalized out, but handle it as a register just in case. 2615 assert(Offset == 0); 2616 IdxBaseReg = IdxReg; 2617 } 2618 2619 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize); 2620 2621 // Skip out of bounds offsets, or else we would end up using an undefined 2622 // register. 2623 if (static_cast<unsigned>(Offset) >= SubRegs.size()) 2624 return std::make_pair(IdxReg, SubRegs[0]); 2625 return std::make_pair(IdxBaseReg, SubRegs[Offset]); 2626 } 2627 2628 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT( 2629 MachineInstr &MI) const { 2630 Register DstReg = MI.getOperand(0).getReg(); 2631 Register SrcReg = MI.getOperand(1).getReg(); 2632 Register IdxReg = MI.getOperand(2).getReg(); 2633 2634 LLT DstTy = MRI->getType(DstReg); 2635 LLT SrcTy = MRI->getType(SrcReg); 2636 2637 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2638 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 2639 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 2640 2641 // The index must be scalar. If it wasn't RegBankSelect should have moved this 2642 // into a waterfall loop. 2643 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 2644 return false; 2645 2646 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB, 2647 *MRI); 2648 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB, 2649 *MRI); 2650 if (!SrcRC || !DstRC) 2651 return false; 2652 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 2653 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 2654 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 2655 return false; 2656 2657 MachineBasicBlock *BB = MI.getParent(); 2658 const DebugLoc &DL = MI.getDebugLoc(); 2659 const bool Is64 = DstTy.getSizeInBits() == 64; 2660 2661 unsigned SubReg; 2662 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg, 2663 DstTy.getSizeInBits() / 8); 2664 2665 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) { 2666 if (DstTy.getSizeInBits() != 32 && !Is64) 2667 return false; 2668 2669 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2670 .addReg(IdxReg); 2671 2672 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32; 2673 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg) 2674 .addReg(SrcReg, 0, SubReg) 2675 .addReg(SrcReg, RegState::Implicit); 2676 MI.eraseFromParent(); 2677 return true; 2678 } 2679 2680 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32) 2681 return false; 2682 2683 if (!STI.useVGPRIndexMode()) { 2684 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2685 .addReg(IdxReg); 2686 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg) 2687 .addReg(SrcReg, 0, SubReg) 2688 .addReg(SrcReg, RegState::Implicit); 2689 MI.eraseFromParent(); 2690 return true; 2691 } 2692 2693 const MCInstrDesc &GPRIDXDesc = 2694 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true); 2695 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg) 2696 .addReg(SrcReg) 2697 .addReg(IdxReg) 2698 .addImm(SubReg); 2699 2700 MI.eraseFromParent(); 2701 return true; 2702 } 2703 2704 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd 2705 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT( 2706 MachineInstr &MI) const { 2707 Register DstReg = MI.getOperand(0).getReg(); 2708 Register VecReg = MI.getOperand(1).getReg(); 2709 Register ValReg = MI.getOperand(2).getReg(); 2710 Register IdxReg = MI.getOperand(3).getReg(); 2711 2712 LLT VecTy = MRI->getType(DstReg); 2713 LLT ValTy = MRI->getType(ValReg); 2714 unsigned VecSize = VecTy.getSizeInBits(); 2715 unsigned ValSize = ValTy.getSizeInBits(); 2716 2717 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI); 2718 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI); 2719 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 2720 2721 assert(VecTy.getElementType() == ValTy); 2722 2723 // The index must be scalar. If it wasn't RegBankSelect should have moved this 2724 // into a waterfall loop. 2725 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 2726 return false; 2727 2728 const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB, 2729 *MRI); 2730 const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB, 2731 *MRI); 2732 2733 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) || 2734 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) || 2735 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) || 2736 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 2737 return false; 2738 2739 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32) 2740 return false; 2741 2742 unsigned SubReg; 2743 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, 2744 ValSize / 8); 2745 2746 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID && 2747 STI.useVGPRIndexMode(); 2748 2749 MachineBasicBlock *BB = MI.getParent(); 2750 const DebugLoc &DL = MI.getDebugLoc(); 2751 2752 if (!IndexMode) { 2753 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2754 .addReg(IdxReg); 2755 2756 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo( 2757 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID); 2758 BuildMI(*BB, MI, DL, RegWriteOp, DstReg) 2759 .addReg(VecReg) 2760 .addReg(ValReg) 2761 .addImm(SubReg); 2762 MI.eraseFromParent(); 2763 return true; 2764 } 2765 2766 const MCInstrDesc &GPRIDXDesc = 2767 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false); 2768 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg) 2769 .addReg(VecReg) 2770 .addReg(ValReg) 2771 .addReg(IdxReg) 2772 .addImm(SubReg); 2773 2774 MI.eraseFromParent(); 2775 return true; 2776 } 2777 2778 static bool isZeroOrUndef(int X) { 2779 return X == 0 || X == -1; 2780 } 2781 2782 static bool isOneOrUndef(int X) { 2783 return X == 1 || X == -1; 2784 } 2785 2786 static bool isZeroOrOneOrUndef(int X) { 2787 return X == 0 || X == 1 || X == -1; 2788 } 2789 2790 // Normalize a VOP3P shuffle mask to refer to the low/high half of a single 2791 // 32-bit register. 2792 static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1, 2793 ArrayRef<int> Mask) { 2794 NewMask[0] = Mask[0]; 2795 NewMask[1] = Mask[1]; 2796 if (isZeroOrOneOrUndef(Mask[0]) && isZeroOrOneOrUndef(Mask[1])) 2797 return Src0; 2798 2799 assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1); 2800 assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1); 2801 2802 // Shift the mask inputs to be 0/1; 2803 NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2; 2804 NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2; 2805 return Src1; 2806 } 2807 2808 // This is only legal with VOP3P instructions as an aid to op_sel matching. 2809 bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR( 2810 MachineInstr &MI) const { 2811 Register DstReg = MI.getOperand(0).getReg(); 2812 Register Src0Reg = MI.getOperand(1).getReg(); 2813 Register Src1Reg = MI.getOperand(2).getReg(); 2814 ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask(); 2815 2816 const LLT V2S16 = LLT::vector(2, 16); 2817 if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16) 2818 return false; 2819 2820 if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask)) 2821 return false; 2822 2823 assert(ShufMask.size() == 2); 2824 assert(STI.hasSDWA() && "no target has VOP3P but not SDWA"); 2825 2826 MachineBasicBlock *MBB = MI.getParent(); 2827 const DebugLoc &DL = MI.getDebugLoc(); 2828 2829 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2830 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; 2831 const TargetRegisterClass &RC = IsVALU ? 2832 AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 2833 2834 // Handle the degenerate case which should have folded out. 2835 if (ShufMask[0] == -1 && ShufMask[1] == -1) { 2836 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg); 2837 2838 MI.eraseFromParent(); 2839 return RBI.constrainGenericRegister(DstReg, RC, *MRI); 2840 } 2841 2842 // A legal VOP3P mask only reads one of the sources. 2843 int Mask[2]; 2844 Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask); 2845 2846 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) || 2847 !RBI.constrainGenericRegister(SrcVec, RC, *MRI)) 2848 return false; 2849 2850 // TODO: This also should have been folded out 2851 if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) { 2852 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg) 2853 .addReg(SrcVec); 2854 2855 MI.eraseFromParent(); 2856 return true; 2857 } 2858 2859 if (Mask[0] == 1 && Mask[1] == -1) { 2860 if (IsVALU) { 2861 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg) 2862 .addImm(16) 2863 .addReg(SrcVec); 2864 } else { 2865 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg) 2866 .addReg(SrcVec) 2867 .addImm(16); 2868 } 2869 } else if (Mask[0] == -1 && Mask[1] == 0) { 2870 if (IsVALU) { 2871 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg) 2872 .addImm(16) 2873 .addReg(SrcVec); 2874 } else { 2875 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg) 2876 .addReg(SrcVec) 2877 .addImm(16); 2878 } 2879 } else if (Mask[0] == 0 && Mask[1] == 0) { 2880 if (IsVALU) { 2881 // Write low half of the register into the high half. 2882 MachineInstr *MovSDWA = 2883 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 2884 .addImm(0) // $src0_modifiers 2885 .addReg(SrcVec) // $src0 2886 .addImm(0) // $clamp 2887 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel 2888 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 2889 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel 2890 .addReg(SrcVec, RegState::Implicit); 2891 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 2892 } else { 2893 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg) 2894 .addReg(SrcVec) 2895 .addReg(SrcVec); 2896 } 2897 } else if (Mask[0] == 1 && Mask[1] == 1) { 2898 if (IsVALU) { 2899 // Write high half of the register into the low half. 2900 MachineInstr *MovSDWA = 2901 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 2902 .addImm(0) // $src0_modifiers 2903 .addReg(SrcVec) // $src0 2904 .addImm(0) // $clamp 2905 .addImm(AMDGPU::SDWA::WORD_0) // $dst_sel 2906 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 2907 .addImm(AMDGPU::SDWA::WORD_1) // $src0_sel 2908 .addReg(SrcVec, RegState::Implicit); 2909 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 2910 } else { 2911 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg) 2912 .addReg(SrcVec) 2913 .addReg(SrcVec); 2914 } 2915 } else if (Mask[0] == 1 && Mask[1] == 0) { 2916 if (IsVALU) { 2917 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32_e64), DstReg) 2918 .addReg(SrcVec) 2919 .addReg(SrcVec) 2920 .addImm(16); 2921 } else { 2922 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2923 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg) 2924 .addReg(SrcVec) 2925 .addImm(16); 2926 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg) 2927 .addReg(TmpReg) 2928 .addReg(SrcVec); 2929 } 2930 } else 2931 llvm_unreachable("all shuffle masks should be handled"); 2932 2933 MI.eraseFromParent(); 2934 return true; 2935 } 2936 2937 bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD( 2938 MachineInstr &MI) const { 2939 if (STI.hasGFX90AInsts()) 2940 return selectImpl(MI, *CoverageInfo); 2941 2942 MachineBasicBlock *MBB = MI.getParent(); 2943 const DebugLoc &DL = MI.getDebugLoc(); 2944 2945 if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) { 2946 Function &F = MBB->getParent()->getFunction(); 2947 DiagnosticInfoUnsupported 2948 NoFpRet(F, "return versions of fp atomics not supported", 2949 MI.getDebugLoc(), DS_Error); 2950 F.getContext().diagnose(NoFpRet); 2951 return false; 2952 } 2953 2954 // FIXME: This is only needed because tablegen requires number of dst operands 2955 // in match and replace pattern to be the same. Otherwise patterns can be 2956 // exported from SDag path. 2957 MachineOperand &VDataIn = MI.getOperand(1); 2958 MachineOperand &VIndex = MI.getOperand(3); 2959 MachineOperand &VOffset = MI.getOperand(4); 2960 MachineOperand &SOffset = MI.getOperand(5); 2961 int16_t Offset = MI.getOperand(6).getImm(); 2962 2963 bool HasVOffset = !isOperandImmEqual(VOffset, 0, *MRI); 2964 bool HasVIndex = !isOperandImmEqual(VIndex, 0, *MRI); 2965 2966 unsigned Opcode; 2967 if (HasVOffset) { 2968 Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN 2969 : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN; 2970 } else { 2971 Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN 2972 : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET; 2973 } 2974 2975 if (MRI->getType(VDataIn.getReg()).isVector()) { 2976 switch (Opcode) { 2977 case AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN: 2978 Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN; 2979 break; 2980 case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN: 2981 Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFEN; 2982 break; 2983 case AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN: 2984 Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_IDXEN; 2985 break; 2986 case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET: 2987 Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFSET; 2988 break; 2989 } 2990 } 2991 2992 auto I = BuildMI(*MBB, MI, DL, TII.get(Opcode)); 2993 I.add(VDataIn); 2994 2995 if (Opcode == AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN || 2996 Opcode == AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN) { 2997 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class()); 2998 BuildMI(*MBB, &*I, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg) 2999 .addReg(VIndex.getReg()) 3000 .addImm(AMDGPU::sub0) 3001 .addReg(VOffset.getReg()) 3002 .addImm(AMDGPU::sub1); 3003 3004 I.addReg(IdxReg); 3005 } else if (HasVIndex) { 3006 I.add(VIndex); 3007 } else if (HasVOffset) { 3008 I.add(VOffset); 3009 } 3010 3011 I.add(MI.getOperand(2)); // rsrc 3012 I.add(SOffset); 3013 I.addImm(Offset); 3014 I.addImm(MI.getOperand(7).getImm()); // cpol 3015 I.cloneMemRefs(MI); 3016 3017 MI.eraseFromParent(); 3018 3019 return true; 3020 } 3021 3022 bool AMDGPUInstructionSelector::selectGlobalAtomicFadd( 3023 MachineInstr &MI, MachineOperand &AddrOp, MachineOperand &DataOp) const { 3024 3025 if (STI.hasGFX90AInsts()) { 3026 // gfx90a adds return versions of the global atomic fadd instructions so no 3027 // special handling is required. 3028 return selectImpl(MI, *CoverageInfo); 3029 } 3030 3031 MachineBasicBlock *MBB = MI.getParent(); 3032 const DebugLoc &DL = MI.getDebugLoc(); 3033 3034 if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) { 3035 Function &F = MBB->getParent()->getFunction(); 3036 DiagnosticInfoUnsupported 3037 NoFpRet(F, "return versions of fp atomics not supported", 3038 MI.getDebugLoc(), DS_Error); 3039 F.getContext().diagnose(NoFpRet); 3040 return false; 3041 } 3042 3043 // FIXME: This is only needed because tablegen requires number of dst operands 3044 // in match and replace pattern to be the same. Otherwise patterns can be 3045 // exported from SDag path. 3046 auto Addr = selectFlatOffsetImpl(AddrOp, SIInstrFlags::FlatGlobal); 3047 3048 Register Data = DataOp.getReg(); 3049 const unsigned Opc = MRI->getType(Data).isVector() ? 3050 AMDGPU::GLOBAL_ATOMIC_PK_ADD_F16 : AMDGPU::GLOBAL_ATOMIC_ADD_F32; 3051 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc)) 3052 .addReg(Addr.first) 3053 .addReg(Data) 3054 .addImm(Addr.second) 3055 .addImm(0) // cpol 3056 .cloneMemRefs(MI); 3057 3058 MI.eraseFromParent(); 3059 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 3060 } 3061 3062 bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{ 3063 MI.setDesc(TII.get(MI.getOperand(1).getImm())); 3064 MI.RemoveOperand(1); 3065 MI.addImplicitDefUseOperands(*MI.getParent()->getParent()); 3066 return true; 3067 } 3068 3069 bool AMDGPUInstructionSelector::select(MachineInstr &I) { 3070 if (I.isPHI()) 3071 return selectPHI(I); 3072 3073 if (!I.isPreISelOpcode()) { 3074 if (I.isCopy()) 3075 return selectCOPY(I); 3076 return true; 3077 } 3078 3079 switch (I.getOpcode()) { 3080 case TargetOpcode::G_AND: 3081 case TargetOpcode::G_OR: 3082 case TargetOpcode::G_XOR: 3083 if (selectImpl(I, *CoverageInfo)) 3084 return true; 3085 return selectG_AND_OR_XOR(I); 3086 case TargetOpcode::G_ADD: 3087 case TargetOpcode::G_SUB: 3088 if (selectImpl(I, *CoverageInfo)) 3089 return true; 3090 return selectG_ADD_SUB(I); 3091 case TargetOpcode::G_UADDO: 3092 case TargetOpcode::G_USUBO: 3093 case TargetOpcode::G_UADDE: 3094 case TargetOpcode::G_USUBE: 3095 return selectG_UADDO_USUBO_UADDE_USUBE(I); 3096 case TargetOpcode::G_INTTOPTR: 3097 case TargetOpcode::G_BITCAST: 3098 case TargetOpcode::G_PTRTOINT: 3099 return selectCOPY(I); 3100 case TargetOpcode::G_CONSTANT: 3101 case TargetOpcode::G_FCONSTANT: 3102 return selectG_CONSTANT(I); 3103 case TargetOpcode::G_FNEG: 3104 if (selectImpl(I, *CoverageInfo)) 3105 return true; 3106 return selectG_FNEG(I); 3107 case TargetOpcode::G_FABS: 3108 if (selectImpl(I, *CoverageInfo)) 3109 return true; 3110 return selectG_FABS(I); 3111 case TargetOpcode::G_EXTRACT: 3112 return selectG_EXTRACT(I); 3113 case TargetOpcode::G_MERGE_VALUES: 3114 case TargetOpcode::G_BUILD_VECTOR: 3115 case TargetOpcode::G_CONCAT_VECTORS: 3116 return selectG_MERGE_VALUES(I); 3117 case TargetOpcode::G_UNMERGE_VALUES: 3118 return selectG_UNMERGE_VALUES(I); 3119 case TargetOpcode::G_BUILD_VECTOR_TRUNC: 3120 return selectG_BUILD_VECTOR_TRUNC(I); 3121 case TargetOpcode::G_PTR_ADD: 3122 return selectG_PTR_ADD(I); 3123 case TargetOpcode::G_IMPLICIT_DEF: 3124 return selectG_IMPLICIT_DEF(I); 3125 case TargetOpcode::G_FREEZE: 3126 return selectCOPY(I); 3127 case TargetOpcode::G_INSERT: 3128 return selectG_INSERT(I); 3129 case TargetOpcode::G_INTRINSIC: 3130 return selectG_INTRINSIC(I); 3131 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: 3132 return selectG_INTRINSIC_W_SIDE_EFFECTS(I); 3133 case TargetOpcode::G_ICMP: 3134 if (selectG_ICMP(I)) 3135 return true; 3136 return selectImpl(I, *CoverageInfo); 3137 case TargetOpcode::G_LOAD: 3138 case TargetOpcode::G_STORE: 3139 case TargetOpcode::G_ATOMIC_CMPXCHG: 3140 case TargetOpcode::G_ATOMICRMW_XCHG: 3141 case TargetOpcode::G_ATOMICRMW_ADD: 3142 case TargetOpcode::G_ATOMICRMW_SUB: 3143 case TargetOpcode::G_ATOMICRMW_AND: 3144 case TargetOpcode::G_ATOMICRMW_OR: 3145 case TargetOpcode::G_ATOMICRMW_XOR: 3146 case TargetOpcode::G_ATOMICRMW_MIN: 3147 case TargetOpcode::G_ATOMICRMW_MAX: 3148 case TargetOpcode::G_ATOMICRMW_UMIN: 3149 case TargetOpcode::G_ATOMICRMW_UMAX: 3150 case TargetOpcode::G_ATOMICRMW_FADD: 3151 case AMDGPU::G_AMDGPU_ATOMIC_INC: 3152 case AMDGPU::G_AMDGPU_ATOMIC_DEC: 3153 case AMDGPU::G_AMDGPU_ATOMIC_FMIN: 3154 case AMDGPU::G_AMDGPU_ATOMIC_FMAX: 3155 return selectG_LOAD_STORE_ATOMICRMW(I); 3156 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: 3157 return selectG_AMDGPU_ATOMIC_CMPXCHG(I); 3158 case TargetOpcode::G_SELECT: 3159 return selectG_SELECT(I); 3160 case TargetOpcode::G_TRUNC: 3161 return selectG_TRUNC(I); 3162 case TargetOpcode::G_SEXT: 3163 case TargetOpcode::G_ZEXT: 3164 case TargetOpcode::G_ANYEXT: 3165 case TargetOpcode::G_SEXT_INREG: 3166 if (selectImpl(I, *CoverageInfo)) 3167 return true; 3168 return selectG_SZA_EXT(I); 3169 case TargetOpcode::G_BRCOND: 3170 return selectG_BRCOND(I); 3171 case TargetOpcode::G_GLOBAL_VALUE: 3172 return selectG_GLOBAL_VALUE(I); 3173 case TargetOpcode::G_PTRMASK: 3174 return selectG_PTRMASK(I); 3175 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 3176 return selectG_EXTRACT_VECTOR_ELT(I); 3177 case TargetOpcode::G_INSERT_VECTOR_ELT: 3178 return selectG_INSERT_VECTOR_ELT(I); 3179 case TargetOpcode::G_SHUFFLE_VECTOR: 3180 return selectG_SHUFFLE_VECTOR(I); 3181 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 3182 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: { 3183 const AMDGPU::ImageDimIntrinsicInfo *Intr 3184 = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID()); 3185 assert(Intr && "not an image intrinsic with image pseudo"); 3186 return selectImageIntrinsic(I, Intr); 3187 } 3188 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: 3189 return selectBVHIntrinsic(I); 3190 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: 3191 return selectAMDGPU_BUFFER_ATOMIC_FADD(I); 3192 default: 3193 return selectImpl(I, *CoverageInfo); 3194 } 3195 return false; 3196 } 3197 3198 InstructionSelector::ComplexRendererFns 3199 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { 3200 return {{ 3201 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 3202 }}; 3203 3204 } 3205 3206 std::pair<Register, unsigned> 3207 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root, 3208 bool AllowAbs) const { 3209 Register Src = Root.getReg(); 3210 Register OrigSrc = Src; 3211 unsigned Mods = 0; 3212 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI); 3213 3214 if (MI && MI->getOpcode() == AMDGPU::G_FNEG) { 3215 Src = MI->getOperand(1).getReg(); 3216 Mods |= SISrcMods::NEG; 3217 MI = getDefIgnoringCopies(Src, *MRI); 3218 } 3219 3220 if (AllowAbs && MI && MI->getOpcode() == AMDGPU::G_FABS) { 3221 Src = MI->getOperand(1).getReg(); 3222 Mods |= SISrcMods::ABS; 3223 } 3224 3225 if (Mods != 0 && 3226 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) { 3227 MachineInstr *UseMI = Root.getParent(); 3228 3229 // If we looked through copies to find source modifiers on an SGPR operand, 3230 // we now have an SGPR register source. To avoid potentially violating the 3231 // constant bus restriction, we need to insert a copy to a VGPR. 3232 Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc); 3233 BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(), 3234 TII.get(AMDGPU::COPY), VGPRSrc) 3235 .addReg(Src); 3236 Src = VGPRSrc; 3237 } 3238 3239 return std::make_pair(Src, Mods); 3240 } 3241 3242 /// 3243 /// This will select either an SGPR or VGPR operand and will save us from 3244 /// having to write an extra tablegen pattern. 3245 InstructionSelector::ComplexRendererFns 3246 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const { 3247 return {{ 3248 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 3249 }}; 3250 } 3251 3252 InstructionSelector::ComplexRendererFns 3253 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { 3254 Register Src; 3255 unsigned Mods; 3256 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 3257 3258 return {{ 3259 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3260 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 3261 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 3262 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 3263 }}; 3264 } 3265 3266 InstructionSelector::ComplexRendererFns 3267 AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const { 3268 Register Src; 3269 unsigned Mods; 3270 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false); 3271 3272 return {{ 3273 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3274 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 3275 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 3276 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 3277 }}; 3278 } 3279 3280 InstructionSelector::ComplexRendererFns 3281 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { 3282 return {{ 3283 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 3284 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 3285 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 3286 }}; 3287 } 3288 3289 InstructionSelector::ComplexRendererFns 3290 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { 3291 Register Src; 3292 unsigned Mods; 3293 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 3294 3295 return {{ 3296 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3297 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3298 }}; 3299 } 3300 3301 InstructionSelector::ComplexRendererFns 3302 AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const { 3303 Register Src; 3304 unsigned Mods; 3305 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false); 3306 3307 return {{ 3308 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3309 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3310 }}; 3311 } 3312 3313 InstructionSelector::ComplexRendererFns 3314 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { 3315 Register Reg = Root.getReg(); 3316 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); 3317 if (Def && (Def->getOpcode() == AMDGPU::G_FNEG || 3318 Def->getOpcode() == AMDGPU::G_FABS)) 3319 return {}; 3320 return {{ 3321 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 3322 }}; 3323 } 3324 3325 std::pair<Register, unsigned> 3326 AMDGPUInstructionSelector::selectVOP3PModsImpl( 3327 Register Src, const MachineRegisterInfo &MRI) const { 3328 unsigned Mods = 0; 3329 MachineInstr *MI = MRI.getVRegDef(Src); 3330 3331 if (MI && MI->getOpcode() == AMDGPU::G_FNEG && 3332 // It's possible to see an f32 fneg here, but unlikely. 3333 // TODO: Treat f32 fneg as only high bit. 3334 MRI.getType(Src) == LLT::vector(2, 16)) { 3335 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); 3336 Src = MI->getOperand(1).getReg(); 3337 MI = MRI.getVRegDef(Src); 3338 } 3339 3340 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. 3341 3342 // Packed instructions do not have abs modifiers. 3343 Mods |= SISrcMods::OP_SEL_1; 3344 3345 return std::make_pair(Src, Mods); 3346 } 3347 3348 InstructionSelector::ComplexRendererFns 3349 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const { 3350 MachineRegisterInfo &MRI 3351 = Root.getParent()->getParent()->getParent()->getRegInfo(); 3352 3353 Register Src; 3354 unsigned Mods; 3355 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI); 3356 3357 return {{ 3358 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3359 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3360 }}; 3361 } 3362 3363 InstructionSelector::ComplexRendererFns 3364 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const { 3365 Register Src; 3366 unsigned Mods; 3367 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 3368 if (!isKnownNeverNaN(Src, *MRI)) 3369 return None; 3370 3371 return {{ 3372 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3373 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3374 }}; 3375 } 3376 3377 InstructionSelector::ComplexRendererFns 3378 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { 3379 // FIXME: Handle op_sel 3380 return {{ 3381 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 3382 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods 3383 }}; 3384 } 3385 3386 InstructionSelector::ComplexRendererFns 3387 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { 3388 SmallVector<GEPInfo, 4> AddrInfo; 3389 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 3390 3391 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 3392 return None; 3393 3394 const GEPInfo &GEPInfo = AddrInfo[0]; 3395 Optional<int64_t> EncodedImm = 3396 AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false); 3397 if (!EncodedImm) 3398 return None; 3399 3400 unsigned PtrReg = GEPInfo.SgprParts[0]; 3401 return {{ 3402 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 3403 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } 3404 }}; 3405 } 3406 3407 InstructionSelector::ComplexRendererFns 3408 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { 3409 SmallVector<GEPInfo, 4> AddrInfo; 3410 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 3411 3412 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 3413 return None; 3414 3415 const GEPInfo &GEPInfo = AddrInfo[0]; 3416 Register PtrReg = GEPInfo.SgprParts[0]; 3417 Optional<int64_t> EncodedImm = 3418 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm); 3419 if (!EncodedImm) 3420 return None; 3421 3422 return {{ 3423 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 3424 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } 3425 }}; 3426 } 3427 3428 InstructionSelector::ComplexRendererFns 3429 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { 3430 MachineInstr *MI = Root.getParent(); 3431 MachineBasicBlock *MBB = MI->getParent(); 3432 3433 SmallVector<GEPInfo, 4> AddrInfo; 3434 getAddrModeInfo(*MI, *MRI, AddrInfo); 3435 3436 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, 3437 // then we can select all ptr + 32-bit offsets not just immediate offsets. 3438 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 3439 return None; 3440 3441 const GEPInfo &GEPInfo = AddrInfo[0]; 3442 // SGPR offset is unsigned. 3443 if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm)) 3444 return None; 3445 3446 // If we make it this far we have a load with an 32-bit immediate offset. 3447 // It is OK to select this using a sgpr offset, because we have already 3448 // failed trying to select this load into one of the _IMM variants since 3449 // the _IMM Patterns are considered before the _SGPR patterns. 3450 Register PtrReg = GEPInfo.SgprParts[0]; 3451 Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 3452 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) 3453 .addImm(GEPInfo.Imm); 3454 return {{ 3455 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 3456 [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); } 3457 }}; 3458 } 3459 3460 std::pair<Register, int> 3461 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root, 3462 uint64_t FlatVariant) const { 3463 MachineInstr *MI = Root.getParent(); 3464 3465 auto Default = std::make_pair(Root.getReg(), 0); 3466 3467 if (!STI.hasFlatInstOffsets()) 3468 return Default; 3469 3470 Register PtrBase; 3471 int64_t ConstOffset; 3472 std::tie(PtrBase, ConstOffset) = 3473 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 3474 if (ConstOffset == 0) 3475 return Default; 3476 3477 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); 3478 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant)) 3479 return Default; 3480 3481 return std::make_pair(PtrBase, ConstOffset); 3482 } 3483 3484 InstructionSelector::ComplexRendererFns 3485 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const { 3486 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT); 3487 3488 return {{ 3489 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); }, 3490 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); }, 3491 }}; 3492 } 3493 3494 InstructionSelector::ComplexRendererFns 3495 AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const { 3496 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal); 3497 3498 return {{ 3499 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); }, 3500 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); }, 3501 }}; 3502 } 3503 3504 InstructionSelector::ComplexRendererFns 3505 AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const { 3506 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch); 3507 3508 return {{ 3509 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); }, 3510 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); }, 3511 }}; 3512 } 3513 3514 /// Match a zero extend from a 32-bit value to 64-bits. 3515 static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) { 3516 Register ZExtSrc; 3517 if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc)))) 3518 return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register(); 3519 3520 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0) 3521 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); 3522 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES) 3523 return false; 3524 3525 if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) { 3526 return Def->getOperand(1).getReg(); 3527 } 3528 3529 return Register(); 3530 } 3531 3532 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset) 3533 InstructionSelector::ComplexRendererFns 3534 AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { 3535 Register Addr = Root.getReg(); 3536 Register PtrBase; 3537 int64_t ConstOffset; 3538 int64_t ImmOffset = 0; 3539 3540 // Match the immediate offset first, which canonically is moved as low as 3541 // possible. 3542 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); 3543 3544 if (ConstOffset != 0) { 3545 if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, 3546 SIInstrFlags::FlatGlobal)) { 3547 Addr = PtrBase; 3548 ImmOffset = ConstOffset; 3549 } else { 3550 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI); 3551 if (!PtrBaseDef) 3552 return None; 3553 3554 if (isSGPR(PtrBaseDef->Reg)) { 3555 if (ConstOffset > 0) { 3556 // Offset is too large. 3557 // 3558 // saddr + large_offset -> saddr + 3559 // (voffset = large_offset & ~MaxOffset) + 3560 // (large_offset & MaxOffset); 3561 int64_t SplitImmOffset, RemainderOffset; 3562 std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset( 3563 ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal); 3564 3565 if (isUInt<32>(RemainderOffset)) { 3566 MachineInstr *MI = Root.getParent(); 3567 MachineBasicBlock *MBB = MI->getParent(); 3568 Register HighBits = 3569 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3570 3571 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), 3572 HighBits) 3573 .addImm(RemainderOffset); 3574 3575 return {{ 3576 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr 3577 [=](MachineInstrBuilder &MIB) { 3578 MIB.addReg(HighBits); 3579 }, // voffset 3580 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); }, 3581 }}; 3582 } 3583 } 3584 3585 // We are adding a 64 bit SGPR and a constant. If constant bus limit 3586 // is 1 we would need to perform 1 or 2 extra moves for each half of 3587 // the constant and it is better to do a scalar add and then issue a 3588 // single VALU instruction to materialize zero. Otherwise it is less 3589 // instructions to perform VALU adds with immediates or inline literals. 3590 unsigned NumLiterals = 3591 !TII.isInlineConstant(APInt(32, ConstOffset & 0xffffffff)) + 3592 !TII.isInlineConstant(APInt(32, ConstOffset >> 32)); 3593 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals) 3594 return None; 3595 } 3596 } 3597 } 3598 3599 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI); 3600 if (!AddrDef) 3601 return None; 3602 3603 // Match the variable offset. 3604 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) { 3605 // Look through the SGPR->VGPR copy. 3606 Register SAddr = 3607 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI); 3608 3609 if (SAddr && isSGPR(SAddr)) { 3610 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg(); 3611 3612 // It's possible voffset is an SGPR here, but the copy to VGPR will be 3613 // inserted later. 3614 if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) { 3615 return {{[=](MachineInstrBuilder &MIB) { // saddr 3616 MIB.addReg(SAddr); 3617 }, 3618 [=](MachineInstrBuilder &MIB) { // voffset 3619 MIB.addReg(VOffset); 3620 }, 3621 [=](MachineInstrBuilder &MIB) { // offset 3622 MIB.addImm(ImmOffset); 3623 }}}; 3624 } 3625 } 3626 } 3627 3628 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and 3629 // drop this. 3630 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF || 3631 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg)) 3632 return None; 3633 3634 // It's cheaper to materialize a single 32-bit zero for vaddr than the two 3635 // moves required to copy a 64-bit SGPR to VGPR. 3636 MachineInstr *MI = Root.getParent(); 3637 MachineBasicBlock *MBB = MI->getParent(); 3638 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3639 3640 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset) 3641 .addImm(0); 3642 3643 return {{ 3644 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr 3645 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset 3646 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset 3647 }}; 3648 } 3649 3650 InstructionSelector::ComplexRendererFns 3651 AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { 3652 Register Addr = Root.getReg(); 3653 Register PtrBase; 3654 int64_t ConstOffset; 3655 int64_t ImmOffset = 0; 3656 3657 // Match the immediate offset first, which canonically is moved as low as 3658 // possible. 3659 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); 3660 3661 if (ConstOffset != 0 && 3662 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, 3663 SIInstrFlags::FlatScratch)) { 3664 Addr = PtrBase; 3665 ImmOffset = ConstOffset; 3666 } 3667 3668 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI); 3669 if (!AddrDef) 3670 return None; 3671 3672 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) { 3673 int FI = AddrDef->MI->getOperand(1).getIndex(); 3674 return {{ 3675 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr 3676 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset 3677 }}; 3678 } 3679 3680 Register SAddr = AddrDef->Reg; 3681 3682 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) { 3683 Register LHS = AddrDef->MI->getOperand(1).getReg(); 3684 Register RHS = AddrDef->MI->getOperand(2).getReg(); 3685 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI); 3686 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI); 3687 3688 if (LHSDef && RHSDef && 3689 LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX && 3690 isSGPR(RHSDef->Reg)) { 3691 int FI = LHSDef->MI->getOperand(1).getIndex(); 3692 MachineInstr &I = *Root.getParent(); 3693 MachineBasicBlock *BB = I.getParent(); 3694 const DebugLoc &DL = I.getDebugLoc(); 3695 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 3696 3697 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr) 3698 .addFrameIndex(FI) 3699 .addReg(RHSDef->Reg); 3700 } 3701 } 3702 3703 if (!isSGPR(SAddr)) 3704 return None; 3705 3706 return {{ 3707 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr 3708 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset 3709 }}; 3710 } 3711 3712 InstructionSelector::ComplexRendererFns 3713 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { 3714 MachineInstr *MI = Root.getParent(); 3715 MachineBasicBlock *MBB = MI->getParent(); 3716 MachineFunction *MF = MBB->getParent(); 3717 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 3718 3719 int64_t Offset = 0; 3720 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) && 3721 Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) { 3722 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3723 3724 // TODO: Should this be inside the render function? The iterator seems to 3725 // move. 3726 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), 3727 HighBits) 3728 .addImm(Offset & ~4095); 3729 3730 return {{[=](MachineInstrBuilder &MIB) { // rsrc 3731 MIB.addReg(Info->getScratchRSrcReg()); 3732 }, 3733 [=](MachineInstrBuilder &MIB) { // vaddr 3734 MIB.addReg(HighBits); 3735 }, 3736 [=](MachineInstrBuilder &MIB) { // soffset 3737 // Use constant zero for soffset and rely on eliminateFrameIndex 3738 // to choose the appropriate frame register if need be. 3739 MIB.addImm(0); 3740 }, 3741 [=](MachineInstrBuilder &MIB) { // offset 3742 MIB.addImm(Offset & 4095); 3743 }}}; 3744 } 3745 3746 assert(Offset == 0 || Offset == -1); 3747 3748 // Try to fold a frame index directly into the MUBUF vaddr field, and any 3749 // offsets. 3750 Optional<int> FI; 3751 Register VAddr = Root.getReg(); 3752 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) { 3753 Register PtrBase; 3754 int64_t ConstOffset; 3755 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI); 3756 if (ConstOffset != 0) { 3757 if (SIInstrInfo::isLegalMUBUFImmOffset(ConstOffset) && 3758 (!STI.privateMemoryResourceIsRangeChecked() || 3759 KnownBits->signBitIsZero(PtrBase))) { 3760 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase); 3761 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX) 3762 FI = PtrBaseDef->getOperand(1).getIndex(); 3763 else 3764 VAddr = PtrBase; 3765 Offset = ConstOffset; 3766 } 3767 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) { 3768 FI = RootDef->getOperand(1).getIndex(); 3769 } 3770 } 3771 3772 return {{[=](MachineInstrBuilder &MIB) { // rsrc 3773 MIB.addReg(Info->getScratchRSrcReg()); 3774 }, 3775 [=](MachineInstrBuilder &MIB) { // vaddr 3776 if (FI.hasValue()) 3777 MIB.addFrameIndex(FI.getValue()); 3778 else 3779 MIB.addReg(VAddr); 3780 }, 3781 [=](MachineInstrBuilder &MIB) { // soffset 3782 // Use constant zero for soffset and rely on eliminateFrameIndex 3783 // to choose the appropriate frame register if need be. 3784 MIB.addImm(0); 3785 }, 3786 [=](MachineInstrBuilder &MIB) { // offset 3787 MIB.addImm(Offset); 3788 }}}; 3789 } 3790 3791 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base, 3792 int64_t Offset) const { 3793 if (!isUInt<16>(Offset)) 3794 return false; 3795 3796 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) 3797 return true; 3798 3799 // On Southern Islands instruction with a negative base value and an offset 3800 // don't seem to work. 3801 return KnownBits->signBitIsZero(Base); 3802 } 3803 3804 bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0, 3805 int64_t Offset1, 3806 unsigned Size) const { 3807 if (Offset0 % Size != 0 || Offset1 % Size != 0) 3808 return false; 3809 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size)) 3810 return false; 3811 3812 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) 3813 return true; 3814 3815 // On Southern Islands instruction with a negative base value and an offset 3816 // don't seem to work. 3817 return KnownBits->signBitIsZero(Base); 3818 } 3819 3820 InstructionSelector::ComplexRendererFns 3821 AMDGPUInstructionSelector::selectMUBUFScratchOffset( 3822 MachineOperand &Root) const { 3823 MachineInstr *MI = Root.getParent(); 3824 MachineBasicBlock *MBB = MI->getParent(); 3825 3826 int64_t Offset = 0; 3827 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) || 3828 !SIInstrInfo::isLegalMUBUFImmOffset(Offset)) 3829 return {}; 3830 3831 const MachineFunction *MF = MBB->getParent(); 3832 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 3833 3834 return {{ 3835 [=](MachineInstrBuilder &MIB) { // rsrc 3836 MIB.addReg(Info->getScratchRSrcReg()); 3837 }, 3838 [=](MachineInstrBuilder &MIB) { // soffset 3839 MIB.addImm(0); 3840 }, 3841 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset 3842 }}; 3843 } 3844 3845 std::pair<Register, unsigned> 3846 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const { 3847 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 3848 if (!RootDef) 3849 return std::make_pair(Root.getReg(), 0); 3850 3851 int64_t ConstAddr = 0; 3852 3853 Register PtrBase; 3854 int64_t Offset; 3855 std::tie(PtrBase, Offset) = 3856 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 3857 3858 if (Offset) { 3859 if (isDSOffsetLegal(PtrBase, Offset)) { 3860 // (add n0, c0) 3861 return std::make_pair(PtrBase, Offset); 3862 } 3863 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 3864 // TODO 3865 3866 3867 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 3868 // TODO 3869 3870 } 3871 3872 return std::make_pair(Root.getReg(), 0); 3873 } 3874 3875 InstructionSelector::ComplexRendererFns 3876 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const { 3877 Register Reg; 3878 unsigned Offset; 3879 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root); 3880 return {{ 3881 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 3882 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } 3883 }}; 3884 } 3885 3886 InstructionSelector::ComplexRendererFns 3887 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const { 3888 return selectDSReadWrite2(Root, 4); 3889 } 3890 3891 InstructionSelector::ComplexRendererFns 3892 AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const { 3893 return selectDSReadWrite2(Root, 8); 3894 } 3895 3896 InstructionSelector::ComplexRendererFns 3897 AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root, 3898 unsigned Size) const { 3899 Register Reg; 3900 unsigned Offset; 3901 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size); 3902 return {{ 3903 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 3904 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, 3905 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); } 3906 }}; 3907 } 3908 3909 std::pair<Register, unsigned> 3910 AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root, 3911 unsigned Size) const { 3912 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 3913 if (!RootDef) 3914 return std::make_pair(Root.getReg(), 0); 3915 3916 int64_t ConstAddr = 0; 3917 3918 Register PtrBase; 3919 int64_t Offset; 3920 std::tie(PtrBase, Offset) = 3921 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 3922 3923 if (Offset) { 3924 int64_t OffsetValue0 = Offset; 3925 int64_t OffsetValue1 = Offset + Size; 3926 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) { 3927 // (add n0, c0) 3928 return std::make_pair(PtrBase, OffsetValue0 / Size); 3929 } 3930 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 3931 // TODO 3932 3933 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 3934 // TODO 3935 3936 } 3937 3938 return std::make_pair(Root.getReg(), 0); 3939 } 3940 3941 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return 3942 /// the base value with the constant offset. There may be intervening copies 3943 /// between \p Root and the identified constant. Returns \p Root, 0 if this does 3944 /// not match the pattern. 3945 std::pair<Register, int64_t> 3946 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset( 3947 Register Root, const MachineRegisterInfo &MRI) const { 3948 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI); 3949 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD) 3950 return {Root, 0}; 3951 3952 MachineOperand &RHS = RootI->getOperand(2); 3953 Optional<ValueAndVReg> MaybeOffset 3954 = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true); 3955 if (!MaybeOffset) 3956 return {Root, 0}; 3957 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()}; 3958 } 3959 3960 static void addZeroImm(MachineInstrBuilder &MIB) { 3961 MIB.addImm(0); 3962 } 3963 3964 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p 3965 /// BasePtr is not valid, a null base pointer will be used. 3966 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, 3967 uint32_t FormatLo, uint32_t FormatHi, 3968 Register BasePtr) { 3969 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 3970 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 3971 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 3972 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 3973 3974 B.buildInstr(AMDGPU::S_MOV_B32) 3975 .addDef(RSrc2) 3976 .addImm(FormatLo); 3977 B.buildInstr(AMDGPU::S_MOV_B32) 3978 .addDef(RSrc3) 3979 .addImm(FormatHi); 3980 3981 // Build the half of the subregister with the constants before building the 3982 // full 128-bit register. If we are building multiple resource descriptors, 3983 // this will allow CSEing of the 2-component register. 3984 B.buildInstr(AMDGPU::REG_SEQUENCE) 3985 .addDef(RSrcHi) 3986 .addReg(RSrc2) 3987 .addImm(AMDGPU::sub0) 3988 .addReg(RSrc3) 3989 .addImm(AMDGPU::sub1); 3990 3991 Register RSrcLo = BasePtr; 3992 if (!BasePtr) { 3993 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 3994 B.buildInstr(AMDGPU::S_MOV_B64) 3995 .addDef(RSrcLo) 3996 .addImm(0); 3997 } 3998 3999 B.buildInstr(AMDGPU::REG_SEQUENCE) 4000 .addDef(RSrc) 4001 .addReg(RSrcLo) 4002 .addImm(AMDGPU::sub0_sub1) 4003 .addReg(RSrcHi) 4004 .addImm(AMDGPU::sub2_sub3); 4005 4006 return RSrc; 4007 } 4008 4009 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 4010 const SIInstrInfo &TII, Register BasePtr) { 4011 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 4012 4013 // FIXME: Why are half the "default" bits ignored based on the addressing 4014 // mode? 4015 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr); 4016 } 4017 4018 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 4019 const SIInstrInfo &TII, Register BasePtr) { 4020 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 4021 4022 // FIXME: Why are half the "default" bits ignored based on the addressing 4023 // mode? 4024 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr); 4025 } 4026 4027 AMDGPUInstructionSelector::MUBUFAddressData 4028 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const { 4029 MUBUFAddressData Data; 4030 Data.N0 = Src; 4031 4032 Register PtrBase; 4033 int64_t Offset; 4034 4035 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI); 4036 if (isUInt<32>(Offset)) { 4037 Data.N0 = PtrBase; 4038 Data.Offset = Offset; 4039 } 4040 4041 if (MachineInstr *InputAdd 4042 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) { 4043 Data.N2 = InputAdd->getOperand(1).getReg(); 4044 Data.N3 = InputAdd->getOperand(2).getReg(); 4045 4046 // FIXME: Need to fix extra SGPR->VGPRcopies inserted 4047 // FIXME: Don't know this was defined by operand 0 4048 // 4049 // TODO: Remove this when we have copy folding optimizations after 4050 // RegBankSelect. 4051 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg(); 4052 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg(); 4053 } 4054 4055 return Data; 4056 } 4057 4058 /// Return if the addr64 mubuf mode should be used for the given address. 4059 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const { 4060 // (ptr_add N2, N3) -> addr64, or 4061 // (ptr_add (ptr_add N2, N3), C1) -> addr64 4062 if (Addr.N2) 4063 return true; 4064 4065 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI); 4066 return N0Bank->getID() == AMDGPU::VGPRRegBankID; 4067 } 4068 4069 /// Split an immediate offset \p ImmOffset depending on whether it fits in the 4070 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable 4071 /// component. 4072 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset( 4073 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const { 4074 if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset)) 4075 return; 4076 4077 // Illegal offset, store it in soffset. 4078 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 4079 B.buildInstr(AMDGPU::S_MOV_B32) 4080 .addDef(SOffset) 4081 .addImm(ImmOffset); 4082 ImmOffset = 0; 4083 } 4084 4085 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl( 4086 MachineOperand &Root, Register &VAddr, Register &RSrcReg, 4087 Register &SOffset, int64_t &Offset) const { 4088 // FIXME: Predicates should stop this from reaching here. 4089 // addr64 bit was removed for volcanic islands. 4090 if (!STI.hasAddr64() || STI.useFlatForGlobal()) 4091 return false; 4092 4093 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 4094 if (!shouldUseAddr64(AddrData)) 4095 return false; 4096 4097 Register N0 = AddrData.N0; 4098 Register N2 = AddrData.N2; 4099 Register N3 = AddrData.N3; 4100 Offset = AddrData.Offset; 4101 4102 // Base pointer for the SRD. 4103 Register SRDPtr; 4104 4105 if (N2) { 4106 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 4107 assert(N3); 4108 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 4109 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the 4110 // addr64, and construct the default resource from a 0 address. 4111 VAddr = N0; 4112 } else { 4113 SRDPtr = N3; 4114 VAddr = N2; 4115 } 4116 } else { 4117 // N2 is not divergent. 4118 SRDPtr = N2; 4119 VAddr = N3; 4120 } 4121 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 4122 // Use the default null pointer in the resource 4123 VAddr = N0; 4124 } else { 4125 // N0 -> offset, or 4126 // (N0 + C1) -> offset 4127 SRDPtr = N0; 4128 } 4129 4130 MachineIRBuilder B(*Root.getParent()); 4131 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr); 4132 splitIllegalMUBUFOffset(B, SOffset, Offset); 4133 return true; 4134 } 4135 4136 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl( 4137 MachineOperand &Root, Register &RSrcReg, Register &SOffset, 4138 int64_t &Offset) const { 4139 4140 // FIXME: Pattern should not reach here. 4141 if (STI.useFlatForGlobal()) 4142 return false; 4143 4144 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 4145 if (shouldUseAddr64(AddrData)) 4146 return false; 4147 4148 // N0 -> offset, or 4149 // (N0 + C1) -> offset 4150 Register SRDPtr = AddrData.N0; 4151 Offset = AddrData.Offset; 4152 4153 // TODO: Look through extensions for 32-bit soffset. 4154 MachineIRBuilder B(*Root.getParent()); 4155 4156 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr); 4157 splitIllegalMUBUFOffset(B, SOffset, Offset); 4158 return true; 4159 } 4160 4161 InstructionSelector::ComplexRendererFns 4162 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const { 4163 Register VAddr; 4164 Register RSrcReg; 4165 Register SOffset; 4166 int64_t Offset = 0; 4167 4168 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 4169 return {}; 4170 4171 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 4172 // pattern. 4173 return {{ 4174 [=](MachineInstrBuilder &MIB) { // rsrc 4175 MIB.addReg(RSrcReg); 4176 }, 4177 [=](MachineInstrBuilder &MIB) { // vaddr 4178 MIB.addReg(VAddr); 4179 }, 4180 [=](MachineInstrBuilder &MIB) { // soffset 4181 if (SOffset) 4182 MIB.addReg(SOffset); 4183 else 4184 MIB.addImm(0); 4185 }, 4186 [=](MachineInstrBuilder &MIB) { // offset 4187 MIB.addImm(Offset); 4188 }, 4189 addZeroImm, // cpol 4190 addZeroImm, // tfe 4191 addZeroImm // swz 4192 }}; 4193 } 4194 4195 InstructionSelector::ComplexRendererFns 4196 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const { 4197 Register RSrcReg; 4198 Register SOffset; 4199 int64_t Offset = 0; 4200 4201 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 4202 return {}; 4203 4204 return {{ 4205 [=](MachineInstrBuilder &MIB) { // rsrc 4206 MIB.addReg(RSrcReg); 4207 }, 4208 [=](MachineInstrBuilder &MIB) { // soffset 4209 if (SOffset) 4210 MIB.addReg(SOffset); 4211 else 4212 MIB.addImm(0); 4213 }, 4214 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 4215 addZeroImm, // cpol 4216 addZeroImm, // tfe 4217 addZeroImm, // swz 4218 }}; 4219 } 4220 4221 InstructionSelector::ComplexRendererFns 4222 AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const { 4223 Register VAddr; 4224 Register RSrcReg; 4225 Register SOffset; 4226 int64_t Offset = 0; 4227 4228 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 4229 return {}; 4230 4231 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 4232 // pattern. 4233 return {{ 4234 [=](MachineInstrBuilder &MIB) { // rsrc 4235 MIB.addReg(RSrcReg); 4236 }, 4237 [=](MachineInstrBuilder &MIB) { // vaddr 4238 MIB.addReg(VAddr); 4239 }, 4240 [=](MachineInstrBuilder &MIB) { // soffset 4241 if (SOffset) 4242 MIB.addReg(SOffset); 4243 else 4244 MIB.addImm(0); 4245 }, 4246 [=](MachineInstrBuilder &MIB) { // offset 4247 MIB.addImm(Offset); 4248 }, 4249 [=](MachineInstrBuilder &MIB) { 4250 MIB.addImm(AMDGPU::CPol::GLC); // cpol 4251 } 4252 }}; 4253 } 4254 4255 InstructionSelector::ComplexRendererFns 4256 AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const { 4257 Register RSrcReg; 4258 Register SOffset; 4259 int64_t Offset = 0; 4260 4261 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 4262 return {}; 4263 4264 return {{ 4265 [=](MachineInstrBuilder &MIB) { // rsrc 4266 MIB.addReg(RSrcReg); 4267 }, 4268 [=](MachineInstrBuilder &MIB) { // soffset 4269 if (SOffset) 4270 MIB.addReg(SOffset); 4271 else 4272 MIB.addImm(0); 4273 }, 4274 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 4275 [=](MachineInstrBuilder &MIB) { MIB.addImm(AMDGPU::CPol::GLC); } // cpol 4276 }}; 4277 } 4278 4279 /// Get an immediate that must be 32-bits, and treated as zero extended. 4280 static Optional<uint64_t> getConstantZext32Val(Register Reg, 4281 const MachineRegisterInfo &MRI) { 4282 // getConstantVRegVal sexts any values, so see if that matters. 4283 Optional<int64_t> OffsetVal = getConstantVRegSExtVal(Reg, MRI); 4284 if (!OffsetVal || !isInt<32>(*OffsetVal)) 4285 return None; 4286 return Lo_32(*OffsetVal); 4287 } 4288 4289 InstructionSelector::ComplexRendererFns 4290 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const { 4291 Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI); 4292 if (!OffsetVal) 4293 return {}; 4294 4295 Optional<int64_t> EncodedImm = 4296 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true); 4297 if (!EncodedImm) 4298 return {}; 4299 4300 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; 4301 } 4302 4303 InstructionSelector::ComplexRendererFns 4304 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const { 4305 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); 4306 4307 Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI); 4308 if (!OffsetVal) 4309 return {}; 4310 4311 Optional<int64_t> EncodedImm 4312 = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal); 4313 if (!EncodedImm) 4314 return {}; 4315 4316 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; 4317 } 4318 4319 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, 4320 const MachineInstr &MI, 4321 int OpIdx) const { 4322 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 4323 "Expected G_CONSTANT"); 4324 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue()); 4325 } 4326 4327 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB, 4328 const MachineInstr &MI, 4329 int OpIdx) const { 4330 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 4331 "Expected G_CONSTANT"); 4332 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue()); 4333 } 4334 4335 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB, 4336 const MachineInstr &MI, 4337 int OpIdx) const { 4338 assert(OpIdx == -1); 4339 4340 const MachineOperand &Op = MI.getOperand(1); 4341 if (MI.getOpcode() == TargetOpcode::G_FCONSTANT) 4342 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); 4343 else { 4344 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT"); 4345 MIB.addImm(Op.getCImm()->getSExtValue()); 4346 } 4347 } 4348 4349 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB, 4350 const MachineInstr &MI, 4351 int OpIdx) const { 4352 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 4353 "Expected G_CONSTANT"); 4354 MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation()); 4355 } 4356 4357 /// This only really exists to satisfy DAG type checking machinery, so is a 4358 /// no-op here. 4359 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB, 4360 const MachineInstr &MI, 4361 int OpIdx) const { 4362 MIB.addImm(MI.getOperand(OpIdx).getImm()); 4363 } 4364 4365 void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB, 4366 const MachineInstr &MI, 4367 int OpIdx) const { 4368 assert(OpIdx >= 0 && "expected to match an immediate operand"); 4369 MIB.addImm(MI.getOperand(OpIdx).getImm() & AMDGPU::CPol::ALL); 4370 } 4371 4372 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB, 4373 const MachineInstr &MI, 4374 int OpIdx) const { 4375 assert(OpIdx >= 0 && "expected to match an immediate operand"); 4376 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1); 4377 } 4378 4379 void AMDGPUInstructionSelector::renderSetGLC(MachineInstrBuilder &MIB, 4380 const MachineInstr &MI, 4381 int OpIdx) const { 4382 assert(OpIdx >= 0 && "expected to match an immediate operand"); 4383 MIB.addImm(MI.getOperand(OpIdx).getImm() | AMDGPU::CPol::GLC); 4384 } 4385 4386 void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB, 4387 const MachineInstr &MI, 4388 int OpIdx) const { 4389 MIB.addFrameIndex((MI.getOperand(1).getIndex())); 4390 } 4391 4392 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const { 4393 return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm()); 4394 } 4395 4396 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const { 4397 return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm()); 4398 } 4399 4400 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const { 4401 return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm()); 4402 } 4403 4404 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const { 4405 return TII.isInlineConstant(Imm); 4406 } 4407