1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the InstructionSelector class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUInstructionSelector.h" 15 #include "AMDGPUInstrInfo.h" 16 #include "AMDGPUGlobalISelUtils.h" 17 #include "AMDGPURegisterBankInfo.h" 18 #include "AMDGPURegisterInfo.h" 19 #include "AMDGPUSubtarget.h" 20 #include "AMDGPUTargetMachine.h" 21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 22 #include "SIMachineFunctionInfo.h" 23 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 24 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" 25 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 28 #include "llvm/CodeGen/GlobalISel/Utils.h" 29 #include "llvm/CodeGen/MachineBasicBlock.h" 30 #include "llvm/CodeGen/MachineFunction.h" 31 #include "llvm/CodeGen/MachineInstr.h" 32 #include "llvm/CodeGen/MachineInstrBuilder.h" 33 #include "llvm/CodeGen/MachineRegisterInfo.h" 34 #include "llvm/IR/Type.h" 35 #include "llvm/Support/Debug.h" 36 #include "llvm/Support/raw_ostream.h" 37 38 #define DEBUG_TYPE "amdgpu-isel" 39 40 using namespace llvm; 41 using namespace MIPatternMatch; 42 43 #define GET_GLOBALISEL_IMPL 44 #define AMDGPUSubtarget GCNSubtarget 45 #include "AMDGPUGenGlobalISel.inc" 46 #undef GET_GLOBALISEL_IMPL 47 #undef AMDGPUSubtarget 48 49 AMDGPUInstructionSelector::AMDGPUInstructionSelector( 50 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, 51 const AMDGPUTargetMachine &TM) 52 : InstructionSelector(), TII(*STI.getInstrInfo()), 53 TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM), 54 STI(STI), 55 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG), 56 #define GET_GLOBALISEL_PREDICATES_INIT 57 #include "AMDGPUGenGlobalISel.inc" 58 #undef GET_GLOBALISEL_PREDICATES_INIT 59 #define GET_GLOBALISEL_TEMPORARIES_INIT 60 #include "AMDGPUGenGlobalISel.inc" 61 #undef GET_GLOBALISEL_TEMPORARIES_INIT 62 { 63 } 64 65 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } 66 67 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB, 68 CodeGenCoverage &CoverageInfo) { 69 MRI = &MF.getRegInfo(); 70 InstructionSelector::setupMF(MF, KB, CoverageInfo); 71 } 72 73 bool AMDGPUInstructionSelector::isVCC(Register Reg, 74 const MachineRegisterInfo &MRI) const { 75 if (Register::isPhysicalRegister(Reg)) 76 return Reg == TRI.getVCC(); 77 78 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 79 const TargetRegisterClass *RC = 80 RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); 81 if (RC) { 82 const LLT Ty = MRI.getType(Reg); 83 return RC->hasSuperClassEq(TRI.getBoolRC()) && 84 Ty.isValid() && Ty.getSizeInBits() == 1; 85 } 86 87 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); 88 return RB->getID() == AMDGPU::VCCRegBankID; 89 } 90 91 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI, 92 unsigned NewOpc) const { 93 MI.setDesc(TII.get(NewOpc)); 94 MI.RemoveOperand(1); // Remove intrinsic ID. 95 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 96 97 MachineOperand &Dst = MI.getOperand(0); 98 MachineOperand &Src = MI.getOperand(1); 99 100 // TODO: This should be legalized to s32 if needed 101 if (MRI->getType(Dst.getReg()) == LLT::scalar(1)) 102 return false; 103 104 const TargetRegisterClass *DstRC 105 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 106 const TargetRegisterClass *SrcRC 107 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 108 if (!DstRC || DstRC != SrcRC) 109 return false; 110 111 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) && 112 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI); 113 } 114 115 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { 116 const DebugLoc &DL = I.getDebugLoc(); 117 MachineBasicBlock *BB = I.getParent(); 118 I.setDesc(TII.get(TargetOpcode::COPY)); 119 120 const MachineOperand &Src = I.getOperand(1); 121 MachineOperand &Dst = I.getOperand(0); 122 Register DstReg = Dst.getReg(); 123 Register SrcReg = Src.getReg(); 124 125 if (isVCC(DstReg, *MRI)) { 126 if (SrcReg == AMDGPU::SCC) { 127 const TargetRegisterClass *RC 128 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 129 if (!RC) 130 return true; 131 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 132 } 133 134 if (!isVCC(SrcReg, *MRI)) { 135 // TODO: Should probably leave the copy and let copyPhysReg expand it. 136 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI)) 137 return false; 138 139 const TargetRegisterClass *SrcRC 140 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 141 142 Register MaskedReg = MRI->createVirtualRegister(SrcRC); 143 144 // We can't trust the high bits at this point, so clear them. 145 146 // TODO: Skip masking high bits if def is known boolean. 147 148 unsigned AndOpc = TRI.isSGPRClass(SrcRC) ? 149 AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32; 150 BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg) 151 .addImm(1) 152 .addReg(SrcReg); 153 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) 154 .addImm(0) 155 .addReg(MaskedReg); 156 157 if (!MRI->getRegClassOrNull(SrcReg)) 158 MRI->setRegClass(SrcReg, SrcRC); 159 I.eraseFromParent(); 160 return true; 161 } 162 163 const TargetRegisterClass *RC = 164 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 165 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) 166 return false; 167 168 // Don't constrain the source register to a class so the def instruction 169 // handles it (unless it's undef). 170 // 171 // FIXME: This is a hack. When selecting the def, we neeed to know 172 // specifically know that the result is VCCRegBank, and not just an SGPR 173 // with size 1. An SReg_32 with size 1 is ambiguous with wave32. 174 if (Src.isUndef()) { 175 const TargetRegisterClass *SrcRC = 176 TRI.getConstrainedRegClassForOperand(Src, *MRI); 177 if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 178 return false; 179 } 180 181 return true; 182 } 183 184 for (const MachineOperand &MO : I.operands()) { 185 if (Register::isPhysicalRegister(MO.getReg())) 186 continue; 187 188 const TargetRegisterClass *RC = 189 TRI.getConstrainedRegClassForOperand(MO, *MRI); 190 if (!RC) 191 continue; 192 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); 193 } 194 return true; 195 } 196 197 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { 198 const Register DefReg = I.getOperand(0).getReg(); 199 const LLT DefTy = MRI->getType(DefReg); 200 201 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy) 202 203 const RegClassOrRegBank &RegClassOrBank = 204 MRI->getRegClassOrRegBank(DefReg); 205 206 const TargetRegisterClass *DefRC 207 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); 208 if (!DefRC) { 209 if (!DefTy.isValid()) { 210 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); 211 return false; 212 } 213 214 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); 215 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI); 216 if (!DefRC) { 217 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); 218 return false; 219 } 220 } 221 222 // TODO: Verify that all registers have the same bank 223 I.setDesc(TII.get(TargetOpcode::PHI)); 224 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI); 225 } 226 227 MachineOperand 228 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, 229 const TargetRegisterClass &SubRC, 230 unsigned SubIdx) const { 231 232 MachineInstr *MI = MO.getParent(); 233 MachineBasicBlock *BB = MO.getParent()->getParent(); 234 Register DstReg = MRI->createVirtualRegister(&SubRC); 235 236 if (MO.isReg()) { 237 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); 238 Register Reg = MO.getReg(); 239 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) 240 .addReg(Reg, 0, ComposedSubIdx); 241 242 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(), 243 MO.isKill(), MO.isDead(), MO.isUndef(), 244 MO.isEarlyClobber(), 0, MO.isDebug(), 245 MO.isInternalRead()); 246 } 247 248 assert(MO.isImm()); 249 250 APInt Imm(64, MO.getImm()); 251 252 switch (SubIdx) { 253 default: 254 llvm_unreachable("do not know to split immediate with this sub index."); 255 case AMDGPU::sub0: 256 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue()); 257 case AMDGPU::sub1: 258 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue()); 259 } 260 } 261 262 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { 263 switch (Opc) { 264 case AMDGPU::G_AND: 265 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; 266 case AMDGPU::G_OR: 267 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32; 268 case AMDGPU::G_XOR: 269 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32; 270 default: 271 llvm_unreachable("not a bit op"); 272 } 273 } 274 275 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { 276 MachineOperand &Dst = I.getOperand(0); 277 MachineOperand &Src0 = I.getOperand(1); 278 MachineOperand &Src1 = I.getOperand(2); 279 Register DstReg = Dst.getReg(); 280 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 281 282 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 283 if (DstRB->getID() == AMDGPU::VCCRegBankID) { 284 const TargetRegisterClass *RC = TRI.getBoolRC(); 285 unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), 286 RC == &AMDGPU::SReg_64RegClass); 287 I.setDesc(TII.get(InstOpc)); 288 289 // FIXME: Hack to avoid turning the register bank into a register class. 290 // The selector for G_ICMP relies on seeing the register bank for the result 291 // is VCC. In wave32 if we constrain the registers to SReg_32 here, it will 292 // be ambiguous whether it's a scalar or vector bool. 293 if (Src0.isUndef() && !MRI->getRegClassOrNull(Src0.getReg())) 294 MRI->setRegClass(Src0.getReg(), RC); 295 if (Src1.isUndef() && !MRI->getRegClassOrNull(Src1.getReg())) 296 MRI->setRegClass(Src1.getReg(), RC); 297 298 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 299 } 300 301 // TODO: Should this allow an SCC bank result, and produce a copy from SCC for 302 // the result? 303 if (DstRB->getID() == AMDGPU::SGPRRegBankID) { 304 unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), Size > 32); 305 I.setDesc(TII.get(InstOpc)); 306 // Dead implicit-def of scc 307 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef 308 true, // isImp 309 false, // isKill 310 true)); // isDead 311 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 312 } 313 314 return false; 315 } 316 317 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { 318 MachineBasicBlock *BB = I.getParent(); 319 MachineFunction *MF = BB->getParent(); 320 Register DstReg = I.getOperand(0).getReg(); 321 const DebugLoc &DL = I.getDebugLoc(); 322 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 323 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 324 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID; 325 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB; 326 327 if (Size == 32) { 328 if (IsSALU) { 329 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; 330 MachineInstr *Add = 331 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 332 .add(I.getOperand(1)) 333 .add(I.getOperand(2)); 334 I.eraseFromParent(); 335 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 336 } 337 338 if (STI.hasAddNoCarry()) { 339 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64; 340 I.setDesc(TII.get(Opc)); 341 I.addOperand(*MF, MachineOperand::CreateImm(0)); 342 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 343 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 344 } 345 346 const unsigned Opc = Sub ? AMDGPU::V_SUB_I32_e64 : AMDGPU::V_ADD_I32_e64; 347 348 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass()); 349 MachineInstr *Add 350 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 351 .addDef(UnusedCarry, RegState::Dead) 352 .add(I.getOperand(1)) 353 .add(I.getOperand(2)) 354 .addImm(0); 355 I.eraseFromParent(); 356 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 357 } 358 359 assert(!Sub && "illegal sub should not reach here"); 360 361 const TargetRegisterClass &RC 362 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass; 363 const TargetRegisterClass &HalfRC 364 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass; 365 366 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0)); 367 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0)); 368 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1)); 369 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1)); 370 371 Register DstLo = MRI->createVirtualRegister(&HalfRC); 372 Register DstHi = MRI->createVirtualRegister(&HalfRC); 373 374 if (IsSALU) { 375 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) 376 .add(Lo1) 377 .add(Lo2); 378 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) 379 .add(Hi1) 380 .add(Hi2); 381 } else { 382 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass(); 383 Register CarryReg = MRI->createVirtualRegister(CarryRC); 384 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_I32_e64), DstLo) 385 .addDef(CarryReg) 386 .add(Lo1) 387 .add(Lo2) 388 .addImm(0); 389 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi) 390 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead) 391 .add(Hi1) 392 .add(Hi2) 393 .addReg(CarryReg, RegState::Kill) 394 .addImm(0); 395 396 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI)) 397 return false; 398 } 399 400 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 401 .addReg(DstLo) 402 .addImm(AMDGPU::sub0) 403 .addReg(DstHi) 404 .addImm(AMDGPU::sub1); 405 406 407 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI)) 408 return false; 409 410 I.eraseFromParent(); 411 return true; 412 } 413 414 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE( 415 MachineInstr &I) const { 416 MachineBasicBlock *BB = I.getParent(); 417 MachineFunction *MF = BB->getParent(); 418 const DebugLoc &DL = I.getDebugLoc(); 419 Register Dst0Reg = I.getOperand(0).getReg(); 420 Register Dst1Reg = I.getOperand(1).getReg(); 421 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO || 422 I.getOpcode() == AMDGPU::G_UADDE; 423 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE || 424 I.getOpcode() == AMDGPU::G_USUBE; 425 426 if (isVCC(Dst1Reg, *MRI)) { 427 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned 428 // carry out despite the _i32 name. These were renamed in VI to _U32. 429 // FIXME: We should probably rename the opcodes here. 430 unsigned NoCarryOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; 431 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 432 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc)); 433 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 434 I.addOperand(*MF, MachineOperand::CreateImm(0)); 435 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 436 } 437 438 Register Src0Reg = I.getOperand(2).getReg(); 439 Register Src1Reg = I.getOperand(3).getReg(); 440 441 if (HasCarryIn) { 442 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 443 .addReg(I.getOperand(4).getReg()); 444 } 445 446 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; 447 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; 448 449 BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg) 450 .add(I.getOperand(2)) 451 .add(I.getOperand(3)); 452 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg) 453 .addReg(AMDGPU::SCC); 454 455 if (!MRI->getRegClassOrNull(Dst1Reg)) 456 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass); 457 458 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) || 459 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) || 460 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI)) 461 return false; 462 463 if (HasCarryIn && 464 !RBI.constrainGenericRegister(I.getOperand(4).getReg(), 465 AMDGPU::SReg_32RegClass, *MRI)) 466 return false; 467 468 I.eraseFromParent(); 469 return true; 470 } 471 472 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { 473 MachineBasicBlock *BB = I.getParent(); 474 Register DstReg = I.getOperand(0).getReg(); 475 Register SrcReg = I.getOperand(1).getReg(); 476 LLT DstTy = MRI->getType(DstReg); 477 LLT SrcTy = MRI->getType(SrcReg); 478 const unsigned SrcSize = SrcTy.getSizeInBits(); 479 const unsigned DstSize = DstTy.getSizeInBits(); 480 481 // TODO: Should handle any multiple of 32 offset. 482 unsigned Offset = I.getOperand(2).getImm(); 483 if (Offset % DstSize != 0) 484 return false; 485 486 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 487 const TargetRegisterClass *SrcRC = 488 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 489 if (!SrcRC) 490 return false; 491 492 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); 493 494 const DebugLoc &DL = I.getDebugLoc(); 495 MachineInstr *Copy = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg) 496 .addReg(SrcReg, 0, SubRegs[Offset / DstSize]); 497 498 for (const MachineOperand &MO : Copy->operands()) { 499 const TargetRegisterClass *RC = 500 TRI.getConstrainedRegClassForOperand(MO, *MRI); 501 if (!RC) 502 continue; 503 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); 504 } 505 I.eraseFromParent(); 506 return true; 507 } 508 509 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { 510 MachineBasicBlock *BB = MI.getParent(); 511 Register DstReg = MI.getOperand(0).getReg(); 512 LLT DstTy = MRI->getType(DstReg); 513 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg()); 514 515 const unsigned SrcSize = SrcTy.getSizeInBits(); 516 if (SrcSize < 32) 517 return selectImpl(MI, *CoverageInfo); 518 519 const DebugLoc &DL = MI.getDebugLoc(); 520 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 521 const unsigned DstSize = DstTy.getSizeInBits(); 522 const TargetRegisterClass *DstRC = 523 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 524 if (!DstRC) 525 return false; 526 527 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8); 528 MachineInstrBuilder MIB = 529 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg); 530 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) { 531 MachineOperand &Src = MI.getOperand(I + 1); 532 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef())); 533 MIB.addImm(SubRegs[I]); 534 535 const TargetRegisterClass *SrcRC 536 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 537 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI)) 538 return false; 539 } 540 541 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 542 return false; 543 544 MI.eraseFromParent(); 545 return true; 546 } 547 548 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { 549 MachineBasicBlock *BB = MI.getParent(); 550 const int NumDst = MI.getNumOperands() - 1; 551 552 MachineOperand &Src = MI.getOperand(NumDst); 553 554 Register SrcReg = Src.getReg(); 555 Register DstReg0 = MI.getOperand(0).getReg(); 556 LLT DstTy = MRI->getType(DstReg0); 557 LLT SrcTy = MRI->getType(SrcReg); 558 559 const unsigned DstSize = DstTy.getSizeInBits(); 560 const unsigned SrcSize = SrcTy.getSizeInBits(); 561 const DebugLoc &DL = MI.getDebugLoc(); 562 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 563 564 const TargetRegisterClass *SrcRC = 565 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 566 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 567 return false; 568 569 const unsigned SrcFlags = getUndefRegState(Src.isUndef()); 570 571 // Note we could have mixed SGPR and VGPR destination banks for an SGPR 572 // source, and this relies on the fact that the same subregister indices are 573 // used for both. 574 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); 575 for (int I = 0, E = NumDst; I != E; ++I) { 576 MachineOperand &Dst = MI.getOperand(I); 577 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg()) 578 .addReg(SrcReg, SrcFlags, SubRegs[I]); 579 580 const TargetRegisterClass *DstRC = 581 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 582 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI)) 583 return false; 584 } 585 586 MI.eraseFromParent(); 587 return true; 588 } 589 590 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const { 591 return selectG_ADD_SUB(I); 592 } 593 594 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { 595 const MachineOperand &MO = I.getOperand(0); 596 597 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The 598 // regbank check here is to know why getConstrainedRegClassForOperand failed. 599 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI); 600 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) || 601 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) { 602 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); 603 return true; 604 } 605 606 return false; 607 } 608 609 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { 610 MachineBasicBlock *BB = I.getParent(); 611 612 Register DstReg = I.getOperand(0).getReg(); 613 Register Src0Reg = I.getOperand(1).getReg(); 614 Register Src1Reg = I.getOperand(2).getReg(); 615 LLT Src1Ty = MRI->getType(Src1Reg); 616 617 unsigned DstSize = MRI->getType(DstReg).getSizeInBits(); 618 unsigned InsSize = Src1Ty.getSizeInBits(); 619 620 int64_t Offset = I.getOperand(3).getImm(); 621 if (Offset % 32 != 0) 622 return false; 623 624 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32); 625 if (SubReg == AMDGPU::NoSubRegister) 626 return false; 627 628 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 629 const TargetRegisterClass *DstRC = 630 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 631 if (!DstRC) 632 return false; 633 634 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI); 635 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI); 636 const TargetRegisterClass *Src0RC = 637 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI); 638 const TargetRegisterClass *Src1RC = 639 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI); 640 641 // Deal with weird cases where the class only partially supports the subreg 642 // index. 643 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg); 644 if (!Src0RC) 645 return false; 646 647 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 648 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) || 649 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI)) 650 return false; 651 652 const DebugLoc &DL = I.getDebugLoc(); 653 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg) 654 .addReg(Src0Reg) 655 .addReg(Src1Reg) 656 .addImm(SubReg); 657 658 I.eraseFromParent(); 659 return true; 660 } 661 662 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const { 663 if (STI.getLDSBankCount() != 16) 664 return selectImpl(MI, *CoverageInfo); 665 666 Register Dst = MI.getOperand(0).getReg(); 667 Register Src0 = MI.getOperand(2).getReg(); 668 Register M0Val = MI.getOperand(6).getReg(); 669 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) || 670 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) || 671 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI)) 672 return false; 673 674 // This requires 2 instructions. It is possible to write a pattern to support 675 // this, but the generated isel emitter doesn't correctly deal with multiple 676 // output instructions using the same physical register input. The copy to m0 677 // is incorrectly placed before the second instruction. 678 // 679 // TODO: Match source modifiers. 680 681 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 682 const DebugLoc &DL = MI.getDebugLoc(); 683 MachineBasicBlock *MBB = MI.getParent(); 684 685 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 686 .addReg(M0Val); 687 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov) 688 .addImm(2) 689 .addImm(MI.getOperand(4).getImm()) // $attr 690 .addImm(MI.getOperand(3).getImm()); // $attrchan 691 692 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst) 693 .addImm(0) // $src0_modifiers 694 .addReg(Src0) // $src0 695 .addImm(MI.getOperand(4).getImm()) // $attr 696 .addImm(MI.getOperand(3).getImm()) // $attrchan 697 .addImm(0) // $src2_modifiers 698 .addReg(InterpMov) // $src2 - 2 f16 values selected by high 699 .addImm(MI.getOperand(5).getImm()) // $high 700 .addImm(0) // $clamp 701 .addImm(0); // $omod 702 703 MI.eraseFromParent(); 704 return true; 705 } 706 707 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { 708 unsigned IntrinsicID = I.getIntrinsicID(); 709 switch (IntrinsicID) { 710 case Intrinsic::amdgcn_if_break: { 711 MachineBasicBlock *BB = I.getParent(); 712 713 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 714 // SelectionDAG uses for wave32 vs wave64. 715 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK)) 716 .add(I.getOperand(0)) 717 .add(I.getOperand(2)) 718 .add(I.getOperand(3)); 719 720 Register DstReg = I.getOperand(0).getReg(); 721 Register Src0Reg = I.getOperand(2).getReg(); 722 Register Src1Reg = I.getOperand(3).getReg(); 723 724 I.eraseFromParent(); 725 726 for (Register Reg : { DstReg, Src0Reg, Src1Reg }) 727 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 728 729 return true; 730 } 731 case Intrinsic::amdgcn_interp_p1_f16: 732 return selectInterpP1F16(I); 733 case Intrinsic::amdgcn_wqm: 734 return constrainCopyLikeIntrin(I, AMDGPU::WQM); 735 case Intrinsic::amdgcn_softwqm: 736 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM); 737 case Intrinsic::amdgcn_wwm: 738 return constrainCopyLikeIntrin(I, AMDGPU::WWM); 739 default: 740 return selectImpl(I, *CoverageInfo); 741 } 742 } 743 744 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) { 745 if (Size != 32 && Size != 64) 746 return -1; 747 switch (P) { 748 default: 749 llvm_unreachable("Unknown condition code!"); 750 case CmpInst::ICMP_NE: 751 return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64; 752 case CmpInst::ICMP_EQ: 753 return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64; 754 case CmpInst::ICMP_SGT: 755 return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64; 756 case CmpInst::ICMP_SGE: 757 return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64; 758 case CmpInst::ICMP_SLT: 759 return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64; 760 case CmpInst::ICMP_SLE: 761 return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64; 762 case CmpInst::ICMP_UGT: 763 return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64; 764 case CmpInst::ICMP_UGE: 765 return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64; 766 case CmpInst::ICMP_ULT: 767 return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64; 768 case CmpInst::ICMP_ULE: 769 return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64; 770 } 771 } 772 773 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P, 774 unsigned Size) const { 775 if (Size == 64) { 776 if (!STI.hasScalarCompareEq64()) 777 return -1; 778 779 switch (P) { 780 case CmpInst::ICMP_NE: 781 return AMDGPU::S_CMP_LG_U64; 782 case CmpInst::ICMP_EQ: 783 return AMDGPU::S_CMP_EQ_U64; 784 default: 785 return -1; 786 } 787 } 788 789 if (Size != 32) 790 return -1; 791 792 switch (P) { 793 case CmpInst::ICMP_NE: 794 return AMDGPU::S_CMP_LG_U32; 795 case CmpInst::ICMP_EQ: 796 return AMDGPU::S_CMP_EQ_U32; 797 case CmpInst::ICMP_SGT: 798 return AMDGPU::S_CMP_GT_I32; 799 case CmpInst::ICMP_SGE: 800 return AMDGPU::S_CMP_GE_I32; 801 case CmpInst::ICMP_SLT: 802 return AMDGPU::S_CMP_LT_I32; 803 case CmpInst::ICMP_SLE: 804 return AMDGPU::S_CMP_LE_I32; 805 case CmpInst::ICMP_UGT: 806 return AMDGPU::S_CMP_GT_U32; 807 case CmpInst::ICMP_UGE: 808 return AMDGPU::S_CMP_GE_U32; 809 case CmpInst::ICMP_ULT: 810 return AMDGPU::S_CMP_LT_U32; 811 case CmpInst::ICMP_ULE: 812 return AMDGPU::S_CMP_LE_U32; 813 default: 814 llvm_unreachable("Unknown condition code!"); 815 } 816 } 817 818 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { 819 MachineBasicBlock *BB = I.getParent(); 820 const DebugLoc &DL = I.getDebugLoc(); 821 822 Register SrcReg = I.getOperand(2).getReg(); 823 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 824 825 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); 826 827 Register CCReg = I.getOperand(0).getReg(); 828 if (!isVCC(CCReg, *MRI)) { 829 int Opcode = getS_CMPOpcode(Pred, Size); 830 if (Opcode == -1) 831 return false; 832 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode)) 833 .add(I.getOperand(2)) 834 .add(I.getOperand(3)); 835 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg) 836 .addReg(AMDGPU::SCC); 837 bool Ret = 838 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) && 839 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI); 840 I.eraseFromParent(); 841 return Ret; 842 } 843 844 int Opcode = getV_CMPOpcode(Pred, Size); 845 if (Opcode == -1) 846 return false; 847 848 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), 849 I.getOperand(0).getReg()) 850 .add(I.getOperand(2)) 851 .add(I.getOperand(3)); 852 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), 853 *TRI.getBoolRC(), *MRI); 854 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 855 I.eraseFromParent(); 856 return Ret; 857 } 858 859 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const { 860 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 861 // SelectionDAG uses for wave32 vs wave64. 862 MachineBasicBlock *BB = MI.getParent(); 863 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF)) 864 .add(MI.getOperand(1)); 865 866 Register Reg = MI.getOperand(1).getReg(); 867 MI.eraseFromParent(); 868 869 if (!MRI->getRegClassOrNull(Reg)) 870 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 871 return true; 872 } 873 874 static unsigned getDSShaderTypeValue(const MachineFunction &MF) { 875 switch (MF.getFunction().getCallingConv()) { 876 case CallingConv::AMDGPU_PS: 877 return 1; 878 case CallingConv::AMDGPU_VS: 879 return 2; 880 case CallingConv::AMDGPU_GS: 881 return 3; 882 case CallingConv::AMDGPU_HS: 883 case CallingConv::AMDGPU_LS: 884 case CallingConv::AMDGPU_ES: 885 report_fatal_error("ds_ordered_count unsupported for this calling conv"); 886 case CallingConv::AMDGPU_CS: 887 case CallingConv::AMDGPU_KERNEL: 888 case CallingConv::C: 889 case CallingConv::Fast: 890 default: 891 // Assume other calling conventions are various compute callable functions 892 return 0; 893 } 894 } 895 896 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic( 897 MachineInstr &MI, Intrinsic::ID IntrID) const { 898 MachineBasicBlock *MBB = MI.getParent(); 899 MachineFunction *MF = MBB->getParent(); 900 const DebugLoc &DL = MI.getDebugLoc(); 901 902 unsigned IndexOperand = MI.getOperand(7).getImm(); 903 bool WaveRelease = MI.getOperand(8).getImm() != 0; 904 bool WaveDone = MI.getOperand(9).getImm() != 0; 905 906 if (WaveDone && !WaveRelease) 907 report_fatal_error("ds_ordered_count: wave_done requires wave_release"); 908 909 unsigned OrderedCountIndex = IndexOperand & 0x3f; 910 IndexOperand &= ~0x3f; 911 unsigned CountDw = 0; 912 913 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) { 914 CountDw = (IndexOperand >> 24) & 0xf; 915 IndexOperand &= ~(0xf << 24); 916 917 if (CountDw < 1 || CountDw > 4) { 918 report_fatal_error( 919 "ds_ordered_count: dword count must be between 1 and 4"); 920 } 921 } 922 923 if (IndexOperand) 924 report_fatal_error("ds_ordered_count: bad index operand"); 925 926 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1; 927 unsigned ShaderType = getDSShaderTypeValue(*MF); 928 929 unsigned Offset0 = OrderedCountIndex << 2; 930 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) | 931 (Instruction << 4); 932 933 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) 934 Offset1 |= (CountDw - 1) << 6; 935 936 unsigned Offset = Offset0 | (Offset1 << 8); 937 938 Register M0Val = MI.getOperand(2).getReg(); 939 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 940 .addReg(M0Val); 941 942 Register DstReg = MI.getOperand(0).getReg(); 943 Register ValReg = MI.getOperand(3).getReg(); 944 MachineInstrBuilder DS = 945 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg) 946 .addReg(ValReg) 947 .addImm(Offset) 948 .cloneMemRefs(MI); 949 950 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI)) 951 return false; 952 953 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI); 954 MI.eraseFromParent(); 955 return Ret; 956 } 957 958 static unsigned gwsIntrinToOpcode(unsigned IntrID) { 959 switch (IntrID) { 960 case Intrinsic::amdgcn_ds_gws_init: 961 return AMDGPU::DS_GWS_INIT; 962 case Intrinsic::amdgcn_ds_gws_barrier: 963 return AMDGPU::DS_GWS_BARRIER; 964 case Intrinsic::amdgcn_ds_gws_sema_v: 965 return AMDGPU::DS_GWS_SEMA_V; 966 case Intrinsic::amdgcn_ds_gws_sema_br: 967 return AMDGPU::DS_GWS_SEMA_BR; 968 case Intrinsic::amdgcn_ds_gws_sema_p: 969 return AMDGPU::DS_GWS_SEMA_P; 970 case Intrinsic::amdgcn_ds_gws_sema_release_all: 971 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL; 972 default: 973 llvm_unreachable("not a gws intrinsic"); 974 } 975 } 976 977 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, 978 Intrinsic::ID IID) const { 979 if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all && 980 !STI.hasGWSSemaReleaseAll()) 981 return false; 982 983 // intrinsic ID, vsrc, offset 984 const bool HasVSrc = MI.getNumOperands() == 3; 985 assert(HasVSrc || MI.getNumOperands() == 2); 986 987 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg(); 988 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI); 989 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID) 990 return false; 991 992 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 993 assert(OffsetDef); 994 995 unsigned ImmOffset; 996 997 MachineBasicBlock *MBB = MI.getParent(); 998 const DebugLoc &DL = MI.getDebugLoc(); 999 1000 MachineInstr *Readfirstlane = nullptr; 1001 1002 // If we legalized the VGPR input, strip out the readfirstlane to analyze the 1003 // incoming offset, in case there's an add of a constant. We'll have to put it 1004 // back later. 1005 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) { 1006 Readfirstlane = OffsetDef; 1007 BaseOffset = OffsetDef->getOperand(1).getReg(); 1008 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1009 } 1010 1011 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) { 1012 // If we have a constant offset, try to use the 0 in m0 as the base. 1013 // TODO: Look into changing the default m0 initialization value. If the 1014 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to 1015 // the immediate offset. 1016 1017 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue(); 1018 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1019 .addImm(0); 1020 } else { 1021 std::tie(BaseOffset, ImmOffset, OffsetDef) 1022 = AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset); 1023 1024 if (Readfirstlane) { 1025 // We have the constant offset now, so put the readfirstlane back on the 1026 // variable component. 1027 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI)) 1028 return false; 1029 1030 Readfirstlane->getOperand(1).setReg(BaseOffset); 1031 BaseOffset = Readfirstlane->getOperand(0).getReg(); 1032 } else { 1033 if (!RBI.constrainGenericRegister(BaseOffset, 1034 AMDGPU::SReg_32RegClass, *MRI)) 1035 return false; 1036 } 1037 1038 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1039 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base) 1040 .addReg(BaseOffset) 1041 .addImm(16); 1042 1043 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1044 .addReg(M0Base); 1045 } 1046 1047 // The resource id offset is computed as (<isa opaque base> + M0[21:16] + 1048 // offset field) % 64. Some versions of the programming guide omit the m0 1049 // part, or claim it's from offset 0. 1050 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID))); 1051 1052 if (HasVSrc) { 1053 Register VSrc = MI.getOperand(1).getReg(); 1054 MIB.addReg(VSrc); 1055 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI)) 1056 return false; 1057 } 1058 1059 MIB.addImm(ImmOffset) 1060 .addImm(-1) // $gds 1061 .cloneMemRefs(MI); 1062 1063 MI.eraseFromParent(); 1064 return true; 1065 } 1066 1067 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI, 1068 bool IsAppend) const { 1069 Register PtrBase = MI.getOperand(2).getReg(); 1070 LLT PtrTy = MRI->getType(PtrBase); 1071 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS; 1072 1073 unsigned Offset; 1074 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2)); 1075 1076 // TODO: Should this try to look through readfirstlane like GWS? 1077 if (!isDSOffsetLegal(PtrBase, Offset, 16)) { 1078 PtrBase = MI.getOperand(2).getReg(); 1079 Offset = 0; 1080 } 1081 1082 MachineBasicBlock *MBB = MI.getParent(); 1083 const DebugLoc &DL = MI.getDebugLoc(); 1084 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME; 1085 1086 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1087 .addReg(PtrBase); 1088 BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg()) 1089 .addImm(Offset) 1090 .addImm(IsGDS ? -1 : 0) 1091 .cloneMemRefs(MI); 1092 1093 MI.eraseFromParent(); 1094 return true; 1095 } 1096 1097 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( 1098 MachineInstr &I) const { 1099 unsigned IntrinsicID = I.getIntrinsicID(); 1100 switch (IntrinsicID) { 1101 case Intrinsic::amdgcn_end_cf: 1102 return selectEndCfIntrinsic(I); 1103 case Intrinsic::amdgcn_ds_ordered_add: 1104 case Intrinsic::amdgcn_ds_ordered_swap: 1105 return selectDSOrderedIntrinsic(I, IntrinsicID); 1106 case Intrinsic::amdgcn_ds_gws_init: 1107 case Intrinsic::amdgcn_ds_gws_barrier: 1108 case Intrinsic::amdgcn_ds_gws_sema_v: 1109 case Intrinsic::amdgcn_ds_gws_sema_br: 1110 case Intrinsic::amdgcn_ds_gws_sema_p: 1111 case Intrinsic::amdgcn_ds_gws_sema_release_all: 1112 return selectDSGWSIntrinsic(I, IntrinsicID); 1113 case Intrinsic::amdgcn_ds_append: 1114 return selectDSAppendConsume(I, true); 1115 case Intrinsic::amdgcn_ds_consume: 1116 return selectDSAppendConsume(I, false); 1117 default: 1118 return selectImpl(I, *CoverageInfo); 1119 } 1120 } 1121 1122 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { 1123 if (selectImpl(I, *CoverageInfo)) 1124 return true; 1125 1126 MachineBasicBlock *BB = I.getParent(); 1127 const DebugLoc &DL = I.getDebugLoc(); 1128 1129 Register DstReg = I.getOperand(0).getReg(); 1130 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 1131 assert(Size <= 32 || Size == 64); 1132 const MachineOperand &CCOp = I.getOperand(1); 1133 Register CCReg = CCOp.getReg(); 1134 if (!isVCC(CCReg, *MRI)) { 1135 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 : 1136 AMDGPU::S_CSELECT_B32; 1137 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 1138 .addReg(CCReg); 1139 1140 // The generic constrainSelectedInstRegOperands doesn't work for the scc register 1141 // bank, because it does not cover the register class that we used to represent 1142 // for it. So we need to manually set the register class here. 1143 if (!MRI->getRegClassOrNull(CCReg)) 1144 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI)); 1145 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) 1146 .add(I.getOperand(2)) 1147 .add(I.getOperand(3)); 1148 1149 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) | 1150 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI); 1151 I.eraseFromParent(); 1152 return Ret; 1153 } 1154 1155 // Wide VGPR select should have been split in RegBankSelect. 1156 if (Size > 32) 1157 return false; 1158 1159 MachineInstr *Select = 1160 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1161 .addImm(0) 1162 .add(I.getOperand(3)) 1163 .addImm(0) 1164 .add(I.getOperand(2)) 1165 .add(I.getOperand(1)); 1166 1167 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); 1168 I.eraseFromParent(); 1169 return Ret; 1170 } 1171 1172 bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { 1173 initM0(I); 1174 return selectImpl(I, *CoverageInfo); 1175 } 1176 1177 static int sizeToSubRegIndex(unsigned Size) { 1178 switch (Size) { 1179 case 32: 1180 return AMDGPU::sub0; 1181 case 64: 1182 return AMDGPU::sub0_sub1; 1183 case 96: 1184 return AMDGPU::sub0_sub1_sub2; 1185 case 128: 1186 return AMDGPU::sub0_sub1_sub2_sub3; 1187 case 256: 1188 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; 1189 default: 1190 if (Size < 32) 1191 return AMDGPU::sub0; 1192 if (Size > 256) 1193 return -1; 1194 return sizeToSubRegIndex(PowerOf2Ceil(Size)); 1195 } 1196 } 1197 1198 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { 1199 Register DstReg = I.getOperand(0).getReg(); 1200 Register SrcReg = I.getOperand(1).getReg(); 1201 const LLT DstTy = MRI->getType(DstReg); 1202 const LLT SrcTy = MRI->getType(SrcReg); 1203 if (!DstTy.isScalar()) 1204 return false; 1205 1206 const LLT S1 = LLT::scalar(1); 1207 1208 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1209 const RegisterBank *DstRB; 1210 if (DstTy == S1) { 1211 // This is a special case. We don't treat s1 for legalization artifacts as 1212 // vcc booleans. 1213 DstRB = SrcRB; 1214 } else { 1215 DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1216 if (SrcRB != DstRB) 1217 return false; 1218 } 1219 1220 unsigned DstSize = DstTy.getSizeInBits(); 1221 unsigned SrcSize = SrcTy.getSizeInBits(); 1222 1223 const TargetRegisterClass *SrcRC 1224 = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI); 1225 const TargetRegisterClass *DstRC 1226 = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI); 1227 1228 if (SrcSize > 32) { 1229 int SubRegIdx = sizeToSubRegIndex(DstSize); 1230 if (SubRegIdx == -1) 1231 return false; 1232 1233 // Deal with weird cases where the class only partially supports the subreg 1234 // index. 1235 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx); 1236 if (!SrcRC) 1237 return false; 1238 1239 I.getOperand(1).setSubReg(SubRegIdx); 1240 } 1241 1242 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 1243 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) { 1244 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); 1245 return false; 1246 } 1247 1248 I.setDesc(TII.get(TargetOpcode::COPY)); 1249 return true; 1250 } 1251 1252 /// \returns true if a bitmask for \p Size bits will be an inline immediate. 1253 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) { 1254 Mask = maskTrailingOnes<unsigned>(Size); 1255 int SignedMask = static_cast<int>(Mask); 1256 return SignedMask >= -16 && SignedMask <= 64; 1257 } 1258 1259 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1. 1260 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank( 1261 Register Reg, const MachineRegisterInfo &MRI, 1262 const TargetRegisterInfo &TRI) const { 1263 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 1264 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>()) 1265 return RB; 1266 1267 // Ignore the type, since we don't use vcc in artifacts. 1268 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>()) 1269 return &RBI.getRegBankFromRegClass(*RC, LLT()); 1270 return nullptr; 1271 } 1272 1273 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { 1274 bool Signed = I.getOpcode() == AMDGPU::G_SEXT; 1275 const DebugLoc &DL = I.getDebugLoc(); 1276 MachineBasicBlock &MBB = *I.getParent(); 1277 const Register DstReg = I.getOperand(0).getReg(); 1278 const Register SrcReg = I.getOperand(1).getReg(); 1279 1280 const LLT DstTy = MRI->getType(DstReg); 1281 const LLT SrcTy = MRI->getType(SrcReg); 1282 const unsigned SrcSize = SrcTy.getSizeInBits(); 1283 const unsigned DstSize = DstTy.getSizeInBits(); 1284 if (!DstTy.isScalar()) 1285 return false; 1286 1287 if (I.getOpcode() == AMDGPU::G_ANYEXT) 1288 return selectCOPY(I); 1289 1290 // Artifact casts should never use vcc. 1291 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI); 1292 1293 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) { 1294 // 64-bit should have been split up in RegBankSelect 1295 1296 // Try to use an and with a mask if it will save code size. 1297 unsigned Mask; 1298 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1299 MachineInstr *ExtI = 1300 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg) 1301 .addImm(Mask) 1302 .addReg(SrcReg); 1303 I.eraseFromParent(); 1304 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1305 } 1306 1307 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32; 1308 MachineInstr *ExtI = 1309 BuildMI(MBB, I, DL, TII.get(BFE), DstReg) 1310 .addReg(SrcReg) 1311 .addImm(0) // Offset 1312 .addImm(SrcSize); // Width 1313 I.eraseFromParent(); 1314 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1315 } 1316 1317 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) { 1318 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI)) 1319 return false; 1320 1321 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) { 1322 const unsigned SextOpc = SrcSize == 8 ? 1323 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16; 1324 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg) 1325 .addReg(SrcReg); 1326 I.eraseFromParent(); 1327 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1328 } 1329 1330 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; 1331 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; 1332 1333 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width. 1334 if (DstSize > 32 && SrcSize <= 32) { 1335 // We need a 64-bit register source, but the high bits don't matter. 1336 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); 1337 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1338 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 1339 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) 1340 .addReg(SrcReg) 1341 .addImm(AMDGPU::sub0) 1342 .addReg(UndefReg) 1343 .addImm(AMDGPU::sub1); 1344 1345 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg) 1346 .addReg(ExtReg) 1347 .addImm(SrcSize << 16); 1348 1349 I.eraseFromParent(); 1350 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI); 1351 } 1352 1353 unsigned Mask; 1354 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1355 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg) 1356 .addReg(SrcReg) 1357 .addImm(Mask); 1358 } else { 1359 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg) 1360 .addReg(SrcReg) 1361 .addImm(SrcSize << 16); 1362 } 1363 1364 I.eraseFromParent(); 1365 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1366 } 1367 1368 return false; 1369 } 1370 1371 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { 1372 MachineBasicBlock *BB = I.getParent(); 1373 MachineOperand &ImmOp = I.getOperand(1); 1374 1375 // The AMDGPU backend only supports Imm operands and not CImm or FPImm. 1376 if (ImmOp.isFPImm()) { 1377 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt(); 1378 ImmOp.ChangeToImmediate(Imm.getZExtValue()); 1379 } else if (ImmOp.isCImm()) { 1380 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getZExtValue()); 1381 } 1382 1383 Register DstReg = I.getOperand(0).getReg(); 1384 unsigned Size; 1385 bool IsSgpr; 1386 const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg()); 1387 if (RB) { 1388 IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID; 1389 Size = MRI->getType(DstReg).getSizeInBits(); 1390 } else { 1391 const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg); 1392 IsSgpr = TRI.isSGPRClass(RC); 1393 Size = TRI.getRegSizeInBits(*RC); 1394 } 1395 1396 if (Size != 32 && Size != 64) 1397 return false; 1398 1399 unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1400 if (Size == 32) { 1401 I.setDesc(TII.get(Opcode)); 1402 I.addImplicitDefUseOperands(*MF); 1403 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 1404 } 1405 1406 const DebugLoc &DL = I.getDebugLoc(); 1407 1408 APInt Imm(Size, I.getOperand(1).getImm()); 1409 1410 MachineInstr *ResInst; 1411 if (IsSgpr && TII.isInlineConstant(Imm)) { 1412 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) 1413 .addImm(I.getOperand(1).getImm()); 1414 } else { 1415 const TargetRegisterClass *RC = IsSgpr ? 1416 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass; 1417 Register LoReg = MRI->createVirtualRegister(RC); 1418 Register HiReg = MRI->createVirtualRegister(RC); 1419 1420 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg) 1421 .addImm(Imm.trunc(32).getZExtValue()); 1422 1423 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg) 1424 .addImm(Imm.ashr(32).getZExtValue()); 1425 1426 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 1427 .addReg(LoReg) 1428 .addImm(AMDGPU::sub0) 1429 .addReg(HiReg) 1430 .addImm(AMDGPU::sub1); 1431 } 1432 1433 // We can't call constrainSelectedInstRegOperands here, because it doesn't 1434 // work for target independent opcodes 1435 I.eraseFromParent(); 1436 const TargetRegisterClass *DstRC = 1437 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI); 1438 if (!DstRC) 1439 return true; 1440 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI); 1441 } 1442 1443 static bool isConstant(const MachineInstr &MI) { 1444 return MI.getOpcode() == TargetOpcode::G_CONSTANT; 1445 } 1446 1447 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, 1448 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const { 1449 1450 const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg()); 1451 1452 assert(PtrMI); 1453 1454 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD) 1455 return; 1456 1457 GEPInfo GEPInfo(*PtrMI); 1458 1459 for (unsigned i = 1; i != 3; ++i) { 1460 const MachineOperand &GEPOp = PtrMI->getOperand(i); 1461 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg()); 1462 assert(OpDef); 1463 if (i == 2 && isConstant(*OpDef)) { 1464 // TODO: Could handle constant base + variable offset, but a combine 1465 // probably should have commuted it. 1466 assert(GEPInfo.Imm == 0); 1467 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue(); 1468 continue; 1469 } 1470 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI); 1471 if (OpBank->getID() == AMDGPU::SGPRRegBankID) 1472 GEPInfo.SgprParts.push_back(GEPOp.getReg()); 1473 else 1474 GEPInfo.VgprParts.push_back(GEPOp.getReg()); 1475 } 1476 1477 AddrInfo.push_back(GEPInfo); 1478 getAddrModeInfo(*PtrMI, MRI, AddrInfo); 1479 } 1480 1481 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const { 1482 if (!MI.hasOneMemOperand()) 1483 return false; 1484 1485 const MachineMemOperand *MMO = *MI.memoperands_begin(); 1486 const Value *Ptr = MMO->getValue(); 1487 1488 // UndefValue means this is a load of a kernel input. These are uniform. 1489 // Sometimes LDS instructions have constant pointers. 1490 // If Ptr is null, then that means this mem operand contains a 1491 // PseudoSourceValue like GOT. 1492 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || 1493 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) 1494 return true; 1495 1496 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 1497 return true; 1498 1499 const Instruction *I = dyn_cast<Instruction>(Ptr); 1500 return I && I->getMetadata("amdgpu.uniform"); 1501 } 1502 1503 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const { 1504 for (const GEPInfo &GEPInfo : AddrInfo) { 1505 if (!GEPInfo.VgprParts.empty()) 1506 return true; 1507 } 1508 return false; 1509 } 1510 1511 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const { 1512 MachineBasicBlock *BB = I.getParent(); 1513 1514 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); 1515 unsigned AS = PtrTy.getAddressSpace(); 1516 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) && 1517 STI.ldsRequiresM0Init()) { 1518 // If DS instructions require M0 initializtion, insert it before selecting. 1519 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1520 .addImm(-1); 1521 } 1522 } 1523 1524 bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const { 1525 initM0(I); 1526 return selectImpl(I, *CoverageInfo); 1527 } 1528 1529 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { 1530 MachineBasicBlock *BB = I.getParent(); 1531 MachineOperand &CondOp = I.getOperand(0); 1532 Register CondReg = CondOp.getReg(); 1533 const DebugLoc &DL = I.getDebugLoc(); 1534 1535 unsigned BrOpcode; 1536 Register CondPhysReg; 1537 const TargetRegisterClass *ConstrainRC; 1538 1539 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide 1540 // whether the branch is uniform when selecting the instruction. In 1541 // GlobalISel, we should push that decision into RegBankSelect. Assume for now 1542 // RegBankSelect knows what it's doing if the branch condition is scc, even 1543 // though it currently does not. 1544 if (!isVCC(CondReg, *MRI)) { 1545 if (MRI->getType(CondReg) != LLT::scalar(32)) 1546 return false; 1547 1548 CondPhysReg = AMDGPU::SCC; 1549 BrOpcode = AMDGPU::S_CBRANCH_SCC1; 1550 // FIXME: Hack for isSCC tests 1551 ConstrainRC = &AMDGPU::SGPR_32RegClass; 1552 } else { 1553 // FIXME: Do we have to insert an and with exec here, like in SelectionDAG? 1554 // We sort of know that a VCC producer based on the register bank, that ands 1555 // inactive lanes with 0. What if there was a logical operation with vcc 1556 // producers in different blocks/with different exec masks? 1557 // FIXME: Should scc->vcc copies and with exec? 1558 CondPhysReg = TRI.getVCC(); 1559 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ; 1560 ConstrainRC = TRI.getBoolRC(); 1561 } 1562 1563 if (!MRI->getRegClassOrNull(CondReg)) 1564 MRI->setRegClass(CondReg, ConstrainRC); 1565 1566 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg) 1567 .addReg(CondReg); 1568 BuildMI(*BB, &I, DL, TII.get(BrOpcode)) 1569 .addMBB(I.getOperand(1).getMBB()); 1570 1571 I.eraseFromParent(); 1572 return true; 1573 } 1574 1575 bool AMDGPUInstructionSelector::selectG_FRAME_INDEX(MachineInstr &I) const { 1576 Register DstReg = I.getOperand(0).getReg(); 1577 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1578 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 1579 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32)); 1580 if (IsVGPR) 1581 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 1582 1583 return RBI.constrainGenericRegister( 1584 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI); 1585 } 1586 1587 bool AMDGPUInstructionSelector::selectG_PTR_MASK(MachineInstr &I) const { 1588 uint64_t Align = I.getOperand(2).getImm(); 1589 const uint64_t Mask = ~((UINT64_C(1) << Align) - 1); 1590 1591 MachineBasicBlock *BB = I.getParent(); 1592 1593 Register DstReg = I.getOperand(0).getReg(); 1594 Register SrcReg = I.getOperand(1).getReg(); 1595 1596 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1597 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1598 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 1599 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 1600 unsigned MovOpc = IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 1601 const TargetRegisterClass &RegRC 1602 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 1603 1604 LLT Ty = MRI->getType(DstReg); 1605 1606 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB, 1607 *MRI); 1608 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB, 1609 *MRI); 1610 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 1611 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 1612 return false; 1613 1614 const DebugLoc &DL = I.getDebugLoc(); 1615 Register ImmReg = MRI->createVirtualRegister(&RegRC); 1616 BuildMI(*BB, &I, DL, TII.get(MovOpc), ImmReg) 1617 .addImm(Mask); 1618 1619 if (Ty.getSizeInBits() == 32) { 1620 BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg) 1621 .addReg(SrcReg) 1622 .addReg(ImmReg); 1623 I.eraseFromParent(); 1624 return true; 1625 } 1626 1627 Register HiReg = MRI->createVirtualRegister(&RegRC); 1628 Register LoReg = MRI->createVirtualRegister(&RegRC); 1629 Register MaskLo = MRI->createVirtualRegister(&RegRC); 1630 1631 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg) 1632 .addReg(SrcReg, 0, AMDGPU::sub0); 1633 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg) 1634 .addReg(SrcReg, 0, AMDGPU::sub1); 1635 1636 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskLo) 1637 .addReg(LoReg) 1638 .addReg(ImmReg); 1639 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 1640 .addReg(MaskLo) 1641 .addImm(AMDGPU::sub0) 1642 .addReg(HiReg) 1643 .addImm(AMDGPU::sub1); 1644 I.eraseFromParent(); 1645 return true; 1646 } 1647 1648 /// Return the register to use for the index value, and the subregister to use 1649 /// for the indirectly accessed register. 1650 static std::pair<Register, unsigned> 1651 computeIndirectRegIndex(MachineRegisterInfo &MRI, 1652 const SIRegisterInfo &TRI, 1653 const TargetRegisterClass *SuperRC, 1654 Register IdxReg, 1655 unsigned EltSize) { 1656 Register IdxBaseReg; 1657 int Offset; 1658 MachineInstr *Unused; 1659 1660 std::tie(IdxBaseReg, Offset, Unused) 1661 = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg); 1662 1663 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize); 1664 1665 // Skip out of bounds offsets, or else we would end up using an undefined 1666 // register. 1667 if (static_cast<unsigned>(Offset) >= SubRegs.size()) 1668 return std::make_pair(IdxReg, SubRegs[0]); 1669 return std::make_pair(IdxBaseReg, SubRegs[Offset]); 1670 } 1671 1672 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT( 1673 MachineInstr &MI) const { 1674 Register DstReg = MI.getOperand(0).getReg(); 1675 Register SrcReg = MI.getOperand(1).getReg(); 1676 Register IdxReg = MI.getOperand(2).getReg(); 1677 1678 LLT DstTy = MRI->getType(DstReg); 1679 LLT SrcTy = MRI->getType(SrcReg); 1680 1681 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1682 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1683 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 1684 1685 // The index must be scalar. If it wasn't RegBankSelect should have moved this 1686 // into a waterfall loop. 1687 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 1688 return false; 1689 1690 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB, 1691 *MRI); 1692 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB, 1693 *MRI); 1694 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 1695 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 1696 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 1697 return false; 1698 1699 MachineBasicBlock *BB = MI.getParent(); 1700 const DebugLoc &DL = MI.getDebugLoc(); 1701 const bool Is64 = DstTy.getSizeInBits() == 64; 1702 1703 unsigned SubReg; 1704 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg, 1705 DstTy.getSizeInBits() / 8); 1706 1707 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) { 1708 if (DstTy.getSizeInBits() != 32 && !Is64) 1709 return false; 1710 1711 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1712 .addReg(IdxReg); 1713 1714 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32; 1715 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg) 1716 .addReg(SrcReg, 0, SubReg) 1717 .addReg(SrcReg, RegState::Implicit); 1718 MI.eraseFromParent(); 1719 return true; 1720 } 1721 1722 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32) 1723 return false; 1724 1725 if (!STI.useVGPRIndexMode()) { 1726 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1727 .addReg(IdxReg); 1728 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg) 1729 .addReg(SrcReg, RegState::Undef, SubReg) 1730 .addReg(SrcReg, RegState::Implicit); 1731 MI.eraseFromParent(); 1732 return true; 1733 } 1734 1735 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON)) 1736 .addReg(IdxReg) 1737 .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE); 1738 BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstReg) 1739 .addReg(SrcReg, RegState::Undef, SubReg) 1740 .addReg(SrcReg, RegState::Implicit) 1741 .addReg(AMDGPU::M0, RegState::Implicit); 1742 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF)); 1743 1744 MI.eraseFromParent(); 1745 return true; 1746 } 1747 1748 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd 1749 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT( 1750 MachineInstr &MI) const { 1751 Register DstReg = MI.getOperand(0).getReg(); 1752 Register VecReg = MI.getOperand(1).getReg(); 1753 Register ValReg = MI.getOperand(2).getReg(); 1754 Register IdxReg = MI.getOperand(3).getReg(); 1755 1756 LLT VecTy = MRI->getType(DstReg); 1757 LLT ValTy = MRI->getType(ValReg); 1758 unsigned VecSize = VecTy.getSizeInBits(); 1759 unsigned ValSize = ValTy.getSizeInBits(); 1760 1761 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI); 1762 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI); 1763 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 1764 1765 assert(VecTy.getElementType() == ValTy); 1766 1767 // The index must be scalar. If it wasn't RegBankSelect should have moved this 1768 // into a waterfall loop. 1769 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 1770 return false; 1771 1772 const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB, 1773 *MRI); 1774 const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB, 1775 *MRI); 1776 1777 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) || 1778 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) || 1779 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) || 1780 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 1781 return false; 1782 1783 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32) 1784 return false; 1785 1786 unsigned SubReg; 1787 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, 1788 ValSize / 8); 1789 1790 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID && 1791 STI.useVGPRIndexMode(); 1792 1793 MachineBasicBlock *BB = MI.getParent(); 1794 const DebugLoc &DL = MI.getDebugLoc(); 1795 1796 if (IndexMode) { 1797 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON)) 1798 .addReg(IdxReg) 1799 .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE); 1800 } else { 1801 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1802 .addReg(IdxReg); 1803 } 1804 1805 const MCInstrDesc &RegWriteOp 1806 = TII.getIndirectRegWritePseudo(VecSize, ValSize, 1807 VecRB->getID() == AMDGPU::SGPRRegBankID); 1808 BuildMI(*BB, MI, DL, RegWriteOp, DstReg) 1809 .addReg(VecReg) 1810 .addReg(ValReg) 1811 .addImm(SubReg); 1812 1813 if (IndexMode) 1814 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF)); 1815 1816 MI.eraseFromParent(); 1817 return true; 1818 } 1819 1820 bool AMDGPUInstructionSelector::select(MachineInstr &I) { 1821 if (I.isPHI()) 1822 return selectPHI(I); 1823 1824 if (!I.isPreISelOpcode()) { 1825 if (I.isCopy()) 1826 return selectCOPY(I); 1827 return true; 1828 } 1829 1830 switch (I.getOpcode()) { 1831 case TargetOpcode::G_AND: 1832 case TargetOpcode::G_OR: 1833 case TargetOpcode::G_XOR: 1834 if (selectG_AND_OR_XOR(I)) 1835 return true; 1836 return selectImpl(I, *CoverageInfo); 1837 case TargetOpcode::G_ADD: 1838 case TargetOpcode::G_SUB: 1839 if (selectImpl(I, *CoverageInfo)) 1840 return true; 1841 return selectG_ADD_SUB(I); 1842 case TargetOpcode::G_UADDO: 1843 case TargetOpcode::G_USUBO: 1844 case TargetOpcode::G_UADDE: 1845 case TargetOpcode::G_USUBE: 1846 return selectG_UADDO_USUBO_UADDE_USUBE(I); 1847 case TargetOpcode::G_INTTOPTR: 1848 case TargetOpcode::G_BITCAST: 1849 case TargetOpcode::G_PTRTOINT: 1850 return selectCOPY(I); 1851 case TargetOpcode::G_CONSTANT: 1852 case TargetOpcode::G_FCONSTANT: 1853 return selectG_CONSTANT(I); 1854 case TargetOpcode::G_EXTRACT: 1855 return selectG_EXTRACT(I); 1856 case TargetOpcode::G_MERGE_VALUES: 1857 case TargetOpcode::G_BUILD_VECTOR: 1858 case TargetOpcode::G_CONCAT_VECTORS: 1859 return selectG_MERGE_VALUES(I); 1860 case TargetOpcode::G_UNMERGE_VALUES: 1861 return selectG_UNMERGE_VALUES(I); 1862 case TargetOpcode::G_PTR_ADD: 1863 return selectG_PTR_ADD(I); 1864 case TargetOpcode::G_IMPLICIT_DEF: 1865 return selectG_IMPLICIT_DEF(I); 1866 case TargetOpcode::G_INSERT: 1867 return selectG_INSERT(I); 1868 case TargetOpcode::G_INTRINSIC: 1869 return selectG_INTRINSIC(I); 1870 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: 1871 return selectG_INTRINSIC_W_SIDE_EFFECTS(I); 1872 case TargetOpcode::G_ICMP: 1873 if (selectG_ICMP(I)) 1874 return true; 1875 return selectImpl(I, *CoverageInfo); 1876 case TargetOpcode::G_LOAD: 1877 case TargetOpcode::G_ATOMIC_CMPXCHG: 1878 case TargetOpcode::G_ATOMICRMW_XCHG: 1879 case TargetOpcode::G_ATOMICRMW_ADD: 1880 case TargetOpcode::G_ATOMICRMW_SUB: 1881 case TargetOpcode::G_ATOMICRMW_AND: 1882 case TargetOpcode::G_ATOMICRMW_OR: 1883 case TargetOpcode::G_ATOMICRMW_XOR: 1884 case TargetOpcode::G_ATOMICRMW_MIN: 1885 case TargetOpcode::G_ATOMICRMW_MAX: 1886 case TargetOpcode::G_ATOMICRMW_UMIN: 1887 case TargetOpcode::G_ATOMICRMW_UMAX: 1888 case TargetOpcode::G_ATOMICRMW_FADD: 1889 return selectG_LOAD_ATOMICRMW(I); 1890 case TargetOpcode::G_SELECT: 1891 return selectG_SELECT(I); 1892 case TargetOpcode::G_STORE: 1893 return selectG_STORE(I); 1894 case TargetOpcode::G_TRUNC: 1895 return selectG_TRUNC(I); 1896 case TargetOpcode::G_SEXT: 1897 case TargetOpcode::G_ZEXT: 1898 case TargetOpcode::G_ANYEXT: 1899 if (selectImpl(I, *CoverageInfo)) 1900 return true; 1901 return selectG_SZA_EXT(I); 1902 case TargetOpcode::G_BRCOND: 1903 return selectG_BRCOND(I); 1904 case TargetOpcode::G_FRAME_INDEX: 1905 return selectG_FRAME_INDEX(I); 1906 case TargetOpcode::G_PTR_MASK: 1907 return selectG_PTR_MASK(I); 1908 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1909 return selectG_EXTRACT_VECTOR_ELT(I); 1910 case TargetOpcode::G_INSERT_VECTOR_ELT: 1911 return selectG_INSERT_VECTOR_ELT(I); 1912 case AMDGPU::G_AMDGPU_ATOMIC_INC: 1913 case AMDGPU::G_AMDGPU_ATOMIC_DEC: 1914 initM0(I); 1915 return selectImpl(I, *CoverageInfo); 1916 default: 1917 return selectImpl(I, *CoverageInfo); 1918 } 1919 return false; 1920 } 1921 1922 InstructionSelector::ComplexRendererFns 1923 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { 1924 return {{ 1925 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 1926 }}; 1927 1928 } 1929 1930 std::pair<Register, unsigned> 1931 AMDGPUInstructionSelector::selectVOP3ModsImpl( 1932 Register Src) const { 1933 unsigned Mods = 0; 1934 MachineInstr *MI = MRI->getVRegDef(Src); 1935 1936 if (MI && MI->getOpcode() == AMDGPU::G_FNEG) { 1937 Src = MI->getOperand(1).getReg(); 1938 Mods |= SISrcMods::NEG; 1939 MI = MRI->getVRegDef(Src); 1940 } 1941 1942 if (MI && MI->getOpcode() == AMDGPU::G_FABS) { 1943 Src = MI->getOperand(1).getReg(); 1944 Mods |= SISrcMods::ABS; 1945 } 1946 1947 return std::make_pair(Src, Mods); 1948 } 1949 1950 /// 1951 /// This will select either an SGPR or VGPR operand and will save us from 1952 /// having to write an extra tablegen pattern. 1953 InstructionSelector::ComplexRendererFns 1954 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const { 1955 return {{ 1956 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 1957 }}; 1958 } 1959 1960 InstructionSelector::ComplexRendererFns 1961 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { 1962 Register Src; 1963 unsigned Mods; 1964 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 1965 1966 return {{ 1967 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 1968 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 1969 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 1970 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 1971 }}; 1972 } 1973 1974 InstructionSelector::ComplexRendererFns 1975 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { 1976 return {{ 1977 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 1978 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 1979 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 1980 }}; 1981 } 1982 1983 InstructionSelector::ComplexRendererFns 1984 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { 1985 Register Src; 1986 unsigned Mods; 1987 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 1988 1989 return {{ 1990 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 1991 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 1992 }}; 1993 } 1994 1995 InstructionSelector::ComplexRendererFns 1996 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { 1997 Register Reg = Root.getReg(); 1998 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); 1999 if (Def && (Def->getOpcode() == AMDGPU::G_FNEG || 2000 Def->getOpcode() == AMDGPU::G_FABS)) 2001 return {}; 2002 return {{ 2003 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 2004 }}; 2005 } 2006 2007 InstructionSelector::ComplexRendererFns 2008 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const { 2009 Register Src; 2010 unsigned Mods; 2011 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 2012 if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI)) 2013 return None; 2014 2015 return {{ 2016 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2017 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 2018 }}; 2019 } 2020 2021 InstructionSelector::ComplexRendererFns 2022 AMDGPUInstructionSelector::selectVOP3OpSelMods0(MachineOperand &Root) const { 2023 // FIXME: Handle clamp and op_sel 2024 return {{ 2025 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 2026 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // src_mods 2027 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // clamp 2028 }}; 2029 } 2030 2031 InstructionSelector::ComplexRendererFns 2032 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { 2033 // FIXME: Handle op_sel 2034 return {{ 2035 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 2036 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods 2037 }}; 2038 } 2039 2040 InstructionSelector::ComplexRendererFns 2041 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { 2042 SmallVector<GEPInfo, 4> AddrInfo; 2043 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 2044 2045 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 2046 return None; 2047 2048 const GEPInfo &GEPInfo = AddrInfo[0]; 2049 2050 if (!AMDGPU::isLegalSMRDImmOffset(STI, GEPInfo.Imm)) 2051 return None; 2052 2053 unsigned PtrReg = GEPInfo.SgprParts[0]; 2054 int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm); 2055 return {{ 2056 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 2057 [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); } 2058 }}; 2059 } 2060 2061 InstructionSelector::ComplexRendererFns 2062 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { 2063 SmallVector<GEPInfo, 4> AddrInfo; 2064 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 2065 2066 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 2067 return None; 2068 2069 const GEPInfo &GEPInfo = AddrInfo[0]; 2070 unsigned PtrReg = GEPInfo.SgprParts[0]; 2071 int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm); 2072 if (!isUInt<32>(EncodedImm)) 2073 return None; 2074 2075 return {{ 2076 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 2077 [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); } 2078 }}; 2079 } 2080 2081 InstructionSelector::ComplexRendererFns 2082 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { 2083 MachineInstr *MI = Root.getParent(); 2084 MachineBasicBlock *MBB = MI->getParent(); 2085 2086 SmallVector<GEPInfo, 4> AddrInfo; 2087 getAddrModeInfo(*MI, *MRI, AddrInfo); 2088 2089 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, 2090 // then we can select all ptr + 32-bit offsets not just immediate offsets. 2091 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 2092 return None; 2093 2094 const GEPInfo &GEPInfo = AddrInfo[0]; 2095 if (!GEPInfo.Imm || !isUInt<32>(GEPInfo.Imm)) 2096 return None; 2097 2098 // If we make it this far we have a load with an 32-bit immediate offset. 2099 // It is OK to select this using a sgpr offset, because we have already 2100 // failed trying to select this load into one of the _IMM variants since 2101 // the _IMM Patterns are considered before the _SGPR patterns. 2102 unsigned PtrReg = GEPInfo.SgprParts[0]; 2103 Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2104 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) 2105 .addImm(GEPInfo.Imm); 2106 return {{ 2107 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 2108 [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); } 2109 }}; 2110 } 2111 2112 template <bool Signed> 2113 InstructionSelector::ComplexRendererFns 2114 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const { 2115 MachineInstr *MI = Root.getParent(); 2116 2117 InstructionSelector::ComplexRendererFns Default = {{ 2118 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 2119 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // offset 2120 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 2121 }}; 2122 2123 if (!STI.hasFlatInstOffsets()) 2124 return Default; 2125 2126 const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg()); 2127 if (!OpDef || OpDef->getOpcode() != AMDGPU::G_PTR_ADD) 2128 return Default; 2129 2130 Optional<int64_t> Offset = 2131 getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI); 2132 if (!Offset.hasValue()) 2133 return Default; 2134 2135 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); 2136 if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed)) 2137 return Default; 2138 2139 Register BasePtr = OpDef->getOperand(1).getReg(); 2140 2141 return {{ 2142 [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); }, 2143 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); }, 2144 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 2145 }}; 2146 } 2147 2148 InstructionSelector::ComplexRendererFns 2149 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const { 2150 return selectFlatOffsetImpl<false>(Root); 2151 } 2152 2153 InstructionSelector::ComplexRendererFns 2154 AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const { 2155 return selectFlatOffsetImpl<true>(Root); 2156 } 2157 2158 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { 2159 auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>(); 2160 return PSV && PSV->isStack(); 2161 } 2162 2163 InstructionSelector::ComplexRendererFns 2164 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { 2165 MachineInstr *MI = Root.getParent(); 2166 MachineBasicBlock *MBB = MI->getParent(); 2167 MachineFunction *MF = MBB->getParent(); 2168 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 2169 2170 int64_t Offset = 0; 2171 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset))) { 2172 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2173 2174 // TODO: Should this be inside the render function? The iterator seems to 2175 // move. 2176 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), 2177 HighBits) 2178 .addImm(Offset & ~4095); 2179 2180 return {{[=](MachineInstrBuilder &MIB) { // rsrc 2181 MIB.addReg(Info->getScratchRSrcReg()); 2182 }, 2183 [=](MachineInstrBuilder &MIB) { // vaddr 2184 MIB.addReg(HighBits); 2185 }, 2186 [=](MachineInstrBuilder &MIB) { // soffset 2187 const MachineMemOperand *MMO = *MI->memoperands_begin(); 2188 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 2189 2190 Register SOffsetReg = isStackPtrRelative(PtrInfo) 2191 ? Info->getStackPtrOffsetReg() 2192 : Info->getScratchWaveOffsetReg(); 2193 MIB.addReg(SOffsetReg); 2194 }, 2195 [=](MachineInstrBuilder &MIB) { // offset 2196 MIB.addImm(Offset & 4095); 2197 }}}; 2198 } 2199 2200 assert(Offset == 0); 2201 2202 // Try to fold a frame index directly into the MUBUF vaddr field, and any 2203 // offsets. 2204 Optional<int> FI; 2205 Register VAddr = Root.getReg(); 2206 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) { 2207 if (isBaseWithConstantOffset(Root, *MRI)) { 2208 const MachineOperand &LHS = RootDef->getOperand(1); 2209 const MachineOperand &RHS = RootDef->getOperand(2); 2210 const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); 2211 const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); 2212 if (LHSDef && RHSDef) { 2213 int64_t PossibleOffset = 2214 RHSDef->getOperand(1).getCImm()->getSExtValue(); 2215 if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) && 2216 (!STI.privateMemoryResourceIsRangeChecked() || 2217 KnownBits->signBitIsZero(LHS.getReg()))) { 2218 if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX) 2219 FI = LHSDef->getOperand(1).getIndex(); 2220 else 2221 VAddr = LHS.getReg(); 2222 Offset = PossibleOffset; 2223 } 2224 } 2225 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) { 2226 FI = RootDef->getOperand(1).getIndex(); 2227 } 2228 } 2229 2230 // If we don't know this private access is a local stack object, it needs to 2231 // be relative to the entry point's scratch wave offset register. 2232 // TODO: Should split large offsets that don't fit like above. 2233 // TODO: Don't use scratch wave offset just because the offset didn't fit. 2234 Register SOffset = FI.hasValue() ? Info->getStackPtrOffsetReg() 2235 : Info->getScratchWaveOffsetReg(); 2236 2237 return {{[=](MachineInstrBuilder &MIB) { // rsrc 2238 MIB.addReg(Info->getScratchRSrcReg()); 2239 }, 2240 [=](MachineInstrBuilder &MIB) { // vaddr 2241 if (FI.hasValue()) 2242 MIB.addFrameIndex(FI.getValue()); 2243 else 2244 MIB.addReg(VAddr); 2245 }, 2246 [=](MachineInstrBuilder &MIB) { // soffset 2247 MIB.addReg(SOffset); 2248 }, 2249 [=](MachineInstrBuilder &MIB) { // offset 2250 MIB.addImm(Offset); 2251 }}}; 2252 } 2253 2254 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base, 2255 int64_t Offset, 2256 unsigned OffsetBits) const { 2257 if ((OffsetBits == 16 && !isUInt<16>(Offset)) || 2258 (OffsetBits == 8 && !isUInt<8>(Offset))) 2259 return false; 2260 2261 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) 2262 return true; 2263 2264 // On Southern Islands instruction with a negative base value and an offset 2265 // don't seem to work. 2266 return KnownBits->signBitIsZero(Base); 2267 } 2268 2269 InstructionSelector::ComplexRendererFns 2270 AMDGPUInstructionSelector::selectMUBUFScratchOffset( 2271 MachineOperand &Root) const { 2272 MachineInstr *MI = Root.getParent(); 2273 MachineBasicBlock *MBB = MI->getParent(); 2274 2275 int64_t Offset = 0; 2276 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) || 2277 !SIInstrInfo::isLegalMUBUFImmOffset(Offset)) 2278 return {}; 2279 2280 const MachineFunction *MF = MBB->getParent(); 2281 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 2282 const MachineMemOperand *MMO = *MI->memoperands_begin(); 2283 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 2284 2285 Register SOffsetReg = isStackPtrRelative(PtrInfo) 2286 ? Info->getStackPtrOffsetReg() 2287 : Info->getScratchWaveOffsetReg(); 2288 return {{ 2289 [=](MachineInstrBuilder &MIB) { 2290 MIB.addReg(Info->getScratchRSrcReg()); 2291 }, // rsrc 2292 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffsetReg); }, // soffset 2293 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset 2294 }}; 2295 } 2296 2297 std::pair<Register, unsigned> 2298 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const { 2299 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 2300 if (!RootDef) 2301 return std::make_pair(Root.getReg(), 0); 2302 2303 int64_t ConstAddr = 0; 2304 if (isBaseWithConstantOffset(Root, *MRI)) { 2305 const MachineOperand &LHS = RootDef->getOperand(1); 2306 const MachineOperand &RHS = RootDef->getOperand(2); 2307 const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); 2308 const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); 2309 if (LHSDef && RHSDef) { 2310 int64_t PossibleOffset = 2311 RHSDef->getOperand(1).getCImm()->getSExtValue(); 2312 if (isDSOffsetLegal(LHS.getReg(), PossibleOffset, 16)) { 2313 // (add n0, c0) 2314 return std::make_pair(LHS.getReg(), PossibleOffset); 2315 } 2316 } 2317 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 2318 // TODO 2319 2320 2321 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 2322 // TODO 2323 2324 } 2325 2326 return std::make_pair(Root.getReg(), 0); 2327 } 2328 2329 InstructionSelector::ComplexRendererFns 2330 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const { 2331 2332 Register Reg; 2333 unsigned Offset; 2334 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root); 2335 return {{ 2336 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 2337 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } 2338 }}; 2339 } 2340 2341 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return 2342 /// the base value with the constant offset. There may be intervening copies 2343 /// between \p Root and the identified constant. Returns \p Root, 0 if this does 2344 /// not match the pattern. 2345 std::pair<Register, int64_t> 2346 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset( 2347 Register Root, const MachineRegisterInfo &MRI) const { 2348 MachineInstr *RootI = MRI.getVRegDef(Root); 2349 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD) 2350 return {Root, 0}; 2351 2352 MachineOperand &RHS = RootI->getOperand(2); 2353 Optional<ValueAndVReg> MaybeOffset 2354 = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true); 2355 if (!MaybeOffset) 2356 return {Root, 0}; 2357 return {RootI->getOperand(1).getReg(), MaybeOffset->Value}; 2358 } 2359 2360 static void addZeroImm(MachineInstrBuilder &MIB) { 2361 MIB.addImm(0); 2362 } 2363 2364 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p 2365 /// BasePtr is not valid, a null base pointer will be used. 2366 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, 2367 uint32_t FormatLo, uint32_t FormatHi, 2368 Register BasePtr) { 2369 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 2370 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 2371 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 2372 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 2373 2374 B.buildInstr(AMDGPU::S_MOV_B32) 2375 .addDef(RSrc2) 2376 .addImm(FormatLo); 2377 B.buildInstr(AMDGPU::S_MOV_B32) 2378 .addDef(RSrc3) 2379 .addImm(FormatHi); 2380 2381 // Build the half of the subregister with the constants before building the 2382 // full 128-bit register. If we are building multiple resource descriptors, 2383 // this will allow CSEing of the 2-component register. 2384 B.buildInstr(AMDGPU::REG_SEQUENCE) 2385 .addDef(RSrcHi) 2386 .addReg(RSrc2) 2387 .addImm(AMDGPU::sub0) 2388 .addReg(RSrc3) 2389 .addImm(AMDGPU::sub1); 2390 2391 Register RSrcLo = BasePtr; 2392 if (!BasePtr) { 2393 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 2394 B.buildInstr(AMDGPU::S_MOV_B64) 2395 .addDef(RSrcLo) 2396 .addImm(0); 2397 } 2398 2399 B.buildInstr(AMDGPU::REG_SEQUENCE) 2400 .addDef(RSrc) 2401 .addReg(RSrcLo) 2402 .addImm(AMDGPU::sub0_sub1) 2403 .addReg(RSrcHi) 2404 .addImm(AMDGPU::sub2_sub3); 2405 2406 return RSrc; 2407 } 2408 2409 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 2410 const SIInstrInfo &TII, Register BasePtr) { 2411 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 2412 2413 // FIXME: Why are half the "default" bits ignored based on the addressing 2414 // mode? 2415 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr); 2416 } 2417 2418 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 2419 const SIInstrInfo &TII, Register BasePtr) { 2420 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 2421 2422 // FIXME: Why are half the "default" bits ignored based on the addressing 2423 // mode? 2424 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr); 2425 } 2426 2427 AMDGPUInstructionSelector::MUBUFAddressData 2428 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const { 2429 MUBUFAddressData Data; 2430 Data.N0 = Src; 2431 2432 Register PtrBase; 2433 int64_t Offset; 2434 2435 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI); 2436 if (isUInt<32>(Offset)) { 2437 Data.N0 = PtrBase; 2438 Data.Offset = Offset; 2439 } 2440 2441 if (MachineInstr *InputAdd 2442 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) { 2443 Data.N2 = InputAdd->getOperand(1).getReg(); 2444 Data.N3 = InputAdd->getOperand(2).getReg(); 2445 2446 // FIXME: Need to fix extra SGPR->VGPRcopies inserted 2447 // FIXME: Don't know this was defined by operand 0 2448 // 2449 // TODO: Remove this when we have copy folding optimizations after 2450 // RegBankSelect. 2451 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg(); 2452 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg(); 2453 } 2454 2455 return Data; 2456 } 2457 2458 /// Return if the addr64 mubuf mode should be used for the given address. 2459 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const { 2460 // (ptr_add N2, N3) -> addr64, or 2461 // (ptr_add (ptr_add N2, N3), C1) -> addr64 2462 if (Addr.N2) 2463 return true; 2464 2465 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI); 2466 return N0Bank->getID() == AMDGPU::VGPRRegBankID; 2467 } 2468 2469 /// Split an immediate offset \p ImmOffset depending on whether it fits in the 2470 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable 2471 /// component. 2472 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset( 2473 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const { 2474 if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset)) 2475 return; 2476 2477 // Illegal offset, store it in soffset. 2478 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2479 B.buildInstr(AMDGPU::S_MOV_B32) 2480 .addDef(SOffset) 2481 .addImm(ImmOffset); 2482 ImmOffset = 0; 2483 } 2484 2485 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl( 2486 MachineOperand &Root, Register &VAddr, Register &RSrcReg, 2487 Register &SOffset, int64_t &Offset) const { 2488 // FIXME: Predicates should stop this from reaching here. 2489 // addr64 bit was removed for volcanic islands. 2490 if (!STI.hasAddr64() || STI.useFlatForGlobal()) 2491 return false; 2492 2493 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 2494 if (!shouldUseAddr64(AddrData)) 2495 return false; 2496 2497 Register N0 = AddrData.N0; 2498 Register N2 = AddrData.N2; 2499 Register N3 = AddrData.N3; 2500 Offset = AddrData.Offset; 2501 2502 // Base pointer for the SRD. 2503 Register SRDPtr; 2504 2505 if (N2) { 2506 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 2507 assert(N3); 2508 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 2509 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the 2510 // addr64, and construct the default resource from a 0 address. 2511 VAddr = N0; 2512 } else { 2513 SRDPtr = N3; 2514 VAddr = N2; 2515 } 2516 } else { 2517 // N2 is not divergent. 2518 SRDPtr = N2; 2519 VAddr = N3; 2520 } 2521 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 2522 // Use the default null pointer in the resource 2523 VAddr = N0; 2524 } else { 2525 // N0 -> offset, or 2526 // (N0 + C1) -> offset 2527 SRDPtr = N0; 2528 } 2529 2530 MachineIRBuilder B(*Root.getParent()); 2531 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr); 2532 splitIllegalMUBUFOffset(B, SOffset, Offset); 2533 return true; 2534 } 2535 2536 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl( 2537 MachineOperand &Root, Register &RSrcReg, Register &SOffset, 2538 int64_t &Offset) const { 2539 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 2540 if (shouldUseAddr64(AddrData)) 2541 return false; 2542 2543 // N0 -> offset, or 2544 // (N0 + C1) -> offset 2545 Register SRDPtr = AddrData.N0; 2546 Offset = AddrData.Offset; 2547 2548 // TODO: Look through extensions for 32-bit soffset. 2549 MachineIRBuilder B(*Root.getParent()); 2550 2551 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr); 2552 splitIllegalMUBUFOffset(B, SOffset, Offset); 2553 return true; 2554 } 2555 2556 InstructionSelector::ComplexRendererFns 2557 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const { 2558 Register VAddr; 2559 Register RSrcReg; 2560 Register SOffset; 2561 int64_t Offset = 0; 2562 2563 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 2564 return {}; 2565 2566 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 2567 // pattern. 2568 return {{ 2569 [=](MachineInstrBuilder &MIB) { // rsrc 2570 MIB.addReg(RSrcReg); 2571 }, 2572 [=](MachineInstrBuilder &MIB) { // vaddr 2573 MIB.addReg(VAddr); 2574 }, 2575 [=](MachineInstrBuilder &MIB) { // soffset 2576 if (SOffset) 2577 MIB.addReg(SOffset); 2578 else 2579 MIB.addImm(0); 2580 }, 2581 [=](MachineInstrBuilder &MIB) { // offset 2582 MIB.addImm(Offset); 2583 }, 2584 addZeroImm, // glc 2585 addZeroImm, // slc 2586 addZeroImm, // tfe 2587 addZeroImm, // dlc 2588 addZeroImm // swz 2589 }}; 2590 } 2591 2592 InstructionSelector::ComplexRendererFns 2593 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const { 2594 Register RSrcReg; 2595 Register SOffset; 2596 int64_t Offset = 0; 2597 2598 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 2599 return {}; 2600 2601 return {{ 2602 [=](MachineInstrBuilder &MIB) { // rsrc 2603 MIB.addReg(RSrcReg); 2604 }, 2605 [=](MachineInstrBuilder &MIB) { // soffset 2606 if (SOffset) 2607 MIB.addReg(SOffset); 2608 else 2609 MIB.addImm(0); 2610 }, 2611 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 2612 addZeroImm, // glc 2613 addZeroImm, // slc 2614 addZeroImm, // tfe 2615 addZeroImm, // dlc 2616 addZeroImm // swz 2617 }}; 2618 } 2619 2620 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, 2621 const MachineInstr &MI, 2622 int OpIdx) const { 2623 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 2624 "Expected G_CONSTANT"); 2625 Optional<int64_t> CstVal = getConstantVRegVal(MI.getOperand(0).getReg(), *MRI); 2626 assert(CstVal && "Expected constant value"); 2627 MIB.addImm(CstVal.getValue()); 2628 } 2629 2630 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB, 2631 const MachineInstr &MI, 2632 int OpIdx) const { 2633 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 2634 "Expected G_CONSTANT"); 2635 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue()); 2636 } 2637 2638 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB, 2639 const MachineInstr &MI, 2640 int OpIdx) const { 2641 assert(OpIdx == -1); 2642 2643 const MachineOperand &Op = MI.getOperand(1); 2644 if (MI.getOpcode() == TargetOpcode::G_FCONSTANT) 2645 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); 2646 else { 2647 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT"); 2648 MIB.addImm(Op.getCImm()->getSExtValue()); 2649 } 2650 } 2651 2652 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB, 2653 const MachineInstr &MI, 2654 int OpIdx) const { 2655 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 2656 "Expected G_CONSTANT"); 2657 MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation()); 2658 } 2659 2660 /// This only really exists to satisfy DAG type checking machinery, so is a 2661 /// no-op here. 2662 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB, 2663 const MachineInstr &MI, 2664 int OpIdx) const { 2665 MIB.addImm(MI.getOperand(OpIdx).getImm()); 2666 } 2667 2668 void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB, 2669 const MachineInstr &MI, 2670 int OpIdx) const { 2671 assert(OpIdx >= 0 && "expected to match an immediate operand"); 2672 MIB.addImm(MI.getOperand(OpIdx).getImm() & 1); 2673 } 2674 2675 void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB, 2676 const MachineInstr &MI, 2677 int OpIdx) const { 2678 assert(OpIdx >= 0 && "expected to match an immediate operand"); 2679 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1); 2680 } 2681 2682 void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB, 2683 const MachineInstr &MI, 2684 int OpIdx) const { 2685 assert(OpIdx >= 0 && "expected to match an immediate operand"); 2686 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1); 2687 } 2688 2689 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB, 2690 const MachineInstr &MI, 2691 int OpIdx) const { 2692 assert(OpIdx >= 0 && "expected to match an immediate operand"); 2693 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1); 2694 } 2695 2696 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const { 2697 return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm()); 2698 } 2699 2700 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const { 2701 return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm()); 2702 } 2703 2704 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const { 2705 return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm()); 2706 } 2707 2708 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const { 2709 return TII.isInlineConstant(Imm); 2710 } 2711