1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the InstructionSelector class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUInstructionSelector.h" 15 #include "AMDGPUInstrInfo.h" 16 #include "AMDGPUGlobalISelUtils.h" 17 #include "AMDGPURegisterBankInfo.h" 18 #include "AMDGPUSubtarget.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 23 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" 24 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" 25 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 26 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 27 #include "llvm/CodeGen/GlobalISel/Utils.h" 28 #include "llvm/CodeGen/MachineBasicBlock.h" 29 #include "llvm/CodeGen/MachineFunction.h" 30 #include "llvm/CodeGen/MachineInstr.h" 31 #include "llvm/CodeGen/MachineInstrBuilder.h" 32 #include "llvm/CodeGen/MachineRegisterInfo.h" 33 #include "llvm/IR/Type.h" 34 #include "llvm/Support/Debug.h" 35 #include "llvm/Support/raw_ostream.h" 36 37 #define DEBUG_TYPE "amdgpu-isel" 38 39 using namespace llvm; 40 using namespace MIPatternMatch; 41 42 #define GET_GLOBALISEL_IMPL 43 #define AMDGPUSubtarget GCNSubtarget 44 #include "AMDGPUGenGlobalISel.inc" 45 #undef GET_GLOBALISEL_IMPL 46 #undef AMDGPUSubtarget 47 48 AMDGPUInstructionSelector::AMDGPUInstructionSelector( 49 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, 50 const AMDGPUTargetMachine &TM) 51 : InstructionSelector(), TII(*STI.getInstrInfo()), 52 TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM), 53 STI(STI), 54 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG), 55 #define GET_GLOBALISEL_PREDICATES_INIT 56 #include "AMDGPUGenGlobalISel.inc" 57 #undef GET_GLOBALISEL_PREDICATES_INIT 58 #define GET_GLOBALISEL_TEMPORARIES_INIT 59 #include "AMDGPUGenGlobalISel.inc" 60 #undef GET_GLOBALISEL_TEMPORARIES_INIT 61 { 62 } 63 64 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } 65 66 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB, 67 CodeGenCoverage &CoverageInfo) { 68 MRI = &MF.getRegInfo(); 69 InstructionSelector::setupMF(MF, KB, CoverageInfo); 70 } 71 72 bool AMDGPUInstructionSelector::isVCC(Register Reg, 73 const MachineRegisterInfo &MRI) const { 74 if (Register::isPhysicalRegister(Reg)) 75 return Reg == TRI.getVCC(); 76 77 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 78 const TargetRegisterClass *RC = 79 RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); 80 if (RC) { 81 const LLT Ty = MRI.getType(Reg); 82 return RC->hasSuperClassEq(TRI.getBoolRC()) && 83 Ty.isValid() && Ty.getSizeInBits() == 1; 84 } 85 86 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); 87 return RB->getID() == AMDGPU::VCCRegBankID; 88 } 89 90 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI, 91 unsigned NewOpc) const { 92 MI.setDesc(TII.get(NewOpc)); 93 MI.RemoveOperand(1); // Remove intrinsic ID. 94 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 95 96 MachineOperand &Dst = MI.getOperand(0); 97 MachineOperand &Src = MI.getOperand(1); 98 99 // TODO: This should be legalized to s32 if needed 100 if (MRI->getType(Dst.getReg()) == LLT::scalar(1)) 101 return false; 102 103 const TargetRegisterClass *DstRC 104 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 105 const TargetRegisterClass *SrcRC 106 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 107 if (!DstRC || DstRC != SrcRC) 108 return false; 109 110 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) && 111 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI); 112 } 113 114 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { 115 const DebugLoc &DL = I.getDebugLoc(); 116 MachineBasicBlock *BB = I.getParent(); 117 I.setDesc(TII.get(TargetOpcode::COPY)); 118 119 const MachineOperand &Src = I.getOperand(1); 120 MachineOperand &Dst = I.getOperand(0); 121 Register DstReg = Dst.getReg(); 122 Register SrcReg = Src.getReg(); 123 124 if (isVCC(DstReg, *MRI)) { 125 if (SrcReg == AMDGPU::SCC) { 126 const TargetRegisterClass *RC 127 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 128 if (!RC) 129 return true; 130 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 131 } 132 133 if (!isVCC(SrcReg, *MRI)) { 134 // TODO: Should probably leave the copy and let copyPhysReg expand it. 135 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI)) 136 return false; 137 138 const TargetRegisterClass *SrcRC 139 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 140 141 Register MaskedReg = MRI->createVirtualRegister(SrcRC); 142 143 // We can't trust the high bits at this point, so clear them. 144 145 // TODO: Skip masking high bits if def is known boolean. 146 147 unsigned AndOpc = TRI.isSGPRClass(SrcRC) ? 148 AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32; 149 BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg) 150 .addImm(1) 151 .addReg(SrcReg); 152 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) 153 .addImm(0) 154 .addReg(MaskedReg); 155 156 if (!MRI->getRegClassOrNull(SrcReg)) 157 MRI->setRegClass(SrcReg, SrcRC); 158 I.eraseFromParent(); 159 return true; 160 } 161 162 const TargetRegisterClass *RC = 163 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 164 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) 165 return false; 166 167 // Don't constrain the source register to a class so the def instruction 168 // handles it (unless it's undef). 169 // 170 // FIXME: This is a hack. When selecting the def, we neeed to know 171 // specifically know that the result is VCCRegBank, and not just an SGPR 172 // with size 1. An SReg_32 with size 1 is ambiguous with wave32. 173 if (Src.isUndef()) { 174 const TargetRegisterClass *SrcRC = 175 TRI.getConstrainedRegClassForOperand(Src, *MRI); 176 if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 177 return false; 178 } 179 180 return true; 181 } 182 183 for (const MachineOperand &MO : I.operands()) { 184 if (Register::isPhysicalRegister(MO.getReg())) 185 continue; 186 187 const TargetRegisterClass *RC = 188 TRI.getConstrainedRegClassForOperand(MO, *MRI); 189 if (!RC) 190 continue; 191 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); 192 } 193 return true; 194 } 195 196 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { 197 const Register DefReg = I.getOperand(0).getReg(); 198 const LLT DefTy = MRI->getType(DefReg); 199 200 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy) 201 202 const RegClassOrRegBank &RegClassOrBank = 203 MRI->getRegClassOrRegBank(DefReg); 204 205 const TargetRegisterClass *DefRC 206 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); 207 if (!DefRC) { 208 if (!DefTy.isValid()) { 209 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); 210 return false; 211 } 212 213 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); 214 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI); 215 if (!DefRC) { 216 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); 217 return false; 218 } 219 } 220 221 // TODO: Verify that all registers have the same bank 222 I.setDesc(TII.get(TargetOpcode::PHI)); 223 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI); 224 } 225 226 MachineOperand 227 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, 228 const TargetRegisterClass &SubRC, 229 unsigned SubIdx) const { 230 231 MachineInstr *MI = MO.getParent(); 232 MachineBasicBlock *BB = MO.getParent()->getParent(); 233 Register DstReg = MRI->createVirtualRegister(&SubRC); 234 235 if (MO.isReg()) { 236 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); 237 Register Reg = MO.getReg(); 238 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) 239 .addReg(Reg, 0, ComposedSubIdx); 240 241 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(), 242 MO.isKill(), MO.isDead(), MO.isUndef(), 243 MO.isEarlyClobber(), 0, MO.isDebug(), 244 MO.isInternalRead()); 245 } 246 247 assert(MO.isImm()); 248 249 APInt Imm(64, MO.getImm()); 250 251 switch (SubIdx) { 252 default: 253 llvm_unreachable("do not know to split immediate with this sub index."); 254 case AMDGPU::sub0: 255 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue()); 256 case AMDGPU::sub1: 257 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue()); 258 } 259 } 260 261 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { 262 switch (Opc) { 263 case AMDGPU::G_AND: 264 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; 265 case AMDGPU::G_OR: 266 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32; 267 case AMDGPU::G_XOR: 268 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32; 269 default: 270 llvm_unreachable("not a bit op"); 271 } 272 } 273 274 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { 275 MachineOperand &Dst = I.getOperand(0); 276 MachineOperand &Src0 = I.getOperand(1); 277 MachineOperand &Src1 = I.getOperand(2); 278 Register DstReg = Dst.getReg(); 279 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 280 281 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 282 if (DstRB->getID() == AMDGPU::VCCRegBankID) { 283 const TargetRegisterClass *RC = TRI.getBoolRC(); 284 unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), 285 RC == &AMDGPU::SReg_64RegClass); 286 I.setDesc(TII.get(InstOpc)); 287 // Dead implicit-def of scc 288 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef 289 true, // isImp 290 false, // isKill 291 true)); // isDead 292 293 // FIXME: Hack to avoid turning the register bank into a register class. 294 // The selector for G_ICMP relies on seeing the register bank for the result 295 // is VCC. In wave32 if we constrain the registers to SReg_32 here, it will 296 // be ambiguous whether it's a scalar or vector bool. 297 if (Src0.isUndef() && !MRI->getRegClassOrNull(Src0.getReg())) 298 MRI->setRegClass(Src0.getReg(), RC); 299 if (Src1.isUndef() && !MRI->getRegClassOrNull(Src1.getReg())) 300 MRI->setRegClass(Src1.getReg(), RC); 301 302 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 303 } 304 305 // TODO: Should this allow an SCC bank result, and produce a copy from SCC for 306 // the result? 307 if (DstRB->getID() == AMDGPU::SGPRRegBankID) { 308 unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), Size > 32); 309 I.setDesc(TII.get(InstOpc)); 310 // Dead implicit-def of scc 311 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef 312 true, // isImp 313 false, // isKill 314 true)); // isDead 315 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 316 } 317 318 return false; 319 } 320 321 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { 322 MachineBasicBlock *BB = I.getParent(); 323 MachineFunction *MF = BB->getParent(); 324 Register DstReg = I.getOperand(0).getReg(); 325 const DebugLoc &DL = I.getDebugLoc(); 326 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 327 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 328 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID; 329 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB; 330 331 if (Size == 32) { 332 if (IsSALU) { 333 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; 334 MachineInstr *Add = 335 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 336 .add(I.getOperand(1)) 337 .add(I.getOperand(2)); 338 I.eraseFromParent(); 339 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 340 } 341 342 if (STI.hasAddNoCarry()) { 343 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64; 344 I.setDesc(TII.get(Opc)); 345 I.addOperand(*MF, MachineOperand::CreateImm(0)); 346 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 347 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 348 } 349 350 const unsigned Opc = Sub ? AMDGPU::V_SUB_I32_e64 : AMDGPU::V_ADD_I32_e64; 351 352 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass()); 353 MachineInstr *Add 354 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 355 .addDef(UnusedCarry, RegState::Dead) 356 .add(I.getOperand(1)) 357 .add(I.getOperand(2)) 358 .addImm(0); 359 I.eraseFromParent(); 360 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 361 } 362 363 assert(!Sub && "illegal sub should not reach here"); 364 365 const TargetRegisterClass &RC 366 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass; 367 const TargetRegisterClass &HalfRC 368 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass; 369 370 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0)); 371 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0)); 372 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1)); 373 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1)); 374 375 Register DstLo = MRI->createVirtualRegister(&HalfRC); 376 Register DstHi = MRI->createVirtualRegister(&HalfRC); 377 378 if (IsSALU) { 379 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) 380 .add(Lo1) 381 .add(Lo2); 382 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) 383 .add(Hi1) 384 .add(Hi2); 385 } else { 386 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass(); 387 Register CarryReg = MRI->createVirtualRegister(CarryRC); 388 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_I32_e64), DstLo) 389 .addDef(CarryReg) 390 .add(Lo1) 391 .add(Lo2) 392 .addImm(0); 393 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi) 394 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead) 395 .add(Hi1) 396 .add(Hi2) 397 .addReg(CarryReg, RegState::Kill) 398 .addImm(0); 399 400 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI)) 401 return false; 402 } 403 404 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 405 .addReg(DstLo) 406 .addImm(AMDGPU::sub0) 407 .addReg(DstHi) 408 .addImm(AMDGPU::sub1); 409 410 411 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI)) 412 return false; 413 414 I.eraseFromParent(); 415 return true; 416 } 417 418 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE( 419 MachineInstr &I) const { 420 MachineBasicBlock *BB = I.getParent(); 421 MachineFunction *MF = BB->getParent(); 422 const DebugLoc &DL = I.getDebugLoc(); 423 Register Dst0Reg = I.getOperand(0).getReg(); 424 Register Dst1Reg = I.getOperand(1).getReg(); 425 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO || 426 I.getOpcode() == AMDGPU::G_UADDE; 427 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE || 428 I.getOpcode() == AMDGPU::G_USUBE; 429 430 if (isVCC(Dst1Reg, *MRI)) { 431 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned 432 // carry out despite the _i32 name. These were renamed in VI to _U32. 433 // FIXME: We should probably rename the opcodes here. 434 unsigned NoCarryOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; 435 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 436 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc)); 437 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 438 I.addOperand(*MF, MachineOperand::CreateImm(0)); 439 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 440 } 441 442 Register Src0Reg = I.getOperand(2).getReg(); 443 Register Src1Reg = I.getOperand(3).getReg(); 444 445 if (HasCarryIn) { 446 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 447 .addReg(I.getOperand(4).getReg()); 448 } 449 450 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; 451 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; 452 453 BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg) 454 .add(I.getOperand(2)) 455 .add(I.getOperand(3)); 456 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg) 457 .addReg(AMDGPU::SCC); 458 459 if (!MRI->getRegClassOrNull(Dst1Reg)) 460 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass); 461 462 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) || 463 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) || 464 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI)) 465 return false; 466 467 if (HasCarryIn && 468 !RBI.constrainGenericRegister(I.getOperand(4).getReg(), 469 AMDGPU::SReg_32RegClass, *MRI)) 470 return false; 471 472 I.eraseFromParent(); 473 return true; 474 } 475 476 // TODO: We should probably legalize these to only using 32-bit results. 477 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { 478 MachineBasicBlock *BB = I.getParent(); 479 Register DstReg = I.getOperand(0).getReg(); 480 Register SrcReg = I.getOperand(1).getReg(); 481 LLT DstTy = MRI->getType(DstReg); 482 LLT SrcTy = MRI->getType(SrcReg); 483 const unsigned SrcSize = SrcTy.getSizeInBits(); 484 const unsigned DstSize = DstTy.getSizeInBits(); 485 486 // TODO: Should handle any multiple of 32 offset. 487 unsigned Offset = I.getOperand(2).getImm(); 488 if (Offset % 32 != 0 || DstSize > 128) 489 return false; 490 491 const TargetRegisterClass *DstRC = 492 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI); 493 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 494 return false; 495 496 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 497 const TargetRegisterClass *SrcRC = 498 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 499 if (!SrcRC) 500 return false; 501 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32, 502 DstSize / 32); 503 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg); 504 if (!SrcRC) 505 return false; 506 507 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I, 508 *SrcRC, I.getOperand(1)); 509 const DebugLoc &DL = I.getDebugLoc(); 510 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg) 511 .addReg(SrcReg, 0, SubReg); 512 513 I.eraseFromParent(); 514 return true; 515 } 516 517 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { 518 MachineBasicBlock *BB = MI.getParent(); 519 Register DstReg = MI.getOperand(0).getReg(); 520 LLT DstTy = MRI->getType(DstReg); 521 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg()); 522 523 const unsigned SrcSize = SrcTy.getSizeInBits(); 524 if (SrcSize < 32) 525 return selectImpl(MI, *CoverageInfo); 526 527 const DebugLoc &DL = MI.getDebugLoc(); 528 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 529 const unsigned DstSize = DstTy.getSizeInBits(); 530 const TargetRegisterClass *DstRC = 531 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 532 if (!DstRC) 533 return false; 534 535 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8); 536 MachineInstrBuilder MIB = 537 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg); 538 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) { 539 MachineOperand &Src = MI.getOperand(I + 1); 540 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef())); 541 MIB.addImm(SubRegs[I]); 542 543 const TargetRegisterClass *SrcRC 544 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 545 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI)) 546 return false; 547 } 548 549 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 550 return false; 551 552 MI.eraseFromParent(); 553 return true; 554 } 555 556 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { 557 MachineBasicBlock *BB = MI.getParent(); 558 const int NumDst = MI.getNumOperands() - 1; 559 560 MachineOperand &Src = MI.getOperand(NumDst); 561 562 Register SrcReg = Src.getReg(); 563 Register DstReg0 = MI.getOperand(0).getReg(); 564 LLT DstTy = MRI->getType(DstReg0); 565 LLT SrcTy = MRI->getType(SrcReg); 566 567 const unsigned DstSize = DstTy.getSizeInBits(); 568 const unsigned SrcSize = SrcTy.getSizeInBits(); 569 const DebugLoc &DL = MI.getDebugLoc(); 570 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 571 572 const TargetRegisterClass *SrcRC = 573 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 574 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 575 return false; 576 577 const unsigned SrcFlags = getUndefRegState(Src.isUndef()); 578 579 // Note we could have mixed SGPR and VGPR destination banks for an SGPR 580 // source, and this relies on the fact that the same subregister indices are 581 // used for both. 582 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); 583 for (int I = 0, E = NumDst; I != E; ++I) { 584 MachineOperand &Dst = MI.getOperand(I); 585 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg()) 586 .addReg(SrcReg, SrcFlags, SubRegs[I]); 587 588 const TargetRegisterClass *DstRC = 589 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 590 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI)) 591 return false; 592 } 593 594 MI.eraseFromParent(); 595 return true; 596 } 597 598 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const { 599 return selectG_ADD_SUB(I); 600 } 601 602 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { 603 const MachineOperand &MO = I.getOperand(0); 604 605 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The 606 // regbank check here is to know why getConstrainedRegClassForOperand failed. 607 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI); 608 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) || 609 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) { 610 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); 611 return true; 612 } 613 614 return false; 615 } 616 617 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { 618 MachineBasicBlock *BB = I.getParent(); 619 620 Register DstReg = I.getOperand(0).getReg(); 621 Register Src0Reg = I.getOperand(1).getReg(); 622 Register Src1Reg = I.getOperand(2).getReg(); 623 LLT Src1Ty = MRI->getType(Src1Reg); 624 625 unsigned DstSize = MRI->getType(DstReg).getSizeInBits(); 626 unsigned InsSize = Src1Ty.getSizeInBits(); 627 628 int64_t Offset = I.getOperand(3).getImm(); 629 if (Offset % 32 != 0) 630 return false; 631 632 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32); 633 if (SubReg == AMDGPU::NoSubRegister) 634 return false; 635 636 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 637 const TargetRegisterClass *DstRC = 638 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 639 if (!DstRC) 640 return false; 641 642 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI); 643 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI); 644 const TargetRegisterClass *Src0RC = 645 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI); 646 const TargetRegisterClass *Src1RC = 647 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI); 648 649 // Deal with weird cases where the class only partially supports the subreg 650 // index. 651 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg); 652 if (!Src0RC) 653 return false; 654 655 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 656 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) || 657 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI)) 658 return false; 659 660 const DebugLoc &DL = I.getDebugLoc(); 661 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg) 662 .addReg(Src0Reg) 663 .addReg(Src1Reg) 664 .addImm(SubReg); 665 666 I.eraseFromParent(); 667 return true; 668 } 669 670 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const { 671 if (STI.getLDSBankCount() != 16) 672 return selectImpl(MI, *CoverageInfo); 673 674 Register Dst = MI.getOperand(0).getReg(); 675 Register Src0 = MI.getOperand(2).getReg(); 676 Register M0Val = MI.getOperand(6).getReg(); 677 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) || 678 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) || 679 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI)) 680 return false; 681 682 // This requires 2 instructions. It is possible to write a pattern to support 683 // this, but the generated isel emitter doesn't correctly deal with multiple 684 // output instructions using the same physical register input. The copy to m0 685 // is incorrectly placed before the second instruction. 686 // 687 // TODO: Match source modifiers. 688 689 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 690 const DebugLoc &DL = MI.getDebugLoc(); 691 MachineBasicBlock *MBB = MI.getParent(); 692 693 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 694 .addReg(M0Val); 695 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov) 696 .addImm(2) 697 .addImm(MI.getOperand(4).getImm()) // $attr 698 .addImm(MI.getOperand(3).getImm()); // $attrchan 699 700 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst) 701 .addImm(0) // $src0_modifiers 702 .addReg(Src0) // $src0 703 .addImm(MI.getOperand(4).getImm()) // $attr 704 .addImm(MI.getOperand(3).getImm()) // $attrchan 705 .addImm(0) // $src2_modifiers 706 .addReg(InterpMov) // $src2 - 2 f16 values selected by high 707 .addImm(MI.getOperand(5).getImm()) // $high 708 .addImm(0) // $clamp 709 .addImm(0); // $omod 710 711 MI.eraseFromParent(); 712 return true; 713 } 714 715 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { 716 unsigned IntrinsicID = I.getIntrinsicID(); 717 switch (IntrinsicID) { 718 case Intrinsic::amdgcn_if_break: { 719 MachineBasicBlock *BB = I.getParent(); 720 721 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 722 // SelectionDAG uses for wave32 vs wave64. 723 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK)) 724 .add(I.getOperand(0)) 725 .add(I.getOperand(2)) 726 .add(I.getOperand(3)); 727 728 Register DstReg = I.getOperand(0).getReg(); 729 Register Src0Reg = I.getOperand(2).getReg(); 730 Register Src1Reg = I.getOperand(3).getReg(); 731 732 I.eraseFromParent(); 733 734 for (Register Reg : { DstReg, Src0Reg, Src1Reg }) 735 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 736 737 return true; 738 } 739 case Intrinsic::amdgcn_interp_p1_f16: 740 return selectInterpP1F16(I); 741 case Intrinsic::amdgcn_wqm: 742 return constrainCopyLikeIntrin(I, AMDGPU::WQM); 743 case Intrinsic::amdgcn_softwqm: 744 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM); 745 case Intrinsic::amdgcn_wwm: 746 return constrainCopyLikeIntrin(I, AMDGPU::WWM); 747 default: 748 return selectImpl(I, *CoverageInfo); 749 } 750 } 751 752 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) { 753 if (Size != 32 && Size != 64) 754 return -1; 755 switch (P) { 756 default: 757 llvm_unreachable("Unknown condition code!"); 758 case CmpInst::ICMP_NE: 759 return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64; 760 case CmpInst::ICMP_EQ: 761 return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64; 762 case CmpInst::ICMP_SGT: 763 return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64; 764 case CmpInst::ICMP_SGE: 765 return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64; 766 case CmpInst::ICMP_SLT: 767 return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64; 768 case CmpInst::ICMP_SLE: 769 return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64; 770 case CmpInst::ICMP_UGT: 771 return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64; 772 case CmpInst::ICMP_UGE: 773 return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64; 774 case CmpInst::ICMP_ULT: 775 return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64; 776 case CmpInst::ICMP_ULE: 777 return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64; 778 } 779 } 780 781 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P, 782 unsigned Size) const { 783 if (Size == 64) { 784 if (!STI.hasScalarCompareEq64()) 785 return -1; 786 787 switch (P) { 788 case CmpInst::ICMP_NE: 789 return AMDGPU::S_CMP_LG_U64; 790 case CmpInst::ICMP_EQ: 791 return AMDGPU::S_CMP_EQ_U64; 792 default: 793 return -1; 794 } 795 } 796 797 if (Size != 32) 798 return -1; 799 800 switch (P) { 801 case CmpInst::ICMP_NE: 802 return AMDGPU::S_CMP_LG_U32; 803 case CmpInst::ICMP_EQ: 804 return AMDGPU::S_CMP_EQ_U32; 805 case CmpInst::ICMP_SGT: 806 return AMDGPU::S_CMP_GT_I32; 807 case CmpInst::ICMP_SGE: 808 return AMDGPU::S_CMP_GE_I32; 809 case CmpInst::ICMP_SLT: 810 return AMDGPU::S_CMP_LT_I32; 811 case CmpInst::ICMP_SLE: 812 return AMDGPU::S_CMP_LE_I32; 813 case CmpInst::ICMP_UGT: 814 return AMDGPU::S_CMP_GT_U32; 815 case CmpInst::ICMP_UGE: 816 return AMDGPU::S_CMP_GE_U32; 817 case CmpInst::ICMP_ULT: 818 return AMDGPU::S_CMP_LT_U32; 819 case CmpInst::ICMP_ULE: 820 return AMDGPU::S_CMP_LE_U32; 821 default: 822 llvm_unreachable("Unknown condition code!"); 823 } 824 } 825 826 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { 827 MachineBasicBlock *BB = I.getParent(); 828 const DebugLoc &DL = I.getDebugLoc(); 829 830 Register SrcReg = I.getOperand(2).getReg(); 831 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 832 833 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); 834 835 Register CCReg = I.getOperand(0).getReg(); 836 if (!isVCC(CCReg, *MRI)) { 837 int Opcode = getS_CMPOpcode(Pred, Size); 838 if (Opcode == -1) 839 return false; 840 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode)) 841 .add(I.getOperand(2)) 842 .add(I.getOperand(3)); 843 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg) 844 .addReg(AMDGPU::SCC); 845 bool Ret = 846 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) && 847 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI); 848 I.eraseFromParent(); 849 return Ret; 850 } 851 852 int Opcode = getV_CMPOpcode(Pred, Size); 853 if (Opcode == -1) 854 return false; 855 856 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), 857 I.getOperand(0).getReg()) 858 .add(I.getOperand(2)) 859 .add(I.getOperand(3)); 860 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), 861 *TRI.getBoolRC(), *MRI); 862 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 863 I.eraseFromParent(); 864 return Ret; 865 } 866 867 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const { 868 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 869 // SelectionDAG uses for wave32 vs wave64. 870 MachineBasicBlock *BB = MI.getParent(); 871 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF)) 872 .add(MI.getOperand(1)); 873 874 Register Reg = MI.getOperand(1).getReg(); 875 MI.eraseFromParent(); 876 877 if (!MRI->getRegClassOrNull(Reg)) 878 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 879 return true; 880 } 881 882 static unsigned getDSShaderTypeValue(const MachineFunction &MF) { 883 switch (MF.getFunction().getCallingConv()) { 884 case CallingConv::AMDGPU_PS: 885 return 1; 886 case CallingConv::AMDGPU_VS: 887 return 2; 888 case CallingConv::AMDGPU_GS: 889 return 3; 890 case CallingConv::AMDGPU_HS: 891 case CallingConv::AMDGPU_LS: 892 case CallingConv::AMDGPU_ES: 893 report_fatal_error("ds_ordered_count unsupported for this calling conv"); 894 case CallingConv::AMDGPU_CS: 895 case CallingConv::AMDGPU_KERNEL: 896 case CallingConv::C: 897 case CallingConv::Fast: 898 default: 899 // Assume other calling conventions are various compute callable functions 900 return 0; 901 } 902 } 903 904 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic( 905 MachineInstr &MI, Intrinsic::ID IntrID) const { 906 MachineBasicBlock *MBB = MI.getParent(); 907 MachineFunction *MF = MBB->getParent(); 908 const DebugLoc &DL = MI.getDebugLoc(); 909 910 unsigned IndexOperand = MI.getOperand(7).getImm(); 911 bool WaveRelease = MI.getOperand(8).getImm() != 0; 912 bool WaveDone = MI.getOperand(9).getImm() != 0; 913 914 if (WaveDone && !WaveRelease) 915 report_fatal_error("ds_ordered_count: wave_done requires wave_release"); 916 917 unsigned OrderedCountIndex = IndexOperand & 0x3f; 918 IndexOperand &= ~0x3f; 919 unsigned CountDw = 0; 920 921 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) { 922 CountDw = (IndexOperand >> 24) & 0xf; 923 IndexOperand &= ~(0xf << 24); 924 925 if (CountDw < 1 || CountDw > 4) { 926 report_fatal_error( 927 "ds_ordered_count: dword count must be between 1 and 4"); 928 } 929 } 930 931 if (IndexOperand) 932 report_fatal_error("ds_ordered_count: bad index operand"); 933 934 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1; 935 unsigned ShaderType = getDSShaderTypeValue(*MF); 936 937 unsigned Offset0 = OrderedCountIndex << 2; 938 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) | 939 (Instruction << 4); 940 941 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) 942 Offset1 |= (CountDw - 1) << 6; 943 944 unsigned Offset = Offset0 | (Offset1 << 8); 945 946 Register M0Val = MI.getOperand(2).getReg(); 947 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 948 .addReg(M0Val); 949 950 Register DstReg = MI.getOperand(0).getReg(); 951 Register ValReg = MI.getOperand(3).getReg(); 952 MachineInstrBuilder DS = 953 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg) 954 .addReg(ValReg) 955 .addImm(Offset) 956 .cloneMemRefs(MI); 957 958 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI)) 959 return false; 960 961 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI); 962 MI.eraseFromParent(); 963 return Ret; 964 } 965 966 static unsigned gwsIntrinToOpcode(unsigned IntrID) { 967 switch (IntrID) { 968 case Intrinsic::amdgcn_ds_gws_init: 969 return AMDGPU::DS_GWS_INIT; 970 case Intrinsic::amdgcn_ds_gws_barrier: 971 return AMDGPU::DS_GWS_BARRIER; 972 case Intrinsic::amdgcn_ds_gws_sema_v: 973 return AMDGPU::DS_GWS_SEMA_V; 974 case Intrinsic::amdgcn_ds_gws_sema_br: 975 return AMDGPU::DS_GWS_SEMA_BR; 976 case Intrinsic::amdgcn_ds_gws_sema_p: 977 return AMDGPU::DS_GWS_SEMA_P; 978 case Intrinsic::amdgcn_ds_gws_sema_release_all: 979 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL; 980 default: 981 llvm_unreachable("not a gws intrinsic"); 982 } 983 } 984 985 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, 986 Intrinsic::ID IID) const { 987 if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all && 988 !STI.hasGWSSemaReleaseAll()) 989 return false; 990 991 // intrinsic ID, vsrc, offset 992 const bool HasVSrc = MI.getNumOperands() == 3; 993 assert(HasVSrc || MI.getNumOperands() == 2); 994 995 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg(); 996 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI); 997 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID) 998 return false; 999 1000 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1001 assert(OffsetDef); 1002 1003 unsigned ImmOffset; 1004 1005 MachineBasicBlock *MBB = MI.getParent(); 1006 const DebugLoc &DL = MI.getDebugLoc(); 1007 1008 MachineInstr *Readfirstlane = nullptr; 1009 1010 // If we legalized the VGPR input, strip out the readfirstlane to analyze the 1011 // incoming offset, in case there's an add of a constant. We'll have to put it 1012 // back later. 1013 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) { 1014 Readfirstlane = OffsetDef; 1015 BaseOffset = OffsetDef->getOperand(1).getReg(); 1016 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1017 } 1018 1019 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) { 1020 // If we have a constant offset, try to use the 0 in m0 as the base. 1021 // TODO: Look into changing the default m0 initialization value. If the 1022 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to 1023 // the immediate offset. 1024 1025 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue(); 1026 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1027 .addImm(0); 1028 } else { 1029 std::tie(BaseOffset, ImmOffset, OffsetDef) 1030 = AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset); 1031 1032 if (Readfirstlane) { 1033 // We have the constant offset now, so put the readfirstlane back on the 1034 // variable component. 1035 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI)) 1036 return false; 1037 1038 Readfirstlane->getOperand(1).setReg(BaseOffset); 1039 BaseOffset = Readfirstlane->getOperand(0).getReg(); 1040 } else { 1041 if (!RBI.constrainGenericRegister(BaseOffset, 1042 AMDGPU::SReg_32RegClass, *MRI)) 1043 return false; 1044 } 1045 1046 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1047 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base) 1048 .addReg(BaseOffset) 1049 .addImm(16); 1050 1051 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1052 .addReg(M0Base); 1053 } 1054 1055 // The resource id offset is computed as (<isa opaque base> + M0[21:16] + 1056 // offset field) % 64. Some versions of the programming guide omit the m0 1057 // part, or claim it's from offset 0. 1058 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID))); 1059 1060 if (HasVSrc) { 1061 Register VSrc = MI.getOperand(1).getReg(); 1062 MIB.addReg(VSrc); 1063 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI)) 1064 return false; 1065 } 1066 1067 MIB.addImm(ImmOffset) 1068 .addImm(-1) // $gds 1069 .cloneMemRefs(MI); 1070 1071 MI.eraseFromParent(); 1072 return true; 1073 } 1074 1075 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI, 1076 bool IsAppend) const { 1077 Register PtrBase = MI.getOperand(2).getReg(); 1078 LLT PtrTy = MRI->getType(PtrBase); 1079 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS; 1080 1081 unsigned Offset; 1082 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2)); 1083 1084 // TODO: Should this try to look through readfirstlane like GWS? 1085 if (!isDSOffsetLegal(PtrBase, Offset, 16)) { 1086 PtrBase = MI.getOperand(2).getReg(); 1087 Offset = 0; 1088 } 1089 1090 MachineBasicBlock *MBB = MI.getParent(); 1091 const DebugLoc &DL = MI.getDebugLoc(); 1092 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME; 1093 1094 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1095 .addReg(PtrBase); 1096 BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg()) 1097 .addImm(Offset) 1098 .addImm(IsGDS ? -1 : 0) 1099 .cloneMemRefs(MI); 1100 1101 MI.eraseFromParent(); 1102 return true; 1103 } 1104 1105 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( 1106 MachineInstr &I) const { 1107 unsigned IntrinsicID = I.getIntrinsicID(); 1108 switch (IntrinsicID) { 1109 case Intrinsic::amdgcn_end_cf: 1110 return selectEndCfIntrinsic(I); 1111 case Intrinsic::amdgcn_ds_ordered_add: 1112 case Intrinsic::amdgcn_ds_ordered_swap: 1113 return selectDSOrderedIntrinsic(I, IntrinsicID); 1114 case Intrinsic::amdgcn_ds_gws_init: 1115 case Intrinsic::amdgcn_ds_gws_barrier: 1116 case Intrinsic::amdgcn_ds_gws_sema_v: 1117 case Intrinsic::amdgcn_ds_gws_sema_br: 1118 case Intrinsic::amdgcn_ds_gws_sema_p: 1119 case Intrinsic::amdgcn_ds_gws_sema_release_all: 1120 return selectDSGWSIntrinsic(I, IntrinsicID); 1121 case Intrinsic::amdgcn_ds_append: 1122 return selectDSAppendConsume(I, true); 1123 case Intrinsic::amdgcn_ds_consume: 1124 return selectDSAppendConsume(I, false); 1125 default: 1126 return selectImpl(I, *CoverageInfo); 1127 } 1128 } 1129 1130 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { 1131 if (selectImpl(I, *CoverageInfo)) 1132 return true; 1133 1134 MachineBasicBlock *BB = I.getParent(); 1135 const DebugLoc &DL = I.getDebugLoc(); 1136 1137 Register DstReg = I.getOperand(0).getReg(); 1138 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 1139 assert(Size <= 32 || Size == 64); 1140 const MachineOperand &CCOp = I.getOperand(1); 1141 Register CCReg = CCOp.getReg(); 1142 if (!isVCC(CCReg, *MRI)) { 1143 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 : 1144 AMDGPU::S_CSELECT_B32; 1145 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 1146 .addReg(CCReg); 1147 1148 // The generic constrainSelectedInstRegOperands doesn't work for the scc register 1149 // bank, because it does not cover the register class that we used to represent 1150 // for it. So we need to manually set the register class here. 1151 if (!MRI->getRegClassOrNull(CCReg)) 1152 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI)); 1153 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) 1154 .add(I.getOperand(2)) 1155 .add(I.getOperand(3)); 1156 1157 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) | 1158 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI); 1159 I.eraseFromParent(); 1160 return Ret; 1161 } 1162 1163 // Wide VGPR select should have been split in RegBankSelect. 1164 if (Size > 32) 1165 return false; 1166 1167 MachineInstr *Select = 1168 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1169 .addImm(0) 1170 .add(I.getOperand(3)) 1171 .addImm(0) 1172 .add(I.getOperand(2)) 1173 .add(I.getOperand(1)); 1174 1175 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); 1176 I.eraseFromParent(); 1177 return Ret; 1178 } 1179 1180 bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { 1181 initM0(I); 1182 return selectImpl(I, *CoverageInfo); 1183 } 1184 1185 static int sizeToSubRegIndex(unsigned Size) { 1186 switch (Size) { 1187 case 32: 1188 return AMDGPU::sub0; 1189 case 64: 1190 return AMDGPU::sub0_sub1; 1191 case 96: 1192 return AMDGPU::sub0_sub1_sub2; 1193 case 128: 1194 return AMDGPU::sub0_sub1_sub2_sub3; 1195 case 256: 1196 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; 1197 default: 1198 if (Size < 32) 1199 return AMDGPU::sub0; 1200 if (Size > 256) 1201 return -1; 1202 return sizeToSubRegIndex(PowerOf2Ceil(Size)); 1203 } 1204 } 1205 1206 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { 1207 Register DstReg = I.getOperand(0).getReg(); 1208 Register SrcReg = I.getOperand(1).getReg(); 1209 const LLT DstTy = MRI->getType(DstReg); 1210 const LLT SrcTy = MRI->getType(SrcReg); 1211 const LLT S1 = LLT::scalar(1); 1212 1213 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1214 const RegisterBank *DstRB; 1215 if (DstTy == S1) { 1216 // This is a special case. We don't treat s1 for legalization artifacts as 1217 // vcc booleans. 1218 DstRB = SrcRB; 1219 } else { 1220 DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1221 if (SrcRB != DstRB) 1222 return false; 1223 } 1224 1225 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; 1226 1227 unsigned DstSize = DstTy.getSizeInBits(); 1228 unsigned SrcSize = SrcTy.getSizeInBits(); 1229 1230 const TargetRegisterClass *SrcRC 1231 = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI); 1232 const TargetRegisterClass *DstRC 1233 = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI); 1234 1235 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 1236 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) { 1237 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); 1238 return false; 1239 } 1240 1241 if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) { 1242 MachineBasicBlock *MBB = I.getParent(); 1243 const DebugLoc &DL = I.getDebugLoc(); 1244 1245 Register LoReg = MRI->createVirtualRegister(DstRC); 1246 Register HiReg = MRI->createVirtualRegister(DstRC); 1247 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg) 1248 .addReg(SrcReg, 0, AMDGPU::sub0); 1249 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg) 1250 .addReg(SrcReg, 0, AMDGPU::sub1); 1251 1252 if (IsVALU && STI.hasSDWA()) { 1253 // Write the low 16-bits of the high element into the high 16-bits of the 1254 // low element. 1255 MachineInstr *MovSDWA = 1256 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 1257 .addImm(0) // $src0_modifiers 1258 .addReg(HiReg) // $src0 1259 .addImm(0) // $clamp 1260 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel 1261 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 1262 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel 1263 .addReg(LoReg, RegState::Implicit); 1264 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 1265 } else { 1266 Register TmpReg0 = MRI->createVirtualRegister(DstRC); 1267 Register TmpReg1 = MRI->createVirtualRegister(DstRC); 1268 Register ImmReg = MRI->createVirtualRegister(DstRC); 1269 if (IsVALU) { 1270 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0) 1271 .addImm(16) 1272 .addReg(HiReg); 1273 } else { 1274 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0) 1275 .addReg(HiReg) 1276 .addImm(16); 1277 } 1278 1279 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 1280 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 1281 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32; 1282 1283 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg) 1284 .addImm(0xffff); 1285 BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1) 1286 .addReg(LoReg) 1287 .addReg(ImmReg); 1288 BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg) 1289 .addReg(TmpReg0) 1290 .addReg(TmpReg1); 1291 } 1292 1293 I.eraseFromParent(); 1294 return true; 1295 } 1296 1297 if (!DstTy.isScalar()) 1298 return false; 1299 1300 if (SrcSize > 32) { 1301 int SubRegIdx = sizeToSubRegIndex(DstSize); 1302 if (SubRegIdx == -1) 1303 return false; 1304 1305 // Deal with weird cases where the class only partially supports the subreg 1306 // index. 1307 const TargetRegisterClass *SrcWithSubRC 1308 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx); 1309 if (!SrcWithSubRC) 1310 return false; 1311 1312 if (SrcWithSubRC != SrcRC) { 1313 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI)) 1314 return false; 1315 } 1316 1317 I.getOperand(1).setSubReg(SubRegIdx); 1318 } 1319 1320 I.setDesc(TII.get(TargetOpcode::COPY)); 1321 return true; 1322 } 1323 1324 /// \returns true if a bitmask for \p Size bits will be an inline immediate. 1325 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) { 1326 Mask = maskTrailingOnes<unsigned>(Size); 1327 int SignedMask = static_cast<int>(Mask); 1328 return SignedMask >= -16 && SignedMask <= 64; 1329 } 1330 1331 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1. 1332 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank( 1333 Register Reg, const MachineRegisterInfo &MRI, 1334 const TargetRegisterInfo &TRI) const { 1335 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 1336 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>()) 1337 return RB; 1338 1339 // Ignore the type, since we don't use vcc in artifacts. 1340 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>()) 1341 return &RBI.getRegBankFromRegClass(*RC, LLT()); 1342 return nullptr; 1343 } 1344 1345 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { 1346 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG; 1347 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg; 1348 const DebugLoc &DL = I.getDebugLoc(); 1349 MachineBasicBlock &MBB = *I.getParent(); 1350 const Register DstReg = I.getOperand(0).getReg(); 1351 const Register SrcReg = I.getOperand(1).getReg(); 1352 1353 const LLT DstTy = MRI->getType(DstReg); 1354 const LLT SrcTy = MRI->getType(SrcReg); 1355 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ? 1356 I.getOperand(2).getImm() : SrcTy.getSizeInBits(); 1357 const unsigned DstSize = DstTy.getSizeInBits(); 1358 if (!DstTy.isScalar()) 1359 return false; 1360 1361 if (I.getOpcode() == AMDGPU::G_ANYEXT) 1362 return selectCOPY(I); 1363 1364 // Artifact casts should never use vcc. 1365 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI); 1366 1367 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) { 1368 // 64-bit should have been split up in RegBankSelect 1369 1370 // Try to use an and with a mask if it will save code size. 1371 unsigned Mask; 1372 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1373 MachineInstr *ExtI = 1374 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg) 1375 .addImm(Mask) 1376 .addReg(SrcReg); 1377 I.eraseFromParent(); 1378 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1379 } 1380 1381 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32; 1382 MachineInstr *ExtI = 1383 BuildMI(MBB, I, DL, TII.get(BFE), DstReg) 1384 .addReg(SrcReg) 1385 .addImm(0) // Offset 1386 .addImm(SrcSize); // Width 1387 I.eraseFromParent(); 1388 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1389 } 1390 1391 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) { 1392 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ? 1393 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass; 1394 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI)) 1395 return false; 1396 1397 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) { 1398 const unsigned SextOpc = SrcSize == 8 ? 1399 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16; 1400 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg) 1401 .addReg(SrcReg); 1402 I.eraseFromParent(); 1403 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1404 } 1405 1406 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; 1407 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; 1408 1409 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width. 1410 if (DstSize > 32 && (SrcSize <= 32 || InReg)) { 1411 // We need a 64-bit register source, but the high bits don't matter. 1412 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); 1413 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1414 unsigned SubReg = InReg ? AMDGPU::sub0 : 0; 1415 1416 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 1417 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) 1418 .addReg(SrcReg, 0, SubReg) 1419 .addImm(AMDGPU::sub0) 1420 .addReg(UndefReg) 1421 .addImm(AMDGPU::sub1); 1422 1423 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg) 1424 .addReg(ExtReg) 1425 .addImm(SrcSize << 16); 1426 1427 I.eraseFromParent(); 1428 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI); 1429 } 1430 1431 unsigned Mask; 1432 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1433 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg) 1434 .addReg(SrcReg) 1435 .addImm(Mask); 1436 } else { 1437 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg) 1438 .addReg(SrcReg) 1439 .addImm(SrcSize << 16); 1440 } 1441 1442 I.eraseFromParent(); 1443 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1444 } 1445 1446 return false; 1447 } 1448 1449 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { 1450 MachineBasicBlock *BB = I.getParent(); 1451 MachineOperand &ImmOp = I.getOperand(1); 1452 1453 // The AMDGPU backend only supports Imm operands and not CImm or FPImm. 1454 if (ImmOp.isFPImm()) { 1455 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt(); 1456 ImmOp.ChangeToImmediate(Imm.getZExtValue()); 1457 } else if (ImmOp.isCImm()) { 1458 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getZExtValue()); 1459 } 1460 1461 Register DstReg = I.getOperand(0).getReg(); 1462 unsigned Size; 1463 bool IsSgpr; 1464 const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg()); 1465 if (RB) { 1466 IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID; 1467 Size = MRI->getType(DstReg).getSizeInBits(); 1468 } else { 1469 const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg); 1470 IsSgpr = TRI.isSGPRClass(RC); 1471 Size = TRI.getRegSizeInBits(*RC); 1472 } 1473 1474 if (Size != 32 && Size != 64) 1475 return false; 1476 1477 unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1478 if (Size == 32) { 1479 I.setDesc(TII.get(Opcode)); 1480 I.addImplicitDefUseOperands(*MF); 1481 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 1482 } 1483 1484 const DebugLoc &DL = I.getDebugLoc(); 1485 1486 APInt Imm(Size, I.getOperand(1).getImm()); 1487 1488 MachineInstr *ResInst; 1489 if (IsSgpr && TII.isInlineConstant(Imm)) { 1490 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) 1491 .addImm(I.getOperand(1).getImm()); 1492 } else { 1493 const TargetRegisterClass *RC = IsSgpr ? 1494 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass; 1495 Register LoReg = MRI->createVirtualRegister(RC); 1496 Register HiReg = MRI->createVirtualRegister(RC); 1497 1498 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg) 1499 .addImm(Imm.trunc(32).getZExtValue()); 1500 1501 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg) 1502 .addImm(Imm.ashr(32).getZExtValue()); 1503 1504 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 1505 .addReg(LoReg) 1506 .addImm(AMDGPU::sub0) 1507 .addReg(HiReg) 1508 .addImm(AMDGPU::sub1); 1509 } 1510 1511 // We can't call constrainSelectedInstRegOperands here, because it doesn't 1512 // work for target independent opcodes 1513 I.eraseFromParent(); 1514 const TargetRegisterClass *DstRC = 1515 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI); 1516 if (!DstRC) 1517 return true; 1518 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI); 1519 } 1520 1521 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const { 1522 // Only manually handle the f64 SGPR case. 1523 // 1524 // FIXME: This is a workaround for 2.5 different tablegen problems. Because 1525 // the bit ops theoretically have a second result due to the implicit def of 1526 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing 1527 // that is easy by disabling the check. The result works, but uses a 1528 // nonsensical sreg32orlds_and_sreg_1 regclass. 1529 // 1530 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to 1531 // the variadic REG_SEQUENCE operands. 1532 1533 Register Dst = MI.getOperand(0).getReg(); 1534 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 1535 if (DstRB->getID() != AMDGPU::SGPRRegBankID || 1536 MRI->getType(Dst) != LLT::scalar(64)) 1537 return false; 1538 1539 Register Src = MI.getOperand(1).getReg(); 1540 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI); 1541 if (Fabs) 1542 Src = Fabs->getOperand(1).getReg(); 1543 1544 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || 1545 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) 1546 return false; 1547 1548 MachineBasicBlock *BB = MI.getParent(); 1549 const DebugLoc &DL = MI.getDebugLoc(); 1550 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1551 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1552 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1553 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1554 1555 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) 1556 .addReg(Src, 0, AMDGPU::sub0); 1557 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) 1558 .addReg(Src, 0, AMDGPU::sub1); 1559 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) 1560 .addImm(0x80000000); 1561 1562 // Set or toggle sign bit. 1563 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32; 1564 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg) 1565 .addReg(HiReg) 1566 .addReg(ConstReg); 1567 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) 1568 .addReg(LoReg) 1569 .addImm(AMDGPU::sub0) 1570 .addReg(OpReg) 1571 .addImm(AMDGPU::sub1); 1572 MI.eraseFromParent(); 1573 return true; 1574 } 1575 1576 static bool isConstant(const MachineInstr &MI) { 1577 return MI.getOpcode() == TargetOpcode::G_CONSTANT; 1578 } 1579 1580 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, 1581 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const { 1582 1583 const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg()); 1584 1585 assert(PtrMI); 1586 1587 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD) 1588 return; 1589 1590 GEPInfo GEPInfo(*PtrMI); 1591 1592 for (unsigned i = 1; i != 3; ++i) { 1593 const MachineOperand &GEPOp = PtrMI->getOperand(i); 1594 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg()); 1595 assert(OpDef); 1596 if (i == 2 && isConstant(*OpDef)) { 1597 // TODO: Could handle constant base + variable offset, but a combine 1598 // probably should have commuted it. 1599 assert(GEPInfo.Imm == 0); 1600 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue(); 1601 continue; 1602 } 1603 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI); 1604 if (OpBank->getID() == AMDGPU::SGPRRegBankID) 1605 GEPInfo.SgprParts.push_back(GEPOp.getReg()); 1606 else 1607 GEPInfo.VgprParts.push_back(GEPOp.getReg()); 1608 } 1609 1610 AddrInfo.push_back(GEPInfo); 1611 getAddrModeInfo(*PtrMI, MRI, AddrInfo); 1612 } 1613 1614 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const { 1615 if (!MI.hasOneMemOperand()) 1616 return false; 1617 1618 const MachineMemOperand *MMO = *MI.memoperands_begin(); 1619 const Value *Ptr = MMO->getValue(); 1620 1621 // UndefValue means this is a load of a kernel input. These are uniform. 1622 // Sometimes LDS instructions have constant pointers. 1623 // If Ptr is null, then that means this mem operand contains a 1624 // PseudoSourceValue like GOT. 1625 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || 1626 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) 1627 return true; 1628 1629 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 1630 return true; 1631 1632 const Instruction *I = dyn_cast<Instruction>(Ptr); 1633 return I && I->getMetadata("amdgpu.uniform"); 1634 } 1635 1636 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const { 1637 for (const GEPInfo &GEPInfo : AddrInfo) { 1638 if (!GEPInfo.VgprParts.empty()) 1639 return true; 1640 } 1641 return false; 1642 } 1643 1644 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const { 1645 MachineBasicBlock *BB = I.getParent(); 1646 1647 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); 1648 unsigned AS = PtrTy.getAddressSpace(); 1649 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) && 1650 STI.ldsRequiresM0Init()) { 1651 // If DS instructions require M0 initializtion, insert it before selecting. 1652 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1653 .addImm(-1); 1654 } 1655 } 1656 1657 bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const { 1658 initM0(I); 1659 return selectImpl(I, *CoverageInfo); 1660 } 1661 1662 // TODO: No rtn optimization. 1663 bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG( 1664 MachineInstr &MI) const { 1665 Register PtrReg = MI.getOperand(1).getReg(); 1666 const LLT PtrTy = MRI->getType(PtrReg); 1667 if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS || 1668 STI.useFlatForGlobal()) 1669 return selectImpl(MI, *CoverageInfo); 1670 1671 Register DstReg = MI.getOperand(0).getReg(); 1672 const LLT Ty = MRI->getType(DstReg); 1673 const bool Is64 = Ty.getSizeInBits() == 64; 1674 const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 1675 Register TmpReg = MRI->createVirtualRegister( 1676 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass); 1677 1678 const DebugLoc &DL = MI.getDebugLoc(); 1679 MachineBasicBlock *BB = MI.getParent(); 1680 1681 Register VAddr, RSrcReg, SOffset; 1682 int64_t Offset = 0; 1683 1684 unsigned Opcode; 1685 if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) { 1686 Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN : 1687 AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN; 1688 } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr, 1689 RSrcReg, SOffset, Offset)) { 1690 Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN : 1691 AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN; 1692 } else 1693 return selectImpl(MI, *CoverageInfo); 1694 1695 auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg) 1696 .addReg(MI.getOperand(2).getReg()); 1697 1698 if (VAddr) 1699 MIB.addReg(VAddr); 1700 1701 MIB.addReg(RSrcReg); 1702 if (SOffset) 1703 MIB.addReg(SOffset); 1704 else 1705 MIB.addImm(0); 1706 1707 MIB.addImm(Offset); 1708 MIB.addImm(0); // slc 1709 MIB.cloneMemRefs(MI); 1710 1711 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg) 1712 .addReg(TmpReg, RegState::Kill, SubReg); 1713 1714 MI.eraseFromParent(); 1715 1716 MRI->setRegClass( 1717 DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass); 1718 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1719 } 1720 1721 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { 1722 MachineBasicBlock *BB = I.getParent(); 1723 MachineOperand &CondOp = I.getOperand(0); 1724 Register CondReg = CondOp.getReg(); 1725 const DebugLoc &DL = I.getDebugLoc(); 1726 1727 unsigned BrOpcode; 1728 Register CondPhysReg; 1729 const TargetRegisterClass *ConstrainRC; 1730 1731 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide 1732 // whether the branch is uniform when selecting the instruction. In 1733 // GlobalISel, we should push that decision into RegBankSelect. Assume for now 1734 // RegBankSelect knows what it's doing if the branch condition is scc, even 1735 // though it currently does not. 1736 if (!isVCC(CondReg, *MRI)) { 1737 if (MRI->getType(CondReg) != LLT::scalar(32)) 1738 return false; 1739 1740 CondPhysReg = AMDGPU::SCC; 1741 BrOpcode = AMDGPU::S_CBRANCH_SCC1; 1742 // FIXME: Hack for isSCC tests 1743 ConstrainRC = &AMDGPU::SGPR_32RegClass; 1744 } else { 1745 // FIXME: Do we have to insert an and with exec here, like in SelectionDAG? 1746 // We sort of know that a VCC producer based on the register bank, that ands 1747 // inactive lanes with 0. What if there was a logical operation with vcc 1748 // producers in different blocks/with different exec masks? 1749 // FIXME: Should scc->vcc copies and with exec? 1750 CondPhysReg = TRI.getVCC(); 1751 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ; 1752 ConstrainRC = TRI.getBoolRC(); 1753 } 1754 1755 if (!MRI->getRegClassOrNull(CondReg)) 1756 MRI->setRegClass(CondReg, ConstrainRC); 1757 1758 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg) 1759 .addReg(CondReg); 1760 BuildMI(*BB, &I, DL, TII.get(BrOpcode)) 1761 .addMBB(I.getOperand(1).getMBB()); 1762 1763 I.eraseFromParent(); 1764 return true; 1765 } 1766 1767 bool AMDGPUInstructionSelector::selectG_FRAME_INDEX_GLOBAL_VALUE( 1768 MachineInstr &I) const { 1769 Register DstReg = I.getOperand(0).getReg(); 1770 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1771 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 1772 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32)); 1773 if (IsVGPR) 1774 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 1775 1776 return RBI.constrainGenericRegister( 1777 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI); 1778 } 1779 1780 bool AMDGPUInstructionSelector::selectG_PTR_MASK(MachineInstr &I) const { 1781 uint64_t Align = I.getOperand(2).getImm(); 1782 const uint64_t Mask = ~((UINT64_C(1) << Align) - 1); 1783 1784 MachineBasicBlock *BB = I.getParent(); 1785 1786 Register DstReg = I.getOperand(0).getReg(); 1787 Register SrcReg = I.getOperand(1).getReg(); 1788 1789 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1790 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1791 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 1792 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 1793 unsigned MovOpc = IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 1794 const TargetRegisterClass &RegRC 1795 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 1796 1797 LLT Ty = MRI->getType(DstReg); 1798 1799 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB, 1800 *MRI); 1801 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB, 1802 *MRI); 1803 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 1804 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 1805 return false; 1806 1807 const DebugLoc &DL = I.getDebugLoc(); 1808 Register ImmReg = MRI->createVirtualRegister(&RegRC); 1809 BuildMI(*BB, &I, DL, TII.get(MovOpc), ImmReg) 1810 .addImm(Mask); 1811 1812 if (Ty.getSizeInBits() == 32) { 1813 BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg) 1814 .addReg(SrcReg) 1815 .addReg(ImmReg); 1816 I.eraseFromParent(); 1817 return true; 1818 } 1819 1820 Register HiReg = MRI->createVirtualRegister(&RegRC); 1821 Register LoReg = MRI->createVirtualRegister(&RegRC); 1822 Register MaskLo = MRI->createVirtualRegister(&RegRC); 1823 1824 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg) 1825 .addReg(SrcReg, 0, AMDGPU::sub0); 1826 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg) 1827 .addReg(SrcReg, 0, AMDGPU::sub1); 1828 1829 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskLo) 1830 .addReg(LoReg) 1831 .addReg(ImmReg); 1832 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 1833 .addReg(MaskLo) 1834 .addImm(AMDGPU::sub0) 1835 .addReg(HiReg) 1836 .addImm(AMDGPU::sub1); 1837 I.eraseFromParent(); 1838 return true; 1839 } 1840 1841 /// Return the register to use for the index value, and the subregister to use 1842 /// for the indirectly accessed register. 1843 static std::pair<Register, unsigned> 1844 computeIndirectRegIndex(MachineRegisterInfo &MRI, 1845 const SIRegisterInfo &TRI, 1846 const TargetRegisterClass *SuperRC, 1847 Register IdxReg, 1848 unsigned EltSize) { 1849 Register IdxBaseReg; 1850 int Offset; 1851 MachineInstr *Unused; 1852 1853 std::tie(IdxBaseReg, Offset, Unused) 1854 = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg); 1855 if (IdxBaseReg == AMDGPU::NoRegister) { 1856 // This will happen if the index is a known constant. This should ordinarily 1857 // be legalized out, but handle it as a register just in case. 1858 assert(Offset == 0); 1859 IdxBaseReg = IdxReg; 1860 } 1861 1862 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize); 1863 1864 // Skip out of bounds offsets, or else we would end up using an undefined 1865 // register. 1866 if (static_cast<unsigned>(Offset) >= SubRegs.size()) 1867 return std::make_pair(IdxReg, SubRegs[0]); 1868 return std::make_pair(IdxBaseReg, SubRegs[Offset]); 1869 } 1870 1871 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT( 1872 MachineInstr &MI) const { 1873 Register DstReg = MI.getOperand(0).getReg(); 1874 Register SrcReg = MI.getOperand(1).getReg(); 1875 Register IdxReg = MI.getOperand(2).getReg(); 1876 1877 LLT DstTy = MRI->getType(DstReg); 1878 LLT SrcTy = MRI->getType(SrcReg); 1879 1880 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1881 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1882 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 1883 1884 // The index must be scalar. If it wasn't RegBankSelect should have moved this 1885 // into a waterfall loop. 1886 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 1887 return false; 1888 1889 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB, 1890 *MRI); 1891 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB, 1892 *MRI); 1893 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 1894 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 1895 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 1896 return false; 1897 1898 MachineBasicBlock *BB = MI.getParent(); 1899 const DebugLoc &DL = MI.getDebugLoc(); 1900 const bool Is64 = DstTy.getSizeInBits() == 64; 1901 1902 unsigned SubReg; 1903 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg, 1904 DstTy.getSizeInBits() / 8); 1905 1906 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) { 1907 if (DstTy.getSizeInBits() != 32 && !Is64) 1908 return false; 1909 1910 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1911 .addReg(IdxReg); 1912 1913 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32; 1914 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg) 1915 .addReg(SrcReg, 0, SubReg) 1916 .addReg(SrcReg, RegState::Implicit); 1917 MI.eraseFromParent(); 1918 return true; 1919 } 1920 1921 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32) 1922 return false; 1923 1924 if (!STI.useVGPRIndexMode()) { 1925 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1926 .addReg(IdxReg); 1927 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg) 1928 .addReg(SrcReg, RegState::Undef, SubReg) 1929 .addReg(SrcReg, RegState::Implicit); 1930 MI.eraseFromParent(); 1931 return true; 1932 } 1933 1934 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON)) 1935 .addReg(IdxReg) 1936 .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE); 1937 BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstReg) 1938 .addReg(SrcReg, RegState::Undef, SubReg) 1939 .addReg(SrcReg, RegState::Implicit) 1940 .addReg(AMDGPU::M0, RegState::Implicit); 1941 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF)); 1942 1943 MI.eraseFromParent(); 1944 return true; 1945 } 1946 1947 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd 1948 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT( 1949 MachineInstr &MI) const { 1950 Register DstReg = MI.getOperand(0).getReg(); 1951 Register VecReg = MI.getOperand(1).getReg(); 1952 Register ValReg = MI.getOperand(2).getReg(); 1953 Register IdxReg = MI.getOperand(3).getReg(); 1954 1955 LLT VecTy = MRI->getType(DstReg); 1956 LLT ValTy = MRI->getType(ValReg); 1957 unsigned VecSize = VecTy.getSizeInBits(); 1958 unsigned ValSize = ValTy.getSizeInBits(); 1959 1960 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI); 1961 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI); 1962 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 1963 1964 assert(VecTy.getElementType() == ValTy); 1965 1966 // The index must be scalar. If it wasn't RegBankSelect should have moved this 1967 // into a waterfall loop. 1968 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 1969 return false; 1970 1971 const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB, 1972 *MRI); 1973 const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB, 1974 *MRI); 1975 1976 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) || 1977 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) || 1978 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) || 1979 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 1980 return false; 1981 1982 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32) 1983 return false; 1984 1985 unsigned SubReg; 1986 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, 1987 ValSize / 8); 1988 1989 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID && 1990 STI.useVGPRIndexMode(); 1991 1992 MachineBasicBlock *BB = MI.getParent(); 1993 const DebugLoc &DL = MI.getDebugLoc(); 1994 1995 if (IndexMode) { 1996 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON)) 1997 .addReg(IdxReg) 1998 .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE); 1999 } else { 2000 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2001 .addReg(IdxReg); 2002 } 2003 2004 const MCInstrDesc &RegWriteOp 2005 = TII.getIndirectRegWritePseudo(VecSize, ValSize, 2006 VecRB->getID() == AMDGPU::SGPRRegBankID); 2007 BuildMI(*BB, MI, DL, RegWriteOp, DstReg) 2008 .addReg(VecReg) 2009 .addReg(ValReg) 2010 .addImm(SubReg); 2011 2012 if (IndexMode) 2013 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF)); 2014 2015 MI.eraseFromParent(); 2016 return true; 2017 } 2018 2019 bool AMDGPUInstructionSelector::select(MachineInstr &I) { 2020 if (I.isPHI()) 2021 return selectPHI(I); 2022 2023 if (!I.isPreISelOpcode()) { 2024 if (I.isCopy()) 2025 return selectCOPY(I); 2026 return true; 2027 } 2028 2029 switch (I.getOpcode()) { 2030 case TargetOpcode::G_AND: 2031 case TargetOpcode::G_OR: 2032 case TargetOpcode::G_XOR: 2033 if (selectG_AND_OR_XOR(I)) 2034 return true; 2035 return selectImpl(I, *CoverageInfo); 2036 case TargetOpcode::G_ADD: 2037 case TargetOpcode::G_SUB: 2038 if (selectImpl(I, *CoverageInfo)) 2039 return true; 2040 return selectG_ADD_SUB(I); 2041 case TargetOpcode::G_UADDO: 2042 case TargetOpcode::G_USUBO: 2043 case TargetOpcode::G_UADDE: 2044 case TargetOpcode::G_USUBE: 2045 return selectG_UADDO_USUBO_UADDE_USUBE(I); 2046 case TargetOpcode::G_INTTOPTR: 2047 case TargetOpcode::G_BITCAST: 2048 case TargetOpcode::G_PTRTOINT: 2049 return selectCOPY(I); 2050 case TargetOpcode::G_CONSTANT: 2051 case TargetOpcode::G_FCONSTANT: 2052 return selectG_CONSTANT(I); 2053 case TargetOpcode::G_FNEG: 2054 if (selectImpl(I, *CoverageInfo)) 2055 return true; 2056 return selectG_FNEG(I); 2057 case TargetOpcode::G_EXTRACT: 2058 return selectG_EXTRACT(I); 2059 case TargetOpcode::G_MERGE_VALUES: 2060 case TargetOpcode::G_BUILD_VECTOR: 2061 case TargetOpcode::G_CONCAT_VECTORS: 2062 return selectG_MERGE_VALUES(I); 2063 case TargetOpcode::G_UNMERGE_VALUES: 2064 return selectG_UNMERGE_VALUES(I); 2065 case TargetOpcode::G_PTR_ADD: 2066 return selectG_PTR_ADD(I); 2067 case TargetOpcode::G_IMPLICIT_DEF: 2068 return selectG_IMPLICIT_DEF(I); 2069 case TargetOpcode::G_INSERT: 2070 return selectG_INSERT(I); 2071 case TargetOpcode::G_INTRINSIC: 2072 return selectG_INTRINSIC(I); 2073 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: 2074 return selectG_INTRINSIC_W_SIDE_EFFECTS(I); 2075 case TargetOpcode::G_ICMP: 2076 if (selectG_ICMP(I)) 2077 return true; 2078 return selectImpl(I, *CoverageInfo); 2079 case TargetOpcode::G_LOAD: 2080 case TargetOpcode::G_ATOMIC_CMPXCHG: 2081 case TargetOpcode::G_ATOMICRMW_XCHG: 2082 case TargetOpcode::G_ATOMICRMW_ADD: 2083 case TargetOpcode::G_ATOMICRMW_SUB: 2084 case TargetOpcode::G_ATOMICRMW_AND: 2085 case TargetOpcode::G_ATOMICRMW_OR: 2086 case TargetOpcode::G_ATOMICRMW_XOR: 2087 case TargetOpcode::G_ATOMICRMW_MIN: 2088 case TargetOpcode::G_ATOMICRMW_MAX: 2089 case TargetOpcode::G_ATOMICRMW_UMIN: 2090 case TargetOpcode::G_ATOMICRMW_UMAX: 2091 case TargetOpcode::G_ATOMICRMW_FADD: 2092 return selectG_LOAD_ATOMICRMW(I); 2093 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: 2094 return selectG_AMDGPU_ATOMIC_CMPXCHG(I); 2095 case TargetOpcode::G_SELECT: 2096 return selectG_SELECT(I); 2097 case TargetOpcode::G_STORE: 2098 return selectG_STORE(I); 2099 case TargetOpcode::G_TRUNC: 2100 return selectG_TRUNC(I); 2101 case TargetOpcode::G_SEXT: 2102 case TargetOpcode::G_ZEXT: 2103 case TargetOpcode::G_ANYEXT: 2104 case TargetOpcode::G_SEXT_INREG: 2105 if (selectImpl(I, *CoverageInfo)) 2106 return true; 2107 return selectG_SZA_EXT(I); 2108 case TargetOpcode::G_BRCOND: 2109 return selectG_BRCOND(I); 2110 case TargetOpcode::G_FRAME_INDEX: 2111 case TargetOpcode::G_GLOBAL_VALUE: 2112 return selectG_FRAME_INDEX_GLOBAL_VALUE(I); 2113 case TargetOpcode::G_PTR_MASK: 2114 return selectG_PTR_MASK(I); 2115 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 2116 return selectG_EXTRACT_VECTOR_ELT(I); 2117 case TargetOpcode::G_INSERT_VECTOR_ELT: 2118 return selectG_INSERT_VECTOR_ELT(I); 2119 case AMDGPU::G_AMDGPU_ATOMIC_INC: 2120 case AMDGPU::G_AMDGPU_ATOMIC_DEC: 2121 initM0(I); 2122 return selectImpl(I, *CoverageInfo); 2123 default: 2124 return selectImpl(I, *CoverageInfo); 2125 } 2126 return false; 2127 } 2128 2129 InstructionSelector::ComplexRendererFns 2130 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { 2131 return {{ 2132 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 2133 }}; 2134 2135 } 2136 2137 std::pair<Register, unsigned> 2138 AMDGPUInstructionSelector::selectVOP3ModsImpl( 2139 Register Src) const { 2140 unsigned Mods = 0; 2141 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI); 2142 2143 if (MI && MI->getOpcode() == AMDGPU::G_FNEG) { 2144 Src = MI->getOperand(1).getReg(); 2145 Mods |= SISrcMods::NEG; 2146 MI = getDefIgnoringCopies(Src, *MRI); 2147 } 2148 2149 if (MI && MI->getOpcode() == AMDGPU::G_FABS) { 2150 Src = MI->getOperand(1).getReg(); 2151 Mods |= SISrcMods::ABS; 2152 } 2153 2154 return std::make_pair(Src, Mods); 2155 } 2156 2157 /// 2158 /// This will select either an SGPR or VGPR operand and will save us from 2159 /// having to write an extra tablegen pattern. 2160 InstructionSelector::ComplexRendererFns 2161 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const { 2162 return {{ 2163 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 2164 }}; 2165 } 2166 2167 InstructionSelector::ComplexRendererFns 2168 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { 2169 Register Src; 2170 unsigned Mods; 2171 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 2172 2173 return {{ 2174 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2175 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 2176 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 2177 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 2178 }}; 2179 } 2180 2181 InstructionSelector::ComplexRendererFns 2182 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { 2183 return {{ 2184 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 2185 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 2186 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 2187 }}; 2188 } 2189 2190 InstructionSelector::ComplexRendererFns 2191 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { 2192 Register Src; 2193 unsigned Mods; 2194 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 2195 2196 return {{ 2197 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2198 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 2199 }}; 2200 } 2201 2202 InstructionSelector::ComplexRendererFns 2203 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { 2204 Register Reg = Root.getReg(); 2205 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); 2206 if (Def && (Def->getOpcode() == AMDGPU::G_FNEG || 2207 Def->getOpcode() == AMDGPU::G_FABS)) 2208 return {}; 2209 return {{ 2210 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 2211 }}; 2212 } 2213 2214 InstructionSelector::ComplexRendererFns 2215 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const { 2216 Register Src; 2217 unsigned Mods; 2218 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 2219 if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI)) 2220 return None; 2221 2222 return {{ 2223 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2224 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 2225 }}; 2226 } 2227 2228 InstructionSelector::ComplexRendererFns 2229 AMDGPUInstructionSelector::selectVOP3OpSelMods0(MachineOperand &Root) const { 2230 // FIXME: Handle clamp and op_sel 2231 return {{ 2232 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 2233 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // src_mods 2234 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // clamp 2235 }}; 2236 } 2237 2238 InstructionSelector::ComplexRendererFns 2239 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { 2240 // FIXME: Handle op_sel 2241 return {{ 2242 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 2243 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods 2244 }}; 2245 } 2246 2247 InstructionSelector::ComplexRendererFns 2248 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { 2249 SmallVector<GEPInfo, 4> AddrInfo; 2250 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 2251 2252 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 2253 return None; 2254 2255 const GEPInfo &GEPInfo = AddrInfo[0]; 2256 Optional<int64_t> EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm); 2257 if (!EncodedImm) 2258 return None; 2259 2260 unsigned PtrReg = GEPInfo.SgprParts[0]; 2261 return {{ 2262 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 2263 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } 2264 }}; 2265 } 2266 2267 InstructionSelector::ComplexRendererFns 2268 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { 2269 SmallVector<GEPInfo, 4> AddrInfo; 2270 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 2271 2272 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 2273 return None; 2274 2275 const GEPInfo &GEPInfo = AddrInfo[0]; 2276 unsigned PtrReg = GEPInfo.SgprParts[0]; 2277 Optional<int64_t> EncodedImm = 2278 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm); 2279 if (!EncodedImm) 2280 return None; 2281 2282 return {{ 2283 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 2284 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } 2285 }}; 2286 } 2287 2288 InstructionSelector::ComplexRendererFns 2289 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { 2290 MachineInstr *MI = Root.getParent(); 2291 MachineBasicBlock *MBB = MI->getParent(); 2292 2293 SmallVector<GEPInfo, 4> AddrInfo; 2294 getAddrModeInfo(*MI, *MRI, AddrInfo); 2295 2296 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, 2297 // then we can select all ptr + 32-bit offsets not just immediate offsets. 2298 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 2299 return None; 2300 2301 const GEPInfo &GEPInfo = AddrInfo[0]; 2302 if (!GEPInfo.Imm || !isUInt<32>(GEPInfo.Imm)) 2303 return None; 2304 2305 // If we make it this far we have a load with an 32-bit immediate offset. 2306 // It is OK to select this using a sgpr offset, because we have already 2307 // failed trying to select this load into one of the _IMM variants since 2308 // the _IMM Patterns are considered before the _SGPR patterns. 2309 unsigned PtrReg = GEPInfo.SgprParts[0]; 2310 Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2311 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) 2312 .addImm(GEPInfo.Imm); 2313 return {{ 2314 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 2315 [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); } 2316 }}; 2317 } 2318 2319 template <bool Signed> 2320 InstructionSelector::ComplexRendererFns 2321 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const { 2322 MachineInstr *MI = Root.getParent(); 2323 2324 InstructionSelector::ComplexRendererFns Default = {{ 2325 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 2326 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // offset 2327 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 2328 }}; 2329 2330 if (!STI.hasFlatInstOffsets()) 2331 return Default; 2332 2333 const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg()); 2334 if (!OpDef || OpDef->getOpcode() != AMDGPU::G_PTR_ADD) 2335 return Default; 2336 2337 Optional<int64_t> Offset = 2338 getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI); 2339 if (!Offset.hasValue()) 2340 return Default; 2341 2342 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); 2343 if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed)) 2344 return Default; 2345 2346 Register BasePtr = OpDef->getOperand(1).getReg(); 2347 2348 return {{ 2349 [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); }, 2350 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); }, 2351 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 2352 }}; 2353 } 2354 2355 InstructionSelector::ComplexRendererFns 2356 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const { 2357 return selectFlatOffsetImpl<false>(Root); 2358 } 2359 2360 InstructionSelector::ComplexRendererFns 2361 AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const { 2362 return selectFlatOffsetImpl<true>(Root); 2363 } 2364 2365 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { 2366 auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>(); 2367 return PSV && PSV->isStack(); 2368 } 2369 2370 InstructionSelector::ComplexRendererFns 2371 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { 2372 MachineInstr *MI = Root.getParent(); 2373 MachineBasicBlock *MBB = MI->getParent(); 2374 MachineFunction *MF = MBB->getParent(); 2375 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 2376 2377 int64_t Offset = 0; 2378 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset))) { 2379 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2380 2381 // TODO: Should this be inside the render function? The iterator seems to 2382 // move. 2383 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), 2384 HighBits) 2385 .addImm(Offset & ~4095); 2386 2387 return {{[=](MachineInstrBuilder &MIB) { // rsrc 2388 MIB.addReg(Info->getScratchRSrcReg()); 2389 }, 2390 [=](MachineInstrBuilder &MIB) { // vaddr 2391 MIB.addReg(HighBits); 2392 }, 2393 [=](MachineInstrBuilder &MIB) { // soffset 2394 const MachineMemOperand *MMO = *MI->memoperands_begin(); 2395 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 2396 2397 Register SOffsetReg = isStackPtrRelative(PtrInfo) 2398 ? Info->getStackPtrOffsetReg() 2399 : Info->getScratchWaveOffsetReg(); 2400 MIB.addReg(SOffsetReg); 2401 }, 2402 [=](MachineInstrBuilder &MIB) { // offset 2403 MIB.addImm(Offset & 4095); 2404 }}}; 2405 } 2406 2407 assert(Offset == 0); 2408 2409 // Try to fold a frame index directly into the MUBUF vaddr field, and any 2410 // offsets. 2411 Optional<int> FI; 2412 Register VAddr = Root.getReg(); 2413 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) { 2414 if (isBaseWithConstantOffset(Root, *MRI)) { 2415 const MachineOperand &LHS = RootDef->getOperand(1); 2416 const MachineOperand &RHS = RootDef->getOperand(2); 2417 const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); 2418 const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); 2419 if (LHSDef && RHSDef) { 2420 int64_t PossibleOffset = 2421 RHSDef->getOperand(1).getCImm()->getSExtValue(); 2422 if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) && 2423 (!STI.privateMemoryResourceIsRangeChecked() || 2424 KnownBits->signBitIsZero(LHS.getReg()))) { 2425 if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX) 2426 FI = LHSDef->getOperand(1).getIndex(); 2427 else 2428 VAddr = LHS.getReg(); 2429 Offset = PossibleOffset; 2430 } 2431 } 2432 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) { 2433 FI = RootDef->getOperand(1).getIndex(); 2434 } 2435 } 2436 2437 // If we don't know this private access is a local stack object, it needs to 2438 // be relative to the entry point's scratch wave offset register. 2439 // TODO: Should split large offsets that don't fit like above. 2440 // TODO: Don't use scratch wave offset just because the offset didn't fit. 2441 Register SOffset = FI.hasValue() ? Info->getStackPtrOffsetReg() 2442 : Info->getScratchWaveOffsetReg(); 2443 2444 return {{[=](MachineInstrBuilder &MIB) { // rsrc 2445 MIB.addReg(Info->getScratchRSrcReg()); 2446 }, 2447 [=](MachineInstrBuilder &MIB) { // vaddr 2448 if (FI.hasValue()) 2449 MIB.addFrameIndex(FI.getValue()); 2450 else 2451 MIB.addReg(VAddr); 2452 }, 2453 [=](MachineInstrBuilder &MIB) { // soffset 2454 MIB.addReg(SOffset); 2455 }, 2456 [=](MachineInstrBuilder &MIB) { // offset 2457 MIB.addImm(Offset); 2458 }}}; 2459 } 2460 2461 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base, 2462 int64_t Offset, 2463 unsigned OffsetBits) const { 2464 if ((OffsetBits == 16 && !isUInt<16>(Offset)) || 2465 (OffsetBits == 8 && !isUInt<8>(Offset))) 2466 return false; 2467 2468 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) 2469 return true; 2470 2471 // On Southern Islands instruction with a negative base value and an offset 2472 // don't seem to work. 2473 return KnownBits->signBitIsZero(Base); 2474 } 2475 2476 InstructionSelector::ComplexRendererFns 2477 AMDGPUInstructionSelector::selectMUBUFScratchOffset( 2478 MachineOperand &Root) const { 2479 MachineInstr *MI = Root.getParent(); 2480 MachineBasicBlock *MBB = MI->getParent(); 2481 2482 int64_t Offset = 0; 2483 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) || 2484 !SIInstrInfo::isLegalMUBUFImmOffset(Offset)) 2485 return {}; 2486 2487 const MachineFunction *MF = MBB->getParent(); 2488 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 2489 const MachineMemOperand *MMO = *MI->memoperands_begin(); 2490 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 2491 2492 Register SOffsetReg = isStackPtrRelative(PtrInfo) 2493 ? Info->getStackPtrOffsetReg() 2494 : Info->getScratchWaveOffsetReg(); 2495 return {{ 2496 [=](MachineInstrBuilder &MIB) { 2497 MIB.addReg(Info->getScratchRSrcReg()); 2498 }, // rsrc 2499 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffsetReg); }, // soffset 2500 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset 2501 }}; 2502 } 2503 2504 std::pair<Register, unsigned> 2505 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const { 2506 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 2507 if (!RootDef) 2508 return std::make_pair(Root.getReg(), 0); 2509 2510 int64_t ConstAddr = 0; 2511 2512 Register PtrBase; 2513 int64_t Offset; 2514 std::tie(PtrBase, Offset) = 2515 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 2516 2517 if (Offset) { 2518 if (isDSOffsetLegal(PtrBase, Offset, 16)) { 2519 // (add n0, c0) 2520 return std::make_pair(PtrBase, Offset); 2521 } 2522 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 2523 // TODO 2524 2525 2526 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 2527 // TODO 2528 2529 } 2530 2531 return std::make_pair(Root.getReg(), 0); 2532 } 2533 2534 InstructionSelector::ComplexRendererFns 2535 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const { 2536 Register Reg; 2537 unsigned Offset; 2538 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root); 2539 return {{ 2540 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 2541 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } 2542 }}; 2543 } 2544 2545 InstructionSelector::ComplexRendererFns 2546 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const { 2547 Register Reg; 2548 unsigned Offset; 2549 std::tie(Reg, Offset) = selectDS64Bit4ByteAlignedImpl(Root); 2550 return {{ 2551 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 2552 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, 2553 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); } 2554 }}; 2555 } 2556 2557 std::pair<Register, unsigned> 2558 AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const { 2559 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 2560 if (!RootDef) 2561 return std::make_pair(Root.getReg(), 0); 2562 2563 int64_t ConstAddr = 0; 2564 2565 Register PtrBase; 2566 int64_t Offset; 2567 std::tie(PtrBase, Offset) = 2568 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 2569 2570 if (Offset) { 2571 int64_t DWordOffset0 = Offset / 4; 2572 int64_t DWordOffset1 = DWordOffset0 + 1; 2573 if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) { 2574 // (add n0, c0) 2575 return std::make_pair(PtrBase, DWordOffset0); 2576 } 2577 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 2578 // TODO 2579 2580 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 2581 // TODO 2582 2583 } 2584 2585 return std::make_pair(Root.getReg(), 0); 2586 } 2587 2588 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return 2589 /// the base value with the constant offset. There may be intervening copies 2590 /// between \p Root and the identified constant. Returns \p Root, 0 if this does 2591 /// not match the pattern. 2592 std::pair<Register, int64_t> 2593 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset( 2594 Register Root, const MachineRegisterInfo &MRI) const { 2595 MachineInstr *RootI = MRI.getVRegDef(Root); 2596 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD) 2597 return {Root, 0}; 2598 2599 MachineOperand &RHS = RootI->getOperand(2); 2600 Optional<ValueAndVReg> MaybeOffset 2601 = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true); 2602 if (!MaybeOffset) 2603 return {Root, 0}; 2604 return {RootI->getOperand(1).getReg(), MaybeOffset->Value}; 2605 } 2606 2607 static void addZeroImm(MachineInstrBuilder &MIB) { 2608 MIB.addImm(0); 2609 } 2610 2611 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p 2612 /// BasePtr is not valid, a null base pointer will be used. 2613 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, 2614 uint32_t FormatLo, uint32_t FormatHi, 2615 Register BasePtr) { 2616 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 2617 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 2618 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 2619 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 2620 2621 B.buildInstr(AMDGPU::S_MOV_B32) 2622 .addDef(RSrc2) 2623 .addImm(FormatLo); 2624 B.buildInstr(AMDGPU::S_MOV_B32) 2625 .addDef(RSrc3) 2626 .addImm(FormatHi); 2627 2628 // Build the half of the subregister with the constants before building the 2629 // full 128-bit register. If we are building multiple resource descriptors, 2630 // this will allow CSEing of the 2-component register. 2631 B.buildInstr(AMDGPU::REG_SEQUENCE) 2632 .addDef(RSrcHi) 2633 .addReg(RSrc2) 2634 .addImm(AMDGPU::sub0) 2635 .addReg(RSrc3) 2636 .addImm(AMDGPU::sub1); 2637 2638 Register RSrcLo = BasePtr; 2639 if (!BasePtr) { 2640 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 2641 B.buildInstr(AMDGPU::S_MOV_B64) 2642 .addDef(RSrcLo) 2643 .addImm(0); 2644 } 2645 2646 B.buildInstr(AMDGPU::REG_SEQUENCE) 2647 .addDef(RSrc) 2648 .addReg(RSrcLo) 2649 .addImm(AMDGPU::sub0_sub1) 2650 .addReg(RSrcHi) 2651 .addImm(AMDGPU::sub2_sub3); 2652 2653 return RSrc; 2654 } 2655 2656 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 2657 const SIInstrInfo &TII, Register BasePtr) { 2658 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 2659 2660 // FIXME: Why are half the "default" bits ignored based on the addressing 2661 // mode? 2662 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr); 2663 } 2664 2665 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 2666 const SIInstrInfo &TII, Register BasePtr) { 2667 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 2668 2669 // FIXME: Why are half the "default" bits ignored based on the addressing 2670 // mode? 2671 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr); 2672 } 2673 2674 AMDGPUInstructionSelector::MUBUFAddressData 2675 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const { 2676 MUBUFAddressData Data; 2677 Data.N0 = Src; 2678 2679 Register PtrBase; 2680 int64_t Offset; 2681 2682 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI); 2683 if (isUInt<32>(Offset)) { 2684 Data.N0 = PtrBase; 2685 Data.Offset = Offset; 2686 } 2687 2688 if (MachineInstr *InputAdd 2689 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) { 2690 Data.N2 = InputAdd->getOperand(1).getReg(); 2691 Data.N3 = InputAdd->getOperand(2).getReg(); 2692 2693 // FIXME: Need to fix extra SGPR->VGPRcopies inserted 2694 // FIXME: Don't know this was defined by operand 0 2695 // 2696 // TODO: Remove this when we have copy folding optimizations after 2697 // RegBankSelect. 2698 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg(); 2699 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg(); 2700 } 2701 2702 return Data; 2703 } 2704 2705 /// Return if the addr64 mubuf mode should be used for the given address. 2706 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const { 2707 // (ptr_add N2, N3) -> addr64, or 2708 // (ptr_add (ptr_add N2, N3), C1) -> addr64 2709 if (Addr.N2) 2710 return true; 2711 2712 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI); 2713 return N0Bank->getID() == AMDGPU::VGPRRegBankID; 2714 } 2715 2716 /// Split an immediate offset \p ImmOffset depending on whether it fits in the 2717 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable 2718 /// component. 2719 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset( 2720 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const { 2721 if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset)) 2722 return; 2723 2724 // Illegal offset, store it in soffset. 2725 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2726 B.buildInstr(AMDGPU::S_MOV_B32) 2727 .addDef(SOffset) 2728 .addImm(ImmOffset); 2729 ImmOffset = 0; 2730 } 2731 2732 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl( 2733 MachineOperand &Root, Register &VAddr, Register &RSrcReg, 2734 Register &SOffset, int64_t &Offset) const { 2735 // FIXME: Predicates should stop this from reaching here. 2736 // addr64 bit was removed for volcanic islands. 2737 if (!STI.hasAddr64() || STI.useFlatForGlobal()) 2738 return false; 2739 2740 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 2741 if (!shouldUseAddr64(AddrData)) 2742 return false; 2743 2744 Register N0 = AddrData.N0; 2745 Register N2 = AddrData.N2; 2746 Register N3 = AddrData.N3; 2747 Offset = AddrData.Offset; 2748 2749 // Base pointer for the SRD. 2750 Register SRDPtr; 2751 2752 if (N2) { 2753 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 2754 assert(N3); 2755 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 2756 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the 2757 // addr64, and construct the default resource from a 0 address. 2758 VAddr = N0; 2759 } else { 2760 SRDPtr = N3; 2761 VAddr = N2; 2762 } 2763 } else { 2764 // N2 is not divergent. 2765 SRDPtr = N2; 2766 VAddr = N3; 2767 } 2768 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 2769 // Use the default null pointer in the resource 2770 VAddr = N0; 2771 } else { 2772 // N0 -> offset, or 2773 // (N0 + C1) -> offset 2774 SRDPtr = N0; 2775 } 2776 2777 MachineIRBuilder B(*Root.getParent()); 2778 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr); 2779 splitIllegalMUBUFOffset(B, SOffset, Offset); 2780 return true; 2781 } 2782 2783 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl( 2784 MachineOperand &Root, Register &RSrcReg, Register &SOffset, 2785 int64_t &Offset) const { 2786 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 2787 if (shouldUseAddr64(AddrData)) 2788 return false; 2789 2790 // N0 -> offset, or 2791 // (N0 + C1) -> offset 2792 Register SRDPtr = AddrData.N0; 2793 Offset = AddrData.Offset; 2794 2795 // TODO: Look through extensions for 32-bit soffset. 2796 MachineIRBuilder B(*Root.getParent()); 2797 2798 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr); 2799 splitIllegalMUBUFOffset(B, SOffset, Offset); 2800 return true; 2801 } 2802 2803 InstructionSelector::ComplexRendererFns 2804 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const { 2805 Register VAddr; 2806 Register RSrcReg; 2807 Register SOffset; 2808 int64_t Offset = 0; 2809 2810 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 2811 return {}; 2812 2813 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 2814 // pattern. 2815 return {{ 2816 [=](MachineInstrBuilder &MIB) { // rsrc 2817 MIB.addReg(RSrcReg); 2818 }, 2819 [=](MachineInstrBuilder &MIB) { // vaddr 2820 MIB.addReg(VAddr); 2821 }, 2822 [=](MachineInstrBuilder &MIB) { // soffset 2823 if (SOffset) 2824 MIB.addReg(SOffset); 2825 else 2826 MIB.addImm(0); 2827 }, 2828 [=](MachineInstrBuilder &MIB) { // offset 2829 MIB.addImm(Offset); 2830 }, 2831 addZeroImm, // glc 2832 addZeroImm, // slc 2833 addZeroImm, // tfe 2834 addZeroImm, // dlc 2835 addZeroImm // swz 2836 }}; 2837 } 2838 2839 InstructionSelector::ComplexRendererFns 2840 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const { 2841 Register RSrcReg; 2842 Register SOffset; 2843 int64_t Offset = 0; 2844 2845 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 2846 return {}; 2847 2848 return {{ 2849 [=](MachineInstrBuilder &MIB) { // rsrc 2850 MIB.addReg(RSrcReg); 2851 }, 2852 [=](MachineInstrBuilder &MIB) { // soffset 2853 if (SOffset) 2854 MIB.addReg(SOffset); 2855 else 2856 MIB.addImm(0); 2857 }, 2858 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 2859 addZeroImm, // glc 2860 addZeroImm, // slc 2861 addZeroImm, // tfe 2862 addZeroImm, // dlc 2863 addZeroImm // swz 2864 }}; 2865 } 2866 2867 InstructionSelector::ComplexRendererFns 2868 AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const { 2869 Register VAddr; 2870 Register RSrcReg; 2871 Register SOffset; 2872 int64_t Offset = 0; 2873 2874 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 2875 return {}; 2876 2877 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 2878 // pattern. 2879 return {{ 2880 [=](MachineInstrBuilder &MIB) { // rsrc 2881 MIB.addReg(RSrcReg); 2882 }, 2883 [=](MachineInstrBuilder &MIB) { // vaddr 2884 MIB.addReg(VAddr); 2885 }, 2886 [=](MachineInstrBuilder &MIB) { // soffset 2887 if (SOffset) 2888 MIB.addReg(SOffset); 2889 else 2890 MIB.addImm(0); 2891 }, 2892 [=](MachineInstrBuilder &MIB) { // offset 2893 MIB.addImm(Offset); 2894 }, 2895 addZeroImm // slc 2896 }}; 2897 } 2898 2899 InstructionSelector::ComplexRendererFns 2900 AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const { 2901 Register RSrcReg; 2902 Register SOffset; 2903 int64_t Offset = 0; 2904 2905 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 2906 return {}; 2907 2908 return {{ 2909 [=](MachineInstrBuilder &MIB) { // rsrc 2910 MIB.addReg(RSrcReg); 2911 }, 2912 [=](MachineInstrBuilder &MIB) { // soffset 2913 if (SOffset) 2914 MIB.addReg(SOffset); 2915 else 2916 MIB.addImm(0); 2917 }, 2918 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 2919 addZeroImm // slc 2920 }}; 2921 } 2922 2923 /// Get an immediate that must be 32-bits, and treated as zero extended. 2924 static Optional<uint64_t> getConstantZext32Val(Register Reg, 2925 const MachineRegisterInfo &MRI) { 2926 // getConstantVRegVal sexts any values, so see if that matters. 2927 Optional<int64_t> OffsetVal = getConstantVRegVal(Reg, MRI); 2928 if (!OffsetVal || !isInt<32>(*OffsetVal)) 2929 return None; 2930 return Lo_32(*OffsetVal); 2931 } 2932 2933 InstructionSelector::ComplexRendererFns 2934 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const { 2935 Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI); 2936 if (!OffsetVal) 2937 return {}; 2938 2939 Optional<int64_t> EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal); 2940 if (!EncodedImm) 2941 return {}; 2942 2943 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; 2944 } 2945 2946 InstructionSelector::ComplexRendererFns 2947 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const { 2948 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); 2949 2950 Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI); 2951 if (!OffsetVal) 2952 return {}; 2953 2954 Optional<int64_t> EncodedImm 2955 = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal); 2956 if (!EncodedImm) 2957 return {}; 2958 2959 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; 2960 } 2961 2962 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, 2963 const MachineInstr &MI, 2964 int OpIdx) const { 2965 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 2966 "Expected G_CONSTANT"); 2967 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue()); 2968 } 2969 2970 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB, 2971 const MachineInstr &MI, 2972 int OpIdx) const { 2973 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 2974 "Expected G_CONSTANT"); 2975 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue()); 2976 } 2977 2978 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB, 2979 const MachineInstr &MI, 2980 int OpIdx) const { 2981 assert(OpIdx == -1); 2982 2983 const MachineOperand &Op = MI.getOperand(1); 2984 if (MI.getOpcode() == TargetOpcode::G_FCONSTANT) 2985 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); 2986 else { 2987 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT"); 2988 MIB.addImm(Op.getCImm()->getSExtValue()); 2989 } 2990 } 2991 2992 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB, 2993 const MachineInstr &MI, 2994 int OpIdx) const { 2995 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 2996 "Expected G_CONSTANT"); 2997 MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation()); 2998 } 2999 3000 /// This only really exists to satisfy DAG type checking machinery, so is a 3001 /// no-op here. 3002 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB, 3003 const MachineInstr &MI, 3004 int OpIdx) const { 3005 MIB.addImm(MI.getOperand(OpIdx).getImm()); 3006 } 3007 3008 void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB, 3009 const MachineInstr &MI, 3010 int OpIdx) const { 3011 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3012 MIB.addImm(MI.getOperand(OpIdx).getImm() & 1); 3013 } 3014 3015 void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB, 3016 const MachineInstr &MI, 3017 int OpIdx) const { 3018 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3019 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1); 3020 } 3021 3022 void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB, 3023 const MachineInstr &MI, 3024 int OpIdx) const { 3025 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3026 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1); 3027 } 3028 3029 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB, 3030 const MachineInstr &MI, 3031 int OpIdx) const { 3032 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3033 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1); 3034 } 3035 3036 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const { 3037 return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm()); 3038 } 3039 3040 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const { 3041 return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm()); 3042 } 3043 3044 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const { 3045 return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm()); 3046 } 3047 3048 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const { 3049 return TII.isInlineConstant(Imm); 3050 } 3051