1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the InstructionSelector class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUInstructionSelector.h" 15 #include "AMDGPUInstrInfo.h" 16 #include "AMDGPURegisterBankInfo.h" 17 #include "AMDGPURegisterInfo.h" 18 #include "AMDGPUSubtarget.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 22 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" 23 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" 24 #include "llvm/CodeGen/GlobalISel/Utils.h" 25 #include "llvm/CodeGen/MachineBasicBlock.h" 26 #include "llvm/CodeGen/MachineFunction.h" 27 #include "llvm/CodeGen/MachineInstr.h" 28 #include "llvm/CodeGen/MachineInstrBuilder.h" 29 #include "llvm/CodeGen/MachineRegisterInfo.h" 30 #include "llvm/IR/Type.h" 31 #include "llvm/Support/Debug.h" 32 #include "llvm/Support/raw_ostream.h" 33 34 #define DEBUG_TYPE "amdgpu-isel" 35 36 using namespace llvm; 37 38 #define GET_GLOBALISEL_IMPL 39 #define AMDGPUSubtarget GCNSubtarget 40 #include "AMDGPUGenGlobalISel.inc" 41 #undef GET_GLOBALISEL_IMPL 42 #undef AMDGPUSubtarget 43 44 AMDGPUInstructionSelector::AMDGPUInstructionSelector( 45 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, 46 const AMDGPUTargetMachine &TM) 47 : InstructionSelector(), TII(*STI.getInstrInfo()), 48 TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM), 49 STI(STI), 50 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG), 51 #define GET_GLOBALISEL_PREDICATES_INIT 52 #include "AMDGPUGenGlobalISel.inc" 53 #undef GET_GLOBALISEL_PREDICATES_INIT 54 #define GET_GLOBALISEL_TEMPORARIES_INIT 55 #include "AMDGPUGenGlobalISel.inc" 56 #undef GET_GLOBALISEL_TEMPORARIES_INIT 57 { 58 } 59 60 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } 61 62 static bool isSCC(Register Reg, const MachineRegisterInfo &MRI) { 63 if (TargetRegisterInfo::isPhysicalRegister(Reg)) 64 return Reg == AMDGPU::SCC; 65 66 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 67 const TargetRegisterClass *RC = 68 RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); 69 if (RC) { 70 if (RC->getID() != AMDGPU::SReg_32_XM0RegClassID) 71 return false; 72 const LLT Ty = MRI.getType(Reg); 73 return Ty.isValid() && Ty.getSizeInBits() == 1; 74 } 75 76 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); 77 return RB->getID() == AMDGPU::SCCRegBankID; 78 } 79 80 bool AMDGPUInstructionSelector::isVCC(Register Reg, 81 const MachineRegisterInfo &MRI) const { 82 if (TargetRegisterInfo::isPhysicalRegister(Reg)) 83 return Reg == TRI.getVCC(); 84 85 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 86 const TargetRegisterClass *RC = 87 RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); 88 if (RC) { 89 return RC->hasSuperClassEq(TRI.getBoolRC()) && 90 MRI.getType(Reg).getSizeInBits() == 1; 91 } 92 93 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); 94 return RB->getID() == AMDGPU::VCCRegBankID; 95 } 96 97 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { 98 MachineBasicBlock *BB = I.getParent(); 99 MachineFunction *MF = BB->getParent(); 100 MachineRegisterInfo &MRI = MF->getRegInfo(); 101 I.setDesc(TII.get(TargetOpcode::COPY)); 102 103 // Special case for COPY from the scc register bank. The scc register bank 104 // is modeled using 32-bit sgprs. 105 const MachineOperand &Src = I.getOperand(1); 106 unsigned SrcReg = Src.getReg(); 107 if (!TargetRegisterInfo::isPhysicalRegister(SrcReg) && isSCC(SrcReg, MRI)) { 108 unsigned DstReg = I.getOperand(0).getReg(); 109 110 // Specially handle scc->vcc copies. 111 if (isVCC(DstReg, MRI)) { 112 const DebugLoc &DL = I.getDebugLoc(); 113 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) 114 .addImm(0) 115 .addReg(SrcReg); 116 if (!MRI.getRegClassOrNull(SrcReg)) 117 MRI.setRegClass(SrcReg, TRI.getConstrainedRegClassForOperand(Src, MRI)); 118 I.eraseFromParent(); 119 return true; 120 } 121 } 122 123 for (const MachineOperand &MO : I.operands()) { 124 if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) 125 continue; 126 127 const TargetRegisterClass *RC = 128 TRI.getConstrainedRegClassForOperand(MO, MRI); 129 if (!RC) 130 continue; 131 RBI.constrainGenericRegister(MO.getReg(), *RC, MRI); 132 } 133 return true; 134 } 135 136 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { 137 MachineBasicBlock *BB = I.getParent(); 138 MachineFunction *MF = BB->getParent(); 139 MachineRegisterInfo &MRI = MF->getRegInfo(); 140 141 const Register DefReg = I.getOperand(0).getReg(); 142 const LLT DefTy = MRI.getType(DefReg); 143 144 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy) 145 146 const RegClassOrRegBank &RegClassOrBank = 147 MRI.getRegClassOrRegBank(DefReg); 148 149 const TargetRegisterClass *DefRC 150 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); 151 if (!DefRC) { 152 if (!DefTy.isValid()) { 153 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); 154 return false; 155 } 156 157 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); 158 if (RB.getID() == AMDGPU::SCCRegBankID) { 159 LLVM_DEBUG(dbgs() << "illegal scc phi\n"); 160 return false; 161 } 162 163 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, MRI); 164 if (!DefRC) { 165 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); 166 return false; 167 } 168 } 169 170 I.setDesc(TII.get(TargetOpcode::PHI)); 171 return RBI.constrainGenericRegister(DefReg, *DefRC, MRI); 172 } 173 174 MachineOperand 175 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, 176 const TargetRegisterClass &SubRC, 177 unsigned SubIdx) const { 178 179 MachineInstr *MI = MO.getParent(); 180 MachineBasicBlock *BB = MO.getParent()->getParent(); 181 MachineFunction *MF = BB->getParent(); 182 MachineRegisterInfo &MRI = MF->getRegInfo(); 183 Register DstReg = MRI.createVirtualRegister(&SubRC); 184 185 if (MO.isReg()) { 186 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); 187 unsigned Reg = MO.getReg(); 188 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) 189 .addReg(Reg, 0, ComposedSubIdx); 190 191 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(), 192 MO.isKill(), MO.isDead(), MO.isUndef(), 193 MO.isEarlyClobber(), 0, MO.isDebug(), 194 MO.isInternalRead()); 195 } 196 197 assert(MO.isImm()); 198 199 APInt Imm(64, MO.getImm()); 200 201 switch (SubIdx) { 202 default: 203 llvm_unreachable("do not know to split immediate with this sub index."); 204 case AMDGPU::sub0: 205 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue()); 206 case AMDGPU::sub1: 207 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue()); 208 } 209 } 210 211 static int64_t getConstant(const MachineInstr *MI) { 212 return MI->getOperand(1).getCImm()->getSExtValue(); 213 } 214 215 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { 216 MachineBasicBlock *BB = I.getParent(); 217 MachineFunction *MF = BB->getParent(); 218 MachineRegisterInfo &MRI = MF->getRegInfo(); 219 Register DstReg = I.getOperand(0).getReg(); 220 const DebugLoc &DL = I.getDebugLoc(); 221 unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI); 222 const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI); 223 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID; 224 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB; 225 226 if (Size == 32) { 227 if (IsSALU) { 228 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; 229 MachineInstr *Add = 230 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 231 .add(I.getOperand(1)) 232 .add(I.getOperand(2)); 233 I.eraseFromParent(); 234 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 235 } 236 237 if (STI.hasAddNoCarry()) { 238 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64; 239 I.setDesc(TII.get(Opc)); 240 I.addOperand(*MF, MachineOperand::CreateImm(0)); 241 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 242 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 243 } 244 245 const unsigned Opc = Sub ? AMDGPU::V_SUB_I32_e64 : AMDGPU::V_ADD_I32_e64; 246 247 Register UnusedCarry = MRI.createVirtualRegister(TRI.getWaveMaskRegClass()); 248 MachineInstr *Add 249 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 250 .addDef(UnusedCarry, RegState::Dead) 251 .add(I.getOperand(1)) 252 .add(I.getOperand(2)) 253 .addImm(0); 254 I.eraseFromParent(); 255 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 256 } 257 258 assert(!Sub && "illegal sub should not reach here"); 259 260 const TargetRegisterClass &RC 261 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass; 262 const TargetRegisterClass &HalfRC 263 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass; 264 265 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0)); 266 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0)); 267 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1)); 268 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1)); 269 270 Register DstLo = MRI.createVirtualRegister(&HalfRC); 271 Register DstHi = MRI.createVirtualRegister(&HalfRC); 272 273 if (IsSALU) { 274 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) 275 .add(Lo1) 276 .add(Lo2); 277 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) 278 .add(Hi1) 279 .add(Hi2); 280 } else { 281 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass(); 282 Register CarryReg = MRI.createVirtualRegister(CarryRC); 283 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_I32_e64), DstLo) 284 .addDef(CarryReg) 285 .add(Lo1) 286 .add(Lo2) 287 .addImm(0); 288 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi) 289 .addDef(MRI.createVirtualRegister(CarryRC), RegState::Dead) 290 .add(Hi1) 291 .add(Hi2) 292 .addReg(CarryReg, RegState::Kill) 293 .addImm(0); 294 295 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI)) 296 return false; 297 } 298 299 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 300 .addReg(DstLo) 301 .addImm(AMDGPU::sub0) 302 .addReg(DstHi) 303 .addImm(AMDGPU::sub1); 304 305 306 if (!RBI.constrainGenericRegister(DstReg, RC, MRI)) 307 return false; 308 309 I.eraseFromParent(); 310 return true; 311 } 312 313 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { 314 MachineBasicBlock *BB = I.getParent(); 315 MachineFunction *MF = BB->getParent(); 316 MachineRegisterInfo &MRI = MF->getRegInfo(); 317 assert(I.getOperand(2).getImm() % 32 == 0); 318 unsigned SubReg = TRI.getSubRegFromChannel(I.getOperand(2).getImm() / 32); 319 const DebugLoc &DL = I.getDebugLoc(); 320 MachineInstr *Copy = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), 321 I.getOperand(0).getReg()) 322 .addReg(I.getOperand(1).getReg(), 0, SubReg); 323 324 for (const MachineOperand &MO : Copy->operands()) { 325 const TargetRegisterClass *RC = 326 TRI.getConstrainedRegClassForOperand(MO, MRI); 327 if (!RC) 328 continue; 329 RBI.constrainGenericRegister(MO.getReg(), *RC, MRI); 330 } 331 I.eraseFromParent(); 332 return true; 333 } 334 335 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { 336 MachineBasicBlock *BB = MI.getParent(); 337 MachineFunction *MF = BB->getParent(); 338 MachineRegisterInfo &MRI = MF->getRegInfo(); 339 Register DstReg = MI.getOperand(0).getReg(); 340 LLT DstTy = MRI.getType(DstReg); 341 LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); 342 343 const unsigned SrcSize = SrcTy.getSizeInBits(); 344 const DebugLoc &DL = MI.getDebugLoc(); 345 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, TRI); 346 const unsigned DstSize = DstTy.getSizeInBits(); 347 const TargetRegisterClass *DstRC = 348 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, MRI); 349 if (!DstRC) 350 return false; 351 352 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8); 353 MachineInstrBuilder MIB = 354 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg); 355 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) { 356 MachineOperand &Src = MI.getOperand(I + 1); 357 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef())); 358 MIB.addImm(SubRegs[I]); 359 360 const TargetRegisterClass *SrcRC 361 = TRI.getConstrainedRegClassForOperand(Src, MRI); 362 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, MRI)) 363 return false; 364 } 365 366 if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) 367 return false; 368 369 MI.eraseFromParent(); 370 return true; 371 } 372 373 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { 374 MachineBasicBlock *BB = MI.getParent(); 375 MachineFunction *MF = BB->getParent(); 376 MachineRegisterInfo &MRI = MF->getRegInfo(); 377 const int NumDst = MI.getNumOperands() - 1; 378 379 MachineOperand &Src = MI.getOperand(NumDst); 380 381 Register SrcReg = Src.getReg(); 382 Register DstReg0 = MI.getOperand(0).getReg(); 383 LLT DstTy = MRI.getType(DstReg0); 384 LLT SrcTy = MRI.getType(SrcReg); 385 386 const unsigned DstSize = DstTy.getSizeInBits(); 387 const unsigned SrcSize = SrcTy.getSizeInBits(); 388 const DebugLoc &DL = MI.getDebugLoc(); 389 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, TRI); 390 391 const TargetRegisterClass *SrcRC = 392 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, MRI); 393 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI)) 394 return false; 395 396 const unsigned SrcFlags = getUndefRegState(Src.isUndef()); 397 398 // Note we could have mixed SGPR and VGPR destination banks for an SGPR 399 // source, and this relies on the fact that the same subregister indices are 400 // used for both. 401 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); 402 for (int I = 0, E = NumDst; I != E; ++I) { 403 MachineOperand &Dst = MI.getOperand(I); 404 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg()) 405 .addReg(SrcReg, SrcFlags, SubRegs[I]); 406 407 const TargetRegisterClass *DstRC = 408 TRI.getConstrainedRegClassForOperand(Dst, MRI); 409 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, MRI)) 410 return false; 411 } 412 413 MI.eraseFromParent(); 414 return true; 415 } 416 417 bool AMDGPUInstructionSelector::selectG_GEP(MachineInstr &I) const { 418 return selectG_ADD_SUB(I); 419 } 420 421 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { 422 MachineBasicBlock *BB = I.getParent(); 423 MachineFunction *MF = BB->getParent(); 424 MachineRegisterInfo &MRI = MF->getRegInfo(); 425 const MachineOperand &MO = I.getOperand(0); 426 427 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The 428 // regbank check here is to know why getConstrainedRegClassForOperand failed. 429 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, MRI); 430 if ((!RC && !MRI.getRegBankOrNull(MO.getReg())) || 431 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, MRI))) { 432 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); 433 return true; 434 } 435 436 return false; 437 } 438 439 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { 440 MachineBasicBlock *BB = I.getParent(); 441 MachineFunction *MF = BB->getParent(); 442 MachineRegisterInfo &MRI = MF->getRegInfo(); 443 unsigned SubReg = TRI.getSubRegFromChannel(I.getOperand(3).getImm() / 32); 444 DebugLoc DL = I.getDebugLoc(); 445 MachineInstr *Ins = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG)) 446 .addDef(I.getOperand(0).getReg()) 447 .addReg(I.getOperand(1).getReg()) 448 .addReg(I.getOperand(2).getReg()) 449 .addImm(SubReg); 450 451 for (const MachineOperand &MO : Ins->operands()) { 452 if (!MO.isReg()) 453 continue; 454 if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) 455 continue; 456 457 const TargetRegisterClass *RC = 458 TRI.getConstrainedRegClassForOperand(MO, MRI); 459 if (!RC) 460 continue; 461 RBI.constrainGenericRegister(MO.getReg(), *RC, MRI); 462 } 463 I.eraseFromParent(); 464 return true; 465 } 466 467 bool AMDGPUInstructionSelector::selectG_INTRINSIC( 468 MachineInstr &I, CodeGenCoverage &CoverageInfo) const { 469 unsigned IntrinsicID = I.getOperand(I.getNumExplicitDefs()).getIntrinsicID(); 470 switch (IntrinsicID) { 471 case Intrinsic::maxnum: 472 case Intrinsic::minnum: 473 case Intrinsic::amdgcn_cvt_pkrtz: 474 return selectImpl(I, CoverageInfo); 475 default: 476 return selectImpl(I, CoverageInfo); 477 } 478 } 479 480 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) { 481 if (Size != 32 && Size != 64) 482 return -1; 483 switch (P) { 484 default: 485 llvm_unreachable("Unknown condition code!"); 486 case CmpInst::ICMP_NE: 487 return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64; 488 case CmpInst::ICMP_EQ: 489 return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64; 490 case CmpInst::ICMP_SGT: 491 return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64; 492 case CmpInst::ICMP_SGE: 493 return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64; 494 case CmpInst::ICMP_SLT: 495 return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64; 496 case CmpInst::ICMP_SLE: 497 return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64; 498 case CmpInst::ICMP_UGT: 499 return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64; 500 case CmpInst::ICMP_UGE: 501 return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64; 502 case CmpInst::ICMP_ULT: 503 return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64; 504 case CmpInst::ICMP_ULE: 505 return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64; 506 } 507 } 508 509 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P, 510 unsigned Size) const { 511 if (Size == 64) { 512 if (!STI.hasScalarCompareEq64()) 513 return -1; 514 515 switch (P) { 516 case CmpInst::ICMP_NE: 517 return AMDGPU::S_CMP_LG_U64; 518 case CmpInst::ICMP_EQ: 519 return AMDGPU::S_CMP_EQ_U64; 520 default: 521 return -1; 522 } 523 } 524 525 if (Size != 32) 526 return -1; 527 528 switch (P) { 529 case CmpInst::ICMP_NE: 530 return AMDGPU::S_CMP_LG_U32; 531 case CmpInst::ICMP_EQ: 532 return AMDGPU::S_CMP_EQ_U32; 533 case CmpInst::ICMP_SGT: 534 return AMDGPU::S_CMP_GT_I32; 535 case CmpInst::ICMP_SGE: 536 return AMDGPU::S_CMP_GE_I32; 537 case CmpInst::ICMP_SLT: 538 return AMDGPU::S_CMP_LT_I32; 539 case CmpInst::ICMP_SLE: 540 return AMDGPU::S_CMP_LE_I32; 541 case CmpInst::ICMP_UGT: 542 return AMDGPU::S_CMP_GT_U32; 543 case CmpInst::ICMP_UGE: 544 return AMDGPU::S_CMP_GE_U32; 545 case CmpInst::ICMP_ULT: 546 return AMDGPU::S_CMP_LT_U32; 547 case CmpInst::ICMP_ULE: 548 return AMDGPU::S_CMP_LE_U32; 549 default: 550 llvm_unreachable("Unknown condition code!"); 551 } 552 } 553 554 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { 555 MachineBasicBlock *BB = I.getParent(); 556 MachineFunction *MF = BB->getParent(); 557 MachineRegisterInfo &MRI = MF->getRegInfo(); 558 DebugLoc DL = I.getDebugLoc(); 559 560 unsigned SrcReg = I.getOperand(2).getReg(); 561 unsigned Size = RBI.getSizeInBits(SrcReg, MRI, TRI); 562 563 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); 564 565 unsigned CCReg = I.getOperand(0).getReg(); 566 if (isSCC(CCReg, MRI)) { 567 int Opcode = getS_CMPOpcode(Pred, Size); 568 if (Opcode == -1) 569 return false; 570 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode)) 571 .add(I.getOperand(2)) 572 .add(I.getOperand(3)); 573 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg) 574 .addReg(AMDGPU::SCC); 575 bool Ret = 576 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) && 577 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, MRI); 578 I.eraseFromParent(); 579 return Ret; 580 } 581 582 int Opcode = getV_CMPOpcode(Pred, Size); 583 if (Opcode == -1) 584 return false; 585 586 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), 587 I.getOperand(0).getReg()) 588 .add(I.getOperand(2)) 589 .add(I.getOperand(3)); 590 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), 591 AMDGPU::SReg_64RegClass, MRI); 592 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 593 I.eraseFromParent(); 594 return Ret; 595 } 596 597 static MachineInstr * 598 buildEXP(const TargetInstrInfo &TII, MachineInstr *Insert, unsigned Tgt, 599 unsigned Reg0, unsigned Reg1, unsigned Reg2, unsigned Reg3, 600 unsigned VM, bool Compr, unsigned Enabled, bool Done) { 601 const DebugLoc &DL = Insert->getDebugLoc(); 602 MachineBasicBlock &BB = *Insert->getParent(); 603 unsigned Opcode = Done ? AMDGPU::EXP_DONE : AMDGPU::EXP; 604 return BuildMI(BB, Insert, DL, TII.get(Opcode)) 605 .addImm(Tgt) 606 .addReg(Reg0) 607 .addReg(Reg1) 608 .addReg(Reg2) 609 .addReg(Reg3) 610 .addImm(VM) 611 .addImm(Compr) 612 .addImm(Enabled); 613 } 614 615 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( 616 MachineInstr &I, CodeGenCoverage &CoverageInfo) const { 617 MachineBasicBlock *BB = I.getParent(); 618 MachineFunction *MF = BB->getParent(); 619 MachineRegisterInfo &MRI = MF->getRegInfo(); 620 621 unsigned IntrinsicID = I.getOperand(0).getIntrinsicID(); 622 switch (IntrinsicID) { 623 case Intrinsic::amdgcn_exp: { 624 int64_t Tgt = getConstant(MRI.getVRegDef(I.getOperand(1).getReg())); 625 int64_t Enabled = getConstant(MRI.getVRegDef(I.getOperand(2).getReg())); 626 int64_t Done = getConstant(MRI.getVRegDef(I.getOperand(7).getReg())); 627 int64_t VM = getConstant(MRI.getVRegDef(I.getOperand(8).getReg())); 628 629 MachineInstr *Exp = buildEXP(TII, &I, Tgt, I.getOperand(3).getReg(), 630 I.getOperand(4).getReg(), 631 I.getOperand(5).getReg(), 632 I.getOperand(6).getReg(), 633 VM, false, Enabled, Done); 634 635 I.eraseFromParent(); 636 return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI); 637 } 638 case Intrinsic::amdgcn_exp_compr: { 639 const DebugLoc &DL = I.getDebugLoc(); 640 int64_t Tgt = getConstant(MRI.getVRegDef(I.getOperand(1).getReg())); 641 int64_t Enabled = getConstant(MRI.getVRegDef(I.getOperand(2).getReg())); 642 unsigned Reg0 = I.getOperand(3).getReg(); 643 unsigned Reg1 = I.getOperand(4).getReg(); 644 unsigned Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 645 int64_t Done = getConstant(MRI.getVRegDef(I.getOperand(5).getReg())); 646 int64_t VM = getConstant(MRI.getVRegDef(I.getOperand(6).getReg())); 647 648 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef); 649 MachineInstr *Exp = buildEXP(TII, &I, Tgt, Reg0, Reg1, Undef, Undef, VM, 650 true, Enabled, Done); 651 652 I.eraseFromParent(); 653 return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI); 654 } 655 default: 656 return selectImpl(I, CoverageInfo); 657 } 658 } 659 660 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { 661 MachineBasicBlock *BB = I.getParent(); 662 MachineFunction *MF = BB->getParent(); 663 MachineRegisterInfo &MRI = MF->getRegInfo(); 664 const DebugLoc &DL = I.getDebugLoc(); 665 666 unsigned DstReg = I.getOperand(0).getReg(); 667 unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI); 668 assert(Size <= 32 || Size == 64); 669 const MachineOperand &CCOp = I.getOperand(1); 670 unsigned CCReg = CCOp.getReg(); 671 if (isSCC(CCReg, MRI)) { 672 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 : 673 AMDGPU::S_CSELECT_B32; 674 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 675 .addReg(CCReg); 676 677 // The generic constrainSelectedInstRegOperands doesn't work for the scc register 678 // bank, because it does not cover the register class that we used to represent 679 // for it. So we need to manually set the register class here. 680 if (!MRI.getRegClassOrNull(CCReg)) 681 MRI.setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, MRI)); 682 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) 683 .add(I.getOperand(2)) 684 .add(I.getOperand(3)); 685 686 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) | 687 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI); 688 I.eraseFromParent(); 689 return Ret; 690 } 691 692 // Wide VGPR select should have been split in RegBankSelect. 693 if (Size > 32) 694 return false; 695 696 MachineInstr *Select = 697 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 698 .addImm(0) 699 .add(I.getOperand(3)) 700 .addImm(0) 701 .add(I.getOperand(2)) 702 .add(I.getOperand(1)); 703 704 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); 705 I.eraseFromParent(); 706 return Ret; 707 } 708 709 bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { 710 MachineBasicBlock *BB = I.getParent(); 711 MachineFunction *MF = BB->getParent(); 712 MachineRegisterInfo &MRI = MF->getRegInfo(); 713 DebugLoc DL = I.getDebugLoc(); 714 unsigned PtrSize = RBI.getSizeInBits(I.getOperand(1).getReg(), MRI, TRI); 715 if (PtrSize != 64) { 716 LLVM_DEBUG(dbgs() << "Unhandled address space\n"); 717 return false; 718 } 719 720 unsigned StoreSize = RBI.getSizeInBits(I.getOperand(0).getReg(), MRI, TRI); 721 unsigned Opcode; 722 723 // FIXME: Select store instruction based on address space 724 switch (StoreSize) { 725 default: 726 return false; 727 case 32: 728 Opcode = AMDGPU::FLAT_STORE_DWORD; 729 break; 730 case 64: 731 Opcode = AMDGPU::FLAT_STORE_DWORDX2; 732 break; 733 case 96: 734 Opcode = AMDGPU::FLAT_STORE_DWORDX3; 735 break; 736 case 128: 737 Opcode = AMDGPU::FLAT_STORE_DWORDX4; 738 break; 739 } 740 741 MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(Opcode)) 742 .add(I.getOperand(1)) 743 .add(I.getOperand(0)) 744 .addImm(0) // offset 745 .addImm(0) // glc 746 .addImm(0) // slc 747 .addImm(0); // dlc 748 749 750 // Now that we selected an opcode, we need to constrain the register 751 // operands to use appropriate classes. 752 bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI); 753 754 I.eraseFromParent(); 755 return Ret; 756 } 757 758 static int sizeToSubRegIndex(unsigned Size) { 759 switch (Size) { 760 case 32: 761 return AMDGPU::sub0; 762 case 64: 763 return AMDGPU::sub0_sub1; 764 case 96: 765 return AMDGPU::sub0_sub1_sub2; 766 case 128: 767 return AMDGPU::sub0_sub1_sub2_sub3; 768 case 256: 769 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; 770 default: 771 if (Size < 32) 772 return AMDGPU::sub0; 773 if (Size > 256) 774 return -1; 775 return sizeToSubRegIndex(PowerOf2Ceil(Size)); 776 } 777 } 778 779 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { 780 MachineBasicBlock *BB = I.getParent(); 781 MachineFunction *MF = BB->getParent(); 782 MachineRegisterInfo &MRI = MF->getRegInfo(); 783 784 unsigned DstReg = I.getOperand(0).getReg(); 785 unsigned SrcReg = I.getOperand(1).getReg(); 786 const LLT DstTy = MRI.getType(DstReg); 787 const LLT SrcTy = MRI.getType(SrcReg); 788 if (!DstTy.isScalar()) 789 return false; 790 791 const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI); 792 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, MRI, TRI); 793 if (SrcRB != DstRB) 794 return false; 795 796 unsigned DstSize = DstTy.getSizeInBits(); 797 unsigned SrcSize = SrcTy.getSizeInBits(); 798 799 const TargetRegisterClass *SrcRC 800 = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, MRI); 801 const TargetRegisterClass *DstRC 802 = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, MRI); 803 804 if (SrcSize > 32) { 805 int SubRegIdx = sizeToSubRegIndex(DstSize); 806 if (SubRegIdx == -1) 807 return false; 808 809 // Deal with weird cases where the class only partially supports the subreg 810 // index. 811 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx); 812 if (!SrcRC) 813 return false; 814 815 I.getOperand(1).setSubReg(SubRegIdx); 816 } 817 818 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) || 819 !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { 820 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); 821 return false; 822 } 823 824 I.setDesc(TII.get(TargetOpcode::COPY)); 825 return true; 826 } 827 828 /// \returns true if a bitmask for \p Size bits will be an inline immediate. 829 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) { 830 Mask = maskTrailingOnes<unsigned>(Size); 831 int SignedMask = static_cast<int>(Mask); 832 return SignedMask >= -16 && SignedMask <= 64; 833 } 834 835 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { 836 bool Signed = I.getOpcode() == AMDGPU::G_SEXT; 837 const DebugLoc &DL = I.getDebugLoc(); 838 MachineBasicBlock &MBB = *I.getParent(); 839 MachineFunction &MF = *MBB.getParent(); 840 MachineRegisterInfo &MRI = MF.getRegInfo(); 841 const unsigned DstReg = I.getOperand(0).getReg(); 842 const unsigned SrcReg = I.getOperand(1).getReg(); 843 844 const LLT DstTy = MRI.getType(DstReg); 845 const LLT SrcTy = MRI.getType(SrcReg); 846 const LLT S1 = LLT::scalar(1); 847 const unsigned SrcSize = SrcTy.getSizeInBits(); 848 const unsigned DstSize = DstTy.getSizeInBits(); 849 if (!DstTy.isScalar()) 850 return false; 851 852 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, TRI); 853 854 if (SrcBank->getID() == AMDGPU::SCCRegBankID) { 855 if (SrcTy != S1 || DstSize > 64) // Invalid 856 return false; 857 858 unsigned Opcode = 859 DstSize > 32 ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32; 860 const TargetRegisterClass *DstRC = 861 DstSize > 32 ? &AMDGPU::SReg_64RegClass : &AMDGPU::SReg_32RegClass; 862 863 // FIXME: Create an extra copy to avoid incorrectly constraining the result 864 // of the scc producer. 865 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 866 BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), TmpReg) 867 .addReg(SrcReg); 868 BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 869 .addReg(TmpReg); 870 871 // The instruction operands are backwards from what you would expect. 872 BuildMI(MBB, I, DL, TII.get(Opcode), DstReg) 873 .addImm(0) 874 .addImm(Signed ? -1 : 1); 875 return RBI.constrainGenericRegister(DstReg, *DstRC, MRI); 876 } 877 878 if (SrcBank->getID() == AMDGPU::VCCRegBankID && DstSize <= 32) { 879 if (SrcTy != S1) // Invalid 880 return false; 881 882 MachineInstr *ExtI = 883 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 884 .addImm(0) // src0_modifiers 885 .addImm(0) // src0 886 .addImm(0) // src1_modifiers 887 .addImm(Signed ? -1 : 1) // src1 888 .addUse(SrcReg); 889 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 890 } 891 892 if (I.getOpcode() == AMDGPU::G_ANYEXT) 893 return selectCOPY(I); 894 895 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) { 896 // 64-bit should have been split up in RegBankSelect 897 898 // Try to use an and with a mask if it will save code size. 899 unsigned Mask; 900 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 901 MachineInstr *ExtI = 902 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg) 903 .addImm(Mask) 904 .addReg(SrcReg); 905 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 906 } 907 908 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32; 909 MachineInstr *ExtI = 910 BuildMI(MBB, I, DL, TII.get(BFE), DstReg) 911 .addReg(SrcReg) 912 .addImm(0) // Offset 913 .addImm(SrcSize); // Width 914 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 915 } 916 917 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) { 918 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI)) 919 return false; 920 921 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) { 922 const unsigned SextOpc = SrcSize == 8 ? 923 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16; 924 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg) 925 .addReg(SrcReg); 926 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, MRI); 927 } 928 929 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; 930 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; 931 932 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width. 933 if (DstSize > 32 && SrcSize <= 32) { 934 // We need a 64-bit register source, but the high bits don't matter. 935 unsigned ExtReg 936 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 937 unsigned UndefReg 938 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 939 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 940 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) 941 .addReg(SrcReg) 942 .addImm(AMDGPU::sub0) 943 .addReg(UndefReg) 944 .addImm(AMDGPU::sub1); 945 946 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg) 947 .addReg(ExtReg) 948 .addImm(SrcSize << 16); 949 950 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, MRI); 951 } 952 953 unsigned Mask; 954 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 955 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg) 956 .addReg(SrcReg) 957 .addImm(Mask); 958 } else { 959 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg) 960 .addReg(SrcReg) 961 .addImm(SrcSize << 16); 962 } 963 964 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, MRI); 965 } 966 967 return false; 968 } 969 970 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { 971 MachineBasicBlock *BB = I.getParent(); 972 MachineFunction *MF = BB->getParent(); 973 MachineRegisterInfo &MRI = MF->getRegInfo(); 974 MachineOperand &ImmOp = I.getOperand(1); 975 976 // The AMDGPU backend only supports Imm operands and not CImm or FPImm. 977 if (ImmOp.isFPImm()) { 978 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt(); 979 ImmOp.ChangeToImmediate(Imm.getZExtValue()); 980 } else if (ImmOp.isCImm()) { 981 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getZExtValue()); 982 } 983 984 unsigned DstReg = I.getOperand(0).getReg(); 985 unsigned Size; 986 bool IsSgpr; 987 const RegisterBank *RB = MRI.getRegBankOrNull(I.getOperand(0).getReg()); 988 if (RB) { 989 IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID; 990 Size = MRI.getType(DstReg).getSizeInBits(); 991 } else { 992 const TargetRegisterClass *RC = TRI.getRegClassForReg(MRI, DstReg); 993 IsSgpr = TRI.isSGPRClass(RC); 994 Size = TRI.getRegSizeInBits(*RC); 995 } 996 997 if (Size != 32 && Size != 64) 998 return false; 999 1000 unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1001 if (Size == 32) { 1002 I.setDesc(TII.get(Opcode)); 1003 I.addImplicitDefUseOperands(*MF); 1004 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 1005 } 1006 1007 DebugLoc DL = I.getDebugLoc(); 1008 const TargetRegisterClass *RC = IsSgpr ? &AMDGPU::SReg_32_XM0RegClass : 1009 &AMDGPU::VGPR_32RegClass; 1010 unsigned LoReg = MRI.createVirtualRegister(RC); 1011 unsigned HiReg = MRI.createVirtualRegister(RC); 1012 const APInt &Imm = APInt(Size, I.getOperand(1).getImm()); 1013 1014 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg) 1015 .addImm(Imm.trunc(32).getZExtValue()); 1016 1017 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg) 1018 .addImm(Imm.ashr(32).getZExtValue()); 1019 1020 const MachineInstr *RS = 1021 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 1022 .addReg(LoReg) 1023 .addImm(AMDGPU::sub0) 1024 .addReg(HiReg) 1025 .addImm(AMDGPU::sub1); 1026 1027 // We can't call constrainSelectedInstRegOperands here, because it doesn't 1028 // work for target independent opcodes 1029 I.eraseFromParent(); 1030 const TargetRegisterClass *DstRC = 1031 TRI.getConstrainedRegClassForOperand(RS->getOperand(0), MRI); 1032 if (!DstRC) 1033 return true; 1034 return RBI.constrainGenericRegister(DstReg, *DstRC, MRI); 1035 } 1036 1037 static bool isConstant(const MachineInstr &MI) { 1038 return MI.getOpcode() == TargetOpcode::G_CONSTANT; 1039 } 1040 1041 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, 1042 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const { 1043 1044 const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg()); 1045 1046 assert(PtrMI); 1047 1048 if (PtrMI->getOpcode() != TargetOpcode::G_GEP) 1049 return; 1050 1051 GEPInfo GEPInfo(*PtrMI); 1052 1053 for (unsigned i = 1, e = 3; i < e; ++i) { 1054 const MachineOperand &GEPOp = PtrMI->getOperand(i); 1055 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg()); 1056 assert(OpDef); 1057 if (isConstant(*OpDef)) { 1058 // FIXME: Is it possible to have multiple Imm parts? Maybe if we 1059 // are lacking other optimizations. 1060 assert(GEPInfo.Imm == 0); 1061 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue(); 1062 continue; 1063 } 1064 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI); 1065 if (OpBank->getID() == AMDGPU::SGPRRegBankID) 1066 GEPInfo.SgprParts.push_back(GEPOp.getReg()); 1067 else 1068 GEPInfo.VgprParts.push_back(GEPOp.getReg()); 1069 } 1070 1071 AddrInfo.push_back(GEPInfo); 1072 getAddrModeInfo(*PtrMI, MRI, AddrInfo); 1073 } 1074 1075 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const { 1076 if (!MI.hasOneMemOperand()) 1077 return false; 1078 1079 const MachineMemOperand *MMO = *MI.memoperands_begin(); 1080 const Value *Ptr = MMO->getValue(); 1081 1082 // UndefValue means this is a load of a kernel input. These are uniform. 1083 // Sometimes LDS instructions have constant pointers. 1084 // If Ptr is null, then that means this mem operand contains a 1085 // PseudoSourceValue like GOT. 1086 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || 1087 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) 1088 return true; 1089 1090 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 1091 return true; 1092 1093 const Instruction *I = dyn_cast<Instruction>(Ptr); 1094 return I && I->getMetadata("amdgpu.uniform"); 1095 } 1096 1097 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const { 1098 for (const GEPInfo &GEPInfo : AddrInfo) { 1099 if (!GEPInfo.VgprParts.empty()) 1100 return true; 1101 } 1102 return false; 1103 } 1104 1105 bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const { 1106 MachineBasicBlock *BB = I.getParent(); 1107 MachineFunction *MF = BB->getParent(); 1108 MachineRegisterInfo &MRI = MF->getRegInfo(); 1109 const DebugLoc &DL = I.getDebugLoc(); 1110 Register DstReg = I.getOperand(0).getReg(); 1111 Register PtrReg = I.getOperand(1).getReg(); 1112 unsigned LoadSize = RBI.getSizeInBits(DstReg, MRI, TRI); 1113 unsigned Opcode; 1114 1115 if (MRI.getType(I.getOperand(1).getReg()).getSizeInBits() == 32) { 1116 LLVM_DEBUG(dbgs() << "Unhandled address space\n"); 1117 return false; 1118 } 1119 1120 SmallVector<GEPInfo, 4> AddrInfo; 1121 1122 getAddrModeInfo(I, MRI, AddrInfo); 1123 1124 switch (LoadSize) { 1125 case 32: 1126 Opcode = AMDGPU::FLAT_LOAD_DWORD; 1127 break; 1128 case 64: 1129 Opcode = AMDGPU::FLAT_LOAD_DWORDX2; 1130 break; 1131 default: 1132 LLVM_DEBUG(dbgs() << "Unhandled load size\n"); 1133 return false; 1134 } 1135 1136 MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(Opcode)) 1137 .add(I.getOperand(0)) 1138 .addReg(PtrReg) 1139 .addImm(0) // offset 1140 .addImm(0) // glc 1141 .addImm(0) // slc 1142 .addImm(0); // dlc 1143 1144 bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI); 1145 I.eraseFromParent(); 1146 return Ret; 1147 } 1148 1149 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { 1150 MachineBasicBlock *BB = I.getParent(); 1151 MachineFunction *MF = BB->getParent(); 1152 MachineRegisterInfo &MRI = MF->getRegInfo(); 1153 MachineOperand &CondOp = I.getOperand(0); 1154 Register CondReg = CondOp.getReg(); 1155 const DebugLoc &DL = I.getDebugLoc(); 1156 1157 unsigned BrOpcode; 1158 Register CondPhysReg; 1159 const TargetRegisterClass *ConstrainRC; 1160 1161 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide 1162 // whether the branch is uniform when selecting the instruction. In 1163 // GlobalISel, we should push that decision into RegBankSelect. Assume for now 1164 // RegBankSelect knows what it's doing if the branch condition is scc, even 1165 // though it currently does not. 1166 if (isSCC(CondReg, MRI)) { 1167 CondPhysReg = AMDGPU::SCC; 1168 BrOpcode = AMDGPU::S_CBRANCH_SCC1; 1169 ConstrainRC = &AMDGPU::SReg_32_XM0RegClass; 1170 } else if (isVCC(CondReg, MRI)) { 1171 // FIXME: Do we have to insert an and with exec here, like in SelectionDAG? 1172 // We sort of know that a VCC producer based on the register bank, that ands 1173 // inactive lanes with 0. What if there was a logical operation with vcc 1174 // producers in different blocks/with different exec masks? 1175 // FIXME: Should scc->vcc copies and with exec? 1176 CondPhysReg = TRI.getVCC(); 1177 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ; 1178 ConstrainRC = TRI.getBoolRC(); 1179 } else 1180 return false; 1181 1182 if (!MRI.getRegClassOrNull(CondReg)) 1183 MRI.setRegClass(CondReg, ConstrainRC); 1184 1185 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg) 1186 .addReg(CondReg); 1187 BuildMI(*BB, &I, DL, TII.get(BrOpcode)) 1188 .addMBB(I.getOperand(1).getMBB()); 1189 1190 I.eraseFromParent(); 1191 return true; 1192 } 1193 1194 bool AMDGPUInstructionSelector::selectG_FRAME_INDEX(MachineInstr &I) const { 1195 MachineBasicBlock *BB = I.getParent(); 1196 MachineFunction *MF = BB->getParent(); 1197 MachineRegisterInfo &MRI = MF->getRegInfo(); 1198 1199 Register DstReg = I.getOperand(0).getReg(); 1200 const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI); 1201 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 1202 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32)); 1203 if (IsVGPR) 1204 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 1205 1206 return RBI.constrainGenericRegister( 1207 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, MRI); 1208 } 1209 1210 bool AMDGPUInstructionSelector::select(MachineInstr &I, 1211 CodeGenCoverage &CoverageInfo) const { 1212 if (I.isPHI()) 1213 return selectPHI(I); 1214 1215 if (!isPreISelGenericOpcode(I.getOpcode())) { 1216 if (I.isCopy()) 1217 return selectCOPY(I); 1218 return true; 1219 } 1220 1221 switch (I.getOpcode()) { 1222 case TargetOpcode::G_ADD: 1223 case TargetOpcode::G_SUB: 1224 if (selectG_ADD_SUB(I)) 1225 return true; 1226 LLVM_FALLTHROUGH; 1227 default: 1228 return selectImpl(I, CoverageInfo); 1229 case TargetOpcode::G_INTTOPTR: 1230 case TargetOpcode::G_BITCAST: 1231 return selectCOPY(I); 1232 case TargetOpcode::G_CONSTANT: 1233 case TargetOpcode::G_FCONSTANT: 1234 return selectG_CONSTANT(I); 1235 case TargetOpcode::G_EXTRACT: 1236 return selectG_EXTRACT(I); 1237 case TargetOpcode::G_MERGE_VALUES: 1238 case TargetOpcode::G_CONCAT_VECTORS: 1239 return selectG_MERGE_VALUES(I); 1240 case TargetOpcode::G_UNMERGE_VALUES: 1241 return selectG_UNMERGE_VALUES(I); 1242 case TargetOpcode::G_GEP: 1243 return selectG_GEP(I); 1244 case TargetOpcode::G_IMPLICIT_DEF: 1245 return selectG_IMPLICIT_DEF(I); 1246 case TargetOpcode::G_INSERT: 1247 return selectG_INSERT(I); 1248 case TargetOpcode::G_INTRINSIC: 1249 return selectG_INTRINSIC(I, CoverageInfo); 1250 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: 1251 return selectG_INTRINSIC_W_SIDE_EFFECTS(I, CoverageInfo); 1252 case TargetOpcode::G_ICMP: 1253 if (selectG_ICMP(I)) 1254 return true; 1255 return selectImpl(I, CoverageInfo); 1256 case TargetOpcode::G_LOAD: 1257 if (selectImpl(I, CoverageInfo)) 1258 return true; 1259 return selectG_LOAD(I); 1260 case TargetOpcode::G_SELECT: 1261 return selectG_SELECT(I); 1262 case TargetOpcode::G_STORE: 1263 return selectG_STORE(I); 1264 case TargetOpcode::G_TRUNC: 1265 return selectG_TRUNC(I); 1266 case TargetOpcode::G_SEXT: 1267 case TargetOpcode::G_ZEXT: 1268 case TargetOpcode::G_ANYEXT: 1269 if (selectG_SZA_EXT(I)) { 1270 I.eraseFromParent(); 1271 return true; 1272 } 1273 1274 return false; 1275 case TargetOpcode::G_BRCOND: 1276 return selectG_BRCOND(I); 1277 case TargetOpcode::G_FRAME_INDEX: 1278 return selectG_FRAME_INDEX(I); 1279 case TargetOpcode::G_FENCE: 1280 // FIXME: Tablegen importer doesn't handle the imm operands correctly, and 1281 // is checking for G_CONSTANT 1282 I.setDesc(TII.get(AMDGPU::ATOMIC_FENCE)); 1283 return true; 1284 } 1285 return false; 1286 } 1287 1288 InstructionSelector::ComplexRendererFns 1289 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { 1290 return {{ 1291 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 1292 }}; 1293 1294 } 1295 1296 std::pair<Register, unsigned> 1297 AMDGPUInstructionSelector::selectVOP3ModsImpl( 1298 Register Src, const MachineRegisterInfo &MRI) const { 1299 unsigned Mods = 0; 1300 MachineInstr *MI = MRI.getVRegDef(Src); 1301 1302 if (MI && MI->getOpcode() == AMDGPU::G_FNEG) { 1303 Src = MI->getOperand(1).getReg(); 1304 Mods |= SISrcMods::NEG; 1305 MI = MRI.getVRegDef(Src); 1306 } 1307 1308 if (MI && MI->getOpcode() == AMDGPU::G_FABS) { 1309 Src = MI->getOperand(1).getReg(); 1310 Mods |= SISrcMods::ABS; 1311 } 1312 1313 return std::make_pair(Src, Mods); 1314 } 1315 1316 /// 1317 /// This will select either an SGPR or VGPR operand and will save us from 1318 /// having to write an extra tablegen pattern. 1319 InstructionSelector::ComplexRendererFns 1320 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const { 1321 return {{ 1322 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 1323 }}; 1324 } 1325 1326 InstructionSelector::ComplexRendererFns 1327 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { 1328 MachineRegisterInfo &MRI 1329 = Root.getParent()->getParent()->getParent()->getRegInfo(); 1330 1331 Register Src; 1332 unsigned Mods; 1333 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), MRI); 1334 1335 return {{ 1336 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 1337 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 1338 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 1339 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 1340 }}; 1341 } 1342 InstructionSelector::ComplexRendererFns 1343 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { 1344 return {{ 1345 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 1346 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 1347 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 1348 }}; 1349 } 1350 1351 InstructionSelector::ComplexRendererFns 1352 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { 1353 MachineRegisterInfo &MRI 1354 = Root.getParent()->getParent()->getParent()->getRegInfo(); 1355 1356 Register Src; 1357 unsigned Mods; 1358 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), MRI); 1359 1360 return {{ 1361 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 1362 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 1363 }}; 1364 } 1365 1366 InstructionSelector::ComplexRendererFns 1367 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { 1368 MachineRegisterInfo &MRI = 1369 Root.getParent()->getParent()->getParent()->getRegInfo(); 1370 1371 SmallVector<GEPInfo, 4> AddrInfo; 1372 getAddrModeInfo(*Root.getParent(), MRI, AddrInfo); 1373 1374 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 1375 return None; 1376 1377 const GEPInfo &GEPInfo = AddrInfo[0]; 1378 1379 if (!AMDGPU::isLegalSMRDImmOffset(STI, GEPInfo.Imm)) 1380 return None; 1381 1382 unsigned PtrReg = GEPInfo.SgprParts[0]; 1383 int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm); 1384 return {{ 1385 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 1386 [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); } 1387 }}; 1388 } 1389 1390 InstructionSelector::ComplexRendererFns 1391 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { 1392 MachineRegisterInfo &MRI = 1393 Root.getParent()->getParent()->getParent()->getRegInfo(); 1394 1395 SmallVector<GEPInfo, 4> AddrInfo; 1396 getAddrModeInfo(*Root.getParent(), MRI, AddrInfo); 1397 1398 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 1399 return None; 1400 1401 const GEPInfo &GEPInfo = AddrInfo[0]; 1402 unsigned PtrReg = GEPInfo.SgprParts[0]; 1403 int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm); 1404 if (!isUInt<32>(EncodedImm)) 1405 return None; 1406 1407 return {{ 1408 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 1409 [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); } 1410 }}; 1411 } 1412 1413 InstructionSelector::ComplexRendererFns 1414 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { 1415 MachineInstr *MI = Root.getParent(); 1416 MachineBasicBlock *MBB = MI->getParent(); 1417 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1418 1419 SmallVector<GEPInfo, 4> AddrInfo; 1420 getAddrModeInfo(*MI, MRI, AddrInfo); 1421 1422 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, 1423 // then we can select all ptr + 32-bit offsets not just immediate offsets. 1424 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 1425 return None; 1426 1427 const GEPInfo &GEPInfo = AddrInfo[0]; 1428 if (!GEPInfo.Imm || !isUInt<32>(GEPInfo.Imm)) 1429 return None; 1430 1431 // If we make it this far we have a load with an 32-bit immediate offset. 1432 // It is OK to select this using a sgpr offset, because we have already 1433 // failed trying to select this load into one of the _IMM variants since 1434 // the _IMM Patterns are considered before the _SGPR patterns. 1435 unsigned PtrReg = GEPInfo.SgprParts[0]; 1436 unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 1437 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) 1438 .addImm(GEPInfo.Imm); 1439 return {{ 1440 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 1441 [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); } 1442 }}; 1443 } 1444