1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the InstructionSelector class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUInstructionSelector.h" 15 #include "AMDGPUInstrInfo.h" 16 #include "AMDGPUGlobalISelUtils.h" 17 #include "AMDGPURegisterBankInfo.h" 18 #include "AMDGPURegisterInfo.h" 19 #include "AMDGPUSubtarget.h" 20 #include "AMDGPUTargetMachine.h" 21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 22 #include "SIMachineFunctionInfo.h" 23 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 24 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" 25 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 28 #include "llvm/CodeGen/GlobalISel/Utils.h" 29 #include "llvm/CodeGen/MachineBasicBlock.h" 30 #include "llvm/CodeGen/MachineFunction.h" 31 #include "llvm/CodeGen/MachineInstr.h" 32 #include "llvm/CodeGen/MachineInstrBuilder.h" 33 #include "llvm/CodeGen/MachineRegisterInfo.h" 34 #include "llvm/IR/Type.h" 35 #include "llvm/Support/Debug.h" 36 #include "llvm/Support/raw_ostream.h" 37 38 #define DEBUG_TYPE "amdgpu-isel" 39 40 using namespace llvm; 41 using namespace MIPatternMatch; 42 43 #define GET_GLOBALISEL_IMPL 44 #define AMDGPUSubtarget GCNSubtarget 45 #include "AMDGPUGenGlobalISel.inc" 46 #undef GET_GLOBALISEL_IMPL 47 #undef AMDGPUSubtarget 48 49 AMDGPUInstructionSelector::AMDGPUInstructionSelector( 50 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, 51 const AMDGPUTargetMachine &TM) 52 : InstructionSelector(), TII(*STI.getInstrInfo()), 53 TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM), 54 STI(STI), 55 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG), 56 #define GET_GLOBALISEL_PREDICATES_INIT 57 #include "AMDGPUGenGlobalISel.inc" 58 #undef GET_GLOBALISEL_PREDICATES_INIT 59 #define GET_GLOBALISEL_TEMPORARIES_INIT 60 #include "AMDGPUGenGlobalISel.inc" 61 #undef GET_GLOBALISEL_TEMPORARIES_INIT 62 { 63 } 64 65 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } 66 67 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB, 68 CodeGenCoverage &CoverageInfo) { 69 MRI = &MF.getRegInfo(); 70 InstructionSelector::setupMF(MF, KB, CoverageInfo); 71 } 72 73 bool AMDGPUInstructionSelector::isVCC(Register Reg, 74 const MachineRegisterInfo &MRI) const { 75 if (Register::isPhysicalRegister(Reg)) 76 return Reg == TRI.getVCC(); 77 78 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 79 const TargetRegisterClass *RC = 80 RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); 81 if (RC) { 82 const LLT Ty = MRI.getType(Reg); 83 return RC->hasSuperClassEq(TRI.getBoolRC()) && 84 Ty.isValid() && Ty.getSizeInBits() == 1; 85 } 86 87 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); 88 return RB->getID() == AMDGPU::VCCRegBankID; 89 } 90 91 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI, 92 unsigned NewOpc) const { 93 MI.setDesc(TII.get(NewOpc)); 94 MI.RemoveOperand(1); // Remove intrinsic ID. 95 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 96 97 MachineOperand &Dst = MI.getOperand(0); 98 MachineOperand &Src = MI.getOperand(1); 99 100 // TODO: This should be legalized to s32 if needed 101 if (MRI->getType(Dst.getReg()) == LLT::scalar(1)) 102 return false; 103 104 const TargetRegisterClass *DstRC 105 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 106 const TargetRegisterClass *SrcRC 107 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 108 if (!DstRC || DstRC != SrcRC) 109 return false; 110 111 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) && 112 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI); 113 } 114 115 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { 116 const DebugLoc &DL = I.getDebugLoc(); 117 MachineBasicBlock *BB = I.getParent(); 118 I.setDesc(TII.get(TargetOpcode::COPY)); 119 120 const MachineOperand &Src = I.getOperand(1); 121 MachineOperand &Dst = I.getOperand(0); 122 Register DstReg = Dst.getReg(); 123 Register SrcReg = Src.getReg(); 124 125 if (isVCC(DstReg, *MRI)) { 126 if (SrcReg == AMDGPU::SCC) { 127 const TargetRegisterClass *RC 128 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 129 if (!RC) 130 return true; 131 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 132 } 133 134 if (!isVCC(SrcReg, *MRI)) { 135 // TODO: Should probably leave the copy and let copyPhysReg expand it. 136 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI)) 137 return false; 138 139 const TargetRegisterClass *SrcRC 140 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 141 142 Register MaskedReg = MRI->createVirtualRegister(SrcRC); 143 144 // We can't trust the high bits at this point, so clear them. 145 146 // TODO: Skip masking high bits if def is known boolean. 147 148 unsigned AndOpc = TRI.isSGPRClass(SrcRC) ? 149 AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32; 150 BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg) 151 .addImm(1) 152 .addReg(SrcReg); 153 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) 154 .addImm(0) 155 .addReg(MaskedReg); 156 157 if (!MRI->getRegClassOrNull(SrcReg)) 158 MRI->setRegClass(SrcReg, SrcRC); 159 I.eraseFromParent(); 160 return true; 161 } 162 163 const TargetRegisterClass *RC = 164 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 165 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) 166 return false; 167 168 // Don't constrain the source register to a class so the def instruction 169 // handles it (unless it's undef). 170 // 171 // FIXME: This is a hack. When selecting the def, we neeed to know 172 // specifically know that the result is VCCRegBank, and not just an SGPR 173 // with size 1. An SReg_32 with size 1 is ambiguous with wave32. 174 if (Src.isUndef()) { 175 const TargetRegisterClass *SrcRC = 176 TRI.getConstrainedRegClassForOperand(Src, *MRI); 177 if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 178 return false; 179 } 180 181 return true; 182 } 183 184 for (const MachineOperand &MO : I.operands()) { 185 if (Register::isPhysicalRegister(MO.getReg())) 186 continue; 187 188 const TargetRegisterClass *RC = 189 TRI.getConstrainedRegClassForOperand(MO, *MRI); 190 if (!RC) 191 continue; 192 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); 193 } 194 return true; 195 } 196 197 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { 198 const Register DefReg = I.getOperand(0).getReg(); 199 const LLT DefTy = MRI->getType(DefReg); 200 201 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy) 202 203 const RegClassOrRegBank &RegClassOrBank = 204 MRI->getRegClassOrRegBank(DefReg); 205 206 const TargetRegisterClass *DefRC 207 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); 208 if (!DefRC) { 209 if (!DefTy.isValid()) { 210 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); 211 return false; 212 } 213 214 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); 215 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI); 216 if (!DefRC) { 217 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); 218 return false; 219 } 220 } 221 222 // TODO: Verify that all registers have the same bank 223 I.setDesc(TII.get(TargetOpcode::PHI)); 224 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI); 225 } 226 227 MachineOperand 228 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, 229 const TargetRegisterClass &SubRC, 230 unsigned SubIdx) const { 231 232 MachineInstr *MI = MO.getParent(); 233 MachineBasicBlock *BB = MO.getParent()->getParent(); 234 Register DstReg = MRI->createVirtualRegister(&SubRC); 235 236 if (MO.isReg()) { 237 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); 238 Register Reg = MO.getReg(); 239 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) 240 .addReg(Reg, 0, ComposedSubIdx); 241 242 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(), 243 MO.isKill(), MO.isDead(), MO.isUndef(), 244 MO.isEarlyClobber(), 0, MO.isDebug(), 245 MO.isInternalRead()); 246 } 247 248 assert(MO.isImm()); 249 250 APInt Imm(64, MO.getImm()); 251 252 switch (SubIdx) { 253 default: 254 llvm_unreachable("do not know to split immediate with this sub index."); 255 case AMDGPU::sub0: 256 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue()); 257 case AMDGPU::sub1: 258 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue()); 259 } 260 } 261 262 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { 263 switch (Opc) { 264 case AMDGPU::G_AND: 265 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; 266 case AMDGPU::G_OR: 267 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32; 268 case AMDGPU::G_XOR: 269 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32; 270 default: 271 llvm_unreachable("not a bit op"); 272 } 273 } 274 275 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { 276 MachineOperand &Dst = I.getOperand(0); 277 MachineOperand &Src0 = I.getOperand(1); 278 MachineOperand &Src1 = I.getOperand(2); 279 Register DstReg = Dst.getReg(); 280 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 281 282 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 283 if (DstRB->getID() == AMDGPU::VCCRegBankID) { 284 const TargetRegisterClass *RC = TRI.getBoolRC(); 285 unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), 286 RC == &AMDGPU::SReg_64RegClass); 287 I.setDesc(TII.get(InstOpc)); 288 289 // FIXME: Hack to avoid turning the register bank into a register class. 290 // The selector for G_ICMP relies on seeing the register bank for the result 291 // is VCC. In wave32 if we constrain the registers to SReg_32 here, it will 292 // be ambiguous whether it's a scalar or vector bool. 293 if (Src0.isUndef() && !MRI->getRegClassOrNull(Src0.getReg())) 294 MRI->setRegClass(Src0.getReg(), RC); 295 if (Src1.isUndef() && !MRI->getRegClassOrNull(Src1.getReg())) 296 MRI->setRegClass(Src1.getReg(), RC); 297 298 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 299 } 300 301 // TODO: Should this allow an SCC bank result, and produce a copy from SCC for 302 // the result? 303 if (DstRB->getID() == AMDGPU::SGPRRegBankID) { 304 unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), Size > 32); 305 I.setDesc(TII.get(InstOpc)); 306 // Dead implicit-def of scc 307 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef 308 true, // isImp 309 false, // isKill 310 true)); // isDead 311 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 312 } 313 314 return false; 315 } 316 317 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { 318 MachineBasicBlock *BB = I.getParent(); 319 MachineFunction *MF = BB->getParent(); 320 Register DstReg = I.getOperand(0).getReg(); 321 const DebugLoc &DL = I.getDebugLoc(); 322 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 323 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 324 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID; 325 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB; 326 327 if (Size == 32) { 328 if (IsSALU) { 329 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; 330 MachineInstr *Add = 331 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 332 .add(I.getOperand(1)) 333 .add(I.getOperand(2)); 334 I.eraseFromParent(); 335 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 336 } 337 338 if (STI.hasAddNoCarry()) { 339 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64; 340 I.setDesc(TII.get(Opc)); 341 I.addOperand(*MF, MachineOperand::CreateImm(0)); 342 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 343 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 344 } 345 346 const unsigned Opc = Sub ? AMDGPU::V_SUB_I32_e64 : AMDGPU::V_ADD_I32_e64; 347 348 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass()); 349 MachineInstr *Add 350 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 351 .addDef(UnusedCarry, RegState::Dead) 352 .add(I.getOperand(1)) 353 .add(I.getOperand(2)) 354 .addImm(0); 355 I.eraseFromParent(); 356 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 357 } 358 359 assert(!Sub && "illegal sub should not reach here"); 360 361 const TargetRegisterClass &RC 362 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass; 363 const TargetRegisterClass &HalfRC 364 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass; 365 366 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0)); 367 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0)); 368 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1)); 369 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1)); 370 371 Register DstLo = MRI->createVirtualRegister(&HalfRC); 372 Register DstHi = MRI->createVirtualRegister(&HalfRC); 373 374 if (IsSALU) { 375 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) 376 .add(Lo1) 377 .add(Lo2); 378 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) 379 .add(Hi1) 380 .add(Hi2); 381 } else { 382 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass(); 383 Register CarryReg = MRI->createVirtualRegister(CarryRC); 384 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_I32_e64), DstLo) 385 .addDef(CarryReg) 386 .add(Lo1) 387 .add(Lo2) 388 .addImm(0); 389 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi) 390 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead) 391 .add(Hi1) 392 .add(Hi2) 393 .addReg(CarryReg, RegState::Kill) 394 .addImm(0); 395 396 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI)) 397 return false; 398 } 399 400 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 401 .addReg(DstLo) 402 .addImm(AMDGPU::sub0) 403 .addReg(DstHi) 404 .addImm(AMDGPU::sub1); 405 406 407 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI)) 408 return false; 409 410 I.eraseFromParent(); 411 return true; 412 } 413 414 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE( 415 MachineInstr &I) const { 416 MachineBasicBlock *BB = I.getParent(); 417 MachineFunction *MF = BB->getParent(); 418 const DebugLoc &DL = I.getDebugLoc(); 419 Register Dst0Reg = I.getOperand(0).getReg(); 420 Register Dst1Reg = I.getOperand(1).getReg(); 421 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO || 422 I.getOpcode() == AMDGPU::G_UADDE; 423 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE || 424 I.getOpcode() == AMDGPU::G_USUBE; 425 426 if (isVCC(Dst1Reg, *MRI)) { 427 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned 428 // carry out despite the _i32 name. These were renamed in VI to _U32. 429 // FIXME: We should probably rename the opcodes here. 430 unsigned NoCarryOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; 431 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 432 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc)); 433 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 434 I.addOperand(*MF, MachineOperand::CreateImm(0)); 435 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 436 } 437 438 Register Src0Reg = I.getOperand(2).getReg(); 439 Register Src1Reg = I.getOperand(3).getReg(); 440 441 if (HasCarryIn) { 442 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 443 .addReg(I.getOperand(4).getReg()); 444 } 445 446 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; 447 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; 448 449 BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg) 450 .add(I.getOperand(2)) 451 .add(I.getOperand(3)); 452 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg) 453 .addReg(AMDGPU::SCC); 454 455 if (!MRI->getRegClassOrNull(Dst1Reg)) 456 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass); 457 458 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) || 459 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) || 460 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI)) 461 return false; 462 463 if (HasCarryIn && 464 !RBI.constrainGenericRegister(I.getOperand(4).getReg(), 465 AMDGPU::SReg_32RegClass, *MRI)) 466 return false; 467 468 I.eraseFromParent(); 469 return true; 470 } 471 472 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { 473 MachineBasicBlock *BB = I.getParent(); 474 Register DstReg = I.getOperand(0).getReg(); 475 Register SrcReg = I.getOperand(1).getReg(); 476 LLT DstTy = MRI->getType(DstReg); 477 LLT SrcTy = MRI->getType(SrcReg); 478 const unsigned SrcSize = SrcTy.getSizeInBits(); 479 const unsigned DstSize = DstTy.getSizeInBits(); 480 481 // TODO: Should handle any multiple of 32 offset. 482 unsigned Offset = I.getOperand(2).getImm(); 483 if (Offset % DstSize != 0) 484 return false; 485 486 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 487 const TargetRegisterClass *SrcRC = 488 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 489 if (!SrcRC) 490 return false; 491 492 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); 493 494 const DebugLoc &DL = I.getDebugLoc(); 495 MachineInstr *Copy = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg) 496 .addReg(SrcReg, 0, SubRegs[Offset / DstSize]); 497 498 for (const MachineOperand &MO : Copy->operands()) { 499 const TargetRegisterClass *RC = 500 TRI.getConstrainedRegClassForOperand(MO, *MRI); 501 if (!RC) 502 continue; 503 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); 504 } 505 I.eraseFromParent(); 506 return true; 507 } 508 509 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { 510 MachineBasicBlock *BB = MI.getParent(); 511 Register DstReg = MI.getOperand(0).getReg(); 512 LLT DstTy = MRI->getType(DstReg); 513 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg()); 514 515 const unsigned SrcSize = SrcTy.getSizeInBits(); 516 if (SrcSize < 32) 517 return selectImpl(MI, *CoverageInfo); 518 519 const DebugLoc &DL = MI.getDebugLoc(); 520 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 521 const unsigned DstSize = DstTy.getSizeInBits(); 522 const TargetRegisterClass *DstRC = 523 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 524 if (!DstRC) 525 return false; 526 527 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8); 528 MachineInstrBuilder MIB = 529 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg); 530 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) { 531 MachineOperand &Src = MI.getOperand(I + 1); 532 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef())); 533 MIB.addImm(SubRegs[I]); 534 535 const TargetRegisterClass *SrcRC 536 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 537 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI)) 538 return false; 539 } 540 541 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 542 return false; 543 544 MI.eraseFromParent(); 545 return true; 546 } 547 548 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { 549 MachineBasicBlock *BB = MI.getParent(); 550 const int NumDst = MI.getNumOperands() - 1; 551 552 MachineOperand &Src = MI.getOperand(NumDst); 553 554 Register SrcReg = Src.getReg(); 555 Register DstReg0 = MI.getOperand(0).getReg(); 556 LLT DstTy = MRI->getType(DstReg0); 557 LLT SrcTy = MRI->getType(SrcReg); 558 559 const unsigned DstSize = DstTy.getSizeInBits(); 560 const unsigned SrcSize = SrcTy.getSizeInBits(); 561 const DebugLoc &DL = MI.getDebugLoc(); 562 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 563 564 const TargetRegisterClass *SrcRC = 565 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 566 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 567 return false; 568 569 const unsigned SrcFlags = getUndefRegState(Src.isUndef()); 570 571 // Note we could have mixed SGPR and VGPR destination banks for an SGPR 572 // source, and this relies on the fact that the same subregister indices are 573 // used for both. 574 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); 575 for (int I = 0, E = NumDst; I != E; ++I) { 576 MachineOperand &Dst = MI.getOperand(I); 577 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg()) 578 .addReg(SrcReg, SrcFlags, SubRegs[I]); 579 580 const TargetRegisterClass *DstRC = 581 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 582 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI)) 583 return false; 584 } 585 586 MI.eraseFromParent(); 587 return true; 588 } 589 590 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const { 591 return selectG_ADD_SUB(I); 592 } 593 594 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { 595 const MachineOperand &MO = I.getOperand(0); 596 597 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The 598 // regbank check here is to know why getConstrainedRegClassForOperand failed. 599 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI); 600 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) || 601 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) { 602 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); 603 return true; 604 } 605 606 return false; 607 } 608 609 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { 610 MachineBasicBlock *BB = I.getParent(); 611 612 Register DstReg = I.getOperand(0).getReg(); 613 Register Src0Reg = I.getOperand(1).getReg(); 614 Register Src1Reg = I.getOperand(2).getReg(); 615 LLT Src1Ty = MRI->getType(Src1Reg); 616 617 unsigned DstSize = MRI->getType(DstReg).getSizeInBits(); 618 unsigned InsSize = Src1Ty.getSizeInBits(); 619 620 int64_t Offset = I.getOperand(3).getImm(); 621 if (Offset % 32 != 0) 622 return false; 623 624 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32); 625 if (SubReg == AMDGPU::NoSubRegister) 626 return false; 627 628 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 629 const TargetRegisterClass *DstRC = 630 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 631 if (!DstRC) 632 return false; 633 634 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI); 635 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI); 636 const TargetRegisterClass *Src0RC = 637 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI); 638 const TargetRegisterClass *Src1RC = 639 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI); 640 641 // Deal with weird cases where the class only partially supports the subreg 642 // index. 643 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg); 644 if (!Src0RC) 645 return false; 646 647 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 648 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) || 649 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI)) 650 return false; 651 652 const DebugLoc &DL = I.getDebugLoc(); 653 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg) 654 .addReg(Src0Reg) 655 .addReg(Src1Reg) 656 .addImm(SubReg); 657 658 I.eraseFromParent(); 659 return true; 660 } 661 662 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const { 663 if (STI.getLDSBankCount() != 16) 664 return selectImpl(MI, *CoverageInfo); 665 666 Register Dst = MI.getOperand(0).getReg(); 667 Register Src0 = MI.getOperand(2).getReg(); 668 Register M0Val = MI.getOperand(6).getReg(); 669 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) || 670 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) || 671 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI)) 672 return false; 673 674 // This requires 2 instructions. It is possible to write a pattern to support 675 // this, but the generated isel emitter doesn't correctly deal with multiple 676 // output instructions using the same physical register input. The copy to m0 677 // is incorrectly placed before the second instruction. 678 // 679 // TODO: Match source modifiers. 680 681 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 682 const DebugLoc &DL = MI.getDebugLoc(); 683 MachineBasicBlock *MBB = MI.getParent(); 684 685 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 686 .addReg(M0Val); 687 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov) 688 .addImm(2) 689 .addImm(MI.getOperand(4).getImm()) // $attr 690 .addImm(MI.getOperand(3).getImm()); // $attrchan 691 692 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst) 693 .addImm(0) // $src0_modifiers 694 .addReg(Src0) // $src0 695 .addImm(MI.getOperand(4).getImm()) // $attr 696 .addImm(MI.getOperand(3).getImm()) // $attrchan 697 .addImm(0) // $src2_modifiers 698 .addReg(InterpMov) // $src2 - 2 f16 values selected by high 699 .addImm(MI.getOperand(5).getImm()) // $high 700 .addImm(0) // $clamp 701 .addImm(0); // $omod 702 703 MI.eraseFromParent(); 704 return true; 705 } 706 707 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { 708 unsigned IntrinsicID = I.getIntrinsicID(); 709 switch (IntrinsicID) { 710 case Intrinsic::amdgcn_if_break: { 711 MachineBasicBlock *BB = I.getParent(); 712 713 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 714 // SelectionDAG uses for wave32 vs wave64. 715 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK)) 716 .add(I.getOperand(0)) 717 .add(I.getOperand(2)) 718 .add(I.getOperand(3)); 719 720 Register DstReg = I.getOperand(0).getReg(); 721 Register Src0Reg = I.getOperand(2).getReg(); 722 Register Src1Reg = I.getOperand(3).getReg(); 723 724 I.eraseFromParent(); 725 726 for (Register Reg : { DstReg, Src0Reg, Src1Reg }) 727 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 728 729 return true; 730 } 731 case Intrinsic::amdgcn_interp_p1_f16: 732 return selectInterpP1F16(I); 733 case Intrinsic::amdgcn_wqm: 734 return constrainCopyLikeIntrin(I, AMDGPU::WQM); 735 case Intrinsic::amdgcn_softwqm: 736 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM); 737 case Intrinsic::amdgcn_wwm: 738 return constrainCopyLikeIntrin(I, AMDGPU::WWM); 739 default: 740 return selectImpl(I, *CoverageInfo); 741 } 742 } 743 744 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) { 745 if (Size != 32 && Size != 64) 746 return -1; 747 switch (P) { 748 default: 749 llvm_unreachable("Unknown condition code!"); 750 case CmpInst::ICMP_NE: 751 return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64; 752 case CmpInst::ICMP_EQ: 753 return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64; 754 case CmpInst::ICMP_SGT: 755 return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64; 756 case CmpInst::ICMP_SGE: 757 return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64; 758 case CmpInst::ICMP_SLT: 759 return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64; 760 case CmpInst::ICMP_SLE: 761 return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64; 762 case CmpInst::ICMP_UGT: 763 return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64; 764 case CmpInst::ICMP_UGE: 765 return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64; 766 case CmpInst::ICMP_ULT: 767 return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64; 768 case CmpInst::ICMP_ULE: 769 return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64; 770 } 771 } 772 773 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P, 774 unsigned Size) const { 775 if (Size == 64) { 776 if (!STI.hasScalarCompareEq64()) 777 return -1; 778 779 switch (P) { 780 case CmpInst::ICMP_NE: 781 return AMDGPU::S_CMP_LG_U64; 782 case CmpInst::ICMP_EQ: 783 return AMDGPU::S_CMP_EQ_U64; 784 default: 785 return -1; 786 } 787 } 788 789 if (Size != 32) 790 return -1; 791 792 switch (P) { 793 case CmpInst::ICMP_NE: 794 return AMDGPU::S_CMP_LG_U32; 795 case CmpInst::ICMP_EQ: 796 return AMDGPU::S_CMP_EQ_U32; 797 case CmpInst::ICMP_SGT: 798 return AMDGPU::S_CMP_GT_I32; 799 case CmpInst::ICMP_SGE: 800 return AMDGPU::S_CMP_GE_I32; 801 case CmpInst::ICMP_SLT: 802 return AMDGPU::S_CMP_LT_I32; 803 case CmpInst::ICMP_SLE: 804 return AMDGPU::S_CMP_LE_I32; 805 case CmpInst::ICMP_UGT: 806 return AMDGPU::S_CMP_GT_U32; 807 case CmpInst::ICMP_UGE: 808 return AMDGPU::S_CMP_GE_U32; 809 case CmpInst::ICMP_ULT: 810 return AMDGPU::S_CMP_LT_U32; 811 case CmpInst::ICMP_ULE: 812 return AMDGPU::S_CMP_LE_U32; 813 default: 814 llvm_unreachable("Unknown condition code!"); 815 } 816 } 817 818 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { 819 MachineBasicBlock *BB = I.getParent(); 820 const DebugLoc &DL = I.getDebugLoc(); 821 822 Register SrcReg = I.getOperand(2).getReg(); 823 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 824 825 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); 826 827 Register CCReg = I.getOperand(0).getReg(); 828 if (!isVCC(CCReg, *MRI)) { 829 int Opcode = getS_CMPOpcode(Pred, Size); 830 if (Opcode == -1) 831 return false; 832 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode)) 833 .add(I.getOperand(2)) 834 .add(I.getOperand(3)); 835 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg) 836 .addReg(AMDGPU::SCC); 837 bool Ret = 838 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) && 839 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI); 840 I.eraseFromParent(); 841 return Ret; 842 } 843 844 int Opcode = getV_CMPOpcode(Pred, Size); 845 if (Opcode == -1) 846 return false; 847 848 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), 849 I.getOperand(0).getReg()) 850 .add(I.getOperand(2)) 851 .add(I.getOperand(3)); 852 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), 853 *TRI.getBoolRC(), *MRI); 854 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 855 I.eraseFromParent(); 856 return Ret; 857 } 858 859 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const { 860 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 861 // SelectionDAG uses for wave32 vs wave64. 862 MachineBasicBlock *BB = MI.getParent(); 863 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF)) 864 .add(MI.getOperand(1)); 865 866 Register Reg = MI.getOperand(1).getReg(); 867 MI.eraseFromParent(); 868 869 if (!MRI->getRegClassOrNull(Reg)) 870 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 871 return true; 872 } 873 874 static unsigned getDSShaderTypeValue(const MachineFunction &MF) { 875 switch (MF.getFunction().getCallingConv()) { 876 case CallingConv::AMDGPU_PS: 877 return 1; 878 case CallingConv::AMDGPU_VS: 879 return 2; 880 case CallingConv::AMDGPU_GS: 881 return 3; 882 case CallingConv::AMDGPU_HS: 883 case CallingConv::AMDGPU_LS: 884 case CallingConv::AMDGPU_ES: 885 report_fatal_error("ds_ordered_count unsupported for this calling conv"); 886 case CallingConv::AMDGPU_CS: 887 case CallingConv::AMDGPU_KERNEL: 888 case CallingConv::C: 889 case CallingConv::Fast: 890 default: 891 // Assume other calling conventions are various compute callable functions 892 return 0; 893 } 894 } 895 896 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic( 897 MachineInstr &MI, Intrinsic::ID IntrID) const { 898 MachineBasicBlock *MBB = MI.getParent(); 899 MachineFunction *MF = MBB->getParent(); 900 const DebugLoc &DL = MI.getDebugLoc(); 901 902 unsigned IndexOperand = MI.getOperand(7).getImm(); 903 bool WaveRelease = MI.getOperand(8).getImm() != 0; 904 bool WaveDone = MI.getOperand(9).getImm() != 0; 905 906 if (WaveDone && !WaveRelease) 907 report_fatal_error("ds_ordered_count: wave_done requires wave_release"); 908 909 unsigned OrderedCountIndex = IndexOperand & 0x3f; 910 IndexOperand &= ~0x3f; 911 unsigned CountDw = 0; 912 913 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) { 914 CountDw = (IndexOperand >> 24) & 0xf; 915 IndexOperand &= ~(0xf << 24); 916 917 if (CountDw < 1 || CountDw > 4) { 918 report_fatal_error( 919 "ds_ordered_count: dword count must be between 1 and 4"); 920 } 921 } 922 923 if (IndexOperand) 924 report_fatal_error("ds_ordered_count: bad index operand"); 925 926 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1; 927 unsigned ShaderType = getDSShaderTypeValue(*MF); 928 929 unsigned Offset0 = OrderedCountIndex << 2; 930 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) | 931 (Instruction << 4); 932 933 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) 934 Offset1 |= (CountDw - 1) << 6; 935 936 unsigned Offset = Offset0 | (Offset1 << 8); 937 938 Register M0Val = MI.getOperand(2).getReg(); 939 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 940 .addReg(M0Val); 941 942 Register DstReg = MI.getOperand(0).getReg(); 943 Register ValReg = MI.getOperand(3).getReg(); 944 MachineInstrBuilder DS = 945 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg) 946 .addReg(ValReg) 947 .addImm(Offset) 948 .cloneMemRefs(MI); 949 950 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI)) 951 return false; 952 953 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI); 954 MI.eraseFromParent(); 955 return Ret; 956 } 957 958 static unsigned gwsIntrinToOpcode(unsigned IntrID) { 959 switch (IntrID) { 960 case Intrinsic::amdgcn_ds_gws_init: 961 return AMDGPU::DS_GWS_INIT; 962 case Intrinsic::amdgcn_ds_gws_barrier: 963 return AMDGPU::DS_GWS_BARRIER; 964 case Intrinsic::amdgcn_ds_gws_sema_v: 965 return AMDGPU::DS_GWS_SEMA_V; 966 case Intrinsic::amdgcn_ds_gws_sema_br: 967 return AMDGPU::DS_GWS_SEMA_BR; 968 case Intrinsic::amdgcn_ds_gws_sema_p: 969 return AMDGPU::DS_GWS_SEMA_P; 970 case Intrinsic::amdgcn_ds_gws_sema_release_all: 971 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL; 972 default: 973 llvm_unreachable("not a gws intrinsic"); 974 } 975 } 976 977 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, 978 Intrinsic::ID IID) const { 979 if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all && 980 !STI.hasGWSSemaReleaseAll()) 981 return false; 982 983 // intrinsic ID, vsrc, offset 984 const bool HasVSrc = MI.getNumOperands() == 3; 985 assert(HasVSrc || MI.getNumOperands() == 2); 986 987 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg(); 988 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI); 989 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID) 990 return false; 991 992 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 993 assert(OffsetDef); 994 995 unsigned ImmOffset; 996 997 MachineBasicBlock *MBB = MI.getParent(); 998 const DebugLoc &DL = MI.getDebugLoc(); 999 1000 MachineInstr *Readfirstlane = nullptr; 1001 1002 // If we legalized the VGPR input, strip out the readfirstlane to analyze the 1003 // incoming offset, in case there's an add of a constant. We'll have to put it 1004 // back later. 1005 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) { 1006 Readfirstlane = OffsetDef; 1007 BaseOffset = OffsetDef->getOperand(1).getReg(); 1008 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1009 } 1010 1011 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) { 1012 // If we have a constant offset, try to use the 0 in m0 as the base. 1013 // TODO: Look into changing the default m0 initialization value. If the 1014 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to 1015 // the immediate offset. 1016 1017 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue(); 1018 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1019 .addImm(0); 1020 } else { 1021 std::tie(BaseOffset, ImmOffset, OffsetDef) 1022 = AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset); 1023 1024 if (Readfirstlane) { 1025 // We have the constant offset now, so put the readfirstlane back on the 1026 // variable component. 1027 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI)) 1028 return false; 1029 1030 Readfirstlane->getOperand(1).setReg(BaseOffset); 1031 BaseOffset = Readfirstlane->getOperand(0).getReg(); 1032 } else { 1033 if (!RBI.constrainGenericRegister(BaseOffset, 1034 AMDGPU::SReg_32RegClass, *MRI)) 1035 return false; 1036 } 1037 1038 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1039 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base) 1040 .addReg(BaseOffset) 1041 .addImm(16); 1042 1043 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1044 .addReg(M0Base); 1045 } 1046 1047 // The resource id offset is computed as (<isa opaque base> + M0[21:16] + 1048 // offset field) % 64. Some versions of the programming guide omit the m0 1049 // part, or claim it's from offset 0. 1050 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID))); 1051 1052 if (HasVSrc) { 1053 Register VSrc = MI.getOperand(1).getReg(); 1054 MIB.addReg(VSrc); 1055 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI)) 1056 return false; 1057 } 1058 1059 MIB.addImm(ImmOffset) 1060 .addImm(-1) // $gds 1061 .cloneMemRefs(MI); 1062 1063 MI.eraseFromParent(); 1064 return true; 1065 } 1066 1067 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI, 1068 bool IsAppend) const { 1069 Register PtrBase = MI.getOperand(2).getReg(); 1070 LLT PtrTy = MRI->getType(PtrBase); 1071 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS; 1072 1073 unsigned Offset; 1074 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2)); 1075 1076 // TODO: Should this try to look through readfirstlane like GWS? 1077 if (!isDSOffsetLegal(PtrBase, Offset, 16)) { 1078 PtrBase = MI.getOperand(2).getReg(); 1079 Offset = 0; 1080 } 1081 1082 MachineBasicBlock *MBB = MI.getParent(); 1083 const DebugLoc &DL = MI.getDebugLoc(); 1084 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME; 1085 1086 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1087 .addReg(PtrBase); 1088 BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg()) 1089 .addImm(Offset) 1090 .addImm(IsGDS ? -1 : 0) 1091 .cloneMemRefs(MI); 1092 1093 MI.eraseFromParent(); 1094 return true; 1095 } 1096 1097 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( 1098 MachineInstr &I) const { 1099 unsigned IntrinsicID = I.getIntrinsicID(); 1100 switch (IntrinsicID) { 1101 case Intrinsic::amdgcn_end_cf: 1102 return selectEndCfIntrinsic(I); 1103 case Intrinsic::amdgcn_ds_ordered_add: 1104 case Intrinsic::amdgcn_ds_ordered_swap: 1105 return selectDSOrderedIntrinsic(I, IntrinsicID); 1106 case Intrinsic::amdgcn_ds_gws_init: 1107 case Intrinsic::amdgcn_ds_gws_barrier: 1108 case Intrinsic::amdgcn_ds_gws_sema_v: 1109 case Intrinsic::amdgcn_ds_gws_sema_br: 1110 case Intrinsic::amdgcn_ds_gws_sema_p: 1111 case Intrinsic::amdgcn_ds_gws_sema_release_all: 1112 return selectDSGWSIntrinsic(I, IntrinsicID); 1113 case Intrinsic::amdgcn_ds_append: 1114 return selectDSAppendConsume(I, true); 1115 case Intrinsic::amdgcn_ds_consume: 1116 return selectDSAppendConsume(I, false); 1117 default: 1118 return selectImpl(I, *CoverageInfo); 1119 } 1120 } 1121 1122 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { 1123 if (selectImpl(I, *CoverageInfo)) 1124 return true; 1125 1126 MachineBasicBlock *BB = I.getParent(); 1127 const DebugLoc &DL = I.getDebugLoc(); 1128 1129 Register DstReg = I.getOperand(0).getReg(); 1130 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 1131 assert(Size <= 32 || Size == 64); 1132 const MachineOperand &CCOp = I.getOperand(1); 1133 Register CCReg = CCOp.getReg(); 1134 if (!isVCC(CCReg, *MRI)) { 1135 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 : 1136 AMDGPU::S_CSELECT_B32; 1137 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 1138 .addReg(CCReg); 1139 1140 // The generic constrainSelectedInstRegOperands doesn't work for the scc register 1141 // bank, because it does not cover the register class that we used to represent 1142 // for it. So we need to manually set the register class here. 1143 if (!MRI->getRegClassOrNull(CCReg)) 1144 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI)); 1145 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) 1146 .add(I.getOperand(2)) 1147 .add(I.getOperand(3)); 1148 1149 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) | 1150 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI); 1151 I.eraseFromParent(); 1152 return Ret; 1153 } 1154 1155 // Wide VGPR select should have been split in RegBankSelect. 1156 if (Size > 32) 1157 return false; 1158 1159 MachineInstr *Select = 1160 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1161 .addImm(0) 1162 .add(I.getOperand(3)) 1163 .addImm(0) 1164 .add(I.getOperand(2)) 1165 .add(I.getOperand(1)); 1166 1167 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); 1168 I.eraseFromParent(); 1169 return Ret; 1170 } 1171 1172 bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { 1173 initM0(I); 1174 return selectImpl(I, *CoverageInfo); 1175 } 1176 1177 static int sizeToSubRegIndex(unsigned Size) { 1178 switch (Size) { 1179 case 32: 1180 return AMDGPU::sub0; 1181 case 64: 1182 return AMDGPU::sub0_sub1; 1183 case 96: 1184 return AMDGPU::sub0_sub1_sub2; 1185 case 128: 1186 return AMDGPU::sub0_sub1_sub2_sub3; 1187 case 256: 1188 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; 1189 default: 1190 if (Size < 32) 1191 return AMDGPU::sub0; 1192 if (Size > 256) 1193 return -1; 1194 return sizeToSubRegIndex(PowerOf2Ceil(Size)); 1195 } 1196 } 1197 1198 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { 1199 Register DstReg = I.getOperand(0).getReg(); 1200 Register SrcReg = I.getOperand(1).getReg(); 1201 const LLT DstTy = MRI->getType(DstReg); 1202 const LLT SrcTy = MRI->getType(SrcReg); 1203 if (!DstTy.isScalar()) 1204 return false; 1205 1206 const LLT S1 = LLT::scalar(1); 1207 1208 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1209 const RegisterBank *DstRB; 1210 if (DstTy == S1) { 1211 // This is a special case. We don't treat s1 for legalization artifacts as 1212 // vcc booleans. 1213 DstRB = SrcRB; 1214 } else { 1215 DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1216 if (SrcRB != DstRB) 1217 return false; 1218 } 1219 1220 unsigned DstSize = DstTy.getSizeInBits(); 1221 unsigned SrcSize = SrcTy.getSizeInBits(); 1222 1223 const TargetRegisterClass *SrcRC 1224 = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI); 1225 const TargetRegisterClass *DstRC 1226 = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI); 1227 1228 if (SrcSize > 32) { 1229 int SubRegIdx = sizeToSubRegIndex(DstSize); 1230 if (SubRegIdx == -1) 1231 return false; 1232 1233 // Deal with weird cases where the class only partially supports the subreg 1234 // index. 1235 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx); 1236 if (!SrcRC) 1237 return false; 1238 1239 I.getOperand(1).setSubReg(SubRegIdx); 1240 } 1241 1242 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 1243 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) { 1244 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); 1245 return false; 1246 } 1247 1248 I.setDesc(TII.get(TargetOpcode::COPY)); 1249 return true; 1250 } 1251 1252 /// \returns true if a bitmask for \p Size bits will be an inline immediate. 1253 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) { 1254 Mask = maskTrailingOnes<unsigned>(Size); 1255 int SignedMask = static_cast<int>(Mask); 1256 return SignedMask >= -16 && SignedMask <= 64; 1257 } 1258 1259 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1. 1260 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank( 1261 Register Reg, const MachineRegisterInfo &MRI, 1262 const TargetRegisterInfo &TRI) const { 1263 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 1264 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>()) 1265 return RB; 1266 1267 // Ignore the type, since we don't use vcc in artifacts. 1268 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>()) 1269 return &RBI.getRegBankFromRegClass(*RC, LLT()); 1270 return nullptr; 1271 } 1272 1273 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { 1274 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG; 1275 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg; 1276 const DebugLoc &DL = I.getDebugLoc(); 1277 MachineBasicBlock &MBB = *I.getParent(); 1278 const Register DstReg = I.getOperand(0).getReg(); 1279 const Register SrcReg = I.getOperand(1).getReg(); 1280 1281 const LLT DstTy = MRI->getType(DstReg); 1282 const LLT SrcTy = MRI->getType(SrcReg); 1283 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ? 1284 I.getOperand(2).getImm() : SrcTy.getSizeInBits(); 1285 const unsigned DstSize = DstTy.getSizeInBits(); 1286 if (!DstTy.isScalar()) 1287 return false; 1288 1289 if (I.getOpcode() == AMDGPU::G_ANYEXT) 1290 return selectCOPY(I); 1291 1292 // Artifact casts should never use vcc. 1293 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI); 1294 1295 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) { 1296 // 64-bit should have been split up in RegBankSelect 1297 1298 // Try to use an and with a mask if it will save code size. 1299 unsigned Mask; 1300 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1301 MachineInstr *ExtI = 1302 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg) 1303 .addImm(Mask) 1304 .addReg(SrcReg); 1305 I.eraseFromParent(); 1306 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1307 } 1308 1309 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32; 1310 MachineInstr *ExtI = 1311 BuildMI(MBB, I, DL, TII.get(BFE), DstReg) 1312 .addReg(SrcReg) 1313 .addImm(0) // Offset 1314 .addImm(SrcSize); // Width 1315 I.eraseFromParent(); 1316 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1317 } 1318 1319 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) { 1320 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ? 1321 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass; 1322 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI)) 1323 return false; 1324 1325 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) { 1326 const unsigned SextOpc = SrcSize == 8 ? 1327 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16; 1328 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg) 1329 .addReg(SrcReg); 1330 I.eraseFromParent(); 1331 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1332 } 1333 1334 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; 1335 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; 1336 1337 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width. 1338 if (DstSize > 32 && (SrcSize <= 32 || InReg)) { 1339 // We need a 64-bit register source, but the high bits don't matter. 1340 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); 1341 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1342 unsigned SubReg = InReg ? AMDGPU::sub0 : 0; 1343 1344 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 1345 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) 1346 .addReg(SrcReg, 0, SubReg) 1347 .addImm(AMDGPU::sub0) 1348 .addReg(UndefReg) 1349 .addImm(AMDGPU::sub1); 1350 1351 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg) 1352 .addReg(ExtReg) 1353 .addImm(SrcSize << 16); 1354 1355 I.eraseFromParent(); 1356 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI); 1357 } 1358 1359 unsigned Mask; 1360 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1361 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg) 1362 .addReg(SrcReg) 1363 .addImm(Mask); 1364 } else { 1365 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg) 1366 .addReg(SrcReg) 1367 .addImm(SrcSize << 16); 1368 } 1369 1370 I.eraseFromParent(); 1371 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1372 } 1373 1374 return false; 1375 } 1376 1377 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { 1378 MachineBasicBlock *BB = I.getParent(); 1379 MachineOperand &ImmOp = I.getOperand(1); 1380 1381 // The AMDGPU backend only supports Imm operands and not CImm or FPImm. 1382 if (ImmOp.isFPImm()) { 1383 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt(); 1384 ImmOp.ChangeToImmediate(Imm.getZExtValue()); 1385 } else if (ImmOp.isCImm()) { 1386 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getZExtValue()); 1387 } 1388 1389 Register DstReg = I.getOperand(0).getReg(); 1390 unsigned Size; 1391 bool IsSgpr; 1392 const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg()); 1393 if (RB) { 1394 IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID; 1395 Size = MRI->getType(DstReg).getSizeInBits(); 1396 } else { 1397 const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg); 1398 IsSgpr = TRI.isSGPRClass(RC); 1399 Size = TRI.getRegSizeInBits(*RC); 1400 } 1401 1402 if (Size != 32 && Size != 64) 1403 return false; 1404 1405 unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1406 if (Size == 32) { 1407 I.setDesc(TII.get(Opcode)); 1408 I.addImplicitDefUseOperands(*MF); 1409 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 1410 } 1411 1412 const DebugLoc &DL = I.getDebugLoc(); 1413 1414 APInt Imm(Size, I.getOperand(1).getImm()); 1415 1416 MachineInstr *ResInst; 1417 if (IsSgpr && TII.isInlineConstant(Imm)) { 1418 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) 1419 .addImm(I.getOperand(1).getImm()); 1420 } else { 1421 const TargetRegisterClass *RC = IsSgpr ? 1422 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass; 1423 Register LoReg = MRI->createVirtualRegister(RC); 1424 Register HiReg = MRI->createVirtualRegister(RC); 1425 1426 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg) 1427 .addImm(Imm.trunc(32).getZExtValue()); 1428 1429 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg) 1430 .addImm(Imm.ashr(32).getZExtValue()); 1431 1432 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 1433 .addReg(LoReg) 1434 .addImm(AMDGPU::sub0) 1435 .addReg(HiReg) 1436 .addImm(AMDGPU::sub1); 1437 } 1438 1439 // We can't call constrainSelectedInstRegOperands here, because it doesn't 1440 // work for target independent opcodes 1441 I.eraseFromParent(); 1442 const TargetRegisterClass *DstRC = 1443 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI); 1444 if (!DstRC) 1445 return true; 1446 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI); 1447 } 1448 1449 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const { 1450 // Only manually handle the f64 SGPR case. 1451 // 1452 // FIXME: This is a workaround for 2.5 different tablegen problems. Because 1453 // the bit ops theoretically have a second result due to the implicit def of 1454 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing 1455 // that is easy by disabling the check. The result works, but uses a 1456 // nonsensical sreg32orlds_and_sreg_1 regclass. 1457 // 1458 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to 1459 // the variadic REG_SEQUENCE operands. 1460 1461 Register Dst = MI.getOperand(0).getReg(); 1462 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 1463 if (DstRB->getID() != AMDGPU::SGPRRegBankID || 1464 MRI->getType(Dst) != LLT::scalar(64)) 1465 return false; 1466 1467 Register Src = MI.getOperand(1).getReg(); 1468 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI); 1469 if (Fabs) 1470 Src = Fabs->getOperand(1).getReg(); 1471 1472 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || 1473 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) 1474 return false; 1475 1476 MachineBasicBlock *BB = MI.getParent(); 1477 const DebugLoc &DL = MI.getDebugLoc(); 1478 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1479 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1480 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1481 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1482 1483 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) 1484 .addReg(Src, 0, AMDGPU::sub0); 1485 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) 1486 .addReg(Src, 0, AMDGPU::sub1); 1487 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) 1488 .addImm(0x80000000); 1489 1490 // Set or toggle sign bit. 1491 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32; 1492 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg) 1493 .addReg(HiReg) 1494 .addReg(ConstReg); 1495 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) 1496 .addReg(LoReg) 1497 .addImm(AMDGPU::sub0) 1498 .addReg(OpReg) 1499 .addImm(AMDGPU::sub1); 1500 MI.eraseFromParent(); 1501 return true; 1502 } 1503 1504 static bool isConstant(const MachineInstr &MI) { 1505 return MI.getOpcode() == TargetOpcode::G_CONSTANT; 1506 } 1507 1508 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, 1509 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const { 1510 1511 const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg()); 1512 1513 assert(PtrMI); 1514 1515 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD) 1516 return; 1517 1518 GEPInfo GEPInfo(*PtrMI); 1519 1520 for (unsigned i = 1; i != 3; ++i) { 1521 const MachineOperand &GEPOp = PtrMI->getOperand(i); 1522 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg()); 1523 assert(OpDef); 1524 if (i == 2 && isConstant(*OpDef)) { 1525 // TODO: Could handle constant base + variable offset, but a combine 1526 // probably should have commuted it. 1527 assert(GEPInfo.Imm == 0); 1528 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue(); 1529 continue; 1530 } 1531 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI); 1532 if (OpBank->getID() == AMDGPU::SGPRRegBankID) 1533 GEPInfo.SgprParts.push_back(GEPOp.getReg()); 1534 else 1535 GEPInfo.VgprParts.push_back(GEPOp.getReg()); 1536 } 1537 1538 AddrInfo.push_back(GEPInfo); 1539 getAddrModeInfo(*PtrMI, MRI, AddrInfo); 1540 } 1541 1542 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const { 1543 if (!MI.hasOneMemOperand()) 1544 return false; 1545 1546 const MachineMemOperand *MMO = *MI.memoperands_begin(); 1547 const Value *Ptr = MMO->getValue(); 1548 1549 // UndefValue means this is a load of a kernel input. These are uniform. 1550 // Sometimes LDS instructions have constant pointers. 1551 // If Ptr is null, then that means this mem operand contains a 1552 // PseudoSourceValue like GOT. 1553 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || 1554 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) 1555 return true; 1556 1557 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 1558 return true; 1559 1560 const Instruction *I = dyn_cast<Instruction>(Ptr); 1561 return I && I->getMetadata("amdgpu.uniform"); 1562 } 1563 1564 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const { 1565 for (const GEPInfo &GEPInfo : AddrInfo) { 1566 if (!GEPInfo.VgprParts.empty()) 1567 return true; 1568 } 1569 return false; 1570 } 1571 1572 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const { 1573 MachineBasicBlock *BB = I.getParent(); 1574 1575 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); 1576 unsigned AS = PtrTy.getAddressSpace(); 1577 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) && 1578 STI.ldsRequiresM0Init()) { 1579 // If DS instructions require M0 initializtion, insert it before selecting. 1580 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1581 .addImm(-1); 1582 } 1583 } 1584 1585 bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const { 1586 initM0(I); 1587 return selectImpl(I, *CoverageInfo); 1588 } 1589 1590 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { 1591 MachineBasicBlock *BB = I.getParent(); 1592 MachineOperand &CondOp = I.getOperand(0); 1593 Register CondReg = CondOp.getReg(); 1594 const DebugLoc &DL = I.getDebugLoc(); 1595 1596 unsigned BrOpcode; 1597 Register CondPhysReg; 1598 const TargetRegisterClass *ConstrainRC; 1599 1600 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide 1601 // whether the branch is uniform when selecting the instruction. In 1602 // GlobalISel, we should push that decision into RegBankSelect. Assume for now 1603 // RegBankSelect knows what it's doing if the branch condition is scc, even 1604 // though it currently does not. 1605 if (!isVCC(CondReg, *MRI)) { 1606 if (MRI->getType(CondReg) != LLT::scalar(32)) 1607 return false; 1608 1609 CondPhysReg = AMDGPU::SCC; 1610 BrOpcode = AMDGPU::S_CBRANCH_SCC1; 1611 // FIXME: Hack for isSCC tests 1612 ConstrainRC = &AMDGPU::SGPR_32RegClass; 1613 } else { 1614 // FIXME: Do we have to insert an and with exec here, like in SelectionDAG? 1615 // We sort of know that a VCC producer based on the register bank, that ands 1616 // inactive lanes with 0. What if there was a logical operation with vcc 1617 // producers in different blocks/with different exec masks? 1618 // FIXME: Should scc->vcc copies and with exec? 1619 CondPhysReg = TRI.getVCC(); 1620 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ; 1621 ConstrainRC = TRI.getBoolRC(); 1622 } 1623 1624 if (!MRI->getRegClassOrNull(CondReg)) 1625 MRI->setRegClass(CondReg, ConstrainRC); 1626 1627 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg) 1628 .addReg(CondReg); 1629 BuildMI(*BB, &I, DL, TII.get(BrOpcode)) 1630 .addMBB(I.getOperand(1).getMBB()); 1631 1632 I.eraseFromParent(); 1633 return true; 1634 } 1635 1636 bool AMDGPUInstructionSelector::selectG_FRAME_INDEX_GLOBAL_VALUE( 1637 MachineInstr &I) const { 1638 Register DstReg = I.getOperand(0).getReg(); 1639 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1640 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 1641 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32)); 1642 if (IsVGPR) 1643 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 1644 1645 return RBI.constrainGenericRegister( 1646 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI); 1647 } 1648 1649 bool AMDGPUInstructionSelector::selectG_PTR_MASK(MachineInstr &I) const { 1650 uint64_t Align = I.getOperand(2).getImm(); 1651 const uint64_t Mask = ~((UINT64_C(1) << Align) - 1); 1652 1653 MachineBasicBlock *BB = I.getParent(); 1654 1655 Register DstReg = I.getOperand(0).getReg(); 1656 Register SrcReg = I.getOperand(1).getReg(); 1657 1658 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1659 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1660 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 1661 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 1662 unsigned MovOpc = IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 1663 const TargetRegisterClass &RegRC 1664 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 1665 1666 LLT Ty = MRI->getType(DstReg); 1667 1668 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB, 1669 *MRI); 1670 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB, 1671 *MRI); 1672 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 1673 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 1674 return false; 1675 1676 const DebugLoc &DL = I.getDebugLoc(); 1677 Register ImmReg = MRI->createVirtualRegister(&RegRC); 1678 BuildMI(*BB, &I, DL, TII.get(MovOpc), ImmReg) 1679 .addImm(Mask); 1680 1681 if (Ty.getSizeInBits() == 32) { 1682 BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg) 1683 .addReg(SrcReg) 1684 .addReg(ImmReg); 1685 I.eraseFromParent(); 1686 return true; 1687 } 1688 1689 Register HiReg = MRI->createVirtualRegister(&RegRC); 1690 Register LoReg = MRI->createVirtualRegister(&RegRC); 1691 Register MaskLo = MRI->createVirtualRegister(&RegRC); 1692 1693 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg) 1694 .addReg(SrcReg, 0, AMDGPU::sub0); 1695 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg) 1696 .addReg(SrcReg, 0, AMDGPU::sub1); 1697 1698 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskLo) 1699 .addReg(LoReg) 1700 .addReg(ImmReg); 1701 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 1702 .addReg(MaskLo) 1703 .addImm(AMDGPU::sub0) 1704 .addReg(HiReg) 1705 .addImm(AMDGPU::sub1); 1706 I.eraseFromParent(); 1707 return true; 1708 } 1709 1710 /// Return the register to use for the index value, and the subregister to use 1711 /// for the indirectly accessed register. 1712 static std::pair<Register, unsigned> 1713 computeIndirectRegIndex(MachineRegisterInfo &MRI, 1714 const SIRegisterInfo &TRI, 1715 const TargetRegisterClass *SuperRC, 1716 Register IdxReg, 1717 unsigned EltSize) { 1718 Register IdxBaseReg; 1719 int Offset; 1720 MachineInstr *Unused; 1721 1722 std::tie(IdxBaseReg, Offset, Unused) 1723 = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg); 1724 if (IdxBaseReg == AMDGPU::NoRegister) { 1725 // This will happen if the index is a known constant. This should ordinarily 1726 // be legalized out, but handle it as a register just in case. 1727 assert(Offset == 0); 1728 IdxBaseReg = IdxReg; 1729 } 1730 1731 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize); 1732 1733 // Skip out of bounds offsets, or else we would end up using an undefined 1734 // register. 1735 if (static_cast<unsigned>(Offset) >= SubRegs.size()) 1736 return std::make_pair(IdxReg, SubRegs[0]); 1737 return std::make_pair(IdxBaseReg, SubRegs[Offset]); 1738 } 1739 1740 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT( 1741 MachineInstr &MI) const { 1742 Register DstReg = MI.getOperand(0).getReg(); 1743 Register SrcReg = MI.getOperand(1).getReg(); 1744 Register IdxReg = MI.getOperand(2).getReg(); 1745 1746 LLT DstTy = MRI->getType(DstReg); 1747 LLT SrcTy = MRI->getType(SrcReg); 1748 1749 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1750 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1751 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 1752 1753 // The index must be scalar. If it wasn't RegBankSelect should have moved this 1754 // into a waterfall loop. 1755 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 1756 return false; 1757 1758 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB, 1759 *MRI); 1760 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB, 1761 *MRI); 1762 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 1763 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 1764 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 1765 return false; 1766 1767 MachineBasicBlock *BB = MI.getParent(); 1768 const DebugLoc &DL = MI.getDebugLoc(); 1769 const bool Is64 = DstTy.getSizeInBits() == 64; 1770 1771 unsigned SubReg; 1772 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg, 1773 DstTy.getSizeInBits() / 8); 1774 1775 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) { 1776 if (DstTy.getSizeInBits() != 32 && !Is64) 1777 return false; 1778 1779 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1780 .addReg(IdxReg); 1781 1782 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32; 1783 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg) 1784 .addReg(SrcReg, 0, SubReg) 1785 .addReg(SrcReg, RegState::Implicit); 1786 MI.eraseFromParent(); 1787 return true; 1788 } 1789 1790 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32) 1791 return false; 1792 1793 if (!STI.useVGPRIndexMode()) { 1794 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1795 .addReg(IdxReg); 1796 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg) 1797 .addReg(SrcReg, RegState::Undef, SubReg) 1798 .addReg(SrcReg, RegState::Implicit); 1799 MI.eraseFromParent(); 1800 return true; 1801 } 1802 1803 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON)) 1804 .addReg(IdxReg) 1805 .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE); 1806 BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstReg) 1807 .addReg(SrcReg, RegState::Undef, SubReg) 1808 .addReg(SrcReg, RegState::Implicit) 1809 .addReg(AMDGPU::M0, RegState::Implicit); 1810 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF)); 1811 1812 MI.eraseFromParent(); 1813 return true; 1814 } 1815 1816 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd 1817 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT( 1818 MachineInstr &MI) const { 1819 Register DstReg = MI.getOperand(0).getReg(); 1820 Register VecReg = MI.getOperand(1).getReg(); 1821 Register ValReg = MI.getOperand(2).getReg(); 1822 Register IdxReg = MI.getOperand(3).getReg(); 1823 1824 LLT VecTy = MRI->getType(DstReg); 1825 LLT ValTy = MRI->getType(ValReg); 1826 unsigned VecSize = VecTy.getSizeInBits(); 1827 unsigned ValSize = ValTy.getSizeInBits(); 1828 1829 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI); 1830 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI); 1831 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 1832 1833 assert(VecTy.getElementType() == ValTy); 1834 1835 // The index must be scalar. If it wasn't RegBankSelect should have moved this 1836 // into a waterfall loop. 1837 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 1838 return false; 1839 1840 const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB, 1841 *MRI); 1842 const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB, 1843 *MRI); 1844 1845 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) || 1846 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) || 1847 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) || 1848 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 1849 return false; 1850 1851 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32) 1852 return false; 1853 1854 unsigned SubReg; 1855 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, 1856 ValSize / 8); 1857 1858 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID && 1859 STI.useVGPRIndexMode(); 1860 1861 MachineBasicBlock *BB = MI.getParent(); 1862 const DebugLoc &DL = MI.getDebugLoc(); 1863 1864 if (IndexMode) { 1865 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON)) 1866 .addReg(IdxReg) 1867 .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE); 1868 } else { 1869 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1870 .addReg(IdxReg); 1871 } 1872 1873 const MCInstrDesc &RegWriteOp 1874 = TII.getIndirectRegWritePseudo(VecSize, ValSize, 1875 VecRB->getID() == AMDGPU::SGPRRegBankID); 1876 BuildMI(*BB, MI, DL, RegWriteOp, DstReg) 1877 .addReg(VecReg) 1878 .addReg(ValReg) 1879 .addImm(SubReg); 1880 1881 if (IndexMode) 1882 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF)); 1883 1884 MI.eraseFromParent(); 1885 return true; 1886 } 1887 1888 bool AMDGPUInstructionSelector::select(MachineInstr &I) { 1889 if (I.isPHI()) 1890 return selectPHI(I); 1891 1892 if (!I.isPreISelOpcode()) { 1893 if (I.isCopy()) 1894 return selectCOPY(I); 1895 return true; 1896 } 1897 1898 switch (I.getOpcode()) { 1899 case TargetOpcode::G_AND: 1900 case TargetOpcode::G_OR: 1901 case TargetOpcode::G_XOR: 1902 if (selectG_AND_OR_XOR(I)) 1903 return true; 1904 return selectImpl(I, *CoverageInfo); 1905 case TargetOpcode::G_ADD: 1906 case TargetOpcode::G_SUB: 1907 if (selectImpl(I, *CoverageInfo)) 1908 return true; 1909 return selectG_ADD_SUB(I); 1910 case TargetOpcode::G_UADDO: 1911 case TargetOpcode::G_USUBO: 1912 case TargetOpcode::G_UADDE: 1913 case TargetOpcode::G_USUBE: 1914 return selectG_UADDO_USUBO_UADDE_USUBE(I); 1915 case TargetOpcode::G_INTTOPTR: 1916 case TargetOpcode::G_BITCAST: 1917 case TargetOpcode::G_PTRTOINT: 1918 return selectCOPY(I); 1919 case TargetOpcode::G_CONSTANT: 1920 case TargetOpcode::G_FCONSTANT: 1921 return selectG_CONSTANT(I); 1922 case TargetOpcode::G_FNEG: 1923 if (selectImpl(I, *CoverageInfo)) 1924 return true; 1925 return selectG_FNEG(I); 1926 case TargetOpcode::G_EXTRACT: 1927 return selectG_EXTRACT(I); 1928 case TargetOpcode::G_MERGE_VALUES: 1929 case TargetOpcode::G_BUILD_VECTOR: 1930 case TargetOpcode::G_CONCAT_VECTORS: 1931 return selectG_MERGE_VALUES(I); 1932 case TargetOpcode::G_UNMERGE_VALUES: 1933 return selectG_UNMERGE_VALUES(I); 1934 case TargetOpcode::G_PTR_ADD: 1935 return selectG_PTR_ADD(I); 1936 case TargetOpcode::G_IMPLICIT_DEF: 1937 return selectG_IMPLICIT_DEF(I); 1938 case TargetOpcode::G_INSERT: 1939 return selectG_INSERT(I); 1940 case TargetOpcode::G_INTRINSIC: 1941 return selectG_INTRINSIC(I); 1942 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: 1943 return selectG_INTRINSIC_W_SIDE_EFFECTS(I); 1944 case TargetOpcode::G_ICMP: 1945 if (selectG_ICMP(I)) 1946 return true; 1947 return selectImpl(I, *CoverageInfo); 1948 case TargetOpcode::G_LOAD: 1949 case TargetOpcode::G_ATOMIC_CMPXCHG: 1950 case TargetOpcode::G_ATOMICRMW_XCHG: 1951 case TargetOpcode::G_ATOMICRMW_ADD: 1952 case TargetOpcode::G_ATOMICRMW_SUB: 1953 case TargetOpcode::G_ATOMICRMW_AND: 1954 case TargetOpcode::G_ATOMICRMW_OR: 1955 case TargetOpcode::G_ATOMICRMW_XOR: 1956 case TargetOpcode::G_ATOMICRMW_MIN: 1957 case TargetOpcode::G_ATOMICRMW_MAX: 1958 case TargetOpcode::G_ATOMICRMW_UMIN: 1959 case TargetOpcode::G_ATOMICRMW_UMAX: 1960 case TargetOpcode::G_ATOMICRMW_FADD: 1961 return selectG_LOAD_ATOMICRMW(I); 1962 case TargetOpcode::G_SELECT: 1963 return selectG_SELECT(I); 1964 case TargetOpcode::G_STORE: 1965 return selectG_STORE(I); 1966 case TargetOpcode::G_TRUNC: 1967 return selectG_TRUNC(I); 1968 case TargetOpcode::G_SEXT: 1969 case TargetOpcode::G_ZEXT: 1970 case TargetOpcode::G_ANYEXT: 1971 case TargetOpcode::G_SEXT_INREG: 1972 if (selectImpl(I, *CoverageInfo)) 1973 return true; 1974 return selectG_SZA_EXT(I); 1975 case TargetOpcode::G_BRCOND: 1976 return selectG_BRCOND(I); 1977 case TargetOpcode::G_FRAME_INDEX: 1978 case TargetOpcode::G_GLOBAL_VALUE: 1979 return selectG_FRAME_INDEX_GLOBAL_VALUE(I); 1980 case TargetOpcode::G_PTR_MASK: 1981 return selectG_PTR_MASK(I); 1982 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1983 return selectG_EXTRACT_VECTOR_ELT(I); 1984 case TargetOpcode::G_INSERT_VECTOR_ELT: 1985 return selectG_INSERT_VECTOR_ELT(I); 1986 case AMDGPU::G_AMDGPU_ATOMIC_INC: 1987 case AMDGPU::G_AMDGPU_ATOMIC_DEC: 1988 initM0(I); 1989 return selectImpl(I, *CoverageInfo); 1990 default: 1991 return selectImpl(I, *CoverageInfo); 1992 } 1993 return false; 1994 } 1995 1996 InstructionSelector::ComplexRendererFns 1997 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { 1998 return {{ 1999 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 2000 }}; 2001 2002 } 2003 2004 std::pair<Register, unsigned> 2005 AMDGPUInstructionSelector::selectVOP3ModsImpl( 2006 Register Src) const { 2007 unsigned Mods = 0; 2008 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI); 2009 2010 if (MI && MI->getOpcode() == AMDGPU::G_FNEG) { 2011 Src = MI->getOperand(1).getReg(); 2012 Mods |= SISrcMods::NEG; 2013 MI = getDefIgnoringCopies(Src, *MRI); 2014 } 2015 2016 if (MI && MI->getOpcode() == AMDGPU::G_FABS) { 2017 Src = MI->getOperand(1).getReg(); 2018 Mods |= SISrcMods::ABS; 2019 } 2020 2021 return std::make_pair(Src, Mods); 2022 } 2023 2024 /// 2025 /// This will select either an SGPR or VGPR operand and will save us from 2026 /// having to write an extra tablegen pattern. 2027 InstructionSelector::ComplexRendererFns 2028 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const { 2029 return {{ 2030 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 2031 }}; 2032 } 2033 2034 InstructionSelector::ComplexRendererFns 2035 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { 2036 Register Src; 2037 unsigned Mods; 2038 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 2039 2040 return {{ 2041 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2042 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 2043 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 2044 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 2045 }}; 2046 } 2047 2048 InstructionSelector::ComplexRendererFns 2049 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { 2050 return {{ 2051 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 2052 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 2053 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 2054 }}; 2055 } 2056 2057 InstructionSelector::ComplexRendererFns 2058 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { 2059 Register Src; 2060 unsigned Mods; 2061 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 2062 2063 return {{ 2064 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2065 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 2066 }}; 2067 } 2068 2069 InstructionSelector::ComplexRendererFns 2070 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { 2071 Register Reg = Root.getReg(); 2072 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); 2073 if (Def && (Def->getOpcode() == AMDGPU::G_FNEG || 2074 Def->getOpcode() == AMDGPU::G_FABS)) 2075 return {}; 2076 return {{ 2077 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 2078 }}; 2079 } 2080 2081 InstructionSelector::ComplexRendererFns 2082 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const { 2083 Register Src; 2084 unsigned Mods; 2085 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 2086 if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI)) 2087 return None; 2088 2089 return {{ 2090 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2091 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 2092 }}; 2093 } 2094 2095 InstructionSelector::ComplexRendererFns 2096 AMDGPUInstructionSelector::selectVOP3OpSelMods0(MachineOperand &Root) const { 2097 // FIXME: Handle clamp and op_sel 2098 return {{ 2099 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 2100 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // src_mods 2101 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // clamp 2102 }}; 2103 } 2104 2105 InstructionSelector::ComplexRendererFns 2106 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { 2107 // FIXME: Handle op_sel 2108 return {{ 2109 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 2110 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods 2111 }}; 2112 } 2113 2114 InstructionSelector::ComplexRendererFns 2115 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { 2116 SmallVector<GEPInfo, 4> AddrInfo; 2117 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 2118 2119 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 2120 return None; 2121 2122 const GEPInfo &GEPInfo = AddrInfo[0]; 2123 Optional<int64_t> EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm); 2124 if (!EncodedImm) 2125 return None; 2126 2127 unsigned PtrReg = GEPInfo.SgprParts[0]; 2128 return {{ 2129 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 2130 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } 2131 }}; 2132 } 2133 2134 InstructionSelector::ComplexRendererFns 2135 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { 2136 SmallVector<GEPInfo, 4> AddrInfo; 2137 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 2138 2139 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 2140 return None; 2141 2142 const GEPInfo &GEPInfo = AddrInfo[0]; 2143 unsigned PtrReg = GEPInfo.SgprParts[0]; 2144 Optional<int64_t> EncodedImm = 2145 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm); 2146 if (!EncodedImm) 2147 return None; 2148 2149 return {{ 2150 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 2151 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } 2152 }}; 2153 } 2154 2155 InstructionSelector::ComplexRendererFns 2156 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { 2157 MachineInstr *MI = Root.getParent(); 2158 MachineBasicBlock *MBB = MI->getParent(); 2159 2160 SmallVector<GEPInfo, 4> AddrInfo; 2161 getAddrModeInfo(*MI, *MRI, AddrInfo); 2162 2163 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, 2164 // then we can select all ptr + 32-bit offsets not just immediate offsets. 2165 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 2166 return None; 2167 2168 const GEPInfo &GEPInfo = AddrInfo[0]; 2169 if (!GEPInfo.Imm || !isUInt<32>(GEPInfo.Imm)) 2170 return None; 2171 2172 // If we make it this far we have a load with an 32-bit immediate offset. 2173 // It is OK to select this using a sgpr offset, because we have already 2174 // failed trying to select this load into one of the _IMM variants since 2175 // the _IMM Patterns are considered before the _SGPR patterns. 2176 unsigned PtrReg = GEPInfo.SgprParts[0]; 2177 Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2178 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) 2179 .addImm(GEPInfo.Imm); 2180 return {{ 2181 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 2182 [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); } 2183 }}; 2184 } 2185 2186 template <bool Signed> 2187 InstructionSelector::ComplexRendererFns 2188 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const { 2189 MachineInstr *MI = Root.getParent(); 2190 2191 InstructionSelector::ComplexRendererFns Default = {{ 2192 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 2193 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // offset 2194 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 2195 }}; 2196 2197 if (!STI.hasFlatInstOffsets()) 2198 return Default; 2199 2200 const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg()); 2201 if (!OpDef || OpDef->getOpcode() != AMDGPU::G_PTR_ADD) 2202 return Default; 2203 2204 Optional<int64_t> Offset = 2205 getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI); 2206 if (!Offset.hasValue()) 2207 return Default; 2208 2209 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); 2210 if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed)) 2211 return Default; 2212 2213 Register BasePtr = OpDef->getOperand(1).getReg(); 2214 2215 return {{ 2216 [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); }, 2217 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); }, 2218 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 2219 }}; 2220 } 2221 2222 InstructionSelector::ComplexRendererFns 2223 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const { 2224 return selectFlatOffsetImpl<false>(Root); 2225 } 2226 2227 InstructionSelector::ComplexRendererFns 2228 AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const { 2229 return selectFlatOffsetImpl<true>(Root); 2230 } 2231 2232 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { 2233 auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>(); 2234 return PSV && PSV->isStack(); 2235 } 2236 2237 InstructionSelector::ComplexRendererFns 2238 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { 2239 MachineInstr *MI = Root.getParent(); 2240 MachineBasicBlock *MBB = MI->getParent(); 2241 MachineFunction *MF = MBB->getParent(); 2242 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 2243 2244 int64_t Offset = 0; 2245 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset))) { 2246 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2247 2248 // TODO: Should this be inside the render function? The iterator seems to 2249 // move. 2250 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), 2251 HighBits) 2252 .addImm(Offset & ~4095); 2253 2254 return {{[=](MachineInstrBuilder &MIB) { // rsrc 2255 MIB.addReg(Info->getScratchRSrcReg()); 2256 }, 2257 [=](MachineInstrBuilder &MIB) { // vaddr 2258 MIB.addReg(HighBits); 2259 }, 2260 [=](MachineInstrBuilder &MIB) { // soffset 2261 const MachineMemOperand *MMO = *MI->memoperands_begin(); 2262 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 2263 2264 Register SOffsetReg = isStackPtrRelative(PtrInfo) 2265 ? Info->getStackPtrOffsetReg() 2266 : Info->getScratchWaveOffsetReg(); 2267 MIB.addReg(SOffsetReg); 2268 }, 2269 [=](MachineInstrBuilder &MIB) { // offset 2270 MIB.addImm(Offset & 4095); 2271 }}}; 2272 } 2273 2274 assert(Offset == 0); 2275 2276 // Try to fold a frame index directly into the MUBUF vaddr field, and any 2277 // offsets. 2278 Optional<int> FI; 2279 Register VAddr = Root.getReg(); 2280 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) { 2281 if (isBaseWithConstantOffset(Root, *MRI)) { 2282 const MachineOperand &LHS = RootDef->getOperand(1); 2283 const MachineOperand &RHS = RootDef->getOperand(2); 2284 const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); 2285 const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); 2286 if (LHSDef && RHSDef) { 2287 int64_t PossibleOffset = 2288 RHSDef->getOperand(1).getCImm()->getSExtValue(); 2289 if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) && 2290 (!STI.privateMemoryResourceIsRangeChecked() || 2291 KnownBits->signBitIsZero(LHS.getReg()))) { 2292 if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX) 2293 FI = LHSDef->getOperand(1).getIndex(); 2294 else 2295 VAddr = LHS.getReg(); 2296 Offset = PossibleOffset; 2297 } 2298 } 2299 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) { 2300 FI = RootDef->getOperand(1).getIndex(); 2301 } 2302 } 2303 2304 // If we don't know this private access is a local stack object, it needs to 2305 // be relative to the entry point's scratch wave offset register. 2306 // TODO: Should split large offsets that don't fit like above. 2307 // TODO: Don't use scratch wave offset just because the offset didn't fit. 2308 Register SOffset = FI.hasValue() ? Info->getStackPtrOffsetReg() 2309 : Info->getScratchWaveOffsetReg(); 2310 2311 return {{[=](MachineInstrBuilder &MIB) { // rsrc 2312 MIB.addReg(Info->getScratchRSrcReg()); 2313 }, 2314 [=](MachineInstrBuilder &MIB) { // vaddr 2315 if (FI.hasValue()) 2316 MIB.addFrameIndex(FI.getValue()); 2317 else 2318 MIB.addReg(VAddr); 2319 }, 2320 [=](MachineInstrBuilder &MIB) { // soffset 2321 MIB.addReg(SOffset); 2322 }, 2323 [=](MachineInstrBuilder &MIB) { // offset 2324 MIB.addImm(Offset); 2325 }}}; 2326 } 2327 2328 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base, 2329 int64_t Offset, 2330 unsigned OffsetBits) const { 2331 if ((OffsetBits == 16 && !isUInt<16>(Offset)) || 2332 (OffsetBits == 8 && !isUInt<8>(Offset))) 2333 return false; 2334 2335 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) 2336 return true; 2337 2338 // On Southern Islands instruction with a negative base value and an offset 2339 // don't seem to work. 2340 return KnownBits->signBitIsZero(Base); 2341 } 2342 2343 InstructionSelector::ComplexRendererFns 2344 AMDGPUInstructionSelector::selectMUBUFScratchOffset( 2345 MachineOperand &Root) const { 2346 MachineInstr *MI = Root.getParent(); 2347 MachineBasicBlock *MBB = MI->getParent(); 2348 2349 int64_t Offset = 0; 2350 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) || 2351 !SIInstrInfo::isLegalMUBUFImmOffset(Offset)) 2352 return {}; 2353 2354 const MachineFunction *MF = MBB->getParent(); 2355 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 2356 const MachineMemOperand *MMO = *MI->memoperands_begin(); 2357 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 2358 2359 Register SOffsetReg = isStackPtrRelative(PtrInfo) 2360 ? Info->getStackPtrOffsetReg() 2361 : Info->getScratchWaveOffsetReg(); 2362 return {{ 2363 [=](MachineInstrBuilder &MIB) { 2364 MIB.addReg(Info->getScratchRSrcReg()); 2365 }, // rsrc 2366 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffsetReg); }, // soffset 2367 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset 2368 }}; 2369 } 2370 2371 std::pair<Register, unsigned> 2372 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const { 2373 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 2374 if (!RootDef) 2375 return std::make_pair(Root.getReg(), 0); 2376 2377 int64_t ConstAddr = 0; 2378 if (isBaseWithConstantOffset(Root, *MRI)) { 2379 const MachineOperand &LHS = RootDef->getOperand(1); 2380 const MachineOperand &RHS = RootDef->getOperand(2); 2381 const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); 2382 const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); 2383 if (LHSDef && RHSDef) { 2384 int64_t PossibleOffset = 2385 RHSDef->getOperand(1).getCImm()->getSExtValue(); 2386 if (isDSOffsetLegal(LHS.getReg(), PossibleOffset, 16)) { 2387 // (add n0, c0) 2388 return std::make_pair(LHS.getReg(), PossibleOffset); 2389 } 2390 } 2391 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 2392 // TODO 2393 2394 2395 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 2396 // TODO 2397 2398 } 2399 2400 return std::make_pair(Root.getReg(), 0); 2401 } 2402 2403 InstructionSelector::ComplexRendererFns 2404 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const { 2405 2406 Register Reg; 2407 unsigned Offset; 2408 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root); 2409 return {{ 2410 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 2411 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } 2412 }}; 2413 } 2414 2415 InstructionSelector::ComplexRendererFns 2416 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const { 2417 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 2418 if (!RootDef) { 2419 return {{ 2420 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 2421 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, 2422 [=](MachineInstrBuilder &MIB) { MIB.addImm(1); } 2423 }}; 2424 } 2425 2426 int64_t ConstAddr = 0; 2427 Register PtrBase; 2428 int64_t Offset; 2429 2430 std::tie(PtrBase, Offset) = 2431 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 2432 2433 if (Offset) { 2434 int64_t DWordOffset0 = Offset / 4; 2435 int64_t DWordOffset1 = DWordOffset0 + 1; 2436 if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) { 2437 // (add n0, c0) 2438 return {{ 2439 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, 2440 [=](MachineInstrBuilder &MIB) { MIB.addImm(DWordOffset0); }, 2441 [=](MachineInstrBuilder &MIB) { MIB.addImm(DWordOffset1); } 2442 }}; 2443 } 2444 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 2445 // TODO 2446 2447 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 2448 // TODO 2449 2450 } 2451 2452 return {{ 2453 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 2454 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, 2455 [=](MachineInstrBuilder &MIB) { MIB.addImm(1); } 2456 }}; 2457 } 2458 2459 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return 2460 /// the base value with the constant offset. There may be intervening copies 2461 /// between \p Root and the identified constant. Returns \p Root, 0 if this does 2462 /// not match the pattern. 2463 std::pair<Register, int64_t> 2464 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset( 2465 Register Root, const MachineRegisterInfo &MRI) const { 2466 MachineInstr *RootI = MRI.getVRegDef(Root); 2467 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD) 2468 return {Root, 0}; 2469 2470 MachineOperand &RHS = RootI->getOperand(2); 2471 Optional<ValueAndVReg> MaybeOffset 2472 = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true); 2473 if (!MaybeOffset) 2474 return {Root, 0}; 2475 return {RootI->getOperand(1).getReg(), MaybeOffset->Value}; 2476 } 2477 2478 static void addZeroImm(MachineInstrBuilder &MIB) { 2479 MIB.addImm(0); 2480 } 2481 2482 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p 2483 /// BasePtr is not valid, a null base pointer will be used. 2484 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, 2485 uint32_t FormatLo, uint32_t FormatHi, 2486 Register BasePtr) { 2487 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 2488 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 2489 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 2490 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 2491 2492 B.buildInstr(AMDGPU::S_MOV_B32) 2493 .addDef(RSrc2) 2494 .addImm(FormatLo); 2495 B.buildInstr(AMDGPU::S_MOV_B32) 2496 .addDef(RSrc3) 2497 .addImm(FormatHi); 2498 2499 // Build the half of the subregister with the constants before building the 2500 // full 128-bit register. If we are building multiple resource descriptors, 2501 // this will allow CSEing of the 2-component register. 2502 B.buildInstr(AMDGPU::REG_SEQUENCE) 2503 .addDef(RSrcHi) 2504 .addReg(RSrc2) 2505 .addImm(AMDGPU::sub0) 2506 .addReg(RSrc3) 2507 .addImm(AMDGPU::sub1); 2508 2509 Register RSrcLo = BasePtr; 2510 if (!BasePtr) { 2511 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 2512 B.buildInstr(AMDGPU::S_MOV_B64) 2513 .addDef(RSrcLo) 2514 .addImm(0); 2515 } 2516 2517 B.buildInstr(AMDGPU::REG_SEQUENCE) 2518 .addDef(RSrc) 2519 .addReg(RSrcLo) 2520 .addImm(AMDGPU::sub0_sub1) 2521 .addReg(RSrcHi) 2522 .addImm(AMDGPU::sub2_sub3); 2523 2524 return RSrc; 2525 } 2526 2527 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 2528 const SIInstrInfo &TII, Register BasePtr) { 2529 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 2530 2531 // FIXME: Why are half the "default" bits ignored based on the addressing 2532 // mode? 2533 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr); 2534 } 2535 2536 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 2537 const SIInstrInfo &TII, Register BasePtr) { 2538 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 2539 2540 // FIXME: Why are half the "default" bits ignored based on the addressing 2541 // mode? 2542 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr); 2543 } 2544 2545 AMDGPUInstructionSelector::MUBUFAddressData 2546 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const { 2547 MUBUFAddressData Data; 2548 Data.N0 = Src; 2549 2550 Register PtrBase; 2551 int64_t Offset; 2552 2553 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI); 2554 if (isUInt<32>(Offset)) { 2555 Data.N0 = PtrBase; 2556 Data.Offset = Offset; 2557 } 2558 2559 if (MachineInstr *InputAdd 2560 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) { 2561 Data.N2 = InputAdd->getOperand(1).getReg(); 2562 Data.N3 = InputAdd->getOperand(2).getReg(); 2563 2564 // FIXME: Need to fix extra SGPR->VGPRcopies inserted 2565 // FIXME: Don't know this was defined by operand 0 2566 // 2567 // TODO: Remove this when we have copy folding optimizations after 2568 // RegBankSelect. 2569 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg(); 2570 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg(); 2571 } 2572 2573 return Data; 2574 } 2575 2576 /// Return if the addr64 mubuf mode should be used for the given address. 2577 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const { 2578 // (ptr_add N2, N3) -> addr64, or 2579 // (ptr_add (ptr_add N2, N3), C1) -> addr64 2580 if (Addr.N2) 2581 return true; 2582 2583 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI); 2584 return N0Bank->getID() == AMDGPU::VGPRRegBankID; 2585 } 2586 2587 /// Split an immediate offset \p ImmOffset depending on whether it fits in the 2588 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable 2589 /// component. 2590 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset( 2591 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const { 2592 if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset)) 2593 return; 2594 2595 // Illegal offset, store it in soffset. 2596 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2597 B.buildInstr(AMDGPU::S_MOV_B32) 2598 .addDef(SOffset) 2599 .addImm(ImmOffset); 2600 ImmOffset = 0; 2601 } 2602 2603 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl( 2604 MachineOperand &Root, Register &VAddr, Register &RSrcReg, 2605 Register &SOffset, int64_t &Offset) const { 2606 // FIXME: Predicates should stop this from reaching here. 2607 // addr64 bit was removed for volcanic islands. 2608 if (!STI.hasAddr64() || STI.useFlatForGlobal()) 2609 return false; 2610 2611 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 2612 if (!shouldUseAddr64(AddrData)) 2613 return false; 2614 2615 Register N0 = AddrData.N0; 2616 Register N2 = AddrData.N2; 2617 Register N3 = AddrData.N3; 2618 Offset = AddrData.Offset; 2619 2620 // Base pointer for the SRD. 2621 Register SRDPtr; 2622 2623 if (N2) { 2624 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 2625 assert(N3); 2626 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 2627 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the 2628 // addr64, and construct the default resource from a 0 address. 2629 VAddr = N0; 2630 } else { 2631 SRDPtr = N3; 2632 VAddr = N2; 2633 } 2634 } else { 2635 // N2 is not divergent. 2636 SRDPtr = N2; 2637 VAddr = N3; 2638 } 2639 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 2640 // Use the default null pointer in the resource 2641 VAddr = N0; 2642 } else { 2643 // N0 -> offset, or 2644 // (N0 + C1) -> offset 2645 SRDPtr = N0; 2646 } 2647 2648 MachineIRBuilder B(*Root.getParent()); 2649 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr); 2650 splitIllegalMUBUFOffset(B, SOffset, Offset); 2651 return true; 2652 } 2653 2654 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl( 2655 MachineOperand &Root, Register &RSrcReg, Register &SOffset, 2656 int64_t &Offset) const { 2657 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 2658 if (shouldUseAddr64(AddrData)) 2659 return false; 2660 2661 // N0 -> offset, or 2662 // (N0 + C1) -> offset 2663 Register SRDPtr = AddrData.N0; 2664 Offset = AddrData.Offset; 2665 2666 // TODO: Look through extensions for 32-bit soffset. 2667 MachineIRBuilder B(*Root.getParent()); 2668 2669 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr); 2670 splitIllegalMUBUFOffset(B, SOffset, Offset); 2671 return true; 2672 } 2673 2674 InstructionSelector::ComplexRendererFns 2675 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const { 2676 Register VAddr; 2677 Register RSrcReg; 2678 Register SOffset; 2679 int64_t Offset = 0; 2680 2681 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 2682 return {}; 2683 2684 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 2685 // pattern. 2686 return {{ 2687 [=](MachineInstrBuilder &MIB) { // rsrc 2688 MIB.addReg(RSrcReg); 2689 }, 2690 [=](MachineInstrBuilder &MIB) { // vaddr 2691 MIB.addReg(VAddr); 2692 }, 2693 [=](MachineInstrBuilder &MIB) { // soffset 2694 if (SOffset) 2695 MIB.addReg(SOffset); 2696 else 2697 MIB.addImm(0); 2698 }, 2699 [=](MachineInstrBuilder &MIB) { // offset 2700 MIB.addImm(Offset); 2701 }, 2702 addZeroImm, // glc 2703 addZeroImm, // slc 2704 addZeroImm, // tfe 2705 addZeroImm, // dlc 2706 addZeroImm // swz 2707 }}; 2708 } 2709 2710 InstructionSelector::ComplexRendererFns 2711 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const { 2712 Register RSrcReg; 2713 Register SOffset; 2714 int64_t Offset = 0; 2715 2716 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 2717 return {}; 2718 2719 return {{ 2720 [=](MachineInstrBuilder &MIB) { // rsrc 2721 MIB.addReg(RSrcReg); 2722 }, 2723 [=](MachineInstrBuilder &MIB) { // soffset 2724 if (SOffset) 2725 MIB.addReg(SOffset); 2726 else 2727 MIB.addImm(0); 2728 }, 2729 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 2730 addZeroImm, // glc 2731 addZeroImm, // slc 2732 addZeroImm, // tfe 2733 addZeroImm, // dlc 2734 addZeroImm // swz 2735 }}; 2736 } 2737 2738 InstructionSelector::ComplexRendererFns 2739 AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const { 2740 Register VAddr; 2741 Register RSrcReg; 2742 Register SOffset; 2743 int64_t Offset = 0; 2744 2745 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 2746 return {}; 2747 2748 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 2749 // pattern. 2750 return {{ 2751 [=](MachineInstrBuilder &MIB) { // rsrc 2752 MIB.addReg(RSrcReg); 2753 }, 2754 [=](MachineInstrBuilder &MIB) { // vaddr 2755 MIB.addReg(VAddr); 2756 }, 2757 [=](MachineInstrBuilder &MIB) { // soffset 2758 if (SOffset) 2759 MIB.addReg(SOffset); 2760 else 2761 MIB.addImm(0); 2762 }, 2763 [=](MachineInstrBuilder &MIB) { // offset 2764 MIB.addImm(Offset); 2765 }, 2766 addZeroImm // slc 2767 }}; 2768 } 2769 2770 InstructionSelector::ComplexRendererFns 2771 AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const { 2772 Register RSrcReg; 2773 Register SOffset; 2774 int64_t Offset = 0; 2775 2776 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 2777 return {}; 2778 2779 return {{ 2780 [=](MachineInstrBuilder &MIB) { // rsrc 2781 MIB.addReg(RSrcReg); 2782 }, 2783 [=](MachineInstrBuilder &MIB) { // soffset 2784 if (SOffset) 2785 MIB.addReg(SOffset); 2786 else 2787 MIB.addImm(0); 2788 }, 2789 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 2790 addZeroImm // slc 2791 }}; 2792 } 2793 2794 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, 2795 const MachineInstr &MI, 2796 int OpIdx) const { 2797 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 2798 "Expected G_CONSTANT"); 2799 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue()); 2800 } 2801 2802 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB, 2803 const MachineInstr &MI, 2804 int OpIdx) const { 2805 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 2806 "Expected G_CONSTANT"); 2807 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue()); 2808 } 2809 2810 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB, 2811 const MachineInstr &MI, 2812 int OpIdx) const { 2813 assert(OpIdx == -1); 2814 2815 const MachineOperand &Op = MI.getOperand(1); 2816 if (MI.getOpcode() == TargetOpcode::G_FCONSTANT) 2817 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); 2818 else { 2819 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT"); 2820 MIB.addImm(Op.getCImm()->getSExtValue()); 2821 } 2822 } 2823 2824 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB, 2825 const MachineInstr &MI, 2826 int OpIdx) const { 2827 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 2828 "Expected G_CONSTANT"); 2829 MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation()); 2830 } 2831 2832 /// This only really exists to satisfy DAG type checking machinery, so is a 2833 /// no-op here. 2834 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB, 2835 const MachineInstr &MI, 2836 int OpIdx) const { 2837 MIB.addImm(MI.getOperand(OpIdx).getImm()); 2838 } 2839 2840 void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB, 2841 const MachineInstr &MI, 2842 int OpIdx) const { 2843 assert(OpIdx >= 0 && "expected to match an immediate operand"); 2844 MIB.addImm(MI.getOperand(OpIdx).getImm() & 1); 2845 } 2846 2847 void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB, 2848 const MachineInstr &MI, 2849 int OpIdx) const { 2850 assert(OpIdx >= 0 && "expected to match an immediate operand"); 2851 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1); 2852 } 2853 2854 void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB, 2855 const MachineInstr &MI, 2856 int OpIdx) const { 2857 assert(OpIdx >= 0 && "expected to match an immediate operand"); 2858 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1); 2859 } 2860 2861 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB, 2862 const MachineInstr &MI, 2863 int OpIdx) const { 2864 assert(OpIdx >= 0 && "expected to match an immediate operand"); 2865 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1); 2866 } 2867 2868 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const { 2869 return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm()); 2870 } 2871 2872 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const { 2873 return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm()); 2874 } 2875 2876 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const { 2877 return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm()); 2878 } 2879 2880 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const { 2881 return TII.isInlineConstant(Imm); 2882 } 2883