1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the InstructionSelector class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUInstructionSelector.h" 15 #include "AMDGPUInstrInfo.h" 16 #include "AMDGPUGlobalISelUtils.h" 17 #include "AMDGPURegisterBankInfo.h" 18 #include "AMDGPURegisterInfo.h" 19 #include "AMDGPUSubtarget.h" 20 #include "AMDGPUTargetMachine.h" 21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 22 #include "SIMachineFunctionInfo.h" 23 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 24 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" 25 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 28 #include "llvm/CodeGen/GlobalISel/Utils.h" 29 #include "llvm/CodeGen/MachineBasicBlock.h" 30 #include "llvm/CodeGen/MachineFunction.h" 31 #include "llvm/CodeGen/MachineInstr.h" 32 #include "llvm/CodeGen/MachineInstrBuilder.h" 33 #include "llvm/CodeGen/MachineRegisterInfo.h" 34 #include "llvm/IR/Type.h" 35 #include "llvm/Support/Debug.h" 36 #include "llvm/Support/raw_ostream.h" 37 38 #define DEBUG_TYPE "amdgpu-isel" 39 40 using namespace llvm; 41 using namespace MIPatternMatch; 42 43 #define GET_GLOBALISEL_IMPL 44 #define AMDGPUSubtarget GCNSubtarget 45 #include "AMDGPUGenGlobalISel.inc" 46 #undef GET_GLOBALISEL_IMPL 47 #undef AMDGPUSubtarget 48 49 AMDGPUInstructionSelector::AMDGPUInstructionSelector( 50 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, 51 const AMDGPUTargetMachine &TM) 52 : InstructionSelector(), TII(*STI.getInstrInfo()), 53 TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM), 54 STI(STI), 55 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG), 56 #define GET_GLOBALISEL_PREDICATES_INIT 57 #include "AMDGPUGenGlobalISel.inc" 58 #undef GET_GLOBALISEL_PREDICATES_INIT 59 #define GET_GLOBALISEL_TEMPORARIES_INIT 60 #include "AMDGPUGenGlobalISel.inc" 61 #undef GET_GLOBALISEL_TEMPORARIES_INIT 62 { 63 } 64 65 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } 66 67 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB, 68 CodeGenCoverage &CoverageInfo) { 69 MRI = &MF.getRegInfo(); 70 InstructionSelector::setupMF(MF, KB, CoverageInfo); 71 } 72 73 bool AMDGPUInstructionSelector::isVCC(Register Reg, 74 const MachineRegisterInfo &MRI) const { 75 if (Register::isPhysicalRegister(Reg)) 76 return Reg == TRI.getVCC(); 77 78 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 79 const TargetRegisterClass *RC = 80 RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); 81 if (RC) { 82 const LLT Ty = MRI.getType(Reg); 83 return RC->hasSuperClassEq(TRI.getBoolRC()) && 84 Ty.isValid() && Ty.getSizeInBits() == 1; 85 } 86 87 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); 88 return RB->getID() == AMDGPU::VCCRegBankID; 89 } 90 91 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI, 92 unsigned NewOpc) const { 93 MI.setDesc(TII.get(NewOpc)); 94 MI.RemoveOperand(1); // Remove intrinsic ID. 95 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 96 97 MachineOperand &Dst = MI.getOperand(0); 98 MachineOperand &Src = MI.getOperand(1); 99 100 // TODO: This should be legalized to s32 if needed 101 if (MRI->getType(Dst.getReg()) == LLT::scalar(1)) 102 return false; 103 104 const TargetRegisterClass *DstRC 105 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 106 const TargetRegisterClass *SrcRC 107 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 108 if (!DstRC || DstRC != SrcRC) 109 return false; 110 111 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) && 112 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI); 113 } 114 115 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { 116 const DebugLoc &DL = I.getDebugLoc(); 117 MachineBasicBlock *BB = I.getParent(); 118 I.setDesc(TII.get(TargetOpcode::COPY)); 119 120 const MachineOperand &Src = I.getOperand(1); 121 MachineOperand &Dst = I.getOperand(0); 122 Register DstReg = Dst.getReg(); 123 Register SrcReg = Src.getReg(); 124 125 if (isVCC(DstReg, *MRI)) { 126 if (SrcReg == AMDGPU::SCC) { 127 const TargetRegisterClass *RC 128 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 129 if (!RC) 130 return true; 131 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 132 } 133 134 if (!isVCC(SrcReg, *MRI)) { 135 // TODO: Should probably leave the copy and let copyPhysReg expand it. 136 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI)) 137 return false; 138 139 const TargetRegisterClass *SrcRC 140 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 141 142 Register MaskedReg = MRI->createVirtualRegister(SrcRC); 143 144 // We can't trust the high bits at this point, so clear them. 145 146 // TODO: Skip masking high bits if def is known boolean. 147 148 unsigned AndOpc = TRI.isSGPRClass(SrcRC) ? 149 AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32; 150 BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg) 151 .addImm(1) 152 .addReg(SrcReg); 153 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) 154 .addImm(0) 155 .addReg(MaskedReg); 156 157 if (!MRI->getRegClassOrNull(SrcReg)) 158 MRI->setRegClass(SrcReg, SrcRC); 159 I.eraseFromParent(); 160 return true; 161 } 162 163 const TargetRegisterClass *RC = 164 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 165 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) 166 return false; 167 168 // Don't constrain the source register to a class so the def instruction 169 // handles it (unless it's undef). 170 // 171 // FIXME: This is a hack. When selecting the def, we neeed to know 172 // specifically know that the result is VCCRegBank, and not just an SGPR 173 // with size 1. An SReg_32 with size 1 is ambiguous with wave32. 174 if (Src.isUndef()) { 175 const TargetRegisterClass *SrcRC = 176 TRI.getConstrainedRegClassForOperand(Src, *MRI); 177 if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 178 return false; 179 } 180 181 return true; 182 } 183 184 for (const MachineOperand &MO : I.operands()) { 185 if (Register::isPhysicalRegister(MO.getReg())) 186 continue; 187 188 const TargetRegisterClass *RC = 189 TRI.getConstrainedRegClassForOperand(MO, *MRI); 190 if (!RC) 191 continue; 192 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); 193 } 194 return true; 195 } 196 197 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { 198 const Register DefReg = I.getOperand(0).getReg(); 199 const LLT DefTy = MRI->getType(DefReg); 200 201 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy) 202 203 const RegClassOrRegBank &RegClassOrBank = 204 MRI->getRegClassOrRegBank(DefReg); 205 206 const TargetRegisterClass *DefRC 207 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); 208 if (!DefRC) { 209 if (!DefTy.isValid()) { 210 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); 211 return false; 212 } 213 214 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); 215 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI); 216 if (!DefRC) { 217 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); 218 return false; 219 } 220 } 221 222 // TODO: Verify that all registers have the same bank 223 I.setDesc(TII.get(TargetOpcode::PHI)); 224 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI); 225 } 226 227 MachineOperand 228 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, 229 const TargetRegisterClass &SubRC, 230 unsigned SubIdx) const { 231 232 MachineInstr *MI = MO.getParent(); 233 MachineBasicBlock *BB = MO.getParent()->getParent(); 234 Register DstReg = MRI->createVirtualRegister(&SubRC); 235 236 if (MO.isReg()) { 237 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); 238 Register Reg = MO.getReg(); 239 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) 240 .addReg(Reg, 0, ComposedSubIdx); 241 242 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(), 243 MO.isKill(), MO.isDead(), MO.isUndef(), 244 MO.isEarlyClobber(), 0, MO.isDebug(), 245 MO.isInternalRead()); 246 } 247 248 assert(MO.isImm()); 249 250 APInt Imm(64, MO.getImm()); 251 252 switch (SubIdx) { 253 default: 254 llvm_unreachable("do not know to split immediate with this sub index."); 255 case AMDGPU::sub0: 256 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue()); 257 case AMDGPU::sub1: 258 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue()); 259 } 260 } 261 262 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { 263 switch (Opc) { 264 case AMDGPU::G_AND: 265 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; 266 case AMDGPU::G_OR: 267 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32; 268 case AMDGPU::G_XOR: 269 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32; 270 default: 271 llvm_unreachable("not a bit op"); 272 } 273 } 274 275 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { 276 MachineOperand &Dst = I.getOperand(0); 277 MachineOperand &Src0 = I.getOperand(1); 278 MachineOperand &Src1 = I.getOperand(2); 279 Register DstReg = Dst.getReg(); 280 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 281 282 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 283 if (DstRB->getID() == AMDGPU::VCCRegBankID) { 284 const TargetRegisterClass *RC = TRI.getBoolRC(); 285 unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), 286 RC == &AMDGPU::SReg_64RegClass); 287 I.setDesc(TII.get(InstOpc)); 288 289 // FIXME: Hack to avoid turning the register bank into a register class. 290 // The selector for G_ICMP relies on seeing the register bank for the result 291 // is VCC. In wave32 if we constrain the registers to SReg_32 here, it will 292 // be ambiguous whether it's a scalar or vector bool. 293 if (Src0.isUndef() && !MRI->getRegClassOrNull(Src0.getReg())) 294 MRI->setRegClass(Src0.getReg(), RC); 295 if (Src1.isUndef() && !MRI->getRegClassOrNull(Src1.getReg())) 296 MRI->setRegClass(Src1.getReg(), RC); 297 298 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 299 } 300 301 // TODO: Should this allow an SCC bank result, and produce a copy from SCC for 302 // the result? 303 if (DstRB->getID() == AMDGPU::SGPRRegBankID) { 304 unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), Size > 32); 305 I.setDesc(TII.get(InstOpc)); 306 // Dead implicit-def of scc 307 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef 308 true, // isImp 309 false, // isKill 310 true)); // isDead 311 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 312 } 313 314 return false; 315 } 316 317 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { 318 MachineBasicBlock *BB = I.getParent(); 319 MachineFunction *MF = BB->getParent(); 320 Register DstReg = I.getOperand(0).getReg(); 321 const DebugLoc &DL = I.getDebugLoc(); 322 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 323 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 324 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID; 325 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB; 326 327 if (Size == 32) { 328 if (IsSALU) { 329 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; 330 MachineInstr *Add = 331 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 332 .add(I.getOperand(1)) 333 .add(I.getOperand(2)); 334 I.eraseFromParent(); 335 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 336 } 337 338 if (STI.hasAddNoCarry()) { 339 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64; 340 I.setDesc(TII.get(Opc)); 341 I.addOperand(*MF, MachineOperand::CreateImm(0)); 342 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 343 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 344 } 345 346 const unsigned Opc = Sub ? AMDGPU::V_SUB_I32_e64 : AMDGPU::V_ADD_I32_e64; 347 348 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass()); 349 MachineInstr *Add 350 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 351 .addDef(UnusedCarry, RegState::Dead) 352 .add(I.getOperand(1)) 353 .add(I.getOperand(2)) 354 .addImm(0); 355 I.eraseFromParent(); 356 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 357 } 358 359 assert(!Sub && "illegal sub should not reach here"); 360 361 const TargetRegisterClass &RC 362 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass; 363 const TargetRegisterClass &HalfRC 364 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass; 365 366 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0)); 367 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0)); 368 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1)); 369 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1)); 370 371 Register DstLo = MRI->createVirtualRegister(&HalfRC); 372 Register DstHi = MRI->createVirtualRegister(&HalfRC); 373 374 if (IsSALU) { 375 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) 376 .add(Lo1) 377 .add(Lo2); 378 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) 379 .add(Hi1) 380 .add(Hi2); 381 } else { 382 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass(); 383 Register CarryReg = MRI->createVirtualRegister(CarryRC); 384 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_I32_e64), DstLo) 385 .addDef(CarryReg) 386 .add(Lo1) 387 .add(Lo2) 388 .addImm(0); 389 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi) 390 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead) 391 .add(Hi1) 392 .add(Hi2) 393 .addReg(CarryReg, RegState::Kill) 394 .addImm(0); 395 396 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI)) 397 return false; 398 } 399 400 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 401 .addReg(DstLo) 402 .addImm(AMDGPU::sub0) 403 .addReg(DstHi) 404 .addImm(AMDGPU::sub1); 405 406 407 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI)) 408 return false; 409 410 I.eraseFromParent(); 411 return true; 412 } 413 414 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE( 415 MachineInstr &I) const { 416 MachineBasicBlock *BB = I.getParent(); 417 MachineFunction *MF = BB->getParent(); 418 const DebugLoc &DL = I.getDebugLoc(); 419 Register Dst0Reg = I.getOperand(0).getReg(); 420 Register Dst1Reg = I.getOperand(1).getReg(); 421 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO || 422 I.getOpcode() == AMDGPU::G_UADDE; 423 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE || 424 I.getOpcode() == AMDGPU::G_USUBE; 425 426 if (isVCC(Dst1Reg, *MRI)) { 427 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned 428 // carry out despite the _i32 name. These were renamed in VI to _U32. 429 // FIXME: We should probably rename the opcodes here. 430 unsigned NoCarryOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; 431 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 432 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc)); 433 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 434 I.addOperand(*MF, MachineOperand::CreateImm(0)); 435 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 436 } 437 438 Register Src0Reg = I.getOperand(2).getReg(); 439 Register Src1Reg = I.getOperand(3).getReg(); 440 441 if (HasCarryIn) { 442 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 443 .addReg(I.getOperand(4).getReg()); 444 } 445 446 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; 447 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; 448 449 BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg) 450 .add(I.getOperand(2)) 451 .add(I.getOperand(3)); 452 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg) 453 .addReg(AMDGPU::SCC); 454 455 if (!MRI->getRegClassOrNull(Dst1Reg)) 456 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass); 457 458 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) || 459 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) || 460 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI)) 461 return false; 462 463 if (HasCarryIn && 464 !RBI.constrainGenericRegister(I.getOperand(4).getReg(), 465 AMDGPU::SReg_32RegClass, *MRI)) 466 return false; 467 468 I.eraseFromParent(); 469 return true; 470 } 471 472 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { 473 MachineBasicBlock *BB = I.getParent(); 474 Register DstReg = I.getOperand(0).getReg(); 475 Register SrcReg = I.getOperand(1).getReg(); 476 LLT DstTy = MRI->getType(DstReg); 477 LLT SrcTy = MRI->getType(SrcReg); 478 const unsigned SrcSize = SrcTy.getSizeInBits(); 479 const unsigned DstSize = DstTy.getSizeInBits(); 480 481 // TODO: Should handle any multiple of 32 offset. 482 unsigned Offset = I.getOperand(2).getImm(); 483 if (Offset % DstSize != 0) 484 return false; 485 486 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 487 const TargetRegisterClass *SrcRC = 488 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 489 if (!SrcRC) 490 return false; 491 492 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); 493 494 const DebugLoc &DL = I.getDebugLoc(); 495 MachineInstr *Copy = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg) 496 .addReg(SrcReg, 0, SubRegs[Offset / DstSize]); 497 498 for (const MachineOperand &MO : Copy->operands()) { 499 const TargetRegisterClass *RC = 500 TRI.getConstrainedRegClassForOperand(MO, *MRI); 501 if (!RC) 502 continue; 503 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); 504 } 505 I.eraseFromParent(); 506 return true; 507 } 508 509 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { 510 MachineBasicBlock *BB = MI.getParent(); 511 Register DstReg = MI.getOperand(0).getReg(); 512 LLT DstTy = MRI->getType(DstReg); 513 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg()); 514 515 const unsigned SrcSize = SrcTy.getSizeInBits(); 516 if (SrcSize < 32) 517 return selectImpl(MI, *CoverageInfo); 518 519 const DebugLoc &DL = MI.getDebugLoc(); 520 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 521 const unsigned DstSize = DstTy.getSizeInBits(); 522 const TargetRegisterClass *DstRC = 523 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 524 if (!DstRC) 525 return false; 526 527 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8); 528 MachineInstrBuilder MIB = 529 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg); 530 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) { 531 MachineOperand &Src = MI.getOperand(I + 1); 532 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef())); 533 MIB.addImm(SubRegs[I]); 534 535 const TargetRegisterClass *SrcRC 536 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 537 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI)) 538 return false; 539 } 540 541 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 542 return false; 543 544 MI.eraseFromParent(); 545 return true; 546 } 547 548 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { 549 MachineBasicBlock *BB = MI.getParent(); 550 const int NumDst = MI.getNumOperands() - 1; 551 552 MachineOperand &Src = MI.getOperand(NumDst); 553 554 Register SrcReg = Src.getReg(); 555 Register DstReg0 = MI.getOperand(0).getReg(); 556 LLT DstTy = MRI->getType(DstReg0); 557 LLT SrcTy = MRI->getType(SrcReg); 558 559 const unsigned DstSize = DstTy.getSizeInBits(); 560 const unsigned SrcSize = SrcTy.getSizeInBits(); 561 const DebugLoc &DL = MI.getDebugLoc(); 562 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 563 564 const TargetRegisterClass *SrcRC = 565 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 566 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 567 return false; 568 569 const unsigned SrcFlags = getUndefRegState(Src.isUndef()); 570 571 // Note we could have mixed SGPR and VGPR destination banks for an SGPR 572 // source, and this relies on the fact that the same subregister indices are 573 // used for both. 574 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); 575 for (int I = 0, E = NumDst; I != E; ++I) { 576 MachineOperand &Dst = MI.getOperand(I); 577 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg()) 578 .addReg(SrcReg, SrcFlags, SubRegs[I]); 579 580 const TargetRegisterClass *DstRC = 581 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 582 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI)) 583 return false; 584 } 585 586 MI.eraseFromParent(); 587 return true; 588 } 589 590 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const { 591 return selectG_ADD_SUB(I); 592 } 593 594 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { 595 const MachineOperand &MO = I.getOperand(0); 596 597 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The 598 // regbank check here is to know why getConstrainedRegClassForOperand failed. 599 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI); 600 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) || 601 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) { 602 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); 603 return true; 604 } 605 606 return false; 607 } 608 609 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { 610 MachineBasicBlock *BB = I.getParent(); 611 612 Register DstReg = I.getOperand(0).getReg(); 613 Register Src0Reg = I.getOperand(1).getReg(); 614 Register Src1Reg = I.getOperand(2).getReg(); 615 LLT Src1Ty = MRI->getType(Src1Reg); 616 617 unsigned DstSize = MRI->getType(DstReg).getSizeInBits(); 618 unsigned InsSize = Src1Ty.getSizeInBits(); 619 620 int64_t Offset = I.getOperand(3).getImm(); 621 if (Offset % 32 != 0) 622 return false; 623 624 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32); 625 if (SubReg == AMDGPU::NoSubRegister) 626 return false; 627 628 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 629 const TargetRegisterClass *DstRC = 630 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 631 if (!DstRC) 632 return false; 633 634 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI); 635 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI); 636 const TargetRegisterClass *Src0RC = 637 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI); 638 const TargetRegisterClass *Src1RC = 639 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI); 640 641 // Deal with weird cases where the class only partially supports the subreg 642 // index. 643 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg); 644 if (!Src0RC) 645 return false; 646 647 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 648 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) || 649 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI)) 650 return false; 651 652 const DebugLoc &DL = I.getDebugLoc(); 653 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg) 654 .addReg(Src0Reg) 655 .addReg(Src1Reg) 656 .addImm(SubReg); 657 658 I.eraseFromParent(); 659 return true; 660 } 661 662 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const { 663 if (STI.getLDSBankCount() != 16) 664 return selectImpl(MI, *CoverageInfo); 665 666 Register Dst = MI.getOperand(0).getReg(); 667 Register Src0 = MI.getOperand(2).getReg(); 668 Register M0Val = MI.getOperand(6).getReg(); 669 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) || 670 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) || 671 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI)) 672 return false; 673 674 // This requires 2 instructions. It is possible to write a pattern to support 675 // this, but the generated isel emitter doesn't correctly deal with multiple 676 // output instructions using the same physical register input. The copy to m0 677 // is incorrectly placed before the second instruction. 678 // 679 // TODO: Match source modifiers. 680 681 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 682 const DebugLoc &DL = MI.getDebugLoc(); 683 MachineBasicBlock *MBB = MI.getParent(); 684 685 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 686 .addReg(M0Val); 687 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov) 688 .addImm(2) 689 .addImm(MI.getOperand(4).getImm()) // $attr 690 .addImm(MI.getOperand(3).getImm()); // $attrchan 691 692 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst) 693 .addImm(0) // $src0_modifiers 694 .addReg(Src0) // $src0 695 .addImm(MI.getOperand(4).getImm()) // $attr 696 .addImm(MI.getOperand(3).getImm()) // $attrchan 697 .addImm(0) // $src2_modifiers 698 .addReg(InterpMov) // $src2 - 2 f16 values selected by high 699 .addImm(MI.getOperand(5).getImm()) // $high 700 .addImm(0) // $clamp 701 .addImm(0); // $omod 702 703 MI.eraseFromParent(); 704 return true; 705 } 706 707 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { 708 unsigned IntrinsicID = I.getIntrinsicID(); 709 switch (IntrinsicID) { 710 case Intrinsic::amdgcn_if_break: { 711 MachineBasicBlock *BB = I.getParent(); 712 713 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 714 // SelectionDAG uses for wave32 vs wave64. 715 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK)) 716 .add(I.getOperand(0)) 717 .add(I.getOperand(2)) 718 .add(I.getOperand(3)); 719 720 Register DstReg = I.getOperand(0).getReg(); 721 Register Src0Reg = I.getOperand(2).getReg(); 722 Register Src1Reg = I.getOperand(3).getReg(); 723 724 I.eraseFromParent(); 725 726 for (Register Reg : { DstReg, Src0Reg, Src1Reg }) 727 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 728 729 return true; 730 } 731 case Intrinsic::amdgcn_interp_p1_f16: 732 return selectInterpP1F16(I); 733 case Intrinsic::amdgcn_wqm: 734 return constrainCopyLikeIntrin(I, AMDGPU::WQM); 735 case Intrinsic::amdgcn_softwqm: 736 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM); 737 case Intrinsic::amdgcn_wwm: 738 return constrainCopyLikeIntrin(I, AMDGPU::WWM); 739 default: 740 return selectImpl(I, *CoverageInfo); 741 } 742 } 743 744 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) { 745 if (Size != 32 && Size != 64) 746 return -1; 747 switch (P) { 748 default: 749 llvm_unreachable("Unknown condition code!"); 750 case CmpInst::ICMP_NE: 751 return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64; 752 case CmpInst::ICMP_EQ: 753 return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64; 754 case CmpInst::ICMP_SGT: 755 return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64; 756 case CmpInst::ICMP_SGE: 757 return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64; 758 case CmpInst::ICMP_SLT: 759 return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64; 760 case CmpInst::ICMP_SLE: 761 return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64; 762 case CmpInst::ICMP_UGT: 763 return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64; 764 case CmpInst::ICMP_UGE: 765 return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64; 766 case CmpInst::ICMP_ULT: 767 return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64; 768 case CmpInst::ICMP_ULE: 769 return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64; 770 } 771 } 772 773 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P, 774 unsigned Size) const { 775 if (Size == 64) { 776 if (!STI.hasScalarCompareEq64()) 777 return -1; 778 779 switch (P) { 780 case CmpInst::ICMP_NE: 781 return AMDGPU::S_CMP_LG_U64; 782 case CmpInst::ICMP_EQ: 783 return AMDGPU::S_CMP_EQ_U64; 784 default: 785 return -1; 786 } 787 } 788 789 if (Size != 32) 790 return -1; 791 792 switch (P) { 793 case CmpInst::ICMP_NE: 794 return AMDGPU::S_CMP_LG_U32; 795 case CmpInst::ICMP_EQ: 796 return AMDGPU::S_CMP_EQ_U32; 797 case CmpInst::ICMP_SGT: 798 return AMDGPU::S_CMP_GT_I32; 799 case CmpInst::ICMP_SGE: 800 return AMDGPU::S_CMP_GE_I32; 801 case CmpInst::ICMP_SLT: 802 return AMDGPU::S_CMP_LT_I32; 803 case CmpInst::ICMP_SLE: 804 return AMDGPU::S_CMP_LE_I32; 805 case CmpInst::ICMP_UGT: 806 return AMDGPU::S_CMP_GT_U32; 807 case CmpInst::ICMP_UGE: 808 return AMDGPU::S_CMP_GE_U32; 809 case CmpInst::ICMP_ULT: 810 return AMDGPU::S_CMP_LT_U32; 811 case CmpInst::ICMP_ULE: 812 return AMDGPU::S_CMP_LE_U32; 813 default: 814 llvm_unreachable("Unknown condition code!"); 815 } 816 } 817 818 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { 819 MachineBasicBlock *BB = I.getParent(); 820 const DebugLoc &DL = I.getDebugLoc(); 821 822 Register SrcReg = I.getOperand(2).getReg(); 823 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 824 825 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); 826 827 Register CCReg = I.getOperand(0).getReg(); 828 if (!isVCC(CCReg, *MRI)) { 829 int Opcode = getS_CMPOpcode(Pred, Size); 830 if (Opcode == -1) 831 return false; 832 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode)) 833 .add(I.getOperand(2)) 834 .add(I.getOperand(3)); 835 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg) 836 .addReg(AMDGPU::SCC); 837 bool Ret = 838 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) && 839 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI); 840 I.eraseFromParent(); 841 return Ret; 842 } 843 844 int Opcode = getV_CMPOpcode(Pred, Size); 845 if (Opcode == -1) 846 return false; 847 848 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), 849 I.getOperand(0).getReg()) 850 .add(I.getOperand(2)) 851 .add(I.getOperand(3)); 852 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), 853 *TRI.getBoolRC(), *MRI); 854 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 855 I.eraseFromParent(); 856 return Ret; 857 } 858 859 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const { 860 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 861 // SelectionDAG uses for wave32 vs wave64. 862 MachineBasicBlock *BB = MI.getParent(); 863 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF)) 864 .add(MI.getOperand(1)); 865 866 Register Reg = MI.getOperand(1).getReg(); 867 MI.eraseFromParent(); 868 869 if (!MRI->getRegClassOrNull(Reg)) 870 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 871 return true; 872 } 873 874 static unsigned getDSShaderTypeValue(const MachineFunction &MF) { 875 switch (MF.getFunction().getCallingConv()) { 876 case CallingConv::AMDGPU_PS: 877 return 1; 878 case CallingConv::AMDGPU_VS: 879 return 2; 880 case CallingConv::AMDGPU_GS: 881 return 3; 882 case CallingConv::AMDGPU_HS: 883 case CallingConv::AMDGPU_LS: 884 case CallingConv::AMDGPU_ES: 885 report_fatal_error("ds_ordered_count unsupported for this calling conv"); 886 case CallingConv::AMDGPU_CS: 887 case CallingConv::AMDGPU_KERNEL: 888 case CallingConv::C: 889 case CallingConv::Fast: 890 default: 891 // Assume other calling conventions are various compute callable functions 892 return 0; 893 } 894 } 895 896 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic( 897 MachineInstr &MI, Intrinsic::ID IntrID) const { 898 MachineBasicBlock *MBB = MI.getParent(); 899 MachineFunction *MF = MBB->getParent(); 900 const DebugLoc &DL = MI.getDebugLoc(); 901 902 unsigned IndexOperand = MI.getOperand(7).getImm(); 903 bool WaveRelease = MI.getOperand(8).getImm() != 0; 904 bool WaveDone = MI.getOperand(9).getImm() != 0; 905 906 if (WaveDone && !WaveRelease) 907 report_fatal_error("ds_ordered_count: wave_done requires wave_release"); 908 909 unsigned OrderedCountIndex = IndexOperand & 0x3f; 910 IndexOperand &= ~0x3f; 911 unsigned CountDw = 0; 912 913 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) { 914 CountDw = (IndexOperand >> 24) & 0xf; 915 IndexOperand &= ~(0xf << 24); 916 917 if (CountDw < 1 || CountDw > 4) { 918 report_fatal_error( 919 "ds_ordered_count: dword count must be between 1 and 4"); 920 } 921 } 922 923 if (IndexOperand) 924 report_fatal_error("ds_ordered_count: bad index operand"); 925 926 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1; 927 unsigned ShaderType = getDSShaderTypeValue(*MF); 928 929 unsigned Offset0 = OrderedCountIndex << 2; 930 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) | 931 (Instruction << 4); 932 933 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) 934 Offset1 |= (CountDw - 1) << 6; 935 936 unsigned Offset = Offset0 | (Offset1 << 8); 937 938 Register M0Val = MI.getOperand(2).getReg(); 939 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 940 .addReg(M0Val); 941 942 Register DstReg = MI.getOperand(0).getReg(); 943 Register ValReg = MI.getOperand(3).getReg(); 944 MachineInstrBuilder DS = 945 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg) 946 .addReg(ValReg) 947 .addImm(Offset) 948 .cloneMemRefs(MI); 949 950 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI)) 951 return false; 952 953 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI); 954 MI.eraseFromParent(); 955 return Ret; 956 } 957 958 static unsigned gwsIntrinToOpcode(unsigned IntrID) { 959 switch (IntrID) { 960 case Intrinsic::amdgcn_ds_gws_init: 961 return AMDGPU::DS_GWS_INIT; 962 case Intrinsic::amdgcn_ds_gws_barrier: 963 return AMDGPU::DS_GWS_BARRIER; 964 case Intrinsic::amdgcn_ds_gws_sema_v: 965 return AMDGPU::DS_GWS_SEMA_V; 966 case Intrinsic::amdgcn_ds_gws_sema_br: 967 return AMDGPU::DS_GWS_SEMA_BR; 968 case Intrinsic::amdgcn_ds_gws_sema_p: 969 return AMDGPU::DS_GWS_SEMA_P; 970 case Intrinsic::amdgcn_ds_gws_sema_release_all: 971 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL; 972 default: 973 llvm_unreachable("not a gws intrinsic"); 974 } 975 } 976 977 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, 978 Intrinsic::ID IID) const { 979 if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all && 980 !STI.hasGWSSemaReleaseAll()) 981 return false; 982 983 // intrinsic ID, vsrc, offset 984 const bool HasVSrc = MI.getNumOperands() == 3; 985 assert(HasVSrc || MI.getNumOperands() == 2); 986 987 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg(); 988 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI); 989 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID) 990 return false; 991 992 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 993 assert(OffsetDef); 994 995 unsigned ImmOffset; 996 997 MachineBasicBlock *MBB = MI.getParent(); 998 const DebugLoc &DL = MI.getDebugLoc(); 999 1000 MachineInstr *Readfirstlane = nullptr; 1001 1002 // If we legalized the VGPR input, strip out the readfirstlane to analyze the 1003 // incoming offset, in case there's an add of a constant. We'll have to put it 1004 // back later. 1005 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) { 1006 Readfirstlane = OffsetDef; 1007 BaseOffset = OffsetDef->getOperand(1).getReg(); 1008 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1009 } 1010 1011 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) { 1012 // If we have a constant offset, try to use the 0 in m0 as the base. 1013 // TODO: Look into changing the default m0 initialization value. If the 1014 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to 1015 // the immediate offset. 1016 1017 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue(); 1018 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1019 .addImm(0); 1020 } else { 1021 std::tie(BaseOffset, ImmOffset, OffsetDef) 1022 = AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset); 1023 1024 if (Readfirstlane) { 1025 // We have the constant offset now, so put the readfirstlane back on the 1026 // variable component. 1027 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI)) 1028 return false; 1029 1030 Readfirstlane->getOperand(1).setReg(BaseOffset); 1031 BaseOffset = Readfirstlane->getOperand(0).getReg(); 1032 } else { 1033 if (!RBI.constrainGenericRegister(BaseOffset, 1034 AMDGPU::SReg_32RegClass, *MRI)) 1035 return false; 1036 } 1037 1038 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1039 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base) 1040 .addReg(BaseOffset) 1041 .addImm(16); 1042 1043 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1044 .addReg(M0Base); 1045 } 1046 1047 // The resource id offset is computed as (<isa opaque base> + M0[21:16] + 1048 // offset field) % 64. Some versions of the programming guide omit the m0 1049 // part, or claim it's from offset 0. 1050 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID))); 1051 1052 if (HasVSrc) { 1053 Register VSrc = MI.getOperand(1).getReg(); 1054 MIB.addReg(VSrc); 1055 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI)) 1056 return false; 1057 } 1058 1059 MIB.addImm(ImmOffset) 1060 .addImm(-1) // $gds 1061 .cloneMemRefs(MI); 1062 1063 MI.eraseFromParent(); 1064 return true; 1065 } 1066 1067 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI, 1068 bool IsAppend) const { 1069 Register PtrBase = MI.getOperand(2).getReg(); 1070 LLT PtrTy = MRI->getType(PtrBase); 1071 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS; 1072 1073 unsigned Offset; 1074 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2)); 1075 1076 // TODO: Should this try to look through readfirstlane like GWS? 1077 if (!isDSOffsetLegal(PtrBase, Offset, 16)) { 1078 PtrBase = MI.getOperand(2).getReg(); 1079 Offset = 0; 1080 } 1081 1082 MachineBasicBlock *MBB = MI.getParent(); 1083 const DebugLoc &DL = MI.getDebugLoc(); 1084 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME; 1085 1086 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1087 .addReg(PtrBase); 1088 BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg()) 1089 .addImm(Offset) 1090 .addImm(IsGDS ? -1 : 0) 1091 .cloneMemRefs(MI); 1092 1093 MI.eraseFromParent(); 1094 return true; 1095 } 1096 1097 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( 1098 MachineInstr &I) const { 1099 unsigned IntrinsicID = I.getIntrinsicID(); 1100 switch (IntrinsicID) { 1101 case Intrinsic::amdgcn_end_cf: 1102 return selectEndCfIntrinsic(I); 1103 case Intrinsic::amdgcn_ds_ordered_add: 1104 case Intrinsic::amdgcn_ds_ordered_swap: 1105 return selectDSOrderedIntrinsic(I, IntrinsicID); 1106 case Intrinsic::amdgcn_ds_gws_init: 1107 case Intrinsic::amdgcn_ds_gws_barrier: 1108 case Intrinsic::amdgcn_ds_gws_sema_v: 1109 case Intrinsic::amdgcn_ds_gws_sema_br: 1110 case Intrinsic::amdgcn_ds_gws_sema_p: 1111 case Intrinsic::amdgcn_ds_gws_sema_release_all: 1112 return selectDSGWSIntrinsic(I, IntrinsicID); 1113 case Intrinsic::amdgcn_ds_append: 1114 return selectDSAppendConsume(I, true); 1115 case Intrinsic::amdgcn_ds_consume: 1116 return selectDSAppendConsume(I, false); 1117 default: 1118 return selectImpl(I, *CoverageInfo); 1119 } 1120 } 1121 1122 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { 1123 if (selectImpl(I, *CoverageInfo)) 1124 return true; 1125 1126 MachineBasicBlock *BB = I.getParent(); 1127 const DebugLoc &DL = I.getDebugLoc(); 1128 1129 Register DstReg = I.getOperand(0).getReg(); 1130 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 1131 assert(Size <= 32 || Size == 64); 1132 const MachineOperand &CCOp = I.getOperand(1); 1133 Register CCReg = CCOp.getReg(); 1134 if (!isVCC(CCReg, *MRI)) { 1135 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 : 1136 AMDGPU::S_CSELECT_B32; 1137 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 1138 .addReg(CCReg); 1139 1140 // The generic constrainSelectedInstRegOperands doesn't work for the scc register 1141 // bank, because it does not cover the register class that we used to represent 1142 // for it. So we need to manually set the register class here. 1143 if (!MRI->getRegClassOrNull(CCReg)) 1144 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI)); 1145 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) 1146 .add(I.getOperand(2)) 1147 .add(I.getOperand(3)); 1148 1149 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) | 1150 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI); 1151 I.eraseFromParent(); 1152 return Ret; 1153 } 1154 1155 // Wide VGPR select should have been split in RegBankSelect. 1156 if (Size > 32) 1157 return false; 1158 1159 MachineInstr *Select = 1160 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1161 .addImm(0) 1162 .add(I.getOperand(3)) 1163 .addImm(0) 1164 .add(I.getOperand(2)) 1165 .add(I.getOperand(1)); 1166 1167 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); 1168 I.eraseFromParent(); 1169 return Ret; 1170 } 1171 1172 bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { 1173 initM0(I); 1174 return selectImpl(I, *CoverageInfo); 1175 } 1176 1177 static int sizeToSubRegIndex(unsigned Size) { 1178 switch (Size) { 1179 case 32: 1180 return AMDGPU::sub0; 1181 case 64: 1182 return AMDGPU::sub0_sub1; 1183 case 96: 1184 return AMDGPU::sub0_sub1_sub2; 1185 case 128: 1186 return AMDGPU::sub0_sub1_sub2_sub3; 1187 case 256: 1188 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; 1189 default: 1190 if (Size < 32) 1191 return AMDGPU::sub0; 1192 if (Size > 256) 1193 return -1; 1194 return sizeToSubRegIndex(PowerOf2Ceil(Size)); 1195 } 1196 } 1197 1198 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { 1199 Register DstReg = I.getOperand(0).getReg(); 1200 Register SrcReg = I.getOperand(1).getReg(); 1201 const LLT DstTy = MRI->getType(DstReg); 1202 const LLT SrcTy = MRI->getType(SrcReg); 1203 if (!DstTy.isScalar()) 1204 return false; 1205 1206 const LLT S1 = LLT::scalar(1); 1207 1208 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1209 const RegisterBank *DstRB; 1210 if (DstTy == S1) { 1211 // This is a special case. We don't treat s1 for legalization artifacts as 1212 // vcc booleans. 1213 DstRB = SrcRB; 1214 } else { 1215 DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1216 if (SrcRB != DstRB) 1217 return false; 1218 } 1219 1220 unsigned DstSize = DstTy.getSizeInBits(); 1221 unsigned SrcSize = SrcTy.getSizeInBits(); 1222 1223 const TargetRegisterClass *SrcRC 1224 = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI); 1225 const TargetRegisterClass *DstRC 1226 = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI); 1227 1228 if (SrcSize > 32) { 1229 int SubRegIdx = sizeToSubRegIndex(DstSize); 1230 if (SubRegIdx == -1) 1231 return false; 1232 1233 // Deal with weird cases where the class only partially supports the subreg 1234 // index. 1235 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx); 1236 if (!SrcRC) 1237 return false; 1238 1239 I.getOperand(1).setSubReg(SubRegIdx); 1240 } 1241 1242 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 1243 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) { 1244 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); 1245 return false; 1246 } 1247 1248 I.setDesc(TII.get(TargetOpcode::COPY)); 1249 return true; 1250 } 1251 1252 /// \returns true if a bitmask for \p Size bits will be an inline immediate. 1253 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) { 1254 Mask = maskTrailingOnes<unsigned>(Size); 1255 int SignedMask = static_cast<int>(Mask); 1256 return SignedMask >= -16 && SignedMask <= 64; 1257 } 1258 1259 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1. 1260 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank( 1261 Register Reg, const MachineRegisterInfo &MRI, 1262 const TargetRegisterInfo &TRI) const { 1263 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 1264 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>()) 1265 return RB; 1266 1267 // Ignore the type, since we don't use vcc in artifacts. 1268 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>()) 1269 return &RBI.getRegBankFromRegClass(*RC, LLT()); 1270 return nullptr; 1271 } 1272 1273 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { 1274 bool Signed = I.getOpcode() == AMDGPU::G_SEXT; 1275 const DebugLoc &DL = I.getDebugLoc(); 1276 MachineBasicBlock &MBB = *I.getParent(); 1277 const Register DstReg = I.getOperand(0).getReg(); 1278 const Register SrcReg = I.getOperand(1).getReg(); 1279 1280 const LLT DstTy = MRI->getType(DstReg); 1281 const LLT SrcTy = MRI->getType(SrcReg); 1282 const unsigned SrcSize = SrcTy.getSizeInBits(); 1283 const unsigned DstSize = DstTy.getSizeInBits(); 1284 if (!DstTy.isScalar()) 1285 return false; 1286 1287 if (I.getOpcode() == AMDGPU::G_ANYEXT) 1288 return selectCOPY(I); 1289 1290 // Artifact casts should never use vcc. 1291 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI); 1292 1293 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) { 1294 // 64-bit should have been split up in RegBankSelect 1295 1296 // Try to use an and with a mask if it will save code size. 1297 unsigned Mask; 1298 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1299 MachineInstr *ExtI = 1300 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg) 1301 .addImm(Mask) 1302 .addReg(SrcReg); 1303 I.eraseFromParent(); 1304 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1305 } 1306 1307 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32; 1308 MachineInstr *ExtI = 1309 BuildMI(MBB, I, DL, TII.get(BFE), DstReg) 1310 .addReg(SrcReg) 1311 .addImm(0) // Offset 1312 .addImm(SrcSize); // Width 1313 I.eraseFromParent(); 1314 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1315 } 1316 1317 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) { 1318 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI)) 1319 return false; 1320 1321 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) { 1322 const unsigned SextOpc = SrcSize == 8 ? 1323 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16; 1324 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg) 1325 .addReg(SrcReg); 1326 I.eraseFromParent(); 1327 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1328 } 1329 1330 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; 1331 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; 1332 1333 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width. 1334 if (DstSize > 32 && SrcSize <= 32) { 1335 // We need a 64-bit register source, but the high bits don't matter. 1336 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); 1337 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1338 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 1339 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) 1340 .addReg(SrcReg) 1341 .addImm(AMDGPU::sub0) 1342 .addReg(UndefReg) 1343 .addImm(AMDGPU::sub1); 1344 1345 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg) 1346 .addReg(ExtReg) 1347 .addImm(SrcSize << 16); 1348 1349 I.eraseFromParent(); 1350 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI); 1351 } 1352 1353 unsigned Mask; 1354 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1355 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg) 1356 .addReg(SrcReg) 1357 .addImm(Mask); 1358 } else { 1359 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg) 1360 .addReg(SrcReg) 1361 .addImm(SrcSize << 16); 1362 } 1363 1364 I.eraseFromParent(); 1365 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1366 } 1367 1368 return false; 1369 } 1370 1371 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { 1372 MachineBasicBlock *BB = I.getParent(); 1373 MachineOperand &ImmOp = I.getOperand(1); 1374 1375 // The AMDGPU backend only supports Imm operands and not CImm or FPImm. 1376 if (ImmOp.isFPImm()) { 1377 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt(); 1378 ImmOp.ChangeToImmediate(Imm.getZExtValue()); 1379 } else if (ImmOp.isCImm()) { 1380 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getZExtValue()); 1381 } 1382 1383 Register DstReg = I.getOperand(0).getReg(); 1384 unsigned Size; 1385 bool IsSgpr; 1386 const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg()); 1387 if (RB) { 1388 IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID; 1389 Size = MRI->getType(DstReg).getSizeInBits(); 1390 } else { 1391 const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg); 1392 IsSgpr = TRI.isSGPRClass(RC); 1393 Size = TRI.getRegSizeInBits(*RC); 1394 } 1395 1396 if (Size != 32 && Size != 64) 1397 return false; 1398 1399 unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1400 if (Size == 32) { 1401 I.setDesc(TII.get(Opcode)); 1402 I.addImplicitDefUseOperands(*MF); 1403 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 1404 } 1405 1406 const DebugLoc &DL = I.getDebugLoc(); 1407 1408 APInt Imm(Size, I.getOperand(1).getImm()); 1409 1410 MachineInstr *ResInst; 1411 if (IsSgpr && TII.isInlineConstant(Imm)) { 1412 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) 1413 .addImm(I.getOperand(1).getImm()); 1414 } else { 1415 const TargetRegisterClass *RC = IsSgpr ? 1416 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass; 1417 Register LoReg = MRI->createVirtualRegister(RC); 1418 Register HiReg = MRI->createVirtualRegister(RC); 1419 1420 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg) 1421 .addImm(Imm.trunc(32).getZExtValue()); 1422 1423 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg) 1424 .addImm(Imm.ashr(32).getZExtValue()); 1425 1426 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 1427 .addReg(LoReg) 1428 .addImm(AMDGPU::sub0) 1429 .addReg(HiReg) 1430 .addImm(AMDGPU::sub1); 1431 } 1432 1433 // We can't call constrainSelectedInstRegOperands here, because it doesn't 1434 // work for target independent opcodes 1435 I.eraseFromParent(); 1436 const TargetRegisterClass *DstRC = 1437 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI); 1438 if (!DstRC) 1439 return true; 1440 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI); 1441 } 1442 1443 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const { 1444 // Only manually handle the f64 SGPR case. 1445 // 1446 // FIXME: This is a workaround for 2.5 different tablegen problems. Because 1447 // the bit ops theoretically have a second result due to the implicit def of 1448 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing 1449 // that is easy by disabling the check. The result works, but uses a 1450 // nonsensical sreg32orlds_and_sreg_1 regclass. 1451 // 1452 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to 1453 // the variadic REG_SEQUENCE operands. 1454 1455 Register Dst = MI.getOperand(0).getReg(); 1456 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 1457 if (DstRB->getID() != AMDGPU::SGPRRegBankID || 1458 MRI->getType(Dst) != LLT::scalar(64)) 1459 return false; 1460 1461 Register Src = MI.getOperand(1).getReg(); 1462 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI); 1463 if (Fabs) 1464 Src = Fabs->getOperand(1).getReg(); 1465 1466 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || 1467 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) 1468 return false; 1469 1470 MachineBasicBlock *BB = MI.getParent(); 1471 const DebugLoc &DL = MI.getDebugLoc(); 1472 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1473 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1474 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1475 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1476 1477 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) 1478 .addReg(Src, 0, AMDGPU::sub0); 1479 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) 1480 .addReg(Src, 0, AMDGPU::sub1); 1481 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) 1482 .addImm(0x80000000); 1483 1484 // Set or toggle sign bit. 1485 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32; 1486 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg) 1487 .addReg(HiReg) 1488 .addReg(ConstReg); 1489 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) 1490 .addReg(LoReg) 1491 .addImm(AMDGPU::sub0) 1492 .addReg(OpReg) 1493 .addImm(AMDGPU::sub1); 1494 MI.eraseFromParent(); 1495 return true; 1496 } 1497 1498 static bool isConstant(const MachineInstr &MI) { 1499 return MI.getOpcode() == TargetOpcode::G_CONSTANT; 1500 } 1501 1502 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, 1503 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const { 1504 1505 const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg()); 1506 1507 assert(PtrMI); 1508 1509 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD) 1510 return; 1511 1512 GEPInfo GEPInfo(*PtrMI); 1513 1514 for (unsigned i = 1; i != 3; ++i) { 1515 const MachineOperand &GEPOp = PtrMI->getOperand(i); 1516 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg()); 1517 assert(OpDef); 1518 if (i == 2 && isConstant(*OpDef)) { 1519 // TODO: Could handle constant base + variable offset, but a combine 1520 // probably should have commuted it. 1521 assert(GEPInfo.Imm == 0); 1522 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue(); 1523 continue; 1524 } 1525 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI); 1526 if (OpBank->getID() == AMDGPU::SGPRRegBankID) 1527 GEPInfo.SgprParts.push_back(GEPOp.getReg()); 1528 else 1529 GEPInfo.VgprParts.push_back(GEPOp.getReg()); 1530 } 1531 1532 AddrInfo.push_back(GEPInfo); 1533 getAddrModeInfo(*PtrMI, MRI, AddrInfo); 1534 } 1535 1536 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const { 1537 if (!MI.hasOneMemOperand()) 1538 return false; 1539 1540 const MachineMemOperand *MMO = *MI.memoperands_begin(); 1541 const Value *Ptr = MMO->getValue(); 1542 1543 // UndefValue means this is a load of a kernel input. These are uniform. 1544 // Sometimes LDS instructions have constant pointers. 1545 // If Ptr is null, then that means this mem operand contains a 1546 // PseudoSourceValue like GOT. 1547 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || 1548 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) 1549 return true; 1550 1551 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 1552 return true; 1553 1554 const Instruction *I = dyn_cast<Instruction>(Ptr); 1555 return I && I->getMetadata("amdgpu.uniform"); 1556 } 1557 1558 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const { 1559 for (const GEPInfo &GEPInfo : AddrInfo) { 1560 if (!GEPInfo.VgprParts.empty()) 1561 return true; 1562 } 1563 return false; 1564 } 1565 1566 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const { 1567 MachineBasicBlock *BB = I.getParent(); 1568 1569 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); 1570 unsigned AS = PtrTy.getAddressSpace(); 1571 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) && 1572 STI.ldsRequiresM0Init()) { 1573 // If DS instructions require M0 initializtion, insert it before selecting. 1574 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1575 .addImm(-1); 1576 } 1577 } 1578 1579 bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const { 1580 initM0(I); 1581 return selectImpl(I, *CoverageInfo); 1582 } 1583 1584 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { 1585 MachineBasicBlock *BB = I.getParent(); 1586 MachineOperand &CondOp = I.getOperand(0); 1587 Register CondReg = CondOp.getReg(); 1588 const DebugLoc &DL = I.getDebugLoc(); 1589 1590 unsigned BrOpcode; 1591 Register CondPhysReg; 1592 const TargetRegisterClass *ConstrainRC; 1593 1594 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide 1595 // whether the branch is uniform when selecting the instruction. In 1596 // GlobalISel, we should push that decision into RegBankSelect. Assume for now 1597 // RegBankSelect knows what it's doing if the branch condition is scc, even 1598 // though it currently does not. 1599 if (!isVCC(CondReg, *MRI)) { 1600 if (MRI->getType(CondReg) != LLT::scalar(32)) 1601 return false; 1602 1603 CondPhysReg = AMDGPU::SCC; 1604 BrOpcode = AMDGPU::S_CBRANCH_SCC1; 1605 // FIXME: Hack for isSCC tests 1606 ConstrainRC = &AMDGPU::SGPR_32RegClass; 1607 } else { 1608 // FIXME: Do we have to insert an and with exec here, like in SelectionDAG? 1609 // We sort of know that a VCC producer based on the register bank, that ands 1610 // inactive lanes with 0. What if there was a logical operation with vcc 1611 // producers in different blocks/with different exec masks? 1612 // FIXME: Should scc->vcc copies and with exec? 1613 CondPhysReg = TRI.getVCC(); 1614 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ; 1615 ConstrainRC = TRI.getBoolRC(); 1616 } 1617 1618 if (!MRI->getRegClassOrNull(CondReg)) 1619 MRI->setRegClass(CondReg, ConstrainRC); 1620 1621 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg) 1622 .addReg(CondReg); 1623 BuildMI(*BB, &I, DL, TII.get(BrOpcode)) 1624 .addMBB(I.getOperand(1).getMBB()); 1625 1626 I.eraseFromParent(); 1627 return true; 1628 } 1629 1630 bool AMDGPUInstructionSelector::selectG_FRAME_INDEX_GLOBAL_VALUE( 1631 MachineInstr &I) const { 1632 Register DstReg = I.getOperand(0).getReg(); 1633 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1634 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 1635 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32)); 1636 if (IsVGPR) 1637 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 1638 1639 return RBI.constrainGenericRegister( 1640 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI); 1641 } 1642 1643 bool AMDGPUInstructionSelector::selectG_PTR_MASK(MachineInstr &I) const { 1644 uint64_t Align = I.getOperand(2).getImm(); 1645 const uint64_t Mask = ~((UINT64_C(1) << Align) - 1); 1646 1647 MachineBasicBlock *BB = I.getParent(); 1648 1649 Register DstReg = I.getOperand(0).getReg(); 1650 Register SrcReg = I.getOperand(1).getReg(); 1651 1652 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1653 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1654 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 1655 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 1656 unsigned MovOpc = IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 1657 const TargetRegisterClass &RegRC 1658 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 1659 1660 LLT Ty = MRI->getType(DstReg); 1661 1662 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB, 1663 *MRI); 1664 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB, 1665 *MRI); 1666 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 1667 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 1668 return false; 1669 1670 const DebugLoc &DL = I.getDebugLoc(); 1671 Register ImmReg = MRI->createVirtualRegister(&RegRC); 1672 BuildMI(*BB, &I, DL, TII.get(MovOpc), ImmReg) 1673 .addImm(Mask); 1674 1675 if (Ty.getSizeInBits() == 32) { 1676 BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg) 1677 .addReg(SrcReg) 1678 .addReg(ImmReg); 1679 I.eraseFromParent(); 1680 return true; 1681 } 1682 1683 Register HiReg = MRI->createVirtualRegister(&RegRC); 1684 Register LoReg = MRI->createVirtualRegister(&RegRC); 1685 Register MaskLo = MRI->createVirtualRegister(&RegRC); 1686 1687 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg) 1688 .addReg(SrcReg, 0, AMDGPU::sub0); 1689 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg) 1690 .addReg(SrcReg, 0, AMDGPU::sub1); 1691 1692 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskLo) 1693 .addReg(LoReg) 1694 .addReg(ImmReg); 1695 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 1696 .addReg(MaskLo) 1697 .addImm(AMDGPU::sub0) 1698 .addReg(HiReg) 1699 .addImm(AMDGPU::sub1); 1700 I.eraseFromParent(); 1701 return true; 1702 } 1703 1704 /// Return the register to use for the index value, and the subregister to use 1705 /// for the indirectly accessed register. 1706 static std::pair<Register, unsigned> 1707 computeIndirectRegIndex(MachineRegisterInfo &MRI, 1708 const SIRegisterInfo &TRI, 1709 const TargetRegisterClass *SuperRC, 1710 Register IdxReg, 1711 unsigned EltSize) { 1712 Register IdxBaseReg; 1713 int Offset; 1714 MachineInstr *Unused; 1715 1716 std::tie(IdxBaseReg, Offset, Unused) 1717 = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg); 1718 1719 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize); 1720 1721 // Skip out of bounds offsets, or else we would end up using an undefined 1722 // register. 1723 if (static_cast<unsigned>(Offset) >= SubRegs.size()) 1724 return std::make_pair(IdxReg, SubRegs[0]); 1725 return std::make_pair(IdxBaseReg, SubRegs[Offset]); 1726 } 1727 1728 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT( 1729 MachineInstr &MI) const { 1730 Register DstReg = MI.getOperand(0).getReg(); 1731 Register SrcReg = MI.getOperand(1).getReg(); 1732 Register IdxReg = MI.getOperand(2).getReg(); 1733 1734 LLT DstTy = MRI->getType(DstReg); 1735 LLT SrcTy = MRI->getType(SrcReg); 1736 1737 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1738 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1739 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 1740 1741 // The index must be scalar. If it wasn't RegBankSelect should have moved this 1742 // into a waterfall loop. 1743 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 1744 return false; 1745 1746 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB, 1747 *MRI); 1748 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB, 1749 *MRI); 1750 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 1751 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 1752 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 1753 return false; 1754 1755 MachineBasicBlock *BB = MI.getParent(); 1756 const DebugLoc &DL = MI.getDebugLoc(); 1757 const bool Is64 = DstTy.getSizeInBits() == 64; 1758 1759 unsigned SubReg; 1760 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg, 1761 DstTy.getSizeInBits() / 8); 1762 1763 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) { 1764 if (DstTy.getSizeInBits() != 32 && !Is64) 1765 return false; 1766 1767 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1768 .addReg(IdxReg); 1769 1770 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32; 1771 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg) 1772 .addReg(SrcReg, 0, SubReg) 1773 .addReg(SrcReg, RegState::Implicit); 1774 MI.eraseFromParent(); 1775 return true; 1776 } 1777 1778 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32) 1779 return false; 1780 1781 if (!STI.useVGPRIndexMode()) { 1782 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1783 .addReg(IdxReg); 1784 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg) 1785 .addReg(SrcReg, RegState::Undef, SubReg) 1786 .addReg(SrcReg, RegState::Implicit); 1787 MI.eraseFromParent(); 1788 return true; 1789 } 1790 1791 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON)) 1792 .addReg(IdxReg) 1793 .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE); 1794 BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstReg) 1795 .addReg(SrcReg, RegState::Undef, SubReg) 1796 .addReg(SrcReg, RegState::Implicit) 1797 .addReg(AMDGPU::M0, RegState::Implicit); 1798 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF)); 1799 1800 MI.eraseFromParent(); 1801 return true; 1802 } 1803 1804 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd 1805 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT( 1806 MachineInstr &MI) const { 1807 Register DstReg = MI.getOperand(0).getReg(); 1808 Register VecReg = MI.getOperand(1).getReg(); 1809 Register ValReg = MI.getOperand(2).getReg(); 1810 Register IdxReg = MI.getOperand(3).getReg(); 1811 1812 LLT VecTy = MRI->getType(DstReg); 1813 LLT ValTy = MRI->getType(ValReg); 1814 unsigned VecSize = VecTy.getSizeInBits(); 1815 unsigned ValSize = ValTy.getSizeInBits(); 1816 1817 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI); 1818 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI); 1819 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 1820 1821 assert(VecTy.getElementType() == ValTy); 1822 1823 // The index must be scalar. If it wasn't RegBankSelect should have moved this 1824 // into a waterfall loop. 1825 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 1826 return false; 1827 1828 const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB, 1829 *MRI); 1830 const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB, 1831 *MRI); 1832 1833 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) || 1834 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) || 1835 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) || 1836 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 1837 return false; 1838 1839 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32) 1840 return false; 1841 1842 unsigned SubReg; 1843 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, 1844 ValSize / 8); 1845 1846 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID && 1847 STI.useVGPRIndexMode(); 1848 1849 MachineBasicBlock *BB = MI.getParent(); 1850 const DebugLoc &DL = MI.getDebugLoc(); 1851 1852 if (IndexMode) { 1853 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON)) 1854 .addReg(IdxReg) 1855 .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE); 1856 } else { 1857 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1858 .addReg(IdxReg); 1859 } 1860 1861 const MCInstrDesc &RegWriteOp 1862 = TII.getIndirectRegWritePseudo(VecSize, ValSize, 1863 VecRB->getID() == AMDGPU::SGPRRegBankID); 1864 BuildMI(*BB, MI, DL, RegWriteOp, DstReg) 1865 .addReg(VecReg) 1866 .addReg(ValReg) 1867 .addImm(SubReg); 1868 1869 if (IndexMode) 1870 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF)); 1871 1872 MI.eraseFromParent(); 1873 return true; 1874 } 1875 1876 bool AMDGPUInstructionSelector::select(MachineInstr &I) { 1877 if (I.isPHI()) 1878 return selectPHI(I); 1879 1880 if (!I.isPreISelOpcode()) { 1881 if (I.isCopy()) 1882 return selectCOPY(I); 1883 return true; 1884 } 1885 1886 switch (I.getOpcode()) { 1887 case TargetOpcode::G_AND: 1888 case TargetOpcode::G_OR: 1889 case TargetOpcode::G_XOR: 1890 if (selectG_AND_OR_XOR(I)) 1891 return true; 1892 return selectImpl(I, *CoverageInfo); 1893 case TargetOpcode::G_ADD: 1894 case TargetOpcode::G_SUB: 1895 if (selectImpl(I, *CoverageInfo)) 1896 return true; 1897 return selectG_ADD_SUB(I); 1898 case TargetOpcode::G_UADDO: 1899 case TargetOpcode::G_USUBO: 1900 case TargetOpcode::G_UADDE: 1901 case TargetOpcode::G_USUBE: 1902 return selectG_UADDO_USUBO_UADDE_USUBE(I); 1903 case TargetOpcode::G_INTTOPTR: 1904 case TargetOpcode::G_BITCAST: 1905 case TargetOpcode::G_PTRTOINT: 1906 return selectCOPY(I); 1907 case TargetOpcode::G_CONSTANT: 1908 case TargetOpcode::G_FCONSTANT: 1909 return selectG_CONSTANT(I); 1910 case TargetOpcode::G_FNEG: 1911 if (selectImpl(I, *CoverageInfo)) 1912 return true; 1913 return selectG_FNEG(I); 1914 case TargetOpcode::G_EXTRACT: 1915 return selectG_EXTRACT(I); 1916 case TargetOpcode::G_MERGE_VALUES: 1917 case TargetOpcode::G_BUILD_VECTOR: 1918 case TargetOpcode::G_CONCAT_VECTORS: 1919 return selectG_MERGE_VALUES(I); 1920 case TargetOpcode::G_UNMERGE_VALUES: 1921 return selectG_UNMERGE_VALUES(I); 1922 case TargetOpcode::G_PTR_ADD: 1923 return selectG_PTR_ADD(I); 1924 case TargetOpcode::G_IMPLICIT_DEF: 1925 return selectG_IMPLICIT_DEF(I); 1926 case TargetOpcode::G_INSERT: 1927 return selectG_INSERT(I); 1928 case TargetOpcode::G_INTRINSIC: 1929 return selectG_INTRINSIC(I); 1930 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: 1931 return selectG_INTRINSIC_W_SIDE_EFFECTS(I); 1932 case TargetOpcode::G_ICMP: 1933 if (selectG_ICMP(I)) 1934 return true; 1935 return selectImpl(I, *CoverageInfo); 1936 case TargetOpcode::G_LOAD: 1937 case TargetOpcode::G_ATOMIC_CMPXCHG: 1938 case TargetOpcode::G_ATOMICRMW_XCHG: 1939 case TargetOpcode::G_ATOMICRMW_ADD: 1940 case TargetOpcode::G_ATOMICRMW_SUB: 1941 case TargetOpcode::G_ATOMICRMW_AND: 1942 case TargetOpcode::G_ATOMICRMW_OR: 1943 case TargetOpcode::G_ATOMICRMW_XOR: 1944 case TargetOpcode::G_ATOMICRMW_MIN: 1945 case TargetOpcode::G_ATOMICRMW_MAX: 1946 case TargetOpcode::G_ATOMICRMW_UMIN: 1947 case TargetOpcode::G_ATOMICRMW_UMAX: 1948 case TargetOpcode::G_ATOMICRMW_FADD: 1949 return selectG_LOAD_ATOMICRMW(I); 1950 case TargetOpcode::G_SELECT: 1951 return selectG_SELECT(I); 1952 case TargetOpcode::G_STORE: 1953 return selectG_STORE(I); 1954 case TargetOpcode::G_TRUNC: 1955 return selectG_TRUNC(I); 1956 case TargetOpcode::G_SEXT: 1957 case TargetOpcode::G_ZEXT: 1958 case TargetOpcode::G_ANYEXT: 1959 if (selectImpl(I, *CoverageInfo)) 1960 return true; 1961 return selectG_SZA_EXT(I); 1962 case TargetOpcode::G_BRCOND: 1963 return selectG_BRCOND(I); 1964 case TargetOpcode::G_FRAME_INDEX: 1965 case TargetOpcode::G_GLOBAL_VALUE: 1966 return selectG_FRAME_INDEX_GLOBAL_VALUE(I); 1967 case TargetOpcode::G_PTR_MASK: 1968 return selectG_PTR_MASK(I); 1969 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1970 return selectG_EXTRACT_VECTOR_ELT(I); 1971 case TargetOpcode::G_INSERT_VECTOR_ELT: 1972 return selectG_INSERT_VECTOR_ELT(I); 1973 case AMDGPU::G_AMDGPU_ATOMIC_INC: 1974 case AMDGPU::G_AMDGPU_ATOMIC_DEC: 1975 initM0(I); 1976 return selectImpl(I, *CoverageInfo); 1977 default: 1978 return selectImpl(I, *CoverageInfo); 1979 } 1980 return false; 1981 } 1982 1983 InstructionSelector::ComplexRendererFns 1984 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { 1985 return {{ 1986 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 1987 }}; 1988 1989 } 1990 1991 std::pair<Register, unsigned> 1992 AMDGPUInstructionSelector::selectVOP3ModsImpl( 1993 Register Src) const { 1994 unsigned Mods = 0; 1995 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI); 1996 1997 if (MI && MI->getOpcode() == AMDGPU::G_FNEG) { 1998 Src = MI->getOperand(1).getReg(); 1999 Mods |= SISrcMods::NEG; 2000 MI = getDefIgnoringCopies(Src, *MRI); 2001 } 2002 2003 if (MI && MI->getOpcode() == AMDGPU::G_FABS) { 2004 Src = MI->getOperand(1).getReg(); 2005 Mods |= SISrcMods::ABS; 2006 } 2007 2008 return std::make_pair(Src, Mods); 2009 } 2010 2011 /// 2012 /// This will select either an SGPR or VGPR operand and will save us from 2013 /// having to write an extra tablegen pattern. 2014 InstructionSelector::ComplexRendererFns 2015 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const { 2016 return {{ 2017 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 2018 }}; 2019 } 2020 2021 InstructionSelector::ComplexRendererFns 2022 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { 2023 Register Src; 2024 unsigned Mods; 2025 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 2026 2027 return {{ 2028 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2029 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 2030 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 2031 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 2032 }}; 2033 } 2034 2035 InstructionSelector::ComplexRendererFns 2036 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { 2037 return {{ 2038 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 2039 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 2040 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 2041 }}; 2042 } 2043 2044 InstructionSelector::ComplexRendererFns 2045 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { 2046 Register Src; 2047 unsigned Mods; 2048 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 2049 2050 return {{ 2051 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2052 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 2053 }}; 2054 } 2055 2056 InstructionSelector::ComplexRendererFns 2057 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { 2058 Register Reg = Root.getReg(); 2059 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); 2060 if (Def && (Def->getOpcode() == AMDGPU::G_FNEG || 2061 Def->getOpcode() == AMDGPU::G_FABS)) 2062 return {}; 2063 return {{ 2064 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 2065 }}; 2066 } 2067 2068 InstructionSelector::ComplexRendererFns 2069 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const { 2070 Register Src; 2071 unsigned Mods; 2072 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 2073 if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI)) 2074 return None; 2075 2076 return {{ 2077 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2078 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 2079 }}; 2080 } 2081 2082 InstructionSelector::ComplexRendererFns 2083 AMDGPUInstructionSelector::selectVOP3OpSelMods0(MachineOperand &Root) const { 2084 // FIXME: Handle clamp and op_sel 2085 return {{ 2086 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 2087 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // src_mods 2088 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // clamp 2089 }}; 2090 } 2091 2092 InstructionSelector::ComplexRendererFns 2093 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { 2094 // FIXME: Handle op_sel 2095 return {{ 2096 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 2097 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods 2098 }}; 2099 } 2100 2101 InstructionSelector::ComplexRendererFns 2102 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { 2103 SmallVector<GEPInfo, 4> AddrInfo; 2104 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 2105 2106 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 2107 return None; 2108 2109 const GEPInfo &GEPInfo = AddrInfo[0]; 2110 2111 if (!AMDGPU::isLegalSMRDImmOffset(STI, GEPInfo.Imm)) 2112 return None; 2113 2114 unsigned PtrReg = GEPInfo.SgprParts[0]; 2115 int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm); 2116 return {{ 2117 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 2118 [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); } 2119 }}; 2120 } 2121 2122 InstructionSelector::ComplexRendererFns 2123 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { 2124 SmallVector<GEPInfo, 4> AddrInfo; 2125 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 2126 2127 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 2128 return None; 2129 2130 const GEPInfo &GEPInfo = AddrInfo[0]; 2131 unsigned PtrReg = GEPInfo.SgprParts[0]; 2132 int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm); 2133 if (!isUInt<32>(EncodedImm)) 2134 return None; 2135 2136 return {{ 2137 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 2138 [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); } 2139 }}; 2140 } 2141 2142 InstructionSelector::ComplexRendererFns 2143 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { 2144 MachineInstr *MI = Root.getParent(); 2145 MachineBasicBlock *MBB = MI->getParent(); 2146 2147 SmallVector<GEPInfo, 4> AddrInfo; 2148 getAddrModeInfo(*MI, *MRI, AddrInfo); 2149 2150 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, 2151 // then we can select all ptr + 32-bit offsets not just immediate offsets. 2152 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 2153 return None; 2154 2155 const GEPInfo &GEPInfo = AddrInfo[0]; 2156 if (!GEPInfo.Imm || !isUInt<32>(GEPInfo.Imm)) 2157 return None; 2158 2159 // If we make it this far we have a load with an 32-bit immediate offset. 2160 // It is OK to select this using a sgpr offset, because we have already 2161 // failed trying to select this load into one of the _IMM variants since 2162 // the _IMM Patterns are considered before the _SGPR patterns. 2163 unsigned PtrReg = GEPInfo.SgprParts[0]; 2164 Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2165 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) 2166 .addImm(GEPInfo.Imm); 2167 return {{ 2168 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 2169 [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); } 2170 }}; 2171 } 2172 2173 template <bool Signed> 2174 InstructionSelector::ComplexRendererFns 2175 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const { 2176 MachineInstr *MI = Root.getParent(); 2177 2178 InstructionSelector::ComplexRendererFns Default = {{ 2179 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 2180 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // offset 2181 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 2182 }}; 2183 2184 if (!STI.hasFlatInstOffsets()) 2185 return Default; 2186 2187 const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg()); 2188 if (!OpDef || OpDef->getOpcode() != AMDGPU::G_PTR_ADD) 2189 return Default; 2190 2191 Optional<int64_t> Offset = 2192 getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI); 2193 if (!Offset.hasValue()) 2194 return Default; 2195 2196 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); 2197 if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed)) 2198 return Default; 2199 2200 Register BasePtr = OpDef->getOperand(1).getReg(); 2201 2202 return {{ 2203 [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); }, 2204 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); }, 2205 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 2206 }}; 2207 } 2208 2209 InstructionSelector::ComplexRendererFns 2210 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const { 2211 return selectFlatOffsetImpl<false>(Root); 2212 } 2213 2214 InstructionSelector::ComplexRendererFns 2215 AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const { 2216 return selectFlatOffsetImpl<true>(Root); 2217 } 2218 2219 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { 2220 auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>(); 2221 return PSV && PSV->isStack(); 2222 } 2223 2224 InstructionSelector::ComplexRendererFns 2225 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { 2226 MachineInstr *MI = Root.getParent(); 2227 MachineBasicBlock *MBB = MI->getParent(); 2228 MachineFunction *MF = MBB->getParent(); 2229 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 2230 2231 int64_t Offset = 0; 2232 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset))) { 2233 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2234 2235 // TODO: Should this be inside the render function? The iterator seems to 2236 // move. 2237 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), 2238 HighBits) 2239 .addImm(Offset & ~4095); 2240 2241 return {{[=](MachineInstrBuilder &MIB) { // rsrc 2242 MIB.addReg(Info->getScratchRSrcReg()); 2243 }, 2244 [=](MachineInstrBuilder &MIB) { // vaddr 2245 MIB.addReg(HighBits); 2246 }, 2247 [=](MachineInstrBuilder &MIB) { // soffset 2248 const MachineMemOperand *MMO = *MI->memoperands_begin(); 2249 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 2250 2251 Register SOffsetReg = isStackPtrRelative(PtrInfo) 2252 ? Info->getStackPtrOffsetReg() 2253 : Info->getScratchWaveOffsetReg(); 2254 MIB.addReg(SOffsetReg); 2255 }, 2256 [=](MachineInstrBuilder &MIB) { // offset 2257 MIB.addImm(Offset & 4095); 2258 }}}; 2259 } 2260 2261 assert(Offset == 0); 2262 2263 // Try to fold a frame index directly into the MUBUF vaddr field, and any 2264 // offsets. 2265 Optional<int> FI; 2266 Register VAddr = Root.getReg(); 2267 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) { 2268 if (isBaseWithConstantOffset(Root, *MRI)) { 2269 const MachineOperand &LHS = RootDef->getOperand(1); 2270 const MachineOperand &RHS = RootDef->getOperand(2); 2271 const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); 2272 const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); 2273 if (LHSDef && RHSDef) { 2274 int64_t PossibleOffset = 2275 RHSDef->getOperand(1).getCImm()->getSExtValue(); 2276 if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) && 2277 (!STI.privateMemoryResourceIsRangeChecked() || 2278 KnownBits->signBitIsZero(LHS.getReg()))) { 2279 if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX) 2280 FI = LHSDef->getOperand(1).getIndex(); 2281 else 2282 VAddr = LHS.getReg(); 2283 Offset = PossibleOffset; 2284 } 2285 } 2286 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) { 2287 FI = RootDef->getOperand(1).getIndex(); 2288 } 2289 } 2290 2291 // If we don't know this private access is a local stack object, it needs to 2292 // be relative to the entry point's scratch wave offset register. 2293 // TODO: Should split large offsets that don't fit like above. 2294 // TODO: Don't use scratch wave offset just because the offset didn't fit. 2295 Register SOffset = FI.hasValue() ? Info->getStackPtrOffsetReg() 2296 : Info->getScratchWaveOffsetReg(); 2297 2298 return {{[=](MachineInstrBuilder &MIB) { // rsrc 2299 MIB.addReg(Info->getScratchRSrcReg()); 2300 }, 2301 [=](MachineInstrBuilder &MIB) { // vaddr 2302 if (FI.hasValue()) 2303 MIB.addFrameIndex(FI.getValue()); 2304 else 2305 MIB.addReg(VAddr); 2306 }, 2307 [=](MachineInstrBuilder &MIB) { // soffset 2308 MIB.addReg(SOffset); 2309 }, 2310 [=](MachineInstrBuilder &MIB) { // offset 2311 MIB.addImm(Offset); 2312 }}}; 2313 } 2314 2315 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base, 2316 int64_t Offset, 2317 unsigned OffsetBits) const { 2318 if ((OffsetBits == 16 && !isUInt<16>(Offset)) || 2319 (OffsetBits == 8 && !isUInt<8>(Offset))) 2320 return false; 2321 2322 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) 2323 return true; 2324 2325 // On Southern Islands instruction with a negative base value and an offset 2326 // don't seem to work. 2327 return KnownBits->signBitIsZero(Base); 2328 } 2329 2330 InstructionSelector::ComplexRendererFns 2331 AMDGPUInstructionSelector::selectMUBUFScratchOffset( 2332 MachineOperand &Root) const { 2333 MachineInstr *MI = Root.getParent(); 2334 MachineBasicBlock *MBB = MI->getParent(); 2335 2336 int64_t Offset = 0; 2337 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) || 2338 !SIInstrInfo::isLegalMUBUFImmOffset(Offset)) 2339 return {}; 2340 2341 const MachineFunction *MF = MBB->getParent(); 2342 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 2343 const MachineMemOperand *MMO = *MI->memoperands_begin(); 2344 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 2345 2346 Register SOffsetReg = isStackPtrRelative(PtrInfo) 2347 ? Info->getStackPtrOffsetReg() 2348 : Info->getScratchWaveOffsetReg(); 2349 return {{ 2350 [=](MachineInstrBuilder &MIB) { 2351 MIB.addReg(Info->getScratchRSrcReg()); 2352 }, // rsrc 2353 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffsetReg); }, // soffset 2354 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset 2355 }}; 2356 } 2357 2358 std::pair<Register, unsigned> 2359 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const { 2360 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 2361 if (!RootDef) 2362 return std::make_pair(Root.getReg(), 0); 2363 2364 int64_t ConstAddr = 0; 2365 if (isBaseWithConstantOffset(Root, *MRI)) { 2366 const MachineOperand &LHS = RootDef->getOperand(1); 2367 const MachineOperand &RHS = RootDef->getOperand(2); 2368 const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); 2369 const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); 2370 if (LHSDef && RHSDef) { 2371 int64_t PossibleOffset = 2372 RHSDef->getOperand(1).getCImm()->getSExtValue(); 2373 if (isDSOffsetLegal(LHS.getReg(), PossibleOffset, 16)) { 2374 // (add n0, c0) 2375 return std::make_pair(LHS.getReg(), PossibleOffset); 2376 } 2377 } 2378 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 2379 // TODO 2380 2381 2382 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 2383 // TODO 2384 2385 } 2386 2387 return std::make_pair(Root.getReg(), 0); 2388 } 2389 2390 InstructionSelector::ComplexRendererFns 2391 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const { 2392 2393 Register Reg; 2394 unsigned Offset; 2395 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root); 2396 return {{ 2397 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 2398 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } 2399 }}; 2400 } 2401 2402 InstructionSelector::ComplexRendererFns 2403 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const { 2404 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 2405 if (!RootDef) { 2406 return {{ 2407 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 2408 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, 2409 [=](MachineInstrBuilder &MIB) { MIB.addImm(1); } 2410 }}; 2411 } 2412 2413 int64_t ConstAddr = 0; 2414 Register PtrBase; 2415 int64_t Offset; 2416 2417 std::tie(PtrBase, Offset) = 2418 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 2419 2420 if (Offset) { 2421 int64_t DWordOffset0 = Offset / 4; 2422 int64_t DWordOffset1 = DWordOffset0 + 1; 2423 if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) { 2424 // (add n0, c0) 2425 return {{ 2426 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, 2427 [=](MachineInstrBuilder &MIB) { MIB.addImm(DWordOffset0); }, 2428 [=](MachineInstrBuilder &MIB) { MIB.addImm(DWordOffset1); } 2429 }}; 2430 } 2431 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 2432 // TODO 2433 2434 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 2435 // TODO 2436 2437 } 2438 2439 return {{ 2440 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 2441 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, 2442 [=](MachineInstrBuilder &MIB) { MIB.addImm(1); } 2443 }}; 2444 } 2445 2446 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return 2447 /// the base value with the constant offset. There may be intervening copies 2448 /// between \p Root and the identified constant. Returns \p Root, 0 if this does 2449 /// not match the pattern. 2450 std::pair<Register, int64_t> 2451 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset( 2452 Register Root, const MachineRegisterInfo &MRI) const { 2453 MachineInstr *RootI = MRI.getVRegDef(Root); 2454 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD) 2455 return {Root, 0}; 2456 2457 MachineOperand &RHS = RootI->getOperand(2); 2458 Optional<ValueAndVReg> MaybeOffset 2459 = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true); 2460 if (!MaybeOffset) 2461 return {Root, 0}; 2462 return {RootI->getOperand(1).getReg(), MaybeOffset->Value}; 2463 } 2464 2465 static void addZeroImm(MachineInstrBuilder &MIB) { 2466 MIB.addImm(0); 2467 } 2468 2469 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p 2470 /// BasePtr is not valid, a null base pointer will be used. 2471 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, 2472 uint32_t FormatLo, uint32_t FormatHi, 2473 Register BasePtr) { 2474 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 2475 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 2476 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 2477 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 2478 2479 B.buildInstr(AMDGPU::S_MOV_B32) 2480 .addDef(RSrc2) 2481 .addImm(FormatLo); 2482 B.buildInstr(AMDGPU::S_MOV_B32) 2483 .addDef(RSrc3) 2484 .addImm(FormatHi); 2485 2486 // Build the half of the subregister with the constants before building the 2487 // full 128-bit register. If we are building multiple resource descriptors, 2488 // this will allow CSEing of the 2-component register. 2489 B.buildInstr(AMDGPU::REG_SEQUENCE) 2490 .addDef(RSrcHi) 2491 .addReg(RSrc2) 2492 .addImm(AMDGPU::sub0) 2493 .addReg(RSrc3) 2494 .addImm(AMDGPU::sub1); 2495 2496 Register RSrcLo = BasePtr; 2497 if (!BasePtr) { 2498 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 2499 B.buildInstr(AMDGPU::S_MOV_B64) 2500 .addDef(RSrcLo) 2501 .addImm(0); 2502 } 2503 2504 B.buildInstr(AMDGPU::REG_SEQUENCE) 2505 .addDef(RSrc) 2506 .addReg(RSrcLo) 2507 .addImm(AMDGPU::sub0_sub1) 2508 .addReg(RSrcHi) 2509 .addImm(AMDGPU::sub2_sub3); 2510 2511 return RSrc; 2512 } 2513 2514 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 2515 const SIInstrInfo &TII, Register BasePtr) { 2516 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 2517 2518 // FIXME: Why are half the "default" bits ignored based on the addressing 2519 // mode? 2520 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr); 2521 } 2522 2523 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 2524 const SIInstrInfo &TII, Register BasePtr) { 2525 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 2526 2527 // FIXME: Why are half the "default" bits ignored based on the addressing 2528 // mode? 2529 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr); 2530 } 2531 2532 AMDGPUInstructionSelector::MUBUFAddressData 2533 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const { 2534 MUBUFAddressData Data; 2535 Data.N0 = Src; 2536 2537 Register PtrBase; 2538 int64_t Offset; 2539 2540 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI); 2541 if (isUInt<32>(Offset)) { 2542 Data.N0 = PtrBase; 2543 Data.Offset = Offset; 2544 } 2545 2546 if (MachineInstr *InputAdd 2547 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) { 2548 Data.N2 = InputAdd->getOperand(1).getReg(); 2549 Data.N3 = InputAdd->getOperand(2).getReg(); 2550 2551 // FIXME: Need to fix extra SGPR->VGPRcopies inserted 2552 // FIXME: Don't know this was defined by operand 0 2553 // 2554 // TODO: Remove this when we have copy folding optimizations after 2555 // RegBankSelect. 2556 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg(); 2557 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg(); 2558 } 2559 2560 return Data; 2561 } 2562 2563 /// Return if the addr64 mubuf mode should be used for the given address. 2564 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const { 2565 // (ptr_add N2, N3) -> addr64, or 2566 // (ptr_add (ptr_add N2, N3), C1) -> addr64 2567 if (Addr.N2) 2568 return true; 2569 2570 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI); 2571 return N0Bank->getID() == AMDGPU::VGPRRegBankID; 2572 } 2573 2574 /// Split an immediate offset \p ImmOffset depending on whether it fits in the 2575 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable 2576 /// component. 2577 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset( 2578 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const { 2579 if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset)) 2580 return; 2581 2582 // Illegal offset, store it in soffset. 2583 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2584 B.buildInstr(AMDGPU::S_MOV_B32) 2585 .addDef(SOffset) 2586 .addImm(ImmOffset); 2587 ImmOffset = 0; 2588 } 2589 2590 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl( 2591 MachineOperand &Root, Register &VAddr, Register &RSrcReg, 2592 Register &SOffset, int64_t &Offset) const { 2593 // FIXME: Predicates should stop this from reaching here. 2594 // addr64 bit was removed for volcanic islands. 2595 if (!STI.hasAddr64() || STI.useFlatForGlobal()) 2596 return false; 2597 2598 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 2599 if (!shouldUseAddr64(AddrData)) 2600 return false; 2601 2602 Register N0 = AddrData.N0; 2603 Register N2 = AddrData.N2; 2604 Register N3 = AddrData.N3; 2605 Offset = AddrData.Offset; 2606 2607 // Base pointer for the SRD. 2608 Register SRDPtr; 2609 2610 if (N2) { 2611 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 2612 assert(N3); 2613 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 2614 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the 2615 // addr64, and construct the default resource from a 0 address. 2616 VAddr = N0; 2617 } else { 2618 SRDPtr = N3; 2619 VAddr = N2; 2620 } 2621 } else { 2622 // N2 is not divergent. 2623 SRDPtr = N2; 2624 VAddr = N3; 2625 } 2626 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 2627 // Use the default null pointer in the resource 2628 VAddr = N0; 2629 } else { 2630 // N0 -> offset, or 2631 // (N0 + C1) -> offset 2632 SRDPtr = N0; 2633 } 2634 2635 MachineIRBuilder B(*Root.getParent()); 2636 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr); 2637 splitIllegalMUBUFOffset(B, SOffset, Offset); 2638 return true; 2639 } 2640 2641 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl( 2642 MachineOperand &Root, Register &RSrcReg, Register &SOffset, 2643 int64_t &Offset) const { 2644 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 2645 if (shouldUseAddr64(AddrData)) 2646 return false; 2647 2648 // N0 -> offset, or 2649 // (N0 + C1) -> offset 2650 Register SRDPtr = AddrData.N0; 2651 Offset = AddrData.Offset; 2652 2653 // TODO: Look through extensions for 32-bit soffset. 2654 MachineIRBuilder B(*Root.getParent()); 2655 2656 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr); 2657 splitIllegalMUBUFOffset(B, SOffset, Offset); 2658 return true; 2659 } 2660 2661 InstructionSelector::ComplexRendererFns 2662 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const { 2663 Register VAddr; 2664 Register RSrcReg; 2665 Register SOffset; 2666 int64_t Offset = 0; 2667 2668 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 2669 return {}; 2670 2671 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 2672 // pattern. 2673 return {{ 2674 [=](MachineInstrBuilder &MIB) { // rsrc 2675 MIB.addReg(RSrcReg); 2676 }, 2677 [=](MachineInstrBuilder &MIB) { // vaddr 2678 MIB.addReg(VAddr); 2679 }, 2680 [=](MachineInstrBuilder &MIB) { // soffset 2681 if (SOffset) 2682 MIB.addReg(SOffset); 2683 else 2684 MIB.addImm(0); 2685 }, 2686 [=](MachineInstrBuilder &MIB) { // offset 2687 MIB.addImm(Offset); 2688 }, 2689 addZeroImm, // glc 2690 addZeroImm, // slc 2691 addZeroImm, // tfe 2692 addZeroImm, // dlc 2693 addZeroImm // swz 2694 }}; 2695 } 2696 2697 InstructionSelector::ComplexRendererFns 2698 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const { 2699 Register RSrcReg; 2700 Register SOffset; 2701 int64_t Offset = 0; 2702 2703 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 2704 return {}; 2705 2706 return {{ 2707 [=](MachineInstrBuilder &MIB) { // rsrc 2708 MIB.addReg(RSrcReg); 2709 }, 2710 [=](MachineInstrBuilder &MIB) { // soffset 2711 if (SOffset) 2712 MIB.addReg(SOffset); 2713 else 2714 MIB.addImm(0); 2715 }, 2716 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 2717 addZeroImm, // glc 2718 addZeroImm, // slc 2719 addZeroImm, // tfe 2720 addZeroImm, // dlc 2721 addZeroImm // swz 2722 }}; 2723 } 2724 2725 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, 2726 const MachineInstr &MI, 2727 int OpIdx) const { 2728 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 2729 "Expected G_CONSTANT"); 2730 Optional<int64_t> CstVal = getConstantVRegVal(MI.getOperand(0).getReg(), *MRI); 2731 assert(CstVal && "Expected constant value"); 2732 MIB.addImm(CstVal.getValue()); 2733 } 2734 2735 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB, 2736 const MachineInstr &MI, 2737 int OpIdx) const { 2738 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 2739 "Expected G_CONSTANT"); 2740 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue()); 2741 } 2742 2743 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB, 2744 const MachineInstr &MI, 2745 int OpIdx) const { 2746 assert(OpIdx == -1); 2747 2748 const MachineOperand &Op = MI.getOperand(1); 2749 if (MI.getOpcode() == TargetOpcode::G_FCONSTANT) 2750 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); 2751 else { 2752 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT"); 2753 MIB.addImm(Op.getCImm()->getSExtValue()); 2754 } 2755 } 2756 2757 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB, 2758 const MachineInstr &MI, 2759 int OpIdx) const { 2760 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 2761 "Expected G_CONSTANT"); 2762 MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation()); 2763 } 2764 2765 /// This only really exists to satisfy DAG type checking machinery, so is a 2766 /// no-op here. 2767 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB, 2768 const MachineInstr &MI, 2769 int OpIdx) const { 2770 MIB.addImm(MI.getOperand(OpIdx).getImm()); 2771 } 2772 2773 void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB, 2774 const MachineInstr &MI, 2775 int OpIdx) const { 2776 assert(OpIdx >= 0 && "expected to match an immediate operand"); 2777 MIB.addImm(MI.getOperand(OpIdx).getImm() & 1); 2778 } 2779 2780 void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB, 2781 const MachineInstr &MI, 2782 int OpIdx) const { 2783 assert(OpIdx >= 0 && "expected to match an immediate operand"); 2784 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1); 2785 } 2786 2787 void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB, 2788 const MachineInstr &MI, 2789 int OpIdx) const { 2790 assert(OpIdx >= 0 && "expected to match an immediate operand"); 2791 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1); 2792 } 2793 2794 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB, 2795 const MachineInstr &MI, 2796 int OpIdx) const { 2797 assert(OpIdx >= 0 && "expected to match an immediate operand"); 2798 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1); 2799 } 2800 2801 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const { 2802 return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm()); 2803 } 2804 2805 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const { 2806 return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm()); 2807 } 2808 2809 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const { 2810 return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm()); 2811 } 2812 2813 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const { 2814 return TII.isInlineConstant(Imm); 2815 } 2816