1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the InstructionSelector class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUInstructionSelector.h" 15 #include "AMDGPUInstrInfo.h" 16 #include "AMDGPUGlobalISelUtils.h" 17 #include "AMDGPURegisterBankInfo.h" 18 #include "AMDGPURegisterInfo.h" 19 #include "AMDGPUSubtarget.h" 20 #include "AMDGPUTargetMachine.h" 21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 22 #include "SIMachineFunctionInfo.h" 23 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 24 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" 25 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 28 #include "llvm/CodeGen/GlobalISel/Utils.h" 29 #include "llvm/CodeGen/MachineBasicBlock.h" 30 #include "llvm/CodeGen/MachineFunction.h" 31 #include "llvm/CodeGen/MachineInstr.h" 32 #include "llvm/CodeGen/MachineInstrBuilder.h" 33 #include "llvm/CodeGen/MachineRegisterInfo.h" 34 #include "llvm/IR/Type.h" 35 #include "llvm/Support/Debug.h" 36 #include "llvm/Support/raw_ostream.h" 37 38 #define DEBUG_TYPE "amdgpu-isel" 39 40 using namespace llvm; 41 using namespace MIPatternMatch; 42 43 #define GET_GLOBALISEL_IMPL 44 #define AMDGPUSubtarget GCNSubtarget 45 #include "AMDGPUGenGlobalISel.inc" 46 #undef GET_GLOBALISEL_IMPL 47 #undef AMDGPUSubtarget 48 49 AMDGPUInstructionSelector::AMDGPUInstructionSelector( 50 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, 51 const AMDGPUTargetMachine &TM) 52 : InstructionSelector(), TII(*STI.getInstrInfo()), 53 TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM), 54 STI(STI), 55 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG), 56 #define GET_GLOBALISEL_PREDICATES_INIT 57 #include "AMDGPUGenGlobalISel.inc" 58 #undef GET_GLOBALISEL_PREDICATES_INIT 59 #define GET_GLOBALISEL_TEMPORARIES_INIT 60 #include "AMDGPUGenGlobalISel.inc" 61 #undef GET_GLOBALISEL_TEMPORARIES_INIT 62 { 63 } 64 65 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } 66 67 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB, 68 CodeGenCoverage &CoverageInfo) { 69 MRI = &MF.getRegInfo(); 70 InstructionSelector::setupMF(MF, KB, CoverageInfo); 71 } 72 73 bool AMDGPUInstructionSelector::isVCC(Register Reg, 74 const MachineRegisterInfo &MRI) const { 75 if (Register::isPhysicalRegister(Reg)) 76 return Reg == TRI.getVCC(); 77 78 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 79 const TargetRegisterClass *RC = 80 RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); 81 if (RC) { 82 const LLT Ty = MRI.getType(Reg); 83 return RC->hasSuperClassEq(TRI.getBoolRC()) && 84 Ty.isValid() && Ty.getSizeInBits() == 1; 85 } 86 87 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); 88 return RB->getID() == AMDGPU::VCCRegBankID; 89 } 90 91 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI, 92 unsigned NewOpc) const { 93 MI.setDesc(TII.get(NewOpc)); 94 MI.RemoveOperand(1); // Remove intrinsic ID. 95 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 96 97 MachineOperand &Dst = MI.getOperand(0); 98 MachineOperand &Src = MI.getOperand(1); 99 100 // TODO: This should be legalized to s32 if needed 101 if (MRI->getType(Dst.getReg()) == LLT::scalar(1)) 102 return false; 103 104 const TargetRegisterClass *DstRC 105 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 106 const TargetRegisterClass *SrcRC 107 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 108 if (!DstRC || DstRC != SrcRC) 109 return false; 110 111 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) && 112 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI); 113 } 114 115 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { 116 const DebugLoc &DL = I.getDebugLoc(); 117 MachineBasicBlock *BB = I.getParent(); 118 I.setDesc(TII.get(TargetOpcode::COPY)); 119 120 const MachineOperand &Src = I.getOperand(1); 121 MachineOperand &Dst = I.getOperand(0); 122 Register DstReg = Dst.getReg(); 123 Register SrcReg = Src.getReg(); 124 125 if (isVCC(DstReg, *MRI)) { 126 if (SrcReg == AMDGPU::SCC) { 127 const TargetRegisterClass *RC 128 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 129 if (!RC) 130 return true; 131 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 132 } 133 134 if (!isVCC(SrcReg, *MRI)) { 135 // TODO: Should probably leave the copy and let copyPhysReg expand it. 136 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI)) 137 return false; 138 139 const TargetRegisterClass *SrcRC 140 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 141 142 Register MaskedReg = MRI->createVirtualRegister(SrcRC); 143 144 // We can't trust the high bits at this point, so clear them. 145 146 // TODO: Skip masking high bits if def is known boolean. 147 148 unsigned AndOpc = TRI.isSGPRClass(SrcRC) ? 149 AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32; 150 BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg) 151 .addImm(1) 152 .addReg(SrcReg); 153 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) 154 .addImm(0) 155 .addReg(MaskedReg); 156 157 if (!MRI->getRegClassOrNull(SrcReg)) 158 MRI->setRegClass(SrcReg, SrcRC); 159 I.eraseFromParent(); 160 return true; 161 } 162 163 const TargetRegisterClass *RC = 164 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 165 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) 166 return false; 167 168 // Don't constrain the source register to a class so the def instruction 169 // handles it (unless it's undef). 170 // 171 // FIXME: This is a hack. When selecting the def, we neeed to know 172 // specifically know that the result is VCCRegBank, and not just an SGPR 173 // with size 1. An SReg_32 with size 1 is ambiguous with wave32. 174 if (Src.isUndef()) { 175 const TargetRegisterClass *SrcRC = 176 TRI.getConstrainedRegClassForOperand(Src, *MRI); 177 if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 178 return false; 179 } 180 181 return true; 182 } 183 184 for (const MachineOperand &MO : I.operands()) { 185 if (Register::isPhysicalRegister(MO.getReg())) 186 continue; 187 188 const TargetRegisterClass *RC = 189 TRI.getConstrainedRegClassForOperand(MO, *MRI); 190 if (!RC) 191 continue; 192 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); 193 } 194 return true; 195 } 196 197 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { 198 const Register DefReg = I.getOperand(0).getReg(); 199 const LLT DefTy = MRI->getType(DefReg); 200 201 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy) 202 203 const RegClassOrRegBank &RegClassOrBank = 204 MRI->getRegClassOrRegBank(DefReg); 205 206 const TargetRegisterClass *DefRC 207 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); 208 if (!DefRC) { 209 if (!DefTy.isValid()) { 210 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); 211 return false; 212 } 213 214 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); 215 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI); 216 if (!DefRC) { 217 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); 218 return false; 219 } 220 } 221 222 // TODO: Verify that all registers have the same bank 223 I.setDesc(TII.get(TargetOpcode::PHI)); 224 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI); 225 } 226 227 MachineOperand 228 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, 229 const TargetRegisterClass &SubRC, 230 unsigned SubIdx) const { 231 232 MachineInstr *MI = MO.getParent(); 233 MachineBasicBlock *BB = MO.getParent()->getParent(); 234 Register DstReg = MRI->createVirtualRegister(&SubRC); 235 236 if (MO.isReg()) { 237 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); 238 Register Reg = MO.getReg(); 239 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) 240 .addReg(Reg, 0, ComposedSubIdx); 241 242 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(), 243 MO.isKill(), MO.isDead(), MO.isUndef(), 244 MO.isEarlyClobber(), 0, MO.isDebug(), 245 MO.isInternalRead()); 246 } 247 248 assert(MO.isImm()); 249 250 APInt Imm(64, MO.getImm()); 251 252 switch (SubIdx) { 253 default: 254 llvm_unreachable("do not know to split immediate with this sub index."); 255 case AMDGPU::sub0: 256 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue()); 257 case AMDGPU::sub1: 258 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue()); 259 } 260 } 261 262 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { 263 switch (Opc) { 264 case AMDGPU::G_AND: 265 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; 266 case AMDGPU::G_OR: 267 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32; 268 case AMDGPU::G_XOR: 269 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32; 270 default: 271 llvm_unreachable("not a bit op"); 272 } 273 } 274 275 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { 276 MachineOperand &Dst = I.getOperand(0); 277 MachineOperand &Src0 = I.getOperand(1); 278 MachineOperand &Src1 = I.getOperand(2); 279 Register DstReg = Dst.getReg(); 280 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 281 282 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 283 if (DstRB->getID() == AMDGPU::VCCRegBankID) { 284 const TargetRegisterClass *RC = TRI.getBoolRC(); 285 unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), 286 RC == &AMDGPU::SReg_64RegClass); 287 I.setDesc(TII.get(InstOpc)); 288 289 // FIXME: Hack to avoid turning the register bank into a register class. 290 // The selector for G_ICMP relies on seeing the register bank for the result 291 // is VCC. In wave32 if we constrain the registers to SReg_32 here, it will 292 // be ambiguous whether it's a scalar or vector bool. 293 if (Src0.isUndef() && !MRI->getRegClassOrNull(Src0.getReg())) 294 MRI->setRegClass(Src0.getReg(), RC); 295 if (Src1.isUndef() && !MRI->getRegClassOrNull(Src1.getReg())) 296 MRI->setRegClass(Src1.getReg(), RC); 297 298 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 299 } 300 301 // TODO: Should this allow an SCC bank result, and produce a copy from SCC for 302 // the result? 303 if (DstRB->getID() == AMDGPU::SGPRRegBankID) { 304 unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), Size > 32); 305 I.setDesc(TII.get(InstOpc)); 306 // Dead implicit-def of scc 307 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef 308 true, // isImp 309 false, // isKill 310 true)); // isDead 311 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 312 } 313 314 return false; 315 } 316 317 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { 318 MachineBasicBlock *BB = I.getParent(); 319 MachineFunction *MF = BB->getParent(); 320 Register DstReg = I.getOperand(0).getReg(); 321 const DebugLoc &DL = I.getDebugLoc(); 322 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 323 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 324 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID; 325 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB; 326 327 if (Size == 32) { 328 if (IsSALU) { 329 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; 330 MachineInstr *Add = 331 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 332 .add(I.getOperand(1)) 333 .add(I.getOperand(2)); 334 I.eraseFromParent(); 335 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 336 } 337 338 if (STI.hasAddNoCarry()) { 339 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64; 340 I.setDesc(TII.get(Opc)); 341 I.addOperand(*MF, MachineOperand::CreateImm(0)); 342 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 343 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 344 } 345 346 const unsigned Opc = Sub ? AMDGPU::V_SUB_I32_e64 : AMDGPU::V_ADD_I32_e64; 347 348 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass()); 349 MachineInstr *Add 350 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 351 .addDef(UnusedCarry, RegState::Dead) 352 .add(I.getOperand(1)) 353 .add(I.getOperand(2)) 354 .addImm(0); 355 I.eraseFromParent(); 356 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 357 } 358 359 assert(!Sub && "illegal sub should not reach here"); 360 361 const TargetRegisterClass &RC 362 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass; 363 const TargetRegisterClass &HalfRC 364 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass; 365 366 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0)); 367 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0)); 368 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1)); 369 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1)); 370 371 Register DstLo = MRI->createVirtualRegister(&HalfRC); 372 Register DstHi = MRI->createVirtualRegister(&HalfRC); 373 374 if (IsSALU) { 375 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) 376 .add(Lo1) 377 .add(Lo2); 378 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) 379 .add(Hi1) 380 .add(Hi2); 381 } else { 382 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass(); 383 Register CarryReg = MRI->createVirtualRegister(CarryRC); 384 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_I32_e64), DstLo) 385 .addDef(CarryReg) 386 .add(Lo1) 387 .add(Lo2) 388 .addImm(0); 389 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi) 390 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead) 391 .add(Hi1) 392 .add(Hi2) 393 .addReg(CarryReg, RegState::Kill) 394 .addImm(0); 395 396 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI)) 397 return false; 398 } 399 400 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 401 .addReg(DstLo) 402 .addImm(AMDGPU::sub0) 403 .addReg(DstHi) 404 .addImm(AMDGPU::sub1); 405 406 407 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI)) 408 return false; 409 410 I.eraseFromParent(); 411 return true; 412 } 413 414 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE( 415 MachineInstr &I) const { 416 MachineBasicBlock *BB = I.getParent(); 417 MachineFunction *MF = BB->getParent(); 418 const DebugLoc &DL = I.getDebugLoc(); 419 Register Dst0Reg = I.getOperand(0).getReg(); 420 Register Dst1Reg = I.getOperand(1).getReg(); 421 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO || 422 I.getOpcode() == AMDGPU::G_UADDE; 423 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE || 424 I.getOpcode() == AMDGPU::G_USUBE; 425 426 if (isVCC(Dst1Reg, *MRI)) { 427 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned 428 // carry out despite the _i32 name. These were renamed in VI to _U32. 429 // FIXME: We should probably rename the opcodes here. 430 unsigned NoCarryOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; 431 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 432 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc)); 433 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 434 I.addOperand(*MF, MachineOperand::CreateImm(0)); 435 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 436 } 437 438 Register Src0Reg = I.getOperand(2).getReg(); 439 Register Src1Reg = I.getOperand(3).getReg(); 440 441 if (HasCarryIn) { 442 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 443 .addReg(I.getOperand(4).getReg()); 444 } 445 446 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; 447 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; 448 449 BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg) 450 .add(I.getOperand(2)) 451 .add(I.getOperand(3)); 452 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg) 453 .addReg(AMDGPU::SCC); 454 455 if (!MRI->getRegClassOrNull(Dst1Reg)) 456 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass); 457 458 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) || 459 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) || 460 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI)) 461 return false; 462 463 if (HasCarryIn && 464 !RBI.constrainGenericRegister(I.getOperand(4).getReg(), 465 AMDGPU::SReg_32RegClass, *MRI)) 466 return false; 467 468 I.eraseFromParent(); 469 return true; 470 } 471 472 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { 473 MachineBasicBlock *BB = I.getParent(); 474 Register DstReg = I.getOperand(0).getReg(); 475 Register SrcReg = I.getOperand(1).getReg(); 476 LLT DstTy = MRI->getType(DstReg); 477 LLT SrcTy = MRI->getType(SrcReg); 478 const unsigned SrcSize = SrcTy.getSizeInBits(); 479 const unsigned DstSize = DstTy.getSizeInBits(); 480 481 // TODO: Should handle any multiple of 32 offset. 482 unsigned Offset = I.getOperand(2).getImm(); 483 if (Offset % DstSize != 0) 484 return false; 485 486 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 487 const TargetRegisterClass *SrcRC = 488 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 489 if (!SrcRC) 490 return false; 491 492 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); 493 494 const DebugLoc &DL = I.getDebugLoc(); 495 MachineInstr *Copy = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg) 496 .addReg(SrcReg, 0, SubRegs[Offset / DstSize]); 497 498 for (const MachineOperand &MO : Copy->operands()) { 499 const TargetRegisterClass *RC = 500 TRI.getConstrainedRegClassForOperand(MO, *MRI); 501 if (!RC) 502 continue; 503 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); 504 } 505 I.eraseFromParent(); 506 return true; 507 } 508 509 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { 510 MachineBasicBlock *BB = MI.getParent(); 511 Register DstReg = MI.getOperand(0).getReg(); 512 LLT DstTy = MRI->getType(DstReg); 513 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg()); 514 515 const unsigned SrcSize = SrcTy.getSizeInBits(); 516 if (SrcSize < 32) 517 return selectImpl(MI, *CoverageInfo); 518 519 const DebugLoc &DL = MI.getDebugLoc(); 520 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 521 const unsigned DstSize = DstTy.getSizeInBits(); 522 const TargetRegisterClass *DstRC = 523 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 524 if (!DstRC) 525 return false; 526 527 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8); 528 MachineInstrBuilder MIB = 529 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg); 530 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) { 531 MachineOperand &Src = MI.getOperand(I + 1); 532 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef())); 533 MIB.addImm(SubRegs[I]); 534 535 const TargetRegisterClass *SrcRC 536 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 537 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI)) 538 return false; 539 } 540 541 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 542 return false; 543 544 MI.eraseFromParent(); 545 return true; 546 } 547 548 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { 549 MachineBasicBlock *BB = MI.getParent(); 550 const int NumDst = MI.getNumOperands() - 1; 551 552 MachineOperand &Src = MI.getOperand(NumDst); 553 554 Register SrcReg = Src.getReg(); 555 Register DstReg0 = MI.getOperand(0).getReg(); 556 LLT DstTy = MRI->getType(DstReg0); 557 LLT SrcTy = MRI->getType(SrcReg); 558 559 const unsigned DstSize = DstTy.getSizeInBits(); 560 const unsigned SrcSize = SrcTy.getSizeInBits(); 561 const DebugLoc &DL = MI.getDebugLoc(); 562 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 563 564 const TargetRegisterClass *SrcRC = 565 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 566 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 567 return false; 568 569 const unsigned SrcFlags = getUndefRegState(Src.isUndef()); 570 571 // Note we could have mixed SGPR and VGPR destination banks for an SGPR 572 // source, and this relies on the fact that the same subregister indices are 573 // used for both. 574 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); 575 for (int I = 0, E = NumDst; I != E; ++I) { 576 MachineOperand &Dst = MI.getOperand(I); 577 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg()) 578 .addReg(SrcReg, SrcFlags, SubRegs[I]); 579 580 const TargetRegisterClass *DstRC = 581 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 582 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI)) 583 return false; 584 } 585 586 MI.eraseFromParent(); 587 return true; 588 } 589 590 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const { 591 return selectG_ADD_SUB(I); 592 } 593 594 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { 595 const MachineOperand &MO = I.getOperand(0); 596 597 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The 598 // regbank check here is to know why getConstrainedRegClassForOperand failed. 599 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI); 600 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) || 601 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) { 602 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); 603 return true; 604 } 605 606 return false; 607 } 608 609 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { 610 MachineBasicBlock *BB = I.getParent(); 611 612 Register DstReg = I.getOperand(0).getReg(); 613 Register Src0Reg = I.getOperand(1).getReg(); 614 Register Src1Reg = I.getOperand(2).getReg(); 615 LLT Src1Ty = MRI->getType(Src1Reg); 616 617 unsigned DstSize = MRI->getType(DstReg).getSizeInBits(); 618 unsigned InsSize = Src1Ty.getSizeInBits(); 619 620 int64_t Offset = I.getOperand(3).getImm(); 621 if (Offset % 32 != 0) 622 return false; 623 624 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32); 625 if (SubReg == AMDGPU::NoSubRegister) 626 return false; 627 628 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 629 const TargetRegisterClass *DstRC = 630 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 631 if (!DstRC) 632 return false; 633 634 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI); 635 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI); 636 const TargetRegisterClass *Src0RC = 637 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI); 638 const TargetRegisterClass *Src1RC = 639 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI); 640 641 // Deal with weird cases where the class only partially supports the subreg 642 // index. 643 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg); 644 if (!Src0RC) 645 return false; 646 647 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 648 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) || 649 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI)) 650 return false; 651 652 const DebugLoc &DL = I.getDebugLoc(); 653 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg) 654 .addReg(Src0Reg) 655 .addReg(Src1Reg) 656 .addImm(SubReg); 657 658 I.eraseFromParent(); 659 return true; 660 } 661 662 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const { 663 if (STI.getLDSBankCount() != 16) 664 return selectImpl(MI, *CoverageInfo); 665 666 Register Dst = MI.getOperand(0).getReg(); 667 Register Src0 = MI.getOperand(2).getReg(); 668 Register M0Val = MI.getOperand(6).getReg(); 669 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) || 670 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) || 671 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI)) 672 return false; 673 674 // This requires 2 instructions. It is possible to write a pattern to support 675 // this, but the generated isel emitter doesn't correctly deal with multiple 676 // output instructions using the same physical register input. The copy to m0 677 // is incorrectly placed before the second instruction. 678 // 679 // TODO: Match source modifiers. 680 681 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 682 const DebugLoc &DL = MI.getDebugLoc(); 683 MachineBasicBlock *MBB = MI.getParent(); 684 685 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 686 .addReg(M0Val); 687 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov) 688 .addImm(2) 689 .addImm(MI.getOperand(4).getImm()) // $attr 690 .addImm(MI.getOperand(3).getImm()); // $attrchan 691 692 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst) 693 .addImm(0) // $src0_modifiers 694 .addReg(Src0) // $src0 695 .addImm(MI.getOperand(4).getImm()) // $attr 696 .addImm(MI.getOperand(3).getImm()) // $attrchan 697 .addImm(0) // $src2_modifiers 698 .addReg(InterpMov) // $src2 - 2 f16 values selected by high 699 .addImm(MI.getOperand(5).getImm()) // $high 700 .addImm(0) // $clamp 701 .addImm(0); // $omod 702 703 MI.eraseFromParent(); 704 return true; 705 } 706 707 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { 708 unsigned IntrinsicID = I.getIntrinsicID(); 709 switch (IntrinsicID) { 710 case Intrinsic::amdgcn_if_break: { 711 MachineBasicBlock *BB = I.getParent(); 712 713 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 714 // SelectionDAG uses for wave32 vs wave64. 715 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK)) 716 .add(I.getOperand(0)) 717 .add(I.getOperand(2)) 718 .add(I.getOperand(3)); 719 720 Register DstReg = I.getOperand(0).getReg(); 721 Register Src0Reg = I.getOperand(2).getReg(); 722 Register Src1Reg = I.getOperand(3).getReg(); 723 724 I.eraseFromParent(); 725 726 for (Register Reg : { DstReg, Src0Reg, Src1Reg }) 727 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 728 729 return true; 730 } 731 case Intrinsic::amdgcn_interp_p1_f16: 732 return selectInterpP1F16(I); 733 case Intrinsic::amdgcn_wqm: 734 return constrainCopyLikeIntrin(I, AMDGPU::WQM); 735 case Intrinsic::amdgcn_softwqm: 736 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM); 737 case Intrinsic::amdgcn_wwm: 738 return constrainCopyLikeIntrin(I, AMDGPU::WWM); 739 default: 740 return selectImpl(I, *CoverageInfo); 741 } 742 } 743 744 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) { 745 if (Size != 32 && Size != 64) 746 return -1; 747 switch (P) { 748 default: 749 llvm_unreachable("Unknown condition code!"); 750 case CmpInst::ICMP_NE: 751 return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64; 752 case CmpInst::ICMP_EQ: 753 return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64; 754 case CmpInst::ICMP_SGT: 755 return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64; 756 case CmpInst::ICMP_SGE: 757 return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64; 758 case CmpInst::ICMP_SLT: 759 return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64; 760 case CmpInst::ICMP_SLE: 761 return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64; 762 case CmpInst::ICMP_UGT: 763 return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64; 764 case CmpInst::ICMP_UGE: 765 return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64; 766 case CmpInst::ICMP_ULT: 767 return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64; 768 case CmpInst::ICMP_ULE: 769 return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64; 770 } 771 } 772 773 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P, 774 unsigned Size) const { 775 if (Size == 64) { 776 if (!STI.hasScalarCompareEq64()) 777 return -1; 778 779 switch (P) { 780 case CmpInst::ICMP_NE: 781 return AMDGPU::S_CMP_LG_U64; 782 case CmpInst::ICMP_EQ: 783 return AMDGPU::S_CMP_EQ_U64; 784 default: 785 return -1; 786 } 787 } 788 789 if (Size != 32) 790 return -1; 791 792 switch (P) { 793 case CmpInst::ICMP_NE: 794 return AMDGPU::S_CMP_LG_U32; 795 case CmpInst::ICMP_EQ: 796 return AMDGPU::S_CMP_EQ_U32; 797 case CmpInst::ICMP_SGT: 798 return AMDGPU::S_CMP_GT_I32; 799 case CmpInst::ICMP_SGE: 800 return AMDGPU::S_CMP_GE_I32; 801 case CmpInst::ICMP_SLT: 802 return AMDGPU::S_CMP_LT_I32; 803 case CmpInst::ICMP_SLE: 804 return AMDGPU::S_CMP_LE_I32; 805 case CmpInst::ICMP_UGT: 806 return AMDGPU::S_CMP_GT_U32; 807 case CmpInst::ICMP_UGE: 808 return AMDGPU::S_CMP_GE_U32; 809 case CmpInst::ICMP_ULT: 810 return AMDGPU::S_CMP_LT_U32; 811 case CmpInst::ICMP_ULE: 812 return AMDGPU::S_CMP_LE_U32; 813 default: 814 llvm_unreachable("Unknown condition code!"); 815 } 816 } 817 818 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { 819 MachineBasicBlock *BB = I.getParent(); 820 const DebugLoc &DL = I.getDebugLoc(); 821 822 Register SrcReg = I.getOperand(2).getReg(); 823 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 824 825 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); 826 827 Register CCReg = I.getOperand(0).getReg(); 828 if (!isVCC(CCReg, *MRI)) { 829 int Opcode = getS_CMPOpcode(Pred, Size); 830 if (Opcode == -1) 831 return false; 832 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode)) 833 .add(I.getOperand(2)) 834 .add(I.getOperand(3)); 835 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg) 836 .addReg(AMDGPU::SCC); 837 bool Ret = 838 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) && 839 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI); 840 I.eraseFromParent(); 841 return Ret; 842 } 843 844 int Opcode = getV_CMPOpcode(Pred, Size); 845 if (Opcode == -1) 846 return false; 847 848 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), 849 I.getOperand(0).getReg()) 850 .add(I.getOperand(2)) 851 .add(I.getOperand(3)); 852 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), 853 *TRI.getBoolRC(), *MRI); 854 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 855 I.eraseFromParent(); 856 return Ret; 857 } 858 859 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const { 860 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 861 // SelectionDAG uses for wave32 vs wave64. 862 MachineBasicBlock *BB = MI.getParent(); 863 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF)) 864 .add(MI.getOperand(1)); 865 866 Register Reg = MI.getOperand(1).getReg(); 867 MI.eraseFromParent(); 868 869 if (!MRI->getRegClassOrNull(Reg)) 870 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 871 return true; 872 } 873 874 static unsigned getDSShaderTypeValue(const MachineFunction &MF) { 875 switch (MF.getFunction().getCallingConv()) { 876 case CallingConv::AMDGPU_PS: 877 return 1; 878 case CallingConv::AMDGPU_VS: 879 return 2; 880 case CallingConv::AMDGPU_GS: 881 return 3; 882 case CallingConv::AMDGPU_HS: 883 case CallingConv::AMDGPU_LS: 884 case CallingConv::AMDGPU_ES: 885 report_fatal_error("ds_ordered_count unsupported for this calling conv"); 886 case CallingConv::AMDGPU_CS: 887 case CallingConv::AMDGPU_KERNEL: 888 case CallingConv::C: 889 case CallingConv::Fast: 890 default: 891 // Assume other calling conventions are various compute callable functions 892 return 0; 893 } 894 } 895 896 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic( 897 MachineInstr &MI, Intrinsic::ID IntrID) const { 898 MachineBasicBlock *MBB = MI.getParent(); 899 MachineFunction *MF = MBB->getParent(); 900 const DebugLoc &DL = MI.getDebugLoc(); 901 902 unsigned IndexOperand = MI.getOperand(7).getImm(); 903 bool WaveRelease = MI.getOperand(8).getImm() != 0; 904 bool WaveDone = MI.getOperand(9).getImm() != 0; 905 906 if (WaveDone && !WaveRelease) 907 report_fatal_error("ds_ordered_count: wave_done requires wave_release"); 908 909 unsigned OrderedCountIndex = IndexOperand & 0x3f; 910 IndexOperand &= ~0x3f; 911 unsigned CountDw = 0; 912 913 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) { 914 CountDw = (IndexOperand >> 24) & 0xf; 915 IndexOperand &= ~(0xf << 24); 916 917 if (CountDw < 1 || CountDw > 4) { 918 report_fatal_error( 919 "ds_ordered_count: dword count must be between 1 and 4"); 920 } 921 } 922 923 if (IndexOperand) 924 report_fatal_error("ds_ordered_count: bad index operand"); 925 926 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1; 927 unsigned ShaderType = getDSShaderTypeValue(*MF); 928 929 unsigned Offset0 = OrderedCountIndex << 2; 930 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) | 931 (Instruction << 4); 932 933 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) 934 Offset1 |= (CountDw - 1) << 6; 935 936 unsigned Offset = Offset0 | (Offset1 << 8); 937 938 Register M0Val = MI.getOperand(2).getReg(); 939 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 940 .addReg(M0Val); 941 942 Register DstReg = MI.getOperand(0).getReg(); 943 Register ValReg = MI.getOperand(3).getReg(); 944 MachineInstrBuilder DS = 945 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg) 946 .addReg(ValReg) 947 .addImm(Offset) 948 .cloneMemRefs(MI); 949 950 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI)) 951 return false; 952 953 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI); 954 MI.eraseFromParent(); 955 return Ret; 956 } 957 958 static unsigned gwsIntrinToOpcode(unsigned IntrID) { 959 switch (IntrID) { 960 case Intrinsic::amdgcn_ds_gws_init: 961 return AMDGPU::DS_GWS_INIT; 962 case Intrinsic::amdgcn_ds_gws_barrier: 963 return AMDGPU::DS_GWS_BARRIER; 964 case Intrinsic::amdgcn_ds_gws_sema_v: 965 return AMDGPU::DS_GWS_SEMA_V; 966 case Intrinsic::amdgcn_ds_gws_sema_br: 967 return AMDGPU::DS_GWS_SEMA_BR; 968 case Intrinsic::amdgcn_ds_gws_sema_p: 969 return AMDGPU::DS_GWS_SEMA_P; 970 case Intrinsic::amdgcn_ds_gws_sema_release_all: 971 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL; 972 default: 973 llvm_unreachable("not a gws intrinsic"); 974 } 975 } 976 977 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, 978 Intrinsic::ID IID) const { 979 if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all && 980 !STI.hasGWSSemaReleaseAll()) 981 return false; 982 983 // intrinsic ID, vsrc, offset 984 const bool HasVSrc = MI.getNumOperands() == 3; 985 assert(HasVSrc || MI.getNumOperands() == 2); 986 987 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg(); 988 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI); 989 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID) 990 return false; 991 992 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 993 assert(OffsetDef); 994 995 unsigned ImmOffset; 996 997 MachineBasicBlock *MBB = MI.getParent(); 998 const DebugLoc &DL = MI.getDebugLoc(); 999 1000 MachineInstr *Readfirstlane = nullptr; 1001 1002 // If we legalized the VGPR input, strip out the readfirstlane to analyze the 1003 // incoming offset, in case there's an add of a constant. We'll have to put it 1004 // back later. 1005 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) { 1006 Readfirstlane = OffsetDef; 1007 BaseOffset = OffsetDef->getOperand(1).getReg(); 1008 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1009 } 1010 1011 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) { 1012 // If we have a constant offset, try to use the 0 in m0 as the base. 1013 // TODO: Look into changing the default m0 initialization value. If the 1014 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to 1015 // the immediate offset. 1016 1017 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue(); 1018 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1019 .addImm(0); 1020 } else { 1021 std::tie(BaseOffset, ImmOffset, OffsetDef) 1022 = AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset); 1023 1024 if (Readfirstlane) { 1025 // We have the constant offset now, so put the readfirstlane back on the 1026 // variable component. 1027 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI)) 1028 return false; 1029 1030 Readfirstlane->getOperand(1).setReg(BaseOffset); 1031 BaseOffset = Readfirstlane->getOperand(0).getReg(); 1032 } else { 1033 if (!RBI.constrainGenericRegister(BaseOffset, 1034 AMDGPU::SReg_32RegClass, *MRI)) 1035 return false; 1036 } 1037 1038 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1039 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base) 1040 .addReg(BaseOffset) 1041 .addImm(16); 1042 1043 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1044 .addReg(M0Base); 1045 } 1046 1047 // The resource id offset is computed as (<isa opaque base> + M0[21:16] + 1048 // offset field) % 64. Some versions of the programming guide omit the m0 1049 // part, or claim it's from offset 0. 1050 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID))); 1051 1052 if (HasVSrc) { 1053 Register VSrc = MI.getOperand(1).getReg(); 1054 MIB.addReg(VSrc); 1055 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI)) 1056 return false; 1057 } 1058 1059 MIB.addImm(ImmOffset) 1060 .addImm(-1) // $gds 1061 .cloneMemRefs(MI); 1062 1063 MI.eraseFromParent(); 1064 return true; 1065 } 1066 1067 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI, 1068 bool IsAppend) const { 1069 Register PtrBase = MI.getOperand(2).getReg(); 1070 LLT PtrTy = MRI->getType(PtrBase); 1071 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS; 1072 1073 unsigned Offset; 1074 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2)); 1075 1076 // TODO: Should this try to look through readfirstlane like GWS? 1077 if (!isDSOffsetLegal(PtrBase, Offset, 16)) { 1078 PtrBase = MI.getOperand(2).getReg(); 1079 Offset = 0; 1080 } 1081 1082 MachineBasicBlock *MBB = MI.getParent(); 1083 const DebugLoc &DL = MI.getDebugLoc(); 1084 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME; 1085 1086 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1087 .addReg(PtrBase); 1088 BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg()) 1089 .addImm(Offset) 1090 .addImm(IsGDS ? -1 : 0) 1091 .cloneMemRefs(MI); 1092 1093 MI.eraseFromParent(); 1094 return true; 1095 } 1096 1097 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( 1098 MachineInstr &I) const { 1099 unsigned IntrinsicID = I.getIntrinsicID(); 1100 switch (IntrinsicID) { 1101 case Intrinsic::amdgcn_end_cf: 1102 return selectEndCfIntrinsic(I); 1103 case Intrinsic::amdgcn_ds_ordered_add: 1104 case Intrinsic::amdgcn_ds_ordered_swap: 1105 return selectDSOrderedIntrinsic(I, IntrinsicID); 1106 case Intrinsic::amdgcn_ds_gws_init: 1107 case Intrinsic::amdgcn_ds_gws_barrier: 1108 case Intrinsic::amdgcn_ds_gws_sema_v: 1109 case Intrinsic::amdgcn_ds_gws_sema_br: 1110 case Intrinsic::amdgcn_ds_gws_sema_p: 1111 case Intrinsic::amdgcn_ds_gws_sema_release_all: 1112 return selectDSGWSIntrinsic(I, IntrinsicID); 1113 case Intrinsic::amdgcn_ds_append: 1114 return selectDSAppendConsume(I, true); 1115 case Intrinsic::amdgcn_ds_consume: 1116 return selectDSAppendConsume(I, false); 1117 default: 1118 return selectImpl(I, *CoverageInfo); 1119 } 1120 } 1121 1122 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { 1123 if (selectImpl(I, *CoverageInfo)) 1124 return true; 1125 1126 MachineBasicBlock *BB = I.getParent(); 1127 const DebugLoc &DL = I.getDebugLoc(); 1128 1129 Register DstReg = I.getOperand(0).getReg(); 1130 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 1131 assert(Size <= 32 || Size == 64); 1132 const MachineOperand &CCOp = I.getOperand(1); 1133 Register CCReg = CCOp.getReg(); 1134 if (!isVCC(CCReg, *MRI)) { 1135 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 : 1136 AMDGPU::S_CSELECT_B32; 1137 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 1138 .addReg(CCReg); 1139 1140 // The generic constrainSelectedInstRegOperands doesn't work for the scc register 1141 // bank, because it does not cover the register class that we used to represent 1142 // for it. So we need to manually set the register class here. 1143 if (!MRI->getRegClassOrNull(CCReg)) 1144 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI)); 1145 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) 1146 .add(I.getOperand(2)) 1147 .add(I.getOperand(3)); 1148 1149 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) | 1150 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI); 1151 I.eraseFromParent(); 1152 return Ret; 1153 } 1154 1155 // Wide VGPR select should have been split in RegBankSelect. 1156 if (Size > 32) 1157 return false; 1158 1159 MachineInstr *Select = 1160 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1161 .addImm(0) 1162 .add(I.getOperand(3)) 1163 .addImm(0) 1164 .add(I.getOperand(2)) 1165 .add(I.getOperand(1)); 1166 1167 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); 1168 I.eraseFromParent(); 1169 return Ret; 1170 } 1171 1172 bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { 1173 initM0(I); 1174 return selectImpl(I, *CoverageInfo); 1175 } 1176 1177 static int sizeToSubRegIndex(unsigned Size) { 1178 switch (Size) { 1179 case 32: 1180 return AMDGPU::sub0; 1181 case 64: 1182 return AMDGPU::sub0_sub1; 1183 case 96: 1184 return AMDGPU::sub0_sub1_sub2; 1185 case 128: 1186 return AMDGPU::sub0_sub1_sub2_sub3; 1187 case 256: 1188 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; 1189 default: 1190 if (Size < 32) 1191 return AMDGPU::sub0; 1192 if (Size > 256) 1193 return -1; 1194 return sizeToSubRegIndex(PowerOf2Ceil(Size)); 1195 } 1196 } 1197 1198 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { 1199 Register DstReg = I.getOperand(0).getReg(); 1200 Register SrcReg = I.getOperand(1).getReg(); 1201 const LLT DstTy = MRI->getType(DstReg); 1202 const LLT SrcTy = MRI->getType(SrcReg); 1203 if (!DstTy.isScalar()) 1204 return false; 1205 1206 const LLT S1 = LLT::scalar(1); 1207 1208 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1209 const RegisterBank *DstRB; 1210 if (DstTy == S1) { 1211 // This is a special case. We don't treat s1 for legalization artifacts as 1212 // vcc booleans. 1213 DstRB = SrcRB; 1214 } else { 1215 DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1216 if (SrcRB != DstRB) 1217 return false; 1218 } 1219 1220 unsigned DstSize = DstTy.getSizeInBits(); 1221 unsigned SrcSize = SrcTy.getSizeInBits(); 1222 1223 const TargetRegisterClass *SrcRC 1224 = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI); 1225 const TargetRegisterClass *DstRC 1226 = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI); 1227 1228 if (SrcSize > 32) { 1229 int SubRegIdx = sizeToSubRegIndex(DstSize); 1230 if (SubRegIdx == -1) 1231 return false; 1232 1233 // Deal with weird cases where the class only partially supports the subreg 1234 // index. 1235 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx); 1236 if (!SrcRC) 1237 return false; 1238 1239 I.getOperand(1).setSubReg(SubRegIdx); 1240 } 1241 1242 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 1243 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) { 1244 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); 1245 return false; 1246 } 1247 1248 I.setDesc(TII.get(TargetOpcode::COPY)); 1249 return true; 1250 } 1251 1252 /// \returns true if a bitmask for \p Size bits will be an inline immediate. 1253 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) { 1254 Mask = maskTrailingOnes<unsigned>(Size); 1255 int SignedMask = static_cast<int>(Mask); 1256 return SignedMask >= -16 && SignedMask <= 64; 1257 } 1258 1259 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1. 1260 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank( 1261 Register Reg, const MachineRegisterInfo &MRI, 1262 const TargetRegisterInfo &TRI) const { 1263 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 1264 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>()) 1265 return RB; 1266 1267 // Ignore the type, since we don't use vcc in artifacts. 1268 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>()) 1269 return &RBI.getRegBankFromRegClass(*RC, LLT()); 1270 return nullptr; 1271 } 1272 1273 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { 1274 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG; 1275 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg; 1276 const DebugLoc &DL = I.getDebugLoc(); 1277 MachineBasicBlock &MBB = *I.getParent(); 1278 const Register DstReg = I.getOperand(0).getReg(); 1279 const Register SrcReg = I.getOperand(1).getReg(); 1280 1281 const LLT DstTy = MRI->getType(DstReg); 1282 const LLT SrcTy = MRI->getType(SrcReg); 1283 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ? 1284 I.getOperand(2).getImm() : SrcTy.getSizeInBits(); 1285 const unsigned DstSize = DstTy.getSizeInBits(); 1286 if (!DstTy.isScalar()) 1287 return false; 1288 1289 if (I.getOpcode() == AMDGPU::G_ANYEXT) 1290 return selectCOPY(I); 1291 1292 // Artifact casts should never use vcc. 1293 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI); 1294 1295 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) { 1296 // 64-bit should have been split up in RegBankSelect 1297 1298 // Try to use an and with a mask if it will save code size. 1299 unsigned Mask; 1300 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1301 MachineInstr *ExtI = 1302 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg) 1303 .addImm(Mask) 1304 .addReg(SrcReg); 1305 I.eraseFromParent(); 1306 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1307 } 1308 1309 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32; 1310 MachineInstr *ExtI = 1311 BuildMI(MBB, I, DL, TII.get(BFE), DstReg) 1312 .addReg(SrcReg) 1313 .addImm(0) // Offset 1314 .addImm(SrcSize); // Width 1315 I.eraseFromParent(); 1316 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1317 } 1318 1319 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) { 1320 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ? 1321 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass; 1322 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI)) 1323 return false; 1324 1325 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) { 1326 const unsigned SextOpc = SrcSize == 8 ? 1327 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16; 1328 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg) 1329 .addReg(SrcReg); 1330 I.eraseFromParent(); 1331 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1332 } 1333 1334 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; 1335 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; 1336 1337 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width. 1338 if (DstSize > 32 && (SrcSize <= 32 || InReg)) { 1339 // We need a 64-bit register source, but the high bits don't matter. 1340 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); 1341 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1342 unsigned SubReg = InReg ? AMDGPU::sub0 : 0; 1343 1344 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 1345 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) 1346 .addReg(SrcReg, 0, SubReg) 1347 .addImm(AMDGPU::sub0) 1348 .addReg(UndefReg) 1349 .addImm(AMDGPU::sub1); 1350 1351 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg) 1352 .addReg(ExtReg) 1353 .addImm(SrcSize << 16); 1354 1355 I.eraseFromParent(); 1356 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI); 1357 } 1358 1359 unsigned Mask; 1360 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1361 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg) 1362 .addReg(SrcReg) 1363 .addImm(Mask); 1364 } else { 1365 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg) 1366 .addReg(SrcReg) 1367 .addImm(SrcSize << 16); 1368 } 1369 1370 I.eraseFromParent(); 1371 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1372 } 1373 1374 return false; 1375 } 1376 1377 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { 1378 MachineBasicBlock *BB = I.getParent(); 1379 MachineOperand &ImmOp = I.getOperand(1); 1380 1381 // The AMDGPU backend only supports Imm operands and not CImm or FPImm. 1382 if (ImmOp.isFPImm()) { 1383 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt(); 1384 ImmOp.ChangeToImmediate(Imm.getZExtValue()); 1385 } else if (ImmOp.isCImm()) { 1386 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getZExtValue()); 1387 } 1388 1389 Register DstReg = I.getOperand(0).getReg(); 1390 unsigned Size; 1391 bool IsSgpr; 1392 const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg()); 1393 if (RB) { 1394 IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID; 1395 Size = MRI->getType(DstReg).getSizeInBits(); 1396 } else { 1397 const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg); 1398 IsSgpr = TRI.isSGPRClass(RC); 1399 Size = TRI.getRegSizeInBits(*RC); 1400 } 1401 1402 if (Size != 32 && Size != 64) 1403 return false; 1404 1405 unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1406 if (Size == 32) { 1407 I.setDesc(TII.get(Opcode)); 1408 I.addImplicitDefUseOperands(*MF); 1409 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 1410 } 1411 1412 const DebugLoc &DL = I.getDebugLoc(); 1413 1414 APInt Imm(Size, I.getOperand(1).getImm()); 1415 1416 MachineInstr *ResInst; 1417 if (IsSgpr && TII.isInlineConstant(Imm)) { 1418 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) 1419 .addImm(I.getOperand(1).getImm()); 1420 } else { 1421 const TargetRegisterClass *RC = IsSgpr ? 1422 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass; 1423 Register LoReg = MRI->createVirtualRegister(RC); 1424 Register HiReg = MRI->createVirtualRegister(RC); 1425 1426 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg) 1427 .addImm(Imm.trunc(32).getZExtValue()); 1428 1429 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg) 1430 .addImm(Imm.ashr(32).getZExtValue()); 1431 1432 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 1433 .addReg(LoReg) 1434 .addImm(AMDGPU::sub0) 1435 .addReg(HiReg) 1436 .addImm(AMDGPU::sub1); 1437 } 1438 1439 // We can't call constrainSelectedInstRegOperands here, because it doesn't 1440 // work for target independent opcodes 1441 I.eraseFromParent(); 1442 const TargetRegisterClass *DstRC = 1443 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI); 1444 if (!DstRC) 1445 return true; 1446 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI); 1447 } 1448 1449 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const { 1450 // Only manually handle the f64 SGPR case. 1451 // 1452 // FIXME: This is a workaround for 2.5 different tablegen problems. Because 1453 // the bit ops theoretically have a second result due to the implicit def of 1454 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing 1455 // that is easy by disabling the check. The result works, but uses a 1456 // nonsensical sreg32orlds_and_sreg_1 regclass. 1457 // 1458 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to 1459 // the variadic REG_SEQUENCE operands. 1460 1461 Register Dst = MI.getOperand(0).getReg(); 1462 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 1463 if (DstRB->getID() != AMDGPU::SGPRRegBankID || 1464 MRI->getType(Dst) != LLT::scalar(64)) 1465 return false; 1466 1467 Register Src = MI.getOperand(1).getReg(); 1468 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI); 1469 if (Fabs) 1470 Src = Fabs->getOperand(1).getReg(); 1471 1472 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || 1473 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) 1474 return false; 1475 1476 MachineBasicBlock *BB = MI.getParent(); 1477 const DebugLoc &DL = MI.getDebugLoc(); 1478 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1479 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1480 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1481 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1482 1483 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) 1484 .addReg(Src, 0, AMDGPU::sub0); 1485 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) 1486 .addReg(Src, 0, AMDGPU::sub1); 1487 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) 1488 .addImm(0x80000000); 1489 1490 // Set or toggle sign bit. 1491 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32; 1492 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg) 1493 .addReg(HiReg) 1494 .addReg(ConstReg); 1495 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) 1496 .addReg(LoReg) 1497 .addImm(AMDGPU::sub0) 1498 .addReg(OpReg) 1499 .addImm(AMDGPU::sub1); 1500 MI.eraseFromParent(); 1501 return true; 1502 } 1503 1504 static bool isConstant(const MachineInstr &MI) { 1505 return MI.getOpcode() == TargetOpcode::G_CONSTANT; 1506 } 1507 1508 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, 1509 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const { 1510 1511 const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg()); 1512 1513 assert(PtrMI); 1514 1515 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD) 1516 return; 1517 1518 GEPInfo GEPInfo(*PtrMI); 1519 1520 for (unsigned i = 1; i != 3; ++i) { 1521 const MachineOperand &GEPOp = PtrMI->getOperand(i); 1522 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg()); 1523 assert(OpDef); 1524 if (i == 2 && isConstant(*OpDef)) { 1525 // TODO: Could handle constant base + variable offset, but a combine 1526 // probably should have commuted it. 1527 assert(GEPInfo.Imm == 0); 1528 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue(); 1529 continue; 1530 } 1531 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI); 1532 if (OpBank->getID() == AMDGPU::SGPRRegBankID) 1533 GEPInfo.SgprParts.push_back(GEPOp.getReg()); 1534 else 1535 GEPInfo.VgprParts.push_back(GEPOp.getReg()); 1536 } 1537 1538 AddrInfo.push_back(GEPInfo); 1539 getAddrModeInfo(*PtrMI, MRI, AddrInfo); 1540 } 1541 1542 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const { 1543 if (!MI.hasOneMemOperand()) 1544 return false; 1545 1546 const MachineMemOperand *MMO = *MI.memoperands_begin(); 1547 const Value *Ptr = MMO->getValue(); 1548 1549 // UndefValue means this is a load of a kernel input. These are uniform. 1550 // Sometimes LDS instructions have constant pointers. 1551 // If Ptr is null, then that means this mem operand contains a 1552 // PseudoSourceValue like GOT. 1553 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || 1554 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) 1555 return true; 1556 1557 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 1558 return true; 1559 1560 const Instruction *I = dyn_cast<Instruction>(Ptr); 1561 return I && I->getMetadata("amdgpu.uniform"); 1562 } 1563 1564 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const { 1565 for (const GEPInfo &GEPInfo : AddrInfo) { 1566 if (!GEPInfo.VgprParts.empty()) 1567 return true; 1568 } 1569 return false; 1570 } 1571 1572 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const { 1573 MachineBasicBlock *BB = I.getParent(); 1574 1575 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); 1576 unsigned AS = PtrTy.getAddressSpace(); 1577 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) && 1578 STI.ldsRequiresM0Init()) { 1579 // If DS instructions require M0 initializtion, insert it before selecting. 1580 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1581 .addImm(-1); 1582 } 1583 } 1584 1585 bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const { 1586 initM0(I); 1587 return selectImpl(I, *CoverageInfo); 1588 } 1589 1590 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { 1591 MachineBasicBlock *BB = I.getParent(); 1592 MachineOperand &CondOp = I.getOperand(0); 1593 Register CondReg = CondOp.getReg(); 1594 const DebugLoc &DL = I.getDebugLoc(); 1595 1596 unsigned BrOpcode; 1597 Register CondPhysReg; 1598 const TargetRegisterClass *ConstrainRC; 1599 1600 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide 1601 // whether the branch is uniform when selecting the instruction. In 1602 // GlobalISel, we should push that decision into RegBankSelect. Assume for now 1603 // RegBankSelect knows what it's doing if the branch condition is scc, even 1604 // though it currently does not. 1605 if (!isVCC(CondReg, *MRI)) { 1606 if (MRI->getType(CondReg) != LLT::scalar(32)) 1607 return false; 1608 1609 CondPhysReg = AMDGPU::SCC; 1610 BrOpcode = AMDGPU::S_CBRANCH_SCC1; 1611 // FIXME: Hack for isSCC tests 1612 ConstrainRC = &AMDGPU::SGPR_32RegClass; 1613 } else { 1614 // FIXME: Do we have to insert an and with exec here, like in SelectionDAG? 1615 // We sort of know that a VCC producer based on the register bank, that ands 1616 // inactive lanes with 0. What if there was a logical operation with vcc 1617 // producers in different blocks/with different exec masks? 1618 // FIXME: Should scc->vcc copies and with exec? 1619 CondPhysReg = TRI.getVCC(); 1620 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ; 1621 ConstrainRC = TRI.getBoolRC(); 1622 } 1623 1624 if (!MRI->getRegClassOrNull(CondReg)) 1625 MRI->setRegClass(CondReg, ConstrainRC); 1626 1627 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg) 1628 .addReg(CondReg); 1629 BuildMI(*BB, &I, DL, TII.get(BrOpcode)) 1630 .addMBB(I.getOperand(1).getMBB()); 1631 1632 I.eraseFromParent(); 1633 return true; 1634 } 1635 1636 bool AMDGPUInstructionSelector::selectG_FRAME_INDEX_GLOBAL_VALUE( 1637 MachineInstr &I) const { 1638 Register DstReg = I.getOperand(0).getReg(); 1639 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1640 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 1641 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32)); 1642 if (IsVGPR) 1643 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 1644 1645 return RBI.constrainGenericRegister( 1646 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI); 1647 } 1648 1649 bool AMDGPUInstructionSelector::selectG_PTR_MASK(MachineInstr &I) const { 1650 uint64_t Align = I.getOperand(2).getImm(); 1651 const uint64_t Mask = ~((UINT64_C(1) << Align) - 1); 1652 1653 MachineBasicBlock *BB = I.getParent(); 1654 1655 Register DstReg = I.getOperand(0).getReg(); 1656 Register SrcReg = I.getOperand(1).getReg(); 1657 1658 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1659 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1660 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 1661 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 1662 unsigned MovOpc = IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 1663 const TargetRegisterClass &RegRC 1664 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 1665 1666 LLT Ty = MRI->getType(DstReg); 1667 1668 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB, 1669 *MRI); 1670 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB, 1671 *MRI); 1672 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 1673 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 1674 return false; 1675 1676 const DebugLoc &DL = I.getDebugLoc(); 1677 Register ImmReg = MRI->createVirtualRegister(&RegRC); 1678 BuildMI(*BB, &I, DL, TII.get(MovOpc), ImmReg) 1679 .addImm(Mask); 1680 1681 if (Ty.getSizeInBits() == 32) { 1682 BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg) 1683 .addReg(SrcReg) 1684 .addReg(ImmReg); 1685 I.eraseFromParent(); 1686 return true; 1687 } 1688 1689 Register HiReg = MRI->createVirtualRegister(&RegRC); 1690 Register LoReg = MRI->createVirtualRegister(&RegRC); 1691 Register MaskLo = MRI->createVirtualRegister(&RegRC); 1692 1693 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg) 1694 .addReg(SrcReg, 0, AMDGPU::sub0); 1695 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg) 1696 .addReg(SrcReg, 0, AMDGPU::sub1); 1697 1698 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskLo) 1699 .addReg(LoReg) 1700 .addReg(ImmReg); 1701 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 1702 .addReg(MaskLo) 1703 .addImm(AMDGPU::sub0) 1704 .addReg(HiReg) 1705 .addImm(AMDGPU::sub1); 1706 I.eraseFromParent(); 1707 return true; 1708 } 1709 1710 /// Return the register to use for the index value, and the subregister to use 1711 /// for the indirectly accessed register. 1712 static std::pair<Register, unsigned> 1713 computeIndirectRegIndex(MachineRegisterInfo &MRI, 1714 const SIRegisterInfo &TRI, 1715 const TargetRegisterClass *SuperRC, 1716 Register IdxReg, 1717 unsigned EltSize) { 1718 Register IdxBaseReg; 1719 int Offset; 1720 MachineInstr *Unused; 1721 1722 std::tie(IdxBaseReg, Offset, Unused) 1723 = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg); 1724 1725 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize); 1726 1727 // Skip out of bounds offsets, or else we would end up using an undefined 1728 // register. 1729 if (static_cast<unsigned>(Offset) >= SubRegs.size()) 1730 return std::make_pair(IdxReg, SubRegs[0]); 1731 return std::make_pair(IdxBaseReg, SubRegs[Offset]); 1732 } 1733 1734 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT( 1735 MachineInstr &MI) const { 1736 Register DstReg = MI.getOperand(0).getReg(); 1737 Register SrcReg = MI.getOperand(1).getReg(); 1738 Register IdxReg = MI.getOperand(2).getReg(); 1739 1740 LLT DstTy = MRI->getType(DstReg); 1741 LLT SrcTy = MRI->getType(SrcReg); 1742 1743 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1744 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1745 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 1746 1747 // The index must be scalar. If it wasn't RegBankSelect should have moved this 1748 // into a waterfall loop. 1749 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 1750 return false; 1751 1752 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB, 1753 *MRI); 1754 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB, 1755 *MRI); 1756 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 1757 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 1758 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 1759 return false; 1760 1761 MachineBasicBlock *BB = MI.getParent(); 1762 const DebugLoc &DL = MI.getDebugLoc(); 1763 const bool Is64 = DstTy.getSizeInBits() == 64; 1764 1765 unsigned SubReg; 1766 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg, 1767 DstTy.getSizeInBits() / 8); 1768 1769 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) { 1770 if (DstTy.getSizeInBits() != 32 && !Is64) 1771 return false; 1772 1773 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1774 .addReg(IdxReg); 1775 1776 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32; 1777 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg) 1778 .addReg(SrcReg, 0, SubReg) 1779 .addReg(SrcReg, RegState::Implicit); 1780 MI.eraseFromParent(); 1781 return true; 1782 } 1783 1784 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32) 1785 return false; 1786 1787 if (!STI.useVGPRIndexMode()) { 1788 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1789 .addReg(IdxReg); 1790 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg) 1791 .addReg(SrcReg, RegState::Undef, SubReg) 1792 .addReg(SrcReg, RegState::Implicit); 1793 MI.eraseFromParent(); 1794 return true; 1795 } 1796 1797 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON)) 1798 .addReg(IdxReg) 1799 .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE); 1800 BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstReg) 1801 .addReg(SrcReg, RegState::Undef, SubReg) 1802 .addReg(SrcReg, RegState::Implicit) 1803 .addReg(AMDGPU::M0, RegState::Implicit); 1804 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF)); 1805 1806 MI.eraseFromParent(); 1807 return true; 1808 } 1809 1810 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd 1811 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT( 1812 MachineInstr &MI) const { 1813 Register DstReg = MI.getOperand(0).getReg(); 1814 Register VecReg = MI.getOperand(1).getReg(); 1815 Register ValReg = MI.getOperand(2).getReg(); 1816 Register IdxReg = MI.getOperand(3).getReg(); 1817 1818 LLT VecTy = MRI->getType(DstReg); 1819 LLT ValTy = MRI->getType(ValReg); 1820 unsigned VecSize = VecTy.getSizeInBits(); 1821 unsigned ValSize = ValTy.getSizeInBits(); 1822 1823 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI); 1824 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI); 1825 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 1826 1827 assert(VecTy.getElementType() == ValTy); 1828 1829 // The index must be scalar. If it wasn't RegBankSelect should have moved this 1830 // into a waterfall loop. 1831 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 1832 return false; 1833 1834 const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB, 1835 *MRI); 1836 const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB, 1837 *MRI); 1838 1839 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) || 1840 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) || 1841 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) || 1842 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 1843 return false; 1844 1845 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32) 1846 return false; 1847 1848 unsigned SubReg; 1849 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, 1850 ValSize / 8); 1851 1852 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID && 1853 STI.useVGPRIndexMode(); 1854 1855 MachineBasicBlock *BB = MI.getParent(); 1856 const DebugLoc &DL = MI.getDebugLoc(); 1857 1858 if (IndexMode) { 1859 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON)) 1860 .addReg(IdxReg) 1861 .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE); 1862 } else { 1863 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1864 .addReg(IdxReg); 1865 } 1866 1867 const MCInstrDesc &RegWriteOp 1868 = TII.getIndirectRegWritePseudo(VecSize, ValSize, 1869 VecRB->getID() == AMDGPU::SGPRRegBankID); 1870 BuildMI(*BB, MI, DL, RegWriteOp, DstReg) 1871 .addReg(VecReg) 1872 .addReg(ValReg) 1873 .addImm(SubReg); 1874 1875 if (IndexMode) 1876 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF)); 1877 1878 MI.eraseFromParent(); 1879 return true; 1880 } 1881 1882 bool AMDGPUInstructionSelector::select(MachineInstr &I) { 1883 if (I.isPHI()) 1884 return selectPHI(I); 1885 1886 if (!I.isPreISelOpcode()) { 1887 if (I.isCopy()) 1888 return selectCOPY(I); 1889 return true; 1890 } 1891 1892 switch (I.getOpcode()) { 1893 case TargetOpcode::G_AND: 1894 case TargetOpcode::G_OR: 1895 case TargetOpcode::G_XOR: 1896 if (selectG_AND_OR_XOR(I)) 1897 return true; 1898 return selectImpl(I, *CoverageInfo); 1899 case TargetOpcode::G_ADD: 1900 case TargetOpcode::G_SUB: 1901 if (selectImpl(I, *CoverageInfo)) 1902 return true; 1903 return selectG_ADD_SUB(I); 1904 case TargetOpcode::G_UADDO: 1905 case TargetOpcode::G_USUBO: 1906 case TargetOpcode::G_UADDE: 1907 case TargetOpcode::G_USUBE: 1908 return selectG_UADDO_USUBO_UADDE_USUBE(I); 1909 case TargetOpcode::G_INTTOPTR: 1910 case TargetOpcode::G_BITCAST: 1911 case TargetOpcode::G_PTRTOINT: 1912 return selectCOPY(I); 1913 case TargetOpcode::G_CONSTANT: 1914 case TargetOpcode::G_FCONSTANT: 1915 return selectG_CONSTANT(I); 1916 case TargetOpcode::G_FNEG: 1917 if (selectImpl(I, *CoverageInfo)) 1918 return true; 1919 return selectG_FNEG(I); 1920 case TargetOpcode::G_EXTRACT: 1921 return selectG_EXTRACT(I); 1922 case TargetOpcode::G_MERGE_VALUES: 1923 case TargetOpcode::G_BUILD_VECTOR: 1924 case TargetOpcode::G_CONCAT_VECTORS: 1925 return selectG_MERGE_VALUES(I); 1926 case TargetOpcode::G_UNMERGE_VALUES: 1927 return selectG_UNMERGE_VALUES(I); 1928 case TargetOpcode::G_PTR_ADD: 1929 return selectG_PTR_ADD(I); 1930 case TargetOpcode::G_IMPLICIT_DEF: 1931 return selectG_IMPLICIT_DEF(I); 1932 case TargetOpcode::G_INSERT: 1933 return selectG_INSERT(I); 1934 case TargetOpcode::G_INTRINSIC: 1935 return selectG_INTRINSIC(I); 1936 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: 1937 return selectG_INTRINSIC_W_SIDE_EFFECTS(I); 1938 case TargetOpcode::G_ICMP: 1939 if (selectG_ICMP(I)) 1940 return true; 1941 return selectImpl(I, *CoverageInfo); 1942 case TargetOpcode::G_LOAD: 1943 case TargetOpcode::G_ATOMIC_CMPXCHG: 1944 case TargetOpcode::G_ATOMICRMW_XCHG: 1945 case TargetOpcode::G_ATOMICRMW_ADD: 1946 case TargetOpcode::G_ATOMICRMW_SUB: 1947 case TargetOpcode::G_ATOMICRMW_AND: 1948 case TargetOpcode::G_ATOMICRMW_OR: 1949 case TargetOpcode::G_ATOMICRMW_XOR: 1950 case TargetOpcode::G_ATOMICRMW_MIN: 1951 case TargetOpcode::G_ATOMICRMW_MAX: 1952 case TargetOpcode::G_ATOMICRMW_UMIN: 1953 case TargetOpcode::G_ATOMICRMW_UMAX: 1954 case TargetOpcode::G_ATOMICRMW_FADD: 1955 return selectG_LOAD_ATOMICRMW(I); 1956 case TargetOpcode::G_SELECT: 1957 return selectG_SELECT(I); 1958 case TargetOpcode::G_STORE: 1959 return selectG_STORE(I); 1960 case TargetOpcode::G_TRUNC: 1961 return selectG_TRUNC(I); 1962 case TargetOpcode::G_SEXT: 1963 case TargetOpcode::G_ZEXT: 1964 case TargetOpcode::G_ANYEXT: 1965 case TargetOpcode::G_SEXT_INREG: 1966 if (selectImpl(I, *CoverageInfo)) 1967 return true; 1968 return selectG_SZA_EXT(I); 1969 case TargetOpcode::G_BRCOND: 1970 return selectG_BRCOND(I); 1971 case TargetOpcode::G_FRAME_INDEX: 1972 case TargetOpcode::G_GLOBAL_VALUE: 1973 return selectG_FRAME_INDEX_GLOBAL_VALUE(I); 1974 case TargetOpcode::G_PTR_MASK: 1975 return selectG_PTR_MASK(I); 1976 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1977 return selectG_EXTRACT_VECTOR_ELT(I); 1978 case TargetOpcode::G_INSERT_VECTOR_ELT: 1979 return selectG_INSERT_VECTOR_ELT(I); 1980 case AMDGPU::G_AMDGPU_ATOMIC_INC: 1981 case AMDGPU::G_AMDGPU_ATOMIC_DEC: 1982 initM0(I); 1983 return selectImpl(I, *CoverageInfo); 1984 default: 1985 return selectImpl(I, *CoverageInfo); 1986 } 1987 return false; 1988 } 1989 1990 InstructionSelector::ComplexRendererFns 1991 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { 1992 return {{ 1993 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 1994 }}; 1995 1996 } 1997 1998 std::pair<Register, unsigned> 1999 AMDGPUInstructionSelector::selectVOP3ModsImpl( 2000 Register Src) const { 2001 unsigned Mods = 0; 2002 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI); 2003 2004 if (MI && MI->getOpcode() == AMDGPU::G_FNEG) { 2005 Src = MI->getOperand(1).getReg(); 2006 Mods |= SISrcMods::NEG; 2007 MI = getDefIgnoringCopies(Src, *MRI); 2008 } 2009 2010 if (MI && MI->getOpcode() == AMDGPU::G_FABS) { 2011 Src = MI->getOperand(1).getReg(); 2012 Mods |= SISrcMods::ABS; 2013 } 2014 2015 return std::make_pair(Src, Mods); 2016 } 2017 2018 /// 2019 /// This will select either an SGPR or VGPR operand and will save us from 2020 /// having to write an extra tablegen pattern. 2021 InstructionSelector::ComplexRendererFns 2022 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const { 2023 return {{ 2024 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 2025 }}; 2026 } 2027 2028 InstructionSelector::ComplexRendererFns 2029 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { 2030 Register Src; 2031 unsigned Mods; 2032 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 2033 2034 return {{ 2035 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2036 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 2037 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 2038 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 2039 }}; 2040 } 2041 2042 InstructionSelector::ComplexRendererFns 2043 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { 2044 return {{ 2045 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 2046 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 2047 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 2048 }}; 2049 } 2050 2051 InstructionSelector::ComplexRendererFns 2052 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { 2053 Register Src; 2054 unsigned Mods; 2055 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 2056 2057 return {{ 2058 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2059 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 2060 }}; 2061 } 2062 2063 InstructionSelector::ComplexRendererFns 2064 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { 2065 Register Reg = Root.getReg(); 2066 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); 2067 if (Def && (Def->getOpcode() == AMDGPU::G_FNEG || 2068 Def->getOpcode() == AMDGPU::G_FABS)) 2069 return {}; 2070 return {{ 2071 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 2072 }}; 2073 } 2074 2075 InstructionSelector::ComplexRendererFns 2076 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const { 2077 Register Src; 2078 unsigned Mods; 2079 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 2080 if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI)) 2081 return None; 2082 2083 return {{ 2084 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2085 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 2086 }}; 2087 } 2088 2089 InstructionSelector::ComplexRendererFns 2090 AMDGPUInstructionSelector::selectVOP3OpSelMods0(MachineOperand &Root) const { 2091 // FIXME: Handle clamp and op_sel 2092 return {{ 2093 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 2094 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // src_mods 2095 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // clamp 2096 }}; 2097 } 2098 2099 InstructionSelector::ComplexRendererFns 2100 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { 2101 // FIXME: Handle op_sel 2102 return {{ 2103 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 2104 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods 2105 }}; 2106 } 2107 2108 InstructionSelector::ComplexRendererFns 2109 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { 2110 SmallVector<GEPInfo, 4> AddrInfo; 2111 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 2112 2113 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 2114 return None; 2115 2116 const GEPInfo &GEPInfo = AddrInfo[0]; 2117 Optional<int64_t> EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm); 2118 if (!EncodedImm) 2119 return None; 2120 2121 unsigned PtrReg = GEPInfo.SgprParts[0]; 2122 return {{ 2123 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 2124 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } 2125 }}; 2126 } 2127 2128 InstructionSelector::ComplexRendererFns 2129 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { 2130 SmallVector<GEPInfo, 4> AddrInfo; 2131 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 2132 2133 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 2134 return None; 2135 2136 const GEPInfo &GEPInfo = AddrInfo[0]; 2137 unsigned PtrReg = GEPInfo.SgprParts[0]; 2138 Optional<int64_t> EncodedImm = 2139 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm); 2140 if (!EncodedImm) 2141 return None; 2142 2143 return {{ 2144 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 2145 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } 2146 }}; 2147 } 2148 2149 InstructionSelector::ComplexRendererFns 2150 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { 2151 MachineInstr *MI = Root.getParent(); 2152 MachineBasicBlock *MBB = MI->getParent(); 2153 2154 SmallVector<GEPInfo, 4> AddrInfo; 2155 getAddrModeInfo(*MI, *MRI, AddrInfo); 2156 2157 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, 2158 // then we can select all ptr + 32-bit offsets not just immediate offsets. 2159 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 2160 return None; 2161 2162 const GEPInfo &GEPInfo = AddrInfo[0]; 2163 if (!GEPInfo.Imm || !isUInt<32>(GEPInfo.Imm)) 2164 return None; 2165 2166 // If we make it this far we have a load with an 32-bit immediate offset. 2167 // It is OK to select this using a sgpr offset, because we have already 2168 // failed trying to select this load into one of the _IMM variants since 2169 // the _IMM Patterns are considered before the _SGPR patterns. 2170 unsigned PtrReg = GEPInfo.SgprParts[0]; 2171 Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2172 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) 2173 .addImm(GEPInfo.Imm); 2174 return {{ 2175 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 2176 [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); } 2177 }}; 2178 } 2179 2180 template <bool Signed> 2181 InstructionSelector::ComplexRendererFns 2182 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const { 2183 MachineInstr *MI = Root.getParent(); 2184 2185 InstructionSelector::ComplexRendererFns Default = {{ 2186 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 2187 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // offset 2188 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 2189 }}; 2190 2191 if (!STI.hasFlatInstOffsets()) 2192 return Default; 2193 2194 const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg()); 2195 if (!OpDef || OpDef->getOpcode() != AMDGPU::G_PTR_ADD) 2196 return Default; 2197 2198 Optional<int64_t> Offset = 2199 getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI); 2200 if (!Offset.hasValue()) 2201 return Default; 2202 2203 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); 2204 if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed)) 2205 return Default; 2206 2207 Register BasePtr = OpDef->getOperand(1).getReg(); 2208 2209 return {{ 2210 [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); }, 2211 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); }, 2212 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 2213 }}; 2214 } 2215 2216 InstructionSelector::ComplexRendererFns 2217 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const { 2218 return selectFlatOffsetImpl<false>(Root); 2219 } 2220 2221 InstructionSelector::ComplexRendererFns 2222 AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const { 2223 return selectFlatOffsetImpl<true>(Root); 2224 } 2225 2226 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { 2227 auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>(); 2228 return PSV && PSV->isStack(); 2229 } 2230 2231 InstructionSelector::ComplexRendererFns 2232 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { 2233 MachineInstr *MI = Root.getParent(); 2234 MachineBasicBlock *MBB = MI->getParent(); 2235 MachineFunction *MF = MBB->getParent(); 2236 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 2237 2238 int64_t Offset = 0; 2239 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset))) { 2240 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2241 2242 // TODO: Should this be inside the render function? The iterator seems to 2243 // move. 2244 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), 2245 HighBits) 2246 .addImm(Offset & ~4095); 2247 2248 return {{[=](MachineInstrBuilder &MIB) { // rsrc 2249 MIB.addReg(Info->getScratchRSrcReg()); 2250 }, 2251 [=](MachineInstrBuilder &MIB) { // vaddr 2252 MIB.addReg(HighBits); 2253 }, 2254 [=](MachineInstrBuilder &MIB) { // soffset 2255 const MachineMemOperand *MMO = *MI->memoperands_begin(); 2256 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 2257 2258 Register SOffsetReg = isStackPtrRelative(PtrInfo) 2259 ? Info->getStackPtrOffsetReg() 2260 : Info->getScratchWaveOffsetReg(); 2261 MIB.addReg(SOffsetReg); 2262 }, 2263 [=](MachineInstrBuilder &MIB) { // offset 2264 MIB.addImm(Offset & 4095); 2265 }}}; 2266 } 2267 2268 assert(Offset == 0); 2269 2270 // Try to fold a frame index directly into the MUBUF vaddr field, and any 2271 // offsets. 2272 Optional<int> FI; 2273 Register VAddr = Root.getReg(); 2274 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) { 2275 if (isBaseWithConstantOffset(Root, *MRI)) { 2276 const MachineOperand &LHS = RootDef->getOperand(1); 2277 const MachineOperand &RHS = RootDef->getOperand(2); 2278 const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); 2279 const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); 2280 if (LHSDef && RHSDef) { 2281 int64_t PossibleOffset = 2282 RHSDef->getOperand(1).getCImm()->getSExtValue(); 2283 if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) && 2284 (!STI.privateMemoryResourceIsRangeChecked() || 2285 KnownBits->signBitIsZero(LHS.getReg()))) { 2286 if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX) 2287 FI = LHSDef->getOperand(1).getIndex(); 2288 else 2289 VAddr = LHS.getReg(); 2290 Offset = PossibleOffset; 2291 } 2292 } 2293 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) { 2294 FI = RootDef->getOperand(1).getIndex(); 2295 } 2296 } 2297 2298 // If we don't know this private access is a local stack object, it needs to 2299 // be relative to the entry point's scratch wave offset register. 2300 // TODO: Should split large offsets that don't fit like above. 2301 // TODO: Don't use scratch wave offset just because the offset didn't fit. 2302 Register SOffset = FI.hasValue() ? Info->getStackPtrOffsetReg() 2303 : Info->getScratchWaveOffsetReg(); 2304 2305 return {{[=](MachineInstrBuilder &MIB) { // rsrc 2306 MIB.addReg(Info->getScratchRSrcReg()); 2307 }, 2308 [=](MachineInstrBuilder &MIB) { // vaddr 2309 if (FI.hasValue()) 2310 MIB.addFrameIndex(FI.getValue()); 2311 else 2312 MIB.addReg(VAddr); 2313 }, 2314 [=](MachineInstrBuilder &MIB) { // soffset 2315 MIB.addReg(SOffset); 2316 }, 2317 [=](MachineInstrBuilder &MIB) { // offset 2318 MIB.addImm(Offset); 2319 }}}; 2320 } 2321 2322 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base, 2323 int64_t Offset, 2324 unsigned OffsetBits) const { 2325 if ((OffsetBits == 16 && !isUInt<16>(Offset)) || 2326 (OffsetBits == 8 && !isUInt<8>(Offset))) 2327 return false; 2328 2329 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) 2330 return true; 2331 2332 // On Southern Islands instruction with a negative base value and an offset 2333 // don't seem to work. 2334 return KnownBits->signBitIsZero(Base); 2335 } 2336 2337 InstructionSelector::ComplexRendererFns 2338 AMDGPUInstructionSelector::selectMUBUFScratchOffset( 2339 MachineOperand &Root) const { 2340 MachineInstr *MI = Root.getParent(); 2341 MachineBasicBlock *MBB = MI->getParent(); 2342 2343 int64_t Offset = 0; 2344 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) || 2345 !SIInstrInfo::isLegalMUBUFImmOffset(Offset)) 2346 return {}; 2347 2348 const MachineFunction *MF = MBB->getParent(); 2349 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 2350 const MachineMemOperand *MMO = *MI->memoperands_begin(); 2351 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 2352 2353 Register SOffsetReg = isStackPtrRelative(PtrInfo) 2354 ? Info->getStackPtrOffsetReg() 2355 : Info->getScratchWaveOffsetReg(); 2356 return {{ 2357 [=](MachineInstrBuilder &MIB) { 2358 MIB.addReg(Info->getScratchRSrcReg()); 2359 }, // rsrc 2360 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffsetReg); }, // soffset 2361 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset 2362 }}; 2363 } 2364 2365 std::pair<Register, unsigned> 2366 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const { 2367 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 2368 if (!RootDef) 2369 return std::make_pair(Root.getReg(), 0); 2370 2371 int64_t ConstAddr = 0; 2372 if (isBaseWithConstantOffset(Root, *MRI)) { 2373 const MachineOperand &LHS = RootDef->getOperand(1); 2374 const MachineOperand &RHS = RootDef->getOperand(2); 2375 const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); 2376 const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); 2377 if (LHSDef && RHSDef) { 2378 int64_t PossibleOffset = 2379 RHSDef->getOperand(1).getCImm()->getSExtValue(); 2380 if (isDSOffsetLegal(LHS.getReg(), PossibleOffset, 16)) { 2381 // (add n0, c0) 2382 return std::make_pair(LHS.getReg(), PossibleOffset); 2383 } 2384 } 2385 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 2386 // TODO 2387 2388 2389 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 2390 // TODO 2391 2392 } 2393 2394 return std::make_pair(Root.getReg(), 0); 2395 } 2396 2397 InstructionSelector::ComplexRendererFns 2398 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const { 2399 2400 Register Reg; 2401 unsigned Offset; 2402 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root); 2403 return {{ 2404 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 2405 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } 2406 }}; 2407 } 2408 2409 InstructionSelector::ComplexRendererFns 2410 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const { 2411 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 2412 if (!RootDef) { 2413 return {{ 2414 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 2415 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, 2416 [=](MachineInstrBuilder &MIB) { MIB.addImm(1); } 2417 }}; 2418 } 2419 2420 int64_t ConstAddr = 0; 2421 Register PtrBase; 2422 int64_t Offset; 2423 2424 std::tie(PtrBase, Offset) = 2425 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 2426 2427 if (Offset) { 2428 int64_t DWordOffset0 = Offset / 4; 2429 int64_t DWordOffset1 = DWordOffset0 + 1; 2430 if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) { 2431 // (add n0, c0) 2432 return {{ 2433 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, 2434 [=](MachineInstrBuilder &MIB) { MIB.addImm(DWordOffset0); }, 2435 [=](MachineInstrBuilder &MIB) { MIB.addImm(DWordOffset1); } 2436 }}; 2437 } 2438 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 2439 // TODO 2440 2441 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 2442 // TODO 2443 2444 } 2445 2446 return {{ 2447 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 2448 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, 2449 [=](MachineInstrBuilder &MIB) { MIB.addImm(1); } 2450 }}; 2451 } 2452 2453 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return 2454 /// the base value with the constant offset. There may be intervening copies 2455 /// between \p Root and the identified constant. Returns \p Root, 0 if this does 2456 /// not match the pattern. 2457 std::pair<Register, int64_t> 2458 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset( 2459 Register Root, const MachineRegisterInfo &MRI) const { 2460 MachineInstr *RootI = MRI.getVRegDef(Root); 2461 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD) 2462 return {Root, 0}; 2463 2464 MachineOperand &RHS = RootI->getOperand(2); 2465 Optional<ValueAndVReg> MaybeOffset 2466 = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true); 2467 if (!MaybeOffset) 2468 return {Root, 0}; 2469 return {RootI->getOperand(1).getReg(), MaybeOffset->Value}; 2470 } 2471 2472 static void addZeroImm(MachineInstrBuilder &MIB) { 2473 MIB.addImm(0); 2474 } 2475 2476 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p 2477 /// BasePtr is not valid, a null base pointer will be used. 2478 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, 2479 uint32_t FormatLo, uint32_t FormatHi, 2480 Register BasePtr) { 2481 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 2482 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 2483 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 2484 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 2485 2486 B.buildInstr(AMDGPU::S_MOV_B32) 2487 .addDef(RSrc2) 2488 .addImm(FormatLo); 2489 B.buildInstr(AMDGPU::S_MOV_B32) 2490 .addDef(RSrc3) 2491 .addImm(FormatHi); 2492 2493 // Build the half of the subregister with the constants before building the 2494 // full 128-bit register. If we are building multiple resource descriptors, 2495 // this will allow CSEing of the 2-component register. 2496 B.buildInstr(AMDGPU::REG_SEQUENCE) 2497 .addDef(RSrcHi) 2498 .addReg(RSrc2) 2499 .addImm(AMDGPU::sub0) 2500 .addReg(RSrc3) 2501 .addImm(AMDGPU::sub1); 2502 2503 Register RSrcLo = BasePtr; 2504 if (!BasePtr) { 2505 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 2506 B.buildInstr(AMDGPU::S_MOV_B64) 2507 .addDef(RSrcLo) 2508 .addImm(0); 2509 } 2510 2511 B.buildInstr(AMDGPU::REG_SEQUENCE) 2512 .addDef(RSrc) 2513 .addReg(RSrcLo) 2514 .addImm(AMDGPU::sub0_sub1) 2515 .addReg(RSrcHi) 2516 .addImm(AMDGPU::sub2_sub3); 2517 2518 return RSrc; 2519 } 2520 2521 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 2522 const SIInstrInfo &TII, Register BasePtr) { 2523 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 2524 2525 // FIXME: Why are half the "default" bits ignored based on the addressing 2526 // mode? 2527 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr); 2528 } 2529 2530 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 2531 const SIInstrInfo &TII, Register BasePtr) { 2532 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 2533 2534 // FIXME: Why are half the "default" bits ignored based on the addressing 2535 // mode? 2536 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr); 2537 } 2538 2539 AMDGPUInstructionSelector::MUBUFAddressData 2540 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const { 2541 MUBUFAddressData Data; 2542 Data.N0 = Src; 2543 2544 Register PtrBase; 2545 int64_t Offset; 2546 2547 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI); 2548 if (isUInt<32>(Offset)) { 2549 Data.N0 = PtrBase; 2550 Data.Offset = Offset; 2551 } 2552 2553 if (MachineInstr *InputAdd 2554 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) { 2555 Data.N2 = InputAdd->getOperand(1).getReg(); 2556 Data.N3 = InputAdd->getOperand(2).getReg(); 2557 2558 // FIXME: Need to fix extra SGPR->VGPRcopies inserted 2559 // FIXME: Don't know this was defined by operand 0 2560 // 2561 // TODO: Remove this when we have copy folding optimizations after 2562 // RegBankSelect. 2563 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg(); 2564 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg(); 2565 } 2566 2567 return Data; 2568 } 2569 2570 /// Return if the addr64 mubuf mode should be used for the given address. 2571 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const { 2572 // (ptr_add N2, N3) -> addr64, or 2573 // (ptr_add (ptr_add N2, N3), C1) -> addr64 2574 if (Addr.N2) 2575 return true; 2576 2577 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI); 2578 return N0Bank->getID() == AMDGPU::VGPRRegBankID; 2579 } 2580 2581 /// Split an immediate offset \p ImmOffset depending on whether it fits in the 2582 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable 2583 /// component. 2584 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset( 2585 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const { 2586 if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset)) 2587 return; 2588 2589 // Illegal offset, store it in soffset. 2590 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2591 B.buildInstr(AMDGPU::S_MOV_B32) 2592 .addDef(SOffset) 2593 .addImm(ImmOffset); 2594 ImmOffset = 0; 2595 } 2596 2597 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl( 2598 MachineOperand &Root, Register &VAddr, Register &RSrcReg, 2599 Register &SOffset, int64_t &Offset) const { 2600 // FIXME: Predicates should stop this from reaching here. 2601 // addr64 bit was removed for volcanic islands. 2602 if (!STI.hasAddr64() || STI.useFlatForGlobal()) 2603 return false; 2604 2605 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 2606 if (!shouldUseAddr64(AddrData)) 2607 return false; 2608 2609 Register N0 = AddrData.N0; 2610 Register N2 = AddrData.N2; 2611 Register N3 = AddrData.N3; 2612 Offset = AddrData.Offset; 2613 2614 // Base pointer for the SRD. 2615 Register SRDPtr; 2616 2617 if (N2) { 2618 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 2619 assert(N3); 2620 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 2621 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the 2622 // addr64, and construct the default resource from a 0 address. 2623 VAddr = N0; 2624 } else { 2625 SRDPtr = N3; 2626 VAddr = N2; 2627 } 2628 } else { 2629 // N2 is not divergent. 2630 SRDPtr = N2; 2631 VAddr = N3; 2632 } 2633 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 2634 // Use the default null pointer in the resource 2635 VAddr = N0; 2636 } else { 2637 // N0 -> offset, or 2638 // (N0 + C1) -> offset 2639 SRDPtr = N0; 2640 } 2641 2642 MachineIRBuilder B(*Root.getParent()); 2643 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr); 2644 splitIllegalMUBUFOffset(B, SOffset, Offset); 2645 return true; 2646 } 2647 2648 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl( 2649 MachineOperand &Root, Register &RSrcReg, Register &SOffset, 2650 int64_t &Offset) const { 2651 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 2652 if (shouldUseAddr64(AddrData)) 2653 return false; 2654 2655 // N0 -> offset, or 2656 // (N0 + C1) -> offset 2657 Register SRDPtr = AddrData.N0; 2658 Offset = AddrData.Offset; 2659 2660 // TODO: Look through extensions for 32-bit soffset. 2661 MachineIRBuilder B(*Root.getParent()); 2662 2663 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr); 2664 splitIllegalMUBUFOffset(B, SOffset, Offset); 2665 return true; 2666 } 2667 2668 InstructionSelector::ComplexRendererFns 2669 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const { 2670 Register VAddr; 2671 Register RSrcReg; 2672 Register SOffset; 2673 int64_t Offset = 0; 2674 2675 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 2676 return {}; 2677 2678 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 2679 // pattern. 2680 return {{ 2681 [=](MachineInstrBuilder &MIB) { // rsrc 2682 MIB.addReg(RSrcReg); 2683 }, 2684 [=](MachineInstrBuilder &MIB) { // vaddr 2685 MIB.addReg(VAddr); 2686 }, 2687 [=](MachineInstrBuilder &MIB) { // soffset 2688 if (SOffset) 2689 MIB.addReg(SOffset); 2690 else 2691 MIB.addImm(0); 2692 }, 2693 [=](MachineInstrBuilder &MIB) { // offset 2694 MIB.addImm(Offset); 2695 }, 2696 addZeroImm, // glc 2697 addZeroImm, // slc 2698 addZeroImm, // tfe 2699 addZeroImm, // dlc 2700 addZeroImm // swz 2701 }}; 2702 } 2703 2704 InstructionSelector::ComplexRendererFns 2705 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const { 2706 Register RSrcReg; 2707 Register SOffset; 2708 int64_t Offset = 0; 2709 2710 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 2711 return {}; 2712 2713 return {{ 2714 [=](MachineInstrBuilder &MIB) { // rsrc 2715 MIB.addReg(RSrcReg); 2716 }, 2717 [=](MachineInstrBuilder &MIB) { // soffset 2718 if (SOffset) 2719 MIB.addReg(SOffset); 2720 else 2721 MIB.addImm(0); 2722 }, 2723 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 2724 addZeroImm, // glc 2725 addZeroImm, // slc 2726 addZeroImm, // tfe 2727 addZeroImm, // dlc 2728 addZeroImm // swz 2729 }}; 2730 } 2731 2732 InstructionSelector::ComplexRendererFns 2733 AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const { 2734 Register VAddr; 2735 Register RSrcReg; 2736 Register SOffset; 2737 int64_t Offset = 0; 2738 2739 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 2740 return {}; 2741 2742 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 2743 // pattern. 2744 return {{ 2745 [=](MachineInstrBuilder &MIB) { // rsrc 2746 MIB.addReg(RSrcReg); 2747 }, 2748 [=](MachineInstrBuilder &MIB) { // vaddr 2749 MIB.addReg(VAddr); 2750 }, 2751 [=](MachineInstrBuilder &MIB) { // soffset 2752 if (SOffset) 2753 MIB.addReg(SOffset); 2754 else 2755 MIB.addImm(0); 2756 }, 2757 [=](MachineInstrBuilder &MIB) { // offset 2758 MIB.addImm(Offset); 2759 }, 2760 addZeroImm // slc 2761 }}; 2762 } 2763 2764 InstructionSelector::ComplexRendererFns 2765 AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const { 2766 Register RSrcReg; 2767 Register SOffset; 2768 int64_t Offset = 0; 2769 2770 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 2771 return {}; 2772 2773 return {{ 2774 [=](MachineInstrBuilder &MIB) { // rsrc 2775 MIB.addReg(RSrcReg); 2776 }, 2777 [=](MachineInstrBuilder &MIB) { // soffset 2778 if (SOffset) 2779 MIB.addReg(SOffset); 2780 else 2781 MIB.addImm(0); 2782 }, 2783 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 2784 addZeroImm // slc 2785 }}; 2786 } 2787 2788 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, 2789 const MachineInstr &MI, 2790 int OpIdx) const { 2791 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 2792 "Expected G_CONSTANT"); 2793 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue()); 2794 } 2795 2796 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB, 2797 const MachineInstr &MI, 2798 int OpIdx) const { 2799 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 2800 "Expected G_CONSTANT"); 2801 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue()); 2802 } 2803 2804 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB, 2805 const MachineInstr &MI, 2806 int OpIdx) const { 2807 assert(OpIdx == -1); 2808 2809 const MachineOperand &Op = MI.getOperand(1); 2810 if (MI.getOpcode() == TargetOpcode::G_FCONSTANT) 2811 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); 2812 else { 2813 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT"); 2814 MIB.addImm(Op.getCImm()->getSExtValue()); 2815 } 2816 } 2817 2818 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB, 2819 const MachineInstr &MI, 2820 int OpIdx) const { 2821 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 2822 "Expected G_CONSTANT"); 2823 MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation()); 2824 } 2825 2826 /// This only really exists to satisfy DAG type checking machinery, so is a 2827 /// no-op here. 2828 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB, 2829 const MachineInstr &MI, 2830 int OpIdx) const { 2831 MIB.addImm(MI.getOperand(OpIdx).getImm()); 2832 } 2833 2834 void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB, 2835 const MachineInstr &MI, 2836 int OpIdx) const { 2837 assert(OpIdx >= 0 && "expected to match an immediate operand"); 2838 MIB.addImm(MI.getOperand(OpIdx).getImm() & 1); 2839 } 2840 2841 void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB, 2842 const MachineInstr &MI, 2843 int OpIdx) const { 2844 assert(OpIdx >= 0 && "expected to match an immediate operand"); 2845 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1); 2846 } 2847 2848 void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB, 2849 const MachineInstr &MI, 2850 int OpIdx) const { 2851 assert(OpIdx >= 0 && "expected to match an immediate operand"); 2852 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1); 2853 } 2854 2855 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB, 2856 const MachineInstr &MI, 2857 int OpIdx) const { 2858 assert(OpIdx >= 0 && "expected to match an immediate operand"); 2859 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1); 2860 } 2861 2862 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const { 2863 return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm()); 2864 } 2865 2866 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const { 2867 return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm()); 2868 } 2869 2870 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const { 2871 return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm()); 2872 } 2873 2874 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const { 2875 return TII.isInlineConstant(Imm); 2876 } 2877