1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the InstructionSelector class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUInstructionSelector.h" 15 #include "AMDGPUInstrInfo.h" 16 #include "AMDGPUGlobalISelUtils.h" 17 #include "AMDGPURegisterBankInfo.h" 18 #include "AMDGPUSubtarget.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 23 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" 24 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" 25 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 26 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 27 #include "llvm/CodeGen/GlobalISel/Utils.h" 28 #include "llvm/CodeGen/MachineBasicBlock.h" 29 #include "llvm/CodeGen/MachineFunction.h" 30 #include "llvm/CodeGen/MachineInstr.h" 31 #include "llvm/CodeGen/MachineInstrBuilder.h" 32 #include "llvm/CodeGen/MachineRegisterInfo.h" 33 #include "llvm/IR/Type.h" 34 #include "llvm/Support/Debug.h" 35 #include "llvm/Support/raw_ostream.h" 36 37 #define DEBUG_TYPE "amdgpu-isel" 38 39 using namespace llvm; 40 using namespace MIPatternMatch; 41 42 #define GET_GLOBALISEL_IMPL 43 #define AMDGPUSubtarget GCNSubtarget 44 #include "AMDGPUGenGlobalISel.inc" 45 #undef GET_GLOBALISEL_IMPL 46 #undef AMDGPUSubtarget 47 48 AMDGPUInstructionSelector::AMDGPUInstructionSelector( 49 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, 50 const AMDGPUTargetMachine &TM) 51 : InstructionSelector(), TII(*STI.getInstrInfo()), 52 TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM), 53 STI(STI), 54 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG), 55 #define GET_GLOBALISEL_PREDICATES_INIT 56 #include "AMDGPUGenGlobalISel.inc" 57 #undef GET_GLOBALISEL_PREDICATES_INIT 58 #define GET_GLOBALISEL_TEMPORARIES_INIT 59 #include "AMDGPUGenGlobalISel.inc" 60 #undef GET_GLOBALISEL_TEMPORARIES_INIT 61 { 62 } 63 64 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } 65 66 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB, 67 CodeGenCoverage &CoverageInfo) { 68 MRI = &MF.getRegInfo(); 69 InstructionSelector::setupMF(MF, KB, CoverageInfo); 70 } 71 72 bool AMDGPUInstructionSelector::isVCC(Register Reg, 73 const MachineRegisterInfo &MRI) const { 74 if (Register::isPhysicalRegister(Reg)) 75 return Reg == TRI.getVCC(); 76 77 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 78 const TargetRegisterClass *RC = 79 RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); 80 if (RC) { 81 const LLT Ty = MRI.getType(Reg); 82 return RC->hasSuperClassEq(TRI.getBoolRC()) && 83 Ty.isValid() && Ty.getSizeInBits() == 1; 84 } 85 86 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); 87 return RB->getID() == AMDGPU::VCCRegBankID; 88 } 89 90 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI, 91 unsigned NewOpc) const { 92 MI.setDesc(TII.get(NewOpc)); 93 MI.RemoveOperand(1); // Remove intrinsic ID. 94 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 95 96 MachineOperand &Dst = MI.getOperand(0); 97 MachineOperand &Src = MI.getOperand(1); 98 99 // TODO: This should be legalized to s32 if needed 100 if (MRI->getType(Dst.getReg()) == LLT::scalar(1)) 101 return false; 102 103 const TargetRegisterClass *DstRC 104 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 105 const TargetRegisterClass *SrcRC 106 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 107 if (!DstRC || DstRC != SrcRC) 108 return false; 109 110 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) && 111 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI); 112 } 113 114 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { 115 const DebugLoc &DL = I.getDebugLoc(); 116 MachineBasicBlock *BB = I.getParent(); 117 I.setDesc(TII.get(TargetOpcode::COPY)); 118 119 const MachineOperand &Src = I.getOperand(1); 120 MachineOperand &Dst = I.getOperand(0); 121 Register DstReg = Dst.getReg(); 122 Register SrcReg = Src.getReg(); 123 124 if (isVCC(DstReg, *MRI)) { 125 if (SrcReg == AMDGPU::SCC) { 126 const TargetRegisterClass *RC 127 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 128 if (!RC) 129 return true; 130 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 131 } 132 133 if (!isVCC(SrcReg, *MRI)) { 134 // TODO: Should probably leave the copy and let copyPhysReg expand it. 135 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI)) 136 return false; 137 138 const TargetRegisterClass *SrcRC 139 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 140 141 Register MaskedReg = MRI->createVirtualRegister(SrcRC); 142 143 // We can't trust the high bits at this point, so clear them. 144 145 // TODO: Skip masking high bits if def is known boolean. 146 147 unsigned AndOpc = TRI.isSGPRClass(SrcRC) ? 148 AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32; 149 BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg) 150 .addImm(1) 151 .addReg(SrcReg); 152 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) 153 .addImm(0) 154 .addReg(MaskedReg); 155 156 if (!MRI->getRegClassOrNull(SrcReg)) 157 MRI->setRegClass(SrcReg, SrcRC); 158 I.eraseFromParent(); 159 return true; 160 } 161 162 const TargetRegisterClass *RC = 163 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 164 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) 165 return false; 166 167 // Don't constrain the source register to a class so the def instruction 168 // handles it (unless it's undef). 169 // 170 // FIXME: This is a hack. When selecting the def, we neeed to know 171 // specifically know that the result is VCCRegBank, and not just an SGPR 172 // with size 1. An SReg_32 with size 1 is ambiguous with wave32. 173 if (Src.isUndef()) { 174 const TargetRegisterClass *SrcRC = 175 TRI.getConstrainedRegClassForOperand(Src, *MRI); 176 if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 177 return false; 178 } 179 180 return true; 181 } 182 183 for (const MachineOperand &MO : I.operands()) { 184 if (Register::isPhysicalRegister(MO.getReg())) 185 continue; 186 187 const TargetRegisterClass *RC = 188 TRI.getConstrainedRegClassForOperand(MO, *MRI); 189 if (!RC) 190 continue; 191 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); 192 } 193 return true; 194 } 195 196 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { 197 const Register DefReg = I.getOperand(0).getReg(); 198 const LLT DefTy = MRI->getType(DefReg); 199 200 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy) 201 202 const RegClassOrRegBank &RegClassOrBank = 203 MRI->getRegClassOrRegBank(DefReg); 204 205 const TargetRegisterClass *DefRC 206 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); 207 if (!DefRC) { 208 if (!DefTy.isValid()) { 209 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); 210 return false; 211 } 212 213 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); 214 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI); 215 if (!DefRC) { 216 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); 217 return false; 218 } 219 } 220 221 // TODO: Verify that all registers have the same bank 222 I.setDesc(TII.get(TargetOpcode::PHI)); 223 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI); 224 } 225 226 MachineOperand 227 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, 228 const TargetRegisterClass &SubRC, 229 unsigned SubIdx) const { 230 231 MachineInstr *MI = MO.getParent(); 232 MachineBasicBlock *BB = MO.getParent()->getParent(); 233 Register DstReg = MRI->createVirtualRegister(&SubRC); 234 235 if (MO.isReg()) { 236 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); 237 Register Reg = MO.getReg(); 238 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) 239 .addReg(Reg, 0, ComposedSubIdx); 240 241 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(), 242 MO.isKill(), MO.isDead(), MO.isUndef(), 243 MO.isEarlyClobber(), 0, MO.isDebug(), 244 MO.isInternalRead()); 245 } 246 247 assert(MO.isImm()); 248 249 APInt Imm(64, MO.getImm()); 250 251 switch (SubIdx) { 252 default: 253 llvm_unreachable("do not know to split immediate with this sub index."); 254 case AMDGPU::sub0: 255 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue()); 256 case AMDGPU::sub1: 257 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue()); 258 } 259 } 260 261 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { 262 switch (Opc) { 263 case AMDGPU::G_AND: 264 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; 265 case AMDGPU::G_OR: 266 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32; 267 case AMDGPU::G_XOR: 268 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32; 269 default: 270 llvm_unreachable("not a bit op"); 271 } 272 } 273 274 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { 275 MachineOperand &Dst = I.getOperand(0); 276 MachineOperand &Src0 = I.getOperand(1); 277 MachineOperand &Src1 = I.getOperand(2); 278 Register DstReg = Dst.getReg(); 279 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 280 281 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 282 if (DstRB->getID() == AMDGPU::VCCRegBankID) { 283 const TargetRegisterClass *RC = TRI.getBoolRC(); 284 unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), 285 RC == &AMDGPU::SReg_64RegClass); 286 I.setDesc(TII.get(InstOpc)); 287 // Dead implicit-def of scc 288 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef 289 true, // isImp 290 false, // isKill 291 true)); // isDead 292 293 // FIXME: Hack to avoid turning the register bank into a register class. 294 // The selector for G_ICMP relies on seeing the register bank for the result 295 // is VCC. In wave32 if we constrain the registers to SReg_32 here, it will 296 // be ambiguous whether it's a scalar or vector bool. 297 if (Src0.isUndef() && !MRI->getRegClassOrNull(Src0.getReg())) 298 MRI->setRegClass(Src0.getReg(), RC); 299 if (Src1.isUndef() && !MRI->getRegClassOrNull(Src1.getReg())) 300 MRI->setRegClass(Src1.getReg(), RC); 301 302 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 303 } 304 305 // TODO: Should this allow an SCC bank result, and produce a copy from SCC for 306 // the result? 307 if (DstRB->getID() == AMDGPU::SGPRRegBankID) { 308 unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), Size > 32); 309 I.setDesc(TII.get(InstOpc)); 310 // Dead implicit-def of scc 311 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef 312 true, // isImp 313 false, // isKill 314 true)); // isDead 315 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 316 } 317 318 return false; 319 } 320 321 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { 322 MachineBasicBlock *BB = I.getParent(); 323 MachineFunction *MF = BB->getParent(); 324 Register DstReg = I.getOperand(0).getReg(); 325 const DebugLoc &DL = I.getDebugLoc(); 326 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 327 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 328 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID; 329 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB; 330 331 if (Size == 32) { 332 if (IsSALU) { 333 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; 334 MachineInstr *Add = 335 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 336 .add(I.getOperand(1)) 337 .add(I.getOperand(2)); 338 I.eraseFromParent(); 339 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 340 } 341 342 if (STI.hasAddNoCarry()) { 343 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64; 344 I.setDesc(TII.get(Opc)); 345 I.addOperand(*MF, MachineOperand::CreateImm(0)); 346 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 347 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 348 } 349 350 const unsigned Opc = Sub ? AMDGPU::V_SUB_I32_e64 : AMDGPU::V_ADD_I32_e64; 351 352 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass()); 353 MachineInstr *Add 354 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 355 .addDef(UnusedCarry, RegState::Dead) 356 .add(I.getOperand(1)) 357 .add(I.getOperand(2)) 358 .addImm(0); 359 I.eraseFromParent(); 360 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 361 } 362 363 assert(!Sub && "illegal sub should not reach here"); 364 365 const TargetRegisterClass &RC 366 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass; 367 const TargetRegisterClass &HalfRC 368 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass; 369 370 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0)); 371 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0)); 372 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1)); 373 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1)); 374 375 Register DstLo = MRI->createVirtualRegister(&HalfRC); 376 Register DstHi = MRI->createVirtualRegister(&HalfRC); 377 378 if (IsSALU) { 379 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) 380 .add(Lo1) 381 .add(Lo2); 382 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) 383 .add(Hi1) 384 .add(Hi2); 385 } else { 386 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass(); 387 Register CarryReg = MRI->createVirtualRegister(CarryRC); 388 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_I32_e64), DstLo) 389 .addDef(CarryReg) 390 .add(Lo1) 391 .add(Lo2) 392 .addImm(0); 393 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi) 394 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead) 395 .add(Hi1) 396 .add(Hi2) 397 .addReg(CarryReg, RegState::Kill) 398 .addImm(0); 399 400 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI)) 401 return false; 402 } 403 404 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 405 .addReg(DstLo) 406 .addImm(AMDGPU::sub0) 407 .addReg(DstHi) 408 .addImm(AMDGPU::sub1); 409 410 411 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI)) 412 return false; 413 414 I.eraseFromParent(); 415 return true; 416 } 417 418 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE( 419 MachineInstr &I) const { 420 MachineBasicBlock *BB = I.getParent(); 421 MachineFunction *MF = BB->getParent(); 422 const DebugLoc &DL = I.getDebugLoc(); 423 Register Dst0Reg = I.getOperand(0).getReg(); 424 Register Dst1Reg = I.getOperand(1).getReg(); 425 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO || 426 I.getOpcode() == AMDGPU::G_UADDE; 427 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE || 428 I.getOpcode() == AMDGPU::G_USUBE; 429 430 if (isVCC(Dst1Reg, *MRI)) { 431 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned 432 // carry out despite the _i32 name. These were renamed in VI to _U32. 433 // FIXME: We should probably rename the opcodes here. 434 unsigned NoCarryOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; 435 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 436 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc)); 437 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 438 I.addOperand(*MF, MachineOperand::CreateImm(0)); 439 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 440 } 441 442 Register Src0Reg = I.getOperand(2).getReg(); 443 Register Src1Reg = I.getOperand(3).getReg(); 444 445 if (HasCarryIn) { 446 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 447 .addReg(I.getOperand(4).getReg()); 448 } 449 450 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; 451 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; 452 453 BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg) 454 .add(I.getOperand(2)) 455 .add(I.getOperand(3)); 456 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg) 457 .addReg(AMDGPU::SCC); 458 459 if (!MRI->getRegClassOrNull(Dst1Reg)) 460 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass); 461 462 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) || 463 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) || 464 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI)) 465 return false; 466 467 if (HasCarryIn && 468 !RBI.constrainGenericRegister(I.getOperand(4).getReg(), 469 AMDGPU::SReg_32RegClass, *MRI)) 470 return false; 471 472 I.eraseFromParent(); 473 return true; 474 } 475 476 // TODO: We should probably legalize these to only using 32-bit results. 477 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { 478 MachineBasicBlock *BB = I.getParent(); 479 Register DstReg = I.getOperand(0).getReg(); 480 Register SrcReg = I.getOperand(1).getReg(); 481 LLT DstTy = MRI->getType(DstReg); 482 LLT SrcTy = MRI->getType(SrcReg); 483 const unsigned SrcSize = SrcTy.getSizeInBits(); 484 const unsigned DstSize = DstTy.getSizeInBits(); 485 486 // TODO: Should handle any multiple of 32 offset. 487 unsigned Offset = I.getOperand(2).getImm(); 488 if (Offset % 32 != 0 || DstSize > 128) 489 return false; 490 491 const TargetRegisterClass *DstRC = 492 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI); 493 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 494 return false; 495 496 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 497 const TargetRegisterClass *SrcRC = 498 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 499 if (!SrcRC) 500 return false; 501 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32, 502 DstSize / 32); 503 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg); 504 if (!SrcRC) 505 return false; 506 507 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I, 508 *SrcRC, I.getOperand(1)); 509 const DebugLoc &DL = I.getDebugLoc(); 510 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg) 511 .addReg(SrcReg, 0, SubReg); 512 513 I.eraseFromParent(); 514 return true; 515 } 516 517 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { 518 MachineBasicBlock *BB = MI.getParent(); 519 Register DstReg = MI.getOperand(0).getReg(); 520 LLT DstTy = MRI->getType(DstReg); 521 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg()); 522 523 const unsigned SrcSize = SrcTy.getSizeInBits(); 524 if (SrcSize < 32) 525 return selectImpl(MI, *CoverageInfo); 526 527 const DebugLoc &DL = MI.getDebugLoc(); 528 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 529 const unsigned DstSize = DstTy.getSizeInBits(); 530 const TargetRegisterClass *DstRC = 531 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 532 if (!DstRC) 533 return false; 534 535 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8); 536 MachineInstrBuilder MIB = 537 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg); 538 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) { 539 MachineOperand &Src = MI.getOperand(I + 1); 540 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef())); 541 MIB.addImm(SubRegs[I]); 542 543 const TargetRegisterClass *SrcRC 544 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 545 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI)) 546 return false; 547 } 548 549 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 550 return false; 551 552 MI.eraseFromParent(); 553 return true; 554 } 555 556 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { 557 MachineBasicBlock *BB = MI.getParent(); 558 const int NumDst = MI.getNumOperands() - 1; 559 560 MachineOperand &Src = MI.getOperand(NumDst); 561 562 Register SrcReg = Src.getReg(); 563 Register DstReg0 = MI.getOperand(0).getReg(); 564 LLT DstTy = MRI->getType(DstReg0); 565 LLT SrcTy = MRI->getType(SrcReg); 566 567 const unsigned DstSize = DstTy.getSizeInBits(); 568 const unsigned SrcSize = SrcTy.getSizeInBits(); 569 const DebugLoc &DL = MI.getDebugLoc(); 570 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 571 572 const TargetRegisterClass *SrcRC = 573 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 574 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 575 return false; 576 577 const unsigned SrcFlags = getUndefRegState(Src.isUndef()); 578 579 // Note we could have mixed SGPR and VGPR destination banks for an SGPR 580 // source, and this relies on the fact that the same subregister indices are 581 // used for both. 582 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); 583 for (int I = 0, E = NumDst; I != E; ++I) { 584 MachineOperand &Dst = MI.getOperand(I); 585 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg()) 586 .addReg(SrcReg, SrcFlags, SubRegs[I]); 587 588 const TargetRegisterClass *DstRC = 589 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 590 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI)) 591 return false; 592 } 593 594 MI.eraseFromParent(); 595 return true; 596 } 597 598 static bool isZero(Register Reg, const MachineRegisterInfo &MRI) { 599 int64_t Val; 600 return mi_match(Reg, MRI, m_ICst(Val)) && Val == 0; 601 } 602 603 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC( 604 MachineInstr &MI) const { 605 if (selectImpl(MI, *CoverageInfo)) 606 return true; 607 608 const LLT S32 = LLT::scalar(32); 609 const LLT V2S16 = LLT::vector(2, 16); 610 611 Register Dst = MI.getOperand(0).getReg(); 612 if (MRI->getType(Dst) != V2S16) 613 return false; 614 615 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI); 616 if (DstBank->getID() != AMDGPU::SGPRRegBankID) 617 return false; 618 619 Register Src0 = MI.getOperand(1).getReg(); 620 Register Src1 = MI.getOperand(2).getReg(); 621 if (MRI->getType(Src0) != S32) 622 return false; 623 624 const DebugLoc &DL = MI.getDebugLoc(); 625 MachineBasicBlock *BB = MI.getParent(); 626 627 // TODO: This should probably be a combine somewhere 628 // (build_vector_trunc $src0, undef -> copy $src0 629 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI); 630 if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) { 631 MI.setDesc(TII.get(AMDGPU::COPY)); 632 MI.RemoveOperand(2); 633 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) && 634 RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI); 635 } 636 637 Register ShiftSrc0; 638 Register ShiftSrc1; 639 int64_t ShiftAmt; 640 641 // With multiple uses of the shift, this will duplicate the shift and 642 // increase register pressure. 643 // 644 // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16) 645 // => (S_PACK_HH_B32_B16 $src0, $src1) 646 // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16)) 647 // => (S_PACK_LH_B32_B16 $src0, $src1) 648 // (build_vector_trunc $src0, $src1) 649 // => (S_PACK_LL_B32_B16 $src0, $src1) 650 651 // FIXME: This is an inconvenient way to check a specific value 652 bool Shift0 = mi_match( 653 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_ICst(ShiftAmt)))) && 654 ShiftAmt == 16; 655 656 bool Shift1 = mi_match( 657 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_ICst(ShiftAmt)))) && 658 ShiftAmt == 16; 659 660 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16; 661 if (Shift0 && Shift1) { 662 Opc = AMDGPU::S_PACK_HH_B32_B16; 663 MI.getOperand(1).setReg(ShiftSrc0); 664 MI.getOperand(2).setReg(ShiftSrc1); 665 } else if (Shift1) { 666 Opc = AMDGPU::S_PACK_LH_B32_B16; 667 MI.getOperand(2).setReg(ShiftSrc1); 668 } else if (Shift0 && isZero(Src1, *MRI)) { 669 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16 670 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst) 671 .addReg(ShiftSrc0) 672 .addImm(16); 673 674 MI.eraseFromParent(); 675 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 676 } 677 678 MI.setDesc(TII.get(Opc)); 679 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); 680 } 681 682 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const { 683 return selectG_ADD_SUB(I); 684 } 685 686 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { 687 const MachineOperand &MO = I.getOperand(0); 688 689 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The 690 // regbank check here is to know why getConstrainedRegClassForOperand failed. 691 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI); 692 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) || 693 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) { 694 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); 695 return true; 696 } 697 698 return false; 699 } 700 701 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { 702 MachineBasicBlock *BB = I.getParent(); 703 704 Register DstReg = I.getOperand(0).getReg(); 705 Register Src0Reg = I.getOperand(1).getReg(); 706 Register Src1Reg = I.getOperand(2).getReg(); 707 LLT Src1Ty = MRI->getType(Src1Reg); 708 709 unsigned DstSize = MRI->getType(DstReg).getSizeInBits(); 710 unsigned InsSize = Src1Ty.getSizeInBits(); 711 712 int64_t Offset = I.getOperand(3).getImm(); 713 if (Offset % 32 != 0) 714 return false; 715 716 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32); 717 if (SubReg == AMDGPU::NoSubRegister) 718 return false; 719 720 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 721 const TargetRegisterClass *DstRC = 722 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 723 if (!DstRC) 724 return false; 725 726 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI); 727 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI); 728 const TargetRegisterClass *Src0RC = 729 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI); 730 const TargetRegisterClass *Src1RC = 731 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI); 732 733 // Deal with weird cases where the class only partially supports the subreg 734 // index. 735 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg); 736 if (!Src0RC) 737 return false; 738 739 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 740 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) || 741 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI)) 742 return false; 743 744 const DebugLoc &DL = I.getDebugLoc(); 745 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg) 746 .addReg(Src0Reg) 747 .addReg(Src1Reg) 748 .addImm(SubReg); 749 750 I.eraseFromParent(); 751 return true; 752 } 753 754 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const { 755 if (STI.getLDSBankCount() != 16) 756 return selectImpl(MI, *CoverageInfo); 757 758 Register Dst = MI.getOperand(0).getReg(); 759 Register Src0 = MI.getOperand(2).getReg(); 760 Register M0Val = MI.getOperand(6).getReg(); 761 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) || 762 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) || 763 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI)) 764 return false; 765 766 // This requires 2 instructions. It is possible to write a pattern to support 767 // this, but the generated isel emitter doesn't correctly deal with multiple 768 // output instructions using the same physical register input. The copy to m0 769 // is incorrectly placed before the second instruction. 770 // 771 // TODO: Match source modifiers. 772 773 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 774 const DebugLoc &DL = MI.getDebugLoc(); 775 MachineBasicBlock *MBB = MI.getParent(); 776 777 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 778 .addReg(M0Val); 779 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov) 780 .addImm(2) 781 .addImm(MI.getOperand(4).getImm()) // $attr 782 .addImm(MI.getOperand(3).getImm()); // $attrchan 783 784 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst) 785 .addImm(0) // $src0_modifiers 786 .addReg(Src0) // $src0 787 .addImm(MI.getOperand(4).getImm()) // $attr 788 .addImm(MI.getOperand(3).getImm()) // $attrchan 789 .addImm(0) // $src2_modifiers 790 .addReg(InterpMov) // $src2 - 2 f16 values selected by high 791 .addImm(MI.getOperand(5).getImm()) // $high 792 .addImm(0) // $clamp 793 .addImm(0); // $omod 794 795 MI.eraseFromParent(); 796 return true; 797 } 798 799 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { 800 unsigned IntrinsicID = I.getIntrinsicID(); 801 switch (IntrinsicID) { 802 case Intrinsic::amdgcn_if_break: { 803 MachineBasicBlock *BB = I.getParent(); 804 805 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 806 // SelectionDAG uses for wave32 vs wave64. 807 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK)) 808 .add(I.getOperand(0)) 809 .add(I.getOperand(2)) 810 .add(I.getOperand(3)); 811 812 Register DstReg = I.getOperand(0).getReg(); 813 Register Src0Reg = I.getOperand(2).getReg(); 814 Register Src1Reg = I.getOperand(3).getReg(); 815 816 I.eraseFromParent(); 817 818 for (Register Reg : { DstReg, Src0Reg, Src1Reg }) 819 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 820 821 return true; 822 } 823 case Intrinsic::amdgcn_interp_p1_f16: 824 return selectInterpP1F16(I); 825 case Intrinsic::amdgcn_wqm: 826 return constrainCopyLikeIntrin(I, AMDGPU::WQM); 827 case Intrinsic::amdgcn_softwqm: 828 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM); 829 case Intrinsic::amdgcn_wwm: 830 return constrainCopyLikeIntrin(I, AMDGPU::WWM); 831 default: 832 return selectImpl(I, *CoverageInfo); 833 } 834 } 835 836 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) { 837 if (Size != 32 && Size != 64) 838 return -1; 839 switch (P) { 840 default: 841 llvm_unreachable("Unknown condition code!"); 842 case CmpInst::ICMP_NE: 843 return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64; 844 case CmpInst::ICMP_EQ: 845 return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64; 846 case CmpInst::ICMP_SGT: 847 return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64; 848 case CmpInst::ICMP_SGE: 849 return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64; 850 case CmpInst::ICMP_SLT: 851 return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64; 852 case CmpInst::ICMP_SLE: 853 return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64; 854 case CmpInst::ICMP_UGT: 855 return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64; 856 case CmpInst::ICMP_UGE: 857 return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64; 858 case CmpInst::ICMP_ULT: 859 return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64; 860 case CmpInst::ICMP_ULE: 861 return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64; 862 } 863 } 864 865 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P, 866 unsigned Size) const { 867 if (Size == 64) { 868 if (!STI.hasScalarCompareEq64()) 869 return -1; 870 871 switch (P) { 872 case CmpInst::ICMP_NE: 873 return AMDGPU::S_CMP_LG_U64; 874 case CmpInst::ICMP_EQ: 875 return AMDGPU::S_CMP_EQ_U64; 876 default: 877 return -1; 878 } 879 } 880 881 if (Size != 32) 882 return -1; 883 884 switch (P) { 885 case CmpInst::ICMP_NE: 886 return AMDGPU::S_CMP_LG_U32; 887 case CmpInst::ICMP_EQ: 888 return AMDGPU::S_CMP_EQ_U32; 889 case CmpInst::ICMP_SGT: 890 return AMDGPU::S_CMP_GT_I32; 891 case CmpInst::ICMP_SGE: 892 return AMDGPU::S_CMP_GE_I32; 893 case CmpInst::ICMP_SLT: 894 return AMDGPU::S_CMP_LT_I32; 895 case CmpInst::ICMP_SLE: 896 return AMDGPU::S_CMP_LE_I32; 897 case CmpInst::ICMP_UGT: 898 return AMDGPU::S_CMP_GT_U32; 899 case CmpInst::ICMP_UGE: 900 return AMDGPU::S_CMP_GE_U32; 901 case CmpInst::ICMP_ULT: 902 return AMDGPU::S_CMP_LT_U32; 903 case CmpInst::ICMP_ULE: 904 return AMDGPU::S_CMP_LE_U32; 905 default: 906 llvm_unreachable("Unknown condition code!"); 907 } 908 } 909 910 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { 911 MachineBasicBlock *BB = I.getParent(); 912 const DebugLoc &DL = I.getDebugLoc(); 913 914 Register SrcReg = I.getOperand(2).getReg(); 915 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 916 917 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); 918 919 Register CCReg = I.getOperand(0).getReg(); 920 if (!isVCC(CCReg, *MRI)) { 921 int Opcode = getS_CMPOpcode(Pred, Size); 922 if (Opcode == -1) 923 return false; 924 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode)) 925 .add(I.getOperand(2)) 926 .add(I.getOperand(3)); 927 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg) 928 .addReg(AMDGPU::SCC); 929 bool Ret = 930 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) && 931 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI); 932 I.eraseFromParent(); 933 return Ret; 934 } 935 936 int Opcode = getV_CMPOpcode(Pred, Size); 937 if (Opcode == -1) 938 return false; 939 940 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), 941 I.getOperand(0).getReg()) 942 .add(I.getOperand(2)) 943 .add(I.getOperand(3)); 944 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), 945 *TRI.getBoolRC(), *MRI); 946 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 947 I.eraseFromParent(); 948 return Ret; 949 } 950 951 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const { 952 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 953 // SelectionDAG uses for wave32 vs wave64. 954 MachineBasicBlock *BB = MI.getParent(); 955 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF)) 956 .add(MI.getOperand(1)); 957 958 Register Reg = MI.getOperand(1).getReg(); 959 MI.eraseFromParent(); 960 961 if (!MRI->getRegClassOrNull(Reg)) 962 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 963 return true; 964 } 965 966 static unsigned getDSShaderTypeValue(const MachineFunction &MF) { 967 switch (MF.getFunction().getCallingConv()) { 968 case CallingConv::AMDGPU_PS: 969 return 1; 970 case CallingConv::AMDGPU_VS: 971 return 2; 972 case CallingConv::AMDGPU_GS: 973 return 3; 974 case CallingConv::AMDGPU_HS: 975 case CallingConv::AMDGPU_LS: 976 case CallingConv::AMDGPU_ES: 977 report_fatal_error("ds_ordered_count unsupported for this calling conv"); 978 case CallingConv::AMDGPU_CS: 979 case CallingConv::AMDGPU_KERNEL: 980 case CallingConv::C: 981 case CallingConv::Fast: 982 default: 983 // Assume other calling conventions are various compute callable functions 984 return 0; 985 } 986 } 987 988 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic( 989 MachineInstr &MI, Intrinsic::ID IntrID) const { 990 MachineBasicBlock *MBB = MI.getParent(); 991 MachineFunction *MF = MBB->getParent(); 992 const DebugLoc &DL = MI.getDebugLoc(); 993 994 unsigned IndexOperand = MI.getOperand(7).getImm(); 995 bool WaveRelease = MI.getOperand(8).getImm() != 0; 996 bool WaveDone = MI.getOperand(9).getImm() != 0; 997 998 if (WaveDone && !WaveRelease) 999 report_fatal_error("ds_ordered_count: wave_done requires wave_release"); 1000 1001 unsigned OrderedCountIndex = IndexOperand & 0x3f; 1002 IndexOperand &= ~0x3f; 1003 unsigned CountDw = 0; 1004 1005 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) { 1006 CountDw = (IndexOperand >> 24) & 0xf; 1007 IndexOperand &= ~(0xf << 24); 1008 1009 if (CountDw < 1 || CountDw > 4) { 1010 report_fatal_error( 1011 "ds_ordered_count: dword count must be between 1 and 4"); 1012 } 1013 } 1014 1015 if (IndexOperand) 1016 report_fatal_error("ds_ordered_count: bad index operand"); 1017 1018 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1; 1019 unsigned ShaderType = getDSShaderTypeValue(*MF); 1020 1021 unsigned Offset0 = OrderedCountIndex << 2; 1022 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) | 1023 (Instruction << 4); 1024 1025 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) 1026 Offset1 |= (CountDw - 1) << 6; 1027 1028 unsigned Offset = Offset0 | (Offset1 << 8); 1029 1030 Register M0Val = MI.getOperand(2).getReg(); 1031 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1032 .addReg(M0Val); 1033 1034 Register DstReg = MI.getOperand(0).getReg(); 1035 Register ValReg = MI.getOperand(3).getReg(); 1036 MachineInstrBuilder DS = 1037 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg) 1038 .addReg(ValReg) 1039 .addImm(Offset) 1040 .cloneMemRefs(MI); 1041 1042 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI)) 1043 return false; 1044 1045 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI); 1046 MI.eraseFromParent(); 1047 return Ret; 1048 } 1049 1050 static unsigned gwsIntrinToOpcode(unsigned IntrID) { 1051 switch (IntrID) { 1052 case Intrinsic::amdgcn_ds_gws_init: 1053 return AMDGPU::DS_GWS_INIT; 1054 case Intrinsic::amdgcn_ds_gws_barrier: 1055 return AMDGPU::DS_GWS_BARRIER; 1056 case Intrinsic::amdgcn_ds_gws_sema_v: 1057 return AMDGPU::DS_GWS_SEMA_V; 1058 case Intrinsic::amdgcn_ds_gws_sema_br: 1059 return AMDGPU::DS_GWS_SEMA_BR; 1060 case Intrinsic::amdgcn_ds_gws_sema_p: 1061 return AMDGPU::DS_GWS_SEMA_P; 1062 case Intrinsic::amdgcn_ds_gws_sema_release_all: 1063 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL; 1064 default: 1065 llvm_unreachable("not a gws intrinsic"); 1066 } 1067 } 1068 1069 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, 1070 Intrinsic::ID IID) const { 1071 if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all && 1072 !STI.hasGWSSemaReleaseAll()) 1073 return false; 1074 1075 // intrinsic ID, vsrc, offset 1076 const bool HasVSrc = MI.getNumOperands() == 3; 1077 assert(HasVSrc || MI.getNumOperands() == 2); 1078 1079 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg(); 1080 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI); 1081 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID) 1082 return false; 1083 1084 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1085 assert(OffsetDef); 1086 1087 unsigned ImmOffset; 1088 1089 MachineBasicBlock *MBB = MI.getParent(); 1090 const DebugLoc &DL = MI.getDebugLoc(); 1091 1092 MachineInstr *Readfirstlane = nullptr; 1093 1094 // If we legalized the VGPR input, strip out the readfirstlane to analyze the 1095 // incoming offset, in case there's an add of a constant. We'll have to put it 1096 // back later. 1097 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) { 1098 Readfirstlane = OffsetDef; 1099 BaseOffset = OffsetDef->getOperand(1).getReg(); 1100 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1101 } 1102 1103 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) { 1104 // If we have a constant offset, try to use the 0 in m0 as the base. 1105 // TODO: Look into changing the default m0 initialization value. If the 1106 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to 1107 // the immediate offset. 1108 1109 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue(); 1110 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1111 .addImm(0); 1112 } else { 1113 std::tie(BaseOffset, ImmOffset, OffsetDef) 1114 = AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset); 1115 1116 if (Readfirstlane) { 1117 // We have the constant offset now, so put the readfirstlane back on the 1118 // variable component. 1119 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI)) 1120 return false; 1121 1122 Readfirstlane->getOperand(1).setReg(BaseOffset); 1123 BaseOffset = Readfirstlane->getOperand(0).getReg(); 1124 } else { 1125 if (!RBI.constrainGenericRegister(BaseOffset, 1126 AMDGPU::SReg_32RegClass, *MRI)) 1127 return false; 1128 } 1129 1130 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1131 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base) 1132 .addReg(BaseOffset) 1133 .addImm(16); 1134 1135 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1136 .addReg(M0Base); 1137 } 1138 1139 // The resource id offset is computed as (<isa opaque base> + M0[21:16] + 1140 // offset field) % 64. Some versions of the programming guide omit the m0 1141 // part, or claim it's from offset 0. 1142 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID))); 1143 1144 if (HasVSrc) { 1145 Register VSrc = MI.getOperand(1).getReg(); 1146 MIB.addReg(VSrc); 1147 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI)) 1148 return false; 1149 } 1150 1151 MIB.addImm(ImmOffset) 1152 .addImm(-1) // $gds 1153 .cloneMemRefs(MI); 1154 1155 MI.eraseFromParent(); 1156 return true; 1157 } 1158 1159 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI, 1160 bool IsAppend) const { 1161 Register PtrBase = MI.getOperand(2).getReg(); 1162 LLT PtrTy = MRI->getType(PtrBase); 1163 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS; 1164 1165 unsigned Offset; 1166 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2)); 1167 1168 // TODO: Should this try to look through readfirstlane like GWS? 1169 if (!isDSOffsetLegal(PtrBase, Offset, 16)) { 1170 PtrBase = MI.getOperand(2).getReg(); 1171 Offset = 0; 1172 } 1173 1174 MachineBasicBlock *MBB = MI.getParent(); 1175 const DebugLoc &DL = MI.getDebugLoc(); 1176 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME; 1177 1178 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1179 .addReg(PtrBase); 1180 BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg()) 1181 .addImm(Offset) 1182 .addImm(IsGDS ? -1 : 0) 1183 .cloneMemRefs(MI); 1184 1185 MI.eraseFromParent(); 1186 return true; 1187 } 1188 1189 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( 1190 MachineInstr &I) const { 1191 unsigned IntrinsicID = I.getIntrinsicID(); 1192 switch (IntrinsicID) { 1193 case Intrinsic::amdgcn_end_cf: 1194 return selectEndCfIntrinsic(I); 1195 case Intrinsic::amdgcn_ds_ordered_add: 1196 case Intrinsic::amdgcn_ds_ordered_swap: 1197 return selectDSOrderedIntrinsic(I, IntrinsicID); 1198 case Intrinsic::amdgcn_ds_gws_init: 1199 case Intrinsic::amdgcn_ds_gws_barrier: 1200 case Intrinsic::amdgcn_ds_gws_sema_v: 1201 case Intrinsic::amdgcn_ds_gws_sema_br: 1202 case Intrinsic::amdgcn_ds_gws_sema_p: 1203 case Intrinsic::amdgcn_ds_gws_sema_release_all: 1204 return selectDSGWSIntrinsic(I, IntrinsicID); 1205 case Intrinsic::amdgcn_ds_append: 1206 return selectDSAppendConsume(I, true); 1207 case Intrinsic::amdgcn_ds_consume: 1208 return selectDSAppendConsume(I, false); 1209 default: 1210 return selectImpl(I, *CoverageInfo); 1211 } 1212 } 1213 1214 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { 1215 if (selectImpl(I, *CoverageInfo)) 1216 return true; 1217 1218 MachineBasicBlock *BB = I.getParent(); 1219 const DebugLoc &DL = I.getDebugLoc(); 1220 1221 Register DstReg = I.getOperand(0).getReg(); 1222 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 1223 assert(Size <= 32 || Size == 64); 1224 const MachineOperand &CCOp = I.getOperand(1); 1225 Register CCReg = CCOp.getReg(); 1226 if (!isVCC(CCReg, *MRI)) { 1227 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 : 1228 AMDGPU::S_CSELECT_B32; 1229 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 1230 .addReg(CCReg); 1231 1232 // The generic constrainSelectedInstRegOperands doesn't work for the scc register 1233 // bank, because it does not cover the register class that we used to represent 1234 // for it. So we need to manually set the register class here. 1235 if (!MRI->getRegClassOrNull(CCReg)) 1236 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI)); 1237 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) 1238 .add(I.getOperand(2)) 1239 .add(I.getOperand(3)); 1240 1241 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) | 1242 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI); 1243 I.eraseFromParent(); 1244 return Ret; 1245 } 1246 1247 // Wide VGPR select should have been split in RegBankSelect. 1248 if (Size > 32) 1249 return false; 1250 1251 MachineInstr *Select = 1252 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1253 .addImm(0) 1254 .add(I.getOperand(3)) 1255 .addImm(0) 1256 .add(I.getOperand(2)) 1257 .add(I.getOperand(1)); 1258 1259 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); 1260 I.eraseFromParent(); 1261 return Ret; 1262 } 1263 1264 bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { 1265 initM0(I); 1266 return selectImpl(I, *CoverageInfo); 1267 } 1268 1269 static int sizeToSubRegIndex(unsigned Size) { 1270 switch (Size) { 1271 case 32: 1272 return AMDGPU::sub0; 1273 case 64: 1274 return AMDGPU::sub0_sub1; 1275 case 96: 1276 return AMDGPU::sub0_sub1_sub2; 1277 case 128: 1278 return AMDGPU::sub0_sub1_sub2_sub3; 1279 case 256: 1280 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; 1281 default: 1282 if (Size < 32) 1283 return AMDGPU::sub0; 1284 if (Size > 256) 1285 return -1; 1286 return sizeToSubRegIndex(PowerOf2Ceil(Size)); 1287 } 1288 } 1289 1290 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { 1291 Register DstReg = I.getOperand(0).getReg(); 1292 Register SrcReg = I.getOperand(1).getReg(); 1293 const LLT DstTy = MRI->getType(DstReg); 1294 const LLT SrcTy = MRI->getType(SrcReg); 1295 const LLT S1 = LLT::scalar(1); 1296 1297 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1298 const RegisterBank *DstRB; 1299 if (DstTy == S1) { 1300 // This is a special case. We don't treat s1 for legalization artifacts as 1301 // vcc booleans. 1302 DstRB = SrcRB; 1303 } else { 1304 DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1305 if (SrcRB != DstRB) 1306 return false; 1307 } 1308 1309 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; 1310 1311 unsigned DstSize = DstTy.getSizeInBits(); 1312 unsigned SrcSize = SrcTy.getSizeInBits(); 1313 1314 const TargetRegisterClass *SrcRC 1315 = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI); 1316 const TargetRegisterClass *DstRC 1317 = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI); 1318 1319 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 1320 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) { 1321 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); 1322 return false; 1323 } 1324 1325 if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) { 1326 MachineBasicBlock *MBB = I.getParent(); 1327 const DebugLoc &DL = I.getDebugLoc(); 1328 1329 Register LoReg = MRI->createVirtualRegister(DstRC); 1330 Register HiReg = MRI->createVirtualRegister(DstRC); 1331 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg) 1332 .addReg(SrcReg, 0, AMDGPU::sub0); 1333 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg) 1334 .addReg(SrcReg, 0, AMDGPU::sub1); 1335 1336 if (IsVALU && STI.hasSDWA()) { 1337 // Write the low 16-bits of the high element into the high 16-bits of the 1338 // low element. 1339 MachineInstr *MovSDWA = 1340 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 1341 .addImm(0) // $src0_modifiers 1342 .addReg(HiReg) // $src0 1343 .addImm(0) // $clamp 1344 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel 1345 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 1346 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel 1347 .addReg(LoReg, RegState::Implicit); 1348 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 1349 } else { 1350 Register TmpReg0 = MRI->createVirtualRegister(DstRC); 1351 Register TmpReg1 = MRI->createVirtualRegister(DstRC); 1352 Register ImmReg = MRI->createVirtualRegister(DstRC); 1353 if (IsVALU) { 1354 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0) 1355 .addImm(16) 1356 .addReg(HiReg); 1357 } else { 1358 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0) 1359 .addReg(HiReg) 1360 .addImm(16); 1361 } 1362 1363 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 1364 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 1365 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32; 1366 1367 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg) 1368 .addImm(0xffff); 1369 BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1) 1370 .addReg(LoReg) 1371 .addReg(ImmReg); 1372 BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg) 1373 .addReg(TmpReg0) 1374 .addReg(TmpReg1); 1375 } 1376 1377 I.eraseFromParent(); 1378 return true; 1379 } 1380 1381 if (!DstTy.isScalar()) 1382 return false; 1383 1384 if (SrcSize > 32) { 1385 int SubRegIdx = sizeToSubRegIndex(DstSize); 1386 if (SubRegIdx == -1) 1387 return false; 1388 1389 // Deal with weird cases where the class only partially supports the subreg 1390 // index. 1391 const TargetRegisterClass *SrcWithSubRC 1392 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx); 1393 if (!SrcWithSubRC) 1394 return false; 1395 1396 if (SrcWithSubRC != SrcRC) { 1397 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI)) 1398 return false; 1399 } 1400 1401 I.getOperand(1).setSubReg(SubRegIdx); 1402 } 1403 1404 I.setDesc(TII.get(TargetOpcode::COPY)); 1405 return true; 1406 } 1407 1408 /// \returns true if a bitmask for \p Size bits will be an inline immediate. 1409 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) { 1410 Mask = maskTrailingOnes<unsigned>(Size); 1411 int SignedMask = static_cast<int>(Mask); 1412 return SignedMask >= -16 && SignedMask <= 64; 1413 } 1414 1415 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1. 1416 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank( 1417 Register Reg, const MachineRegisterInfo &MRI, 1418 const TargetRegisterInfo &TRI) const { 1419 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 1420 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>()) 1421 return RB; 1422 1423 // Ignore the type, since we don't use vcc in artifacts. 1424 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>()) 1425 return &RBI.getRegBankFromRegClass(*RC, LLT()); 1426 return nullptr; 1427 } 1428 1429 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { 1430 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG; 1431 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg; 1432 const DebugLoc &DL = I.getDebugLoc(); 1433 MachineBasicBlock &MBB = *I.getParent(); 1434 const Register DstReg = I.getOperand(0).getReg(); 1435 const Register SrcReg = I.getOperand(1).getReg(); 1436 1437 const LLT DstTy = MRI->getType(DstReg); 1438 const LLT SrcTy = MRI->getType(SrcReg); 1439 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ? 1440 I.getOperand(2).getImm() : SrcTy.getSizeInBits(); 1441 const unsigned DstSize = DstTy.getSizeInBits(); 1442 if (!DstTy.isScalar()) 1443 return false; 1444 1445 if (I.getOpcode() == AMDGPU::G_ANYEXT) 1446 return selectCOPY(I); 1447 1448 // Artifact casts should never use vcc. 1449 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI); 1450 1451 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) { 1452 // 64-bit should have been split up in RegBankSelect 1453 1454 // Try to use an and with a mask if it will save code size. 1455 unsigned Mask; 1456 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1457 MachineInstr *ExtI = 1458 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg) 1459 .addImm(Mask) 1460 .addReg(SrcReg); 1461 I.eraseFromParent(); 1462 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1463 } 1464 1465 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32; 1466 MachineInstr *ExtI = 1467 BuildMI(MBB, I, DL, TII.get(BFE), DstReg) 1468 .addReg(SrcReg) 1469 .addImm(0) // Offset 1470 .addImm(SrcSize); // Width 1471 I.eraseFromParent(); 1472 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1473 } 1474 1475 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) { 1476 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ? 1477 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass; 1478 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI)) 1479 return false; 1480 1481 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) { 1482 const unsigned SextOpc = SrcSize == 8 ? 1483 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16; 1484 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg) 1485 .addReg(SrcReg); 1486 I.eraseFromParent(); 1487 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1488 } 1489 1490 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; 1491 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; 1492 1493 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width. 1494 if (DstSize > 32 && (SrcSize <= 32 || InReg)) { 1495 // We need a 64-bit register source, but the high bits don't matter. 1496 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); 1497 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1498 unsigned SubReg = InReg ? AMDGPU::sub0 : 0; 1499 1500 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 1501 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) 1502 .addReg(SrcReg, 0, SubReg) 1503 .addImm(AMDGPU::sub0) 1504 .addReg(UndefReg) 1505 .addImm(AMDGPU::sub1); 1506 1507 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg) 1508 .addReg(ExtReg) 1509 .addImm(SrcSize << 16); 1510 1511 I.eraseFromParent(); 1512 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI); 1513 } 1514 1515 unsigned Mask; 1516 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1517 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg) 1518 .addReg(SrcReg) 1519 .addImm(Mask); 1520 } else { 1521 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg) 1522 .addReg(SrcReg) 1523 .addImm(SrcSize << 16); 1524 } 1525 1526 I.eraseFromParent(); 1527 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1528 } 1529 1530 return false; 1531 } 1532 1533 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { 1534 MachineBasicBlock *BB = I.getParent(); 1535 MachineOperand &ImmOp = I.getOperand(1); 1536 1537 // The AMDGPU backend only supports Imm operands and not CImm or FPImm. 1538 if (ImmOp.isFPImm()) { 1539 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt(); 1540 ImmOp.ChangeToImmediate(Imm.getZExtValue()); 1541 } else if (ImmOp.isCImm()) { 1542 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getZExtValue()); 1543 } 1544 1545 Register DstReg = I.getOperand(0).getReg(); 1546 unsigned Size; 1547 bool IsSgpr; 1548 const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg()); 1549 if (RB) { 1550 IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID; 1551 Size = MRI->getType(DstReg).getSizeInBits(); 1552 } else { 1553 const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg); 1554 IsSgpr = TRI.isSGPRClass(RC); 1555 Size = TRI.getRegSizeInBits(*RC); 1556 } 1557 1558 if (Size != 32 && Size != 64) 1559 return false; 1560 1561 unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1562 if (Size == 32) { 1563 I.setDesc(TII.get(Opcode)); 1564 I.addImplicitDefUseOperands(*MF); 1565 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 1566 } 1567 1568 const DebugLoc &DL = I.getDebugLoc(); 1569 1570 APInt Imm(Size, I.getOperand(1).getImm()); 1571 1572 MachineInstr *ResInst; 1573 if (IsSgpr && TII.isInlineConstant(Imm)) { 1574 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) 1575 .addImm(I.getOperand(1).getImm()); 1576 } else { 1577 const TargetRegisterClass *RC = IsSgpr ? 1578 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass; 1579 Register LoReg = MRI->createVirtualRegister(RC); 1580 Register HiReg = MRI->createVirtualRegister(RC); 1581 1582 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg) 1583 .addImm(Imm.trunc(32).getZExtValue()); 1584 1585 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg) 1586 .addImm(Imm.ashr(32).getZExtValue()); 1587 1588 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 1589 .addReg(LoReg) 1590 .addImm(AMDGPU::sub0) 1591 .addReg(HiReg) 1592 .addImm(AMDGPU::sub1); 1593 } 1594 1595 // We can't call constrainSelectedInstRegOperands here, because it doesn't 1596 // work for target independent opcodes 1597 I.eraseFromParent(); 1598 const TargetRegisterClass *DstRC = 1599 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI); 1600 if (!DstRC) 1601 return true; 1602 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI); 1603 } 1604 1605 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const { 1606 // Only manually handle the f64 SGPR case. 1607 // 1608 // FIXME: This is a workaround for 2.5 different tablegen problems. Because 1609 // the bit ops theoretically have a second result due to the implicit def of 1610 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing 1611 // that is easy by disabling the check. The result works, but uses a 1612 // nonsensical sreg32orlds_and_sreg_1 regclass. 1613 // 1614 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to 1615 // the variadic REG_SEQUENCE operands. 1616 1617 Register Dst = MI.getOperand(0).getReg(); 1618 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 1619 if (DstRB->getID() != AMDGPU::SGPRRegBankID || 1620 MRI->getType(Dst) != LLT::scalar(64)) 1621 return false; 1622 1623 Register Src = MI.getOperand(1).getReg(); 1624 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI); 1625 if (Fabs) 1626 Src = Fabs->getOperand(1).getReg(); 1627 1628 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || 1629 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) 1630 return false; 1631 1632 MachineBasicBlock *BB = MI.getParent(); 1633 const DebugLoc &DL = MI.getDebugLoc(); 1634 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1635 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1636 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1637 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1638 1639 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) 1640 .addReg(Src, 0, AMDGPU::sub0); 1641 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) 1642 .addReg(Src, 0, AMDGPU::sub1); 1643 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) 1644 .addImm(0x80000000); 1645 1646 // Set or toggle sign bit. 1647 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32; 1648 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg) 1649 .addReg(HiReg) 1650 .addReg(ConstReg); 1651 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) 1652 .addReg(LoReg) 1653 .addImm(AMDGPU::sub0) 1654 .addReg(OpReg) 1655 .addImm(AMDGPU::sub1); 1656 MI.eraseFromParent(); 1657 return true; 1658 } 1659 1660 static bool isConstant(const MachineInstr &MI) { 1661 return MI.getOpcode() == TargetOpcode::G_CONSTANT; 1662 } 1663 1664 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, 1665 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const { 1666 1667 const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg()); 1668 1669 assert(PtrMI); 1670 1671 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD) 1672 return; 1673 1674 GEPInfo GEPInfo(*PtrMI); 1675 1676 for (unsigned i = 1; i != 3; ++i) { 1677 const MachineOperand &GEPOp = PtrMI->getOperand(i); 1678 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg()); 1679 assert(OpDef); 1680 if (i == 2 && isConstant(*OpDef)) { 1681 // TODO: Could handle constant base + variable offset, but a combine 1682 // probably should have commuted it. 1683 assert(GEPInfo.Imm == 0); 1684 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue(); 1685 continue; 1686 } 1687 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI); 1688 if (OpBank->getID() == AMDGPU::SGPRRegBankID) 1689 GEPInfo.SgprParts.push_back(GEPOp.getReg()); 1690 else 1691 GEPInfo.VgprParts.push_back(GEPOp.getReg()); 1692 } 1693 1694 AddrInfo.push_back(GEPInfo); 1695 getAddrModeInfo(*PtrMI, MRI, AddrInfo); 1696 } 1697 1698 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const { 1699 if (!MI.hasOneMemOperand()) 1700 return false; 1701 1702 const MachineMemOperand *MMO = *MI.memoperands_begin(); 1703 const Value *Ptr = MMO->getValue(); 1704 1705 // UndefValue means this is a load of a kernel input. These are uniform. 1706 // Sometimes LDS instructions have constant pointers. 1707 // If Ptr is null, then that means this mem operand contains a 1708 // PseudoSourceValue like GOT. 1709 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || 1710 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) 1711 return true; 1712 1713 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 1714 return true; 1715 1716 const Instruction *I = dyn_cast<Instruction>(Ptr); 1717 return I && I->getMetadata("amdgpu.uniform"); 1718 } 1719 1720 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const { 1721 for (const GEPInfo &GEPInfo : AddrInfo) { 1722 if (!GEPInfo.VgprParts.empty()) 1723 return true; 1724 } 1725 return false; 1726 } 1727 1728 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const { 1729 MachineBasicBlock *BB = I.getParent(); 1730 1731 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); 1732 unsigned AS = PtrTy.getAddressSpace(); 1733 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) && 1734 STI.ldsRequiresM0Init()) { 1735 // If DS instructions require M0 initializtion, insert it before selecting. 1736 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1737 .addImm(-1); 1738 } 1739 } 1740 1741 bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const { 1742 initM0(I); 1743 return selectImpl(I, *CoverageInfo); 1744 } 1745 1746 // TODO: No rtn optimization. 1747 bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG( 1748 MachineInstr &MI) const { 1749 Register PtrReg = MI.getOperand(1).getReg(); 1750 const LLT PtrTy = MRI->getType(PtrReg); 1751 if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS || 1752 STI.useFlatForGlobal()) 1753 return selectImpl(MI, *CoverageInfo); 1754 1755 Register DstReg = MI.getOperand(0).getReg(); 1756 const LLT Ty = MRI->getType(DstReg); 1757 const bool Is64 = Ty.getSizeInBits() == 64; 1758 const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 1759 Register TmpReg = MRI->createVirtualRegister( 1760 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass); 1761 1762 const DebugLoc &DL = MI.getDebugLoc(); 1763 MachineBasicBlock *BB = MI.getParent(); 1764 1765 Register VAddr, RSrcReg, SOffset; 1766 int64_t Offset = 0; 1767 1768 unsigned Opcode; 1769 if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) { 1770 Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN : 1771 AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN; 1772 } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr, 1773 RSrcReg, SOffset, Offset)) { 1774 Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN : 1775 AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN; 1776 } else 1777 return selectImpl(MI, *CoverageInfo); 1778 1779 auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg) 1780 .addReg(MI.getOperand(2).getReg()); 1781 1782 if (VAddr) 1783 MIB.addReg(VAddr); 1784 1785 MIB.addReg(RSrcReg); 1786 if (SOffset) 1787 MIB.addReg(SOffset); 1788 else 1789 MIB.addImm(0); 1790 1791 MIB.addImm(Offset); 1792 MIB.addImm(0); // slc 1793 MIB.cloneMemRefs(MI); 1794 1795 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg) 1796 .addReg(TmpReg, RegState::Kill, SubReg); 1797 1798 MI.eraseFromParent(); 1799 1800 MRI->setRegClass( 1801 DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass); 1802 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1803 } 1804 1805 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { 1806 MachineBasicBlock *BB = I.getParent(); 1807 MachineOperand &CondOp = I.getOperand(0); 1808 Register CondReg = CondOp.getReg(); 1809 const DebugLoc &DL = I.getDebugLoc(); 1810 1811 unsigned BrOpcode; 1812 Register CondPhysReg; 1813 const TargetRegisterClass *ConstrainRC; 1814 1815 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide 1816 // whether the branch is uniform when selecting the instruction. In 1817 // GlobalISel, we should push that decision into RegBankSelect. Assume for now 1818 // RegBankSelect knows what it's doing if the branch condition is scc, even 1819 // though it currently does not. 1820 if (!isVCC(CondReg, *MRI)) { 1821 if (MRI->getType(CondReg) != LLT::scalar(32)) 1822 return false; 1823 1824 CondPhysReg = AMDGPU::SCC; 1825 BrOpcode = AMDGPU::S_CBRANCH_SCC1; 1826 // FIXME: Hack for isSCC tests 1827 ConstrainRC = &AMDGPU::SGPR_32RegClass; 1828 } else { 1829 // FIXME: Do we have to insert an and with exec here, like in SelectionDAG? 1830 // We sort of know that a VCC producer based on the register bank, that ands 1831 // inactive lanes with 0. What if there was a logical operation with vcc 1832 // producers in different blocks/with different exec masks? 1833 // FIXME: Should scc->vcc copies and with exec? 1834 CondPhysReg = TRI.getVCC(); 1835 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ; 1836 ConstrainRC = TRI.getBoolRC(); 1837 } 1838 1839 if (!MRI->getRegClassOrNull(CondReg)) 1840 MRI->setRegClass(CondReg, ConstrainRC); 1841 1842 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg) 1843 .addReg(CondReg); 1844 BuildMI(*BB, &I, DL, TII.get(BrOpcode)) 1845 .addMBB(I.getOperand(1).getMBB()); 1846 1847 I.eraseFromParent(); 1848 return true; 1849 } 1850 1851 bool AMDGPUInstructionSelector::selectG_FRAME_INDEX_GLOBAL_VALUE( 1852 MachineInstr &I) const { 1853 Register DstReg = I.getOperand(0).getReg(); 1854 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1855 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 1856 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32)); 1857 if (IsVGPR) 1858 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 1859 1860 return RBI.constrainGenericRegister( 1861 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI); 1862 } 1863 1864 bool AMDGPUInstructionSelector::selectG_PTR_MASK(MachineInstr &I) const { 1865 uint64_t Align = I.getOperand(2).getImm(); 1866 const uint64_t Mask = ~((UINT64_C(1) << Align) - 1); 1867 1868 MachineBasicBlock *BB = I.getParent(); 1869 1870 Register DstReg = I.getOperand(0).getReg(); 1871 Register SrcReg = I.getOperand(1).getReg(); 1872 1873 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1874 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1875 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 1876 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 1877 unsigned MovOpc = IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 1878 const TargetRegisterClass &RegRC 1879 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 1880 1881 LLT Ty = MRI->getType(DstReg); 1882 1883 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB, 1884 *MRI); 1885 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB, 1886 *MRI); 1887 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 1888 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 1889 return false; 1890 1891 const DebugLoc &DL = I.getDebugLoc(); 1892 Register ImmReg = MRI->createVirtualRegister(&RegRC); 1893 BuildMI(*BB, &I, DL, TII.get(MovOpc), ImmReg) 1894 .addImm(Mask); 1895 1896 if (Ty.getSizeInBits() == 32) { 1897 BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg) 1898 .addReg(SrcReg) 1899 .addReg(ImmReg); 1900 I.eraseFromParent(); 1901 return true; 1902 } 1903 1904 Register HiReg = MRI->createVirtualRegister(&RegRC); 1905 Register LoReg = MRI->createVirtualRegister(&RegRC); 1906 Register MaskLo = MRI->createVirtualRegister(&RegRC); 1907 1908 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg) 1909 .addReg(SrcReg, 0, AMDGPU::sub0); 1910 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg) 1911 .addReg(SrcReg, 0, AMDGPU::sub1); 1912 1913 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskLo) 1914 .addReg(LoReg) 1915 .addReg(ImmReg); 1916 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 1917 .addReg(MaskLo) 1918 .addImm(AMDGPU::sub0) 1919 .addReg(HiReg) 1920 .addImm(AMDGPU::sub1); 1921 I.eraseFromParent(); 1922 return true; 1923 } 1924 1925 /// Return the register to use for the index value, and the subregister to use 1926 /// for the indirectly accessed register. 1927 static std::pair<Register, unsigned> 1928 computeIndirectRegIndex(MachineRegisterInfo &MRI, 1929 const SIRegisterInfo &TRI, 1930 const TargetRegisterClass *SuperRC, 1931 Register IdxReg, 1932 unsigned EltSize) { 1933 Register IdxBaseReg; 1934 int Offset; 1935 MachineInstr *Unused; 1936 1937 std::tie(IdxBaseReg, Offset, Unused) 1938 = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg); 1939 if (IdxBaseReg == AMDGPU::NoRegister) { 1940 // This will happen if the index is a known constant. This should ordinarily 1941 // be legalized out, but handle it as a register just in case. 1942 assert(Offset == 0); 1943 IdxBaseReg = IdxReg; 1944 } 1945 1946 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize); 1947 1948 // Skip out of bounds offsets, or else we would end up using an undefined 1949 // register. 1950 if (static_cast<unsigned>(Offset) >= SubRegs.size()) 1951 return std::make_pair(IdxReg, SubRegs[0]); 1952 return std::make_pair(IdxBaseReg, SubRegs[Offset]); 1953 } 1954 1955 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT( 1956 MachineInstr &MI) const { 1957 Register DstReg = MI.getOperand(0).getReg(); 1958 Register SrcReg = MI.getOperand(1).getReg(); 1959 Register IdxReg = MI.getOperand(2).getReg(); 1960 1961 LLT DstTy = MRI->getType(DstReg); 1962 LLT SrcTy = MRI->getType(SrcReg); 1963 1964 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1965 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1966 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 1967 1968 // The index must be scalar. If it wasn't RegBankSelect should have moved this 1969 // into a waterfall loop. 1970 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 1971 return false; 1972 1973 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB, 1974 *MRI); 1975 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB, 1976 *MRI); 1977 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 1978 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 1979 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 1980 return false; 1981 1982 MachineBasicBlock *BB = MI.getParent(); 1983 const DebugLoc &DL = MI.getDebugLoc(); 1984 const bool Is64 = DstTy.getSizeInBits() == 64; 1985 1986 unsigned SubReg; 1987 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg, 1988 DstTy.getSizeInBits() / 8); 1989 1990 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) { 1991 if (DstTy.getSizeInBits() != 32 && !Is64) 1992 return false; 1993 1994 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1995 .addReg(IdxReg); 1996 1997 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32; 1998 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg) 1999 .addReg(SrcReg, 0, SubReg) 2000 .addReg(SrcReg, RegState::Implicit); 2001 MI.eraseFromParent(); 2002 return true; 2003 } 2004 2005 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32) 2006 return false; 2007 2008 if (!STI.useVGPRIndexMode()) { 2009 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2010 .addReg(IdxReg); 2011 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg) 2012 .addReg(SrcReg, RegState::Undef, SubReg) 2013 .addReg(SrcReg, RegState::Implicit); 2014 MI.eraseFromParent(); 2015 return true; 2016 } 2017 2018 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON)) 2019 .addReg(IdxReg) 2020 .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE); 2021 BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstReg) 2022 .addReg(SrcReg, RegState::Undef, SubReg) 2023 .addReg(SrcReg, RegState::Implicit) 2024 .addReg(AMDGPU::M0, RegState::Implicit); 2025 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF)); 2026 2027 MI.eraseFromParent(); 2028 return true; 2029 } 2030 2031 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd 2032 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT( 2033 MachineInstr &MI) const { 2034 Register DstReg = MI.getOperand(0).getReg(); 2035 Register VecReg = MI.getOperand(1).getReg(); 2036 Register ValReg = MI.getOperand(2).getReg(); 2037 Register IdxReg = MI.getOperand(3).getReg(); 2038 2039 LLT VecTy = MRI->getType(DstReg); 2040 LLT ValTy = MRI->getType(ValReg); 2041 unsigned VecSize = VecTy.getSizeInBits(); 2042 unsigned ValSize = ValTy.getSizeInBits(); 2043 2044 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI); 2045 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI); 2046 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 2047 2048 assert(VecTy.getElementType() == ValTy); 2049 2050 // The index must be scalar. If it wasn't RegBankSelect should have moved this 2051 // into a waterfall loop. 2052 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 2053 return false; 2054 2055 const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB, 2056 *MRI); 2057 const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB, 2058 *MRI); 2059 2060 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) || 2061 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) || 2062 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) || 2063 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 2064 return false; 2065 2066 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32) 2067 return false; 2068 2069 unsigned SubReg; 2070 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, 2071 ValSize / 8); 2072 2073 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID && 2074 STI.useVGPRIndexMode(); 2075 2076 MachineBasicBlock *BB = MI.getParent(); 2077 const DebugLoc &DL = MI.getDebugLoc(); 2078 2079 if (IndexMode) { 2080 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON)) 2081 .addReg(IdxReg) 2082 .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE); 2083 } else { 2084 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2085 .addReg(IdxReg); 2086 } 2087 2088 const MCInstrDesc &RegWriteOp 2089 = TII.getIndirectRegWritePseudo(VecSize, ValSize, 2090 VecRB->getID() == AMDGPU::SGPRRegBankID); 2091 BuildMI(*BB, MI, DL, RegWriteOp, DstReg) 2092 .addReg(VecReg) 2093 .addReg(ValReg) 2094 .addImm(SubReg); 2095 2096 if (IndexMode) 2097 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF)); 2098 2099 MI.eraseFromParent(); 2100 return true; 2101 } 2102 2103 static bool isZeroOrUndef(int X) { 2104 return X == 0 || X == -1; 2105 } 2106 2107 static bool isOneOrUndef(int X) { 2108 return X == 1 || X == -1; 2109 } 2110 2111 static bool isZeroOrOneOrUndef(int X) { 2112 return X == 0 || X == 1 || X == -1; 2113 } 2114 2115 // Normalize a VOP3P shuffle mask to refer to the low/high half of a single 2116 // 32-bit register. 2117 static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1, 2118 ArrayRef<int> Mask) { 2119 NewMask[0] = Mask[0]; 2120 NewMask[1] = Mask[1]; 2121 if (isZeroOrOneOrUndef(Mask[0]) && isZeroOrOneOrUndef(Mask[1])) 2122 return Src0; 2123 2124 assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1); 2125 assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1); 2126 2127 // Shift the mask inputs to be 0/1; 2128 NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2; 2129 NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2; 2130 return Src1; 2131 } 2132 2133 // This is only legal with VOP3P instructions as an aid to op_sel matching. 2134 bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR( 2135 MachineInstr &MI) const { 2136 Register DstReg = MI.getOperand(0).getReg(); 2137 Register Src0Reg = MI.getOperand(1).getReg(); 2138 Register Src1Reg = MI.getOperand(2).getReg(); 2139 ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask(); 2140 2141 const LLT V2S16 = LLT::vector(2, 16); 2142 if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16) 2143 return false; 2144 2145 if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask)) 2146 return false; 2147 2148 assert(ShufMask.size() == 2); 2149 assert(STI.hasSDWA() && "no target has VOP3P but not SDWA"); 2150 2151 MachineBasicBlock *MBB = MI.getParent(); 2152 const DebugLoc &DL = MI.getDebugLoc(); 2153 2154 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2155 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; 2156 const TargetRegisterClass &RC = IsVALU ? 2157 AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 2158 2159 // Handle the degenerate case which should have folded out. 2160 if (ShufMask[0] == -1 && ShufMask[1] == -1) { 2161 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg); 2162 2163 MI.eraseFromParent(); 2164 return RBI.constrainGenericRegister(DstReg, RC, *MRI); 2165 } 2166 2167 // A legal VOP3P mask only reads one of the sources. 2168 int Mask[2]; 2169 Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask); 2170 2171 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) || 2172 !RBI.constrainGenericRegister(SrcVec, RC, *MRI)) 2173 return false; 2174 2175 // TODO: This also should have been folded out 2176 if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) { 2177 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg) 2178 .addReg(SrcVec); 2179 2180 MI.eraseFromParent(); 2181 return true; 2182 } 2183 2184 if (Mask[0] == 1 && Mask[1] == -1) { 2185 if (IsVALU) { 2186 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg) 2187 .addImm(16) 2188 .addReg(SrcVec); 2189 } else { 2190 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg) 2191 .addReg(SrcVec) 2192 .addImm(16); 2193 } 2194 } else if (Mask[0] == -1 && Mask[1] == 0) { 2195 if (IsVALU) { 2196 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg) 2197 .addImm(16) 2198 .addReg(SrcVec); 2199 } else { 2200 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg) 2201 .addReg(SrcVec) 2202 .addImm(16); 2203 } 2204 } else if (Mask[0] == 0 && Mask[1] == 0) { 2205 if (IsVALU) { 2206 // Write low half of the register into the high half. 2207 MachineInstr *MovSDWA = 2208 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 2209 .addImm(0) // $src0_modifiers 2210 .addReg(SrcVec) // $src0 2211 .addImm(0) // $clamp 2212 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel 2213 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 2214 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel 2215 .addReg(SrcVec, RegState::Implicit); 2216 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 2217 } else { 2218 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg) 2219 .addReg(SrcVec) 2220 .addReg(SrcVec); 2221 } 2222 } else if (Mask[0] == 1 && Mask[1] == 1) { 2223 if (IsVALU) { 2224 // Write high half of the register into the low half. 2225 MachineInstr *MovSDWA = 2226 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 2227 .addImm(0) // $src0_modifiers 2228 .addReg(SrcVec) // $src0 2229 .addImm(0) // $clamp 2230 .addImm(AMDGPU::SDWA::WORD_0) // $dst_sel 2231 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 2232 .addImm(AMDGPU::SDWA::WORD_1) // $src0_sel 2233 .addReg(SrcVec, RegState::Implicit); 2234 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 2235 } else { 2236 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg) 2237 .addReg(SrcVec) 2238 .addReg(SrcVec); 2239 } 2240 } else if (Mask[0] == 1 && Mask[1] == 0) { 2241 if (IsVALU) { 2242 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32), DstReg) 2243 .addReg(SrcVec) 2244 .addReg(SrcVec) 2245 .addImm(16); 2246 } else { 2247 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2248 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg) 2249 .addReg(SrcVec) 2250 .addImm(16); 2251 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg) 2252 .addReg(TmpReg) 2253 .addReg(SrcVec); 2254 } 2255 } else 2256 llvm_unreachable("all shuffle masks should be handled"); 2257 2258 MI.eraseFromParent(); 2259 return true; 2260 } 2261 2262 bool AMDGPUInstructionSelector::select(MachineInstr &I) { 2263 if (I.isPHI()) 2264 return selectPHI(I); 2265 2266 if (!I.isPreISelOpcode()) { 2267 if (I.isCopy()) 2268 return selectCOPY(I); 2269 return true; 2270 } 2271 2272 switch (I.getOpcode()) { 2273 case TargetOpcode::G_AND: 2274 case TargetOpcode::G_OR: 2275 case TargetOpcode::G_XOR: 2276 if (selectImpl(I, *CoverageInfo)) 2277 return true; 2278 return selectG_AND_OR_XOR(I); 2279 case TargetOpcode::G_ADD: 2280 case TargetOpcode::G_SUB: 2281 if (selectImpl(I, *CoverageInfo)) 2282 return true; 2283 return selectG_ADD_SUB(I); 2284 case TargetOpcode::G_UADDO: 2285 case TargetOpcode::G_USUBO: 2286 case TargetOpcode::G_UADDE: 2287 case TargetOpcode::G_USUBE: 2288 return selectG_UADDO_USUBO_UADDE_USUBE(I); 2289 case TargetOpcode::G_INTTOPTR: 2290 case TargetOpcode::G_BITCAST: 2291 case TargetOpcode::G_PTRTOINT: 2292 return selectCOPY(I); 2293 case TargetOpcode::G_CONSTANT: 2294 case TargetOpcode::G_FCONSTANT: 2295 return selectG_CONSTANT(I); 2296 case TargetOpcode::G_FNEG: 2297 if (selectImpl(I, *CoverageInfo)) 2298 return true; 2299 return selectG_FNEG(I); 2300 case TargetOpcode::G_EXTRACT: 2301 return selectG_EXTRACT(I); 2302 case TargetOpcode::G_MERGE_VALUES: 2303 case TargetOpcode::G_BUILD_VECTOR: 2304 case TargetOpcode::G_CONCAT_VECTORS: 2305 return selectG_MERGE_VALUES(I); 2306 case TargetOpcode::G_UNMERGE_VALUES: 2307 return selectG_UNMERGE_VALUES(I); 2308 case TargetOpcode::G_BUILD_VECTOR_TRUNC: 2309 return selectG_BUILD_VECTOR_TRUNC(I); 2310 case TargetOpcode::G_PTR_ADD: 2311 return selectG_PTR_ADD(I); 2312 case TargetOpcode::G_IMPLICIT_DEF: 2313 return selectG_IMPLICIT_DEF(I); 2314 case TargetOpcode::G_INSERT: 2315 return selectG_INSERT(I); 2316 case TargetOpcode::G_INTRINSIC: 2317 return selectG_INTRINSIC(I); 2318 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: 2319 return selectG_INTRINSIC_W_SIDE_EFFECTS(I); 2320 case TargetOpcode::G_ICMP: 2321 if (selectG_ICMP(I)) 2322 return true; 2323 return selectImpl(I, *CoverageInfo); 2324 case TargetOpcode::G_LOAD: 2325 case TargetOpcode::G_ATOMIC_CMPXCHG: 2326 case TargetOpcode::G_ATOMICRMW_XCHG: 2327 case TargetOpcode::G_ATOMICRMW_ADD: 2328 case TargetOpcode::G_ATOMICRMW_SUB: 2329 case TargetOpcode::G_ATOMICRMW_AND: 2330 case TargetOpcode::G_ATOMICRMW_OR: 2331 case TargetOpcode::G_ATOMICRMW_XOR: 2332 case TargetOpcode::G_ATOMICRMW_MIN: 2333 case TargetOpcode::G_ATOMICRMW_MAX: 2334 case TargetOpcode::G_ATOMICRMW_UMIN: 2335 case TargetOpcode::G_ATOMICRMW_UMAX: 2336 case TargetOpcode::G_ATOMICRMW_FADD: 2337 return selectG_LOAD_ATOMICRMW(I); 2338 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: 2339 return selectG_AMDGPU_ATOMIC_CMPXCHG(I); 2340 case TargetOpcode::G_SELECT: 2341 return selectG_SELECT(I); 2342 case TargetOpcode::G_STORE: 2343 return selectG_STORE(I); 2344 case TargetOpcode::G_TRUNC: 2345 return selectG_TRUNC(I); 2346 case TargetOpcode::G_SEXT: 2347 case TargetOpcode::G_ZEXT: 2348 case TargetOpcode::G_ANYEXT: 2349 case TargetOpcode::G_SEXT_INREG: 2350 if (selectImpl(I, *CoverageInfo)) 2351 return true; 2352 return selectG_SZA_EXT(I); 2353 case TargetOpcode::G_BRCOND: 2354 return selectG_BRCOND(I); 2355 case TargetOpcode::G_FRAME_INDEX: 2356 case TargetOpcode::G_GLOBAL_VALUE: 2357 return selectG_FRAME_INDEX_GLOBAL_VALUE(I); 2358 case TargetOpcode::G_PTR_MASK: 2359 return selectG_PTR_MASK(I); 2360 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 2361 return selectG_EXTRACT_VECTOR_ELT(I); 2362 case TargetOpcode::G_INSERT_VECTOR_ELT: 2363 return selectG_INSERT_VECTOR_ELT(I); 2364 case TargetOpcode::G_SHUFFLE_VECTOR: 2365 return selectG_SHUFFLE_VECTOR(I); 2366 case AMDGPU::G_AMDGPU_ATOMIC_INC: 2367 case AMDGPU::G_AMDGPU_ATOMIC_DEC: 2368 initM0(I); 2369 return selectImpl(I, *CoverageInfo); 2370 default: 2371 return selectImpl(I, *CoverageInfo); 2372 } 2373 return false; 2374 } 2375 2376 InstructionSelector::ComplexRendererFns 2377 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { 2378 return {{ 2379 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 2380 }}; 2381 2382 } 2383 2384 std::pair<Register, unsigned> 2385 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root) const { 2386 Register Src = Root.getReg(); 2387 Register OrigSrc = Src; 2388 unsigned Mods = 0; 2389 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI); 2390 2391 if (MI && MI->getOpcode() == AMDGPU::G_FNEG) { 2392 Src = MI->getOperand(1).getReg(); 2393 Mods |= SISrcMods::NEG; 2394 MI = getDefIgnoringCopies(Src, *MRI); 2395 } 2396 2397 if (MI && MI->getOpcode() == AMDGPU::G_FABS) { 2398 Src = MI->getOperand(1).getReg(); 2399 Mods |= SISrcMods::ABS; 2400 } 2401 2402 if (Mods != 0 && 2403 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) { 2404 MachineInstr *UseMI = Root.getParent(); 2405 2406 // If we looked through copies to find source modifiers on an SGPR operand, 2407 // we now have an SGPR register source. To avoid potentially violating the 2408 // constant bus restriction, we need to insert a copy to a VGPR. 2409 Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc); 2410 BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(), 2411 TII.get(AMDGPU::COPY), VGPRSrc) 2412 .addReg(Src); 2413 Src = VGPRSrc; 2414 } 2415 2416 return std::make_pair(Src, Mods); 2417 } 2418 2419 /// 2420 /// This will select either an SGPR or VGPR operand and will save us from 2421 /// having to write an extra tablegen pattern. 2422 InstructionSelector::ComplexRendererFns 2423 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const { 2424 return {{ 2425 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 2426 }}; 2427 } 2428 2429 InstructionSelector::ComplexRendererFns 2430 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { 2431 Register Src; 2432 unsigned Mods; 2433 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 2434 2435 return {{ 2436 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2437 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 2438 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 2439 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 2440 }}; 2441 } 2442 2443 InstructionSelector::ComplexRendererFns 2444 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { 2445 return {{ 2446 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 2447 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 2448 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 2449 }}; 2450 } 2451 2452 InstructionSelector::ComplexRendererFns 2453 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { 2454 Register Src; 2455 unsigned Mods; 2456 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 2457 2458 return {{ 2459 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2460 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 2461 }}; 2462 } 2463 2464 InstructionSelector::ComplexRendererFns 2465 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { 2466 Register Reg = Root.getReg(); 2467 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); 2468 if (Def && (Def->getOpcode() == AMDGPU::G_FNEG || 2469 Def->getOpcode() == AMDGPU::G_FABS)) 2470 return {}; 2471 return {{ 2472 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 2473 }}; 2474 } 2475 2476 std::pair<Register, unsigned> 2477 AMDGPUInstructionSelector::selectVOP3PModsImpl( 2478 Register Src, const MachineRegisterInfo &MRI) const { 2479 unsigned Mods = 0; 2480 MachineInstr *MI = MRI.getVRegDef(Src); 2481 2482 if (MI && MI->getOpcode() == AMDGPU::G_FNEG && 2483 // It's possible to see an f32 fneg here, but unlikely. 2484 // TODO: Treat f32 fneg as only high bit. 2485 MRI.getType(Src) == LLT::vector(2, 16)) { 2486 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); 2487 Src = MI->getOperand(1).getReg(); 2488 MI = MRI.getVRegDef(Src); 2489 } 2490 2491 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. 2492 2493 // Packed instructions do not have abs modifiers. 2494 Mods |= SISrcMods::OP_SEL_1; 2495 2496 return std::make_pair(Src, Mods); 2497 } 2498 2499 InstructionSelector::ComplexRendererFns 2500 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const { 2501 MachineRegisterInfo &MRI 2502 = Root.getParent()->getParent()->getParent()->getRegInfo(); 2503 2504 Register Src; 2505 unsigned Mods; 2506 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI); 2507 2508 return {{ 2509 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2510 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 2511 }}; 2512 } 2513 2514 InstructionSelector::ComplexRendererFns 2515 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const { 2516 Register Src; 2517 unsigned Mods; 2518 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 2519 if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI)) 2520 return None; 2521 2522 return {{ 2523 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2524 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 2525 }}; 2526 } 2527 2528 InstructionSelector::ComplexRendererFns 2529 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { 2530 // FIXME: Handle op_sel 2531 return {{ 2532 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 2533 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods 2534 }}; 2535 } 2536 2537 InstructionSelector::ComplexRendererFns 2538 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { 2539 SmallVector<GEPInfo, 4> AddrInfo; 2540 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 2541 2542 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 2543 return None; 2544 2545 const GEPInfo &GEPInfo = AddrInfo[0]; 2546 Optional<int64_t> EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm); 2547 if (!EncodedImm) 2548 return None; 2549 2550 unsigned PtrReg = GEPInfo.SgprParts[0]; 2551 return {{ 2552 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 2553 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } 2554 }}; 2555 } 2556 2557 InstructionSelector::ComplexRendererFns 2558 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { 2559 SmallVector<GEPInfo, 4> AddrInfo; 2560 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 2561 2562 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 2563 return None; 2564 2565 const GEPInfo &GEPInfo = AddrInfo[0]; 2566 unsigned PtrReg = GEPInfo.SgprParts[0]; 2567 Optional<int64_t> EncodedImm = 2568 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm); 2569 if (!EncodedImm) 2570 return None; 2571 2572 return {{ 2573 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 2574 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } 2575 }}; 2576 } 2577 2578 InstructionSelector::ComplexRendererFns 2579 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { 2580 MachineInstr *MI = Root.getParent(); 2581 MachineBasicBlock *MBB = MI->getParent(); 2582 2583 SmallVector<GEPInfo, 4> AddrInfo; 2584 getAddrModeInfo(*MI, *MRI, AddrInfo); 2585 2586 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, 2587 // then we can select all ptr + 32-bit offsets not just immediate offsets. 2588 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 2589 return None; 2590 2591 const GEPInfo &GEPInfo = AddrInfo[0]; 2592 if (!GEPInfo.Imm || !isUInt<32>(GEPInfo.Imm)) 2593 return None; 2594 2595 // If we make it this far we have a load with an 32-bit immediate offset. 2596 // It is OK to select this using a sgpr offset, because we have already 2597 // failed trying to select this load into one of the _IMM variants since 2598 // the _IMM Patterns are considered before the _SGPR patterns. 2599 unsigned PtrReg = GEPInfo.SgprParts[0]; 2600 Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2601 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) 2602 .addImm(GEPInfo.Imm); 2603 return {{ 2604 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 2605 [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); } 2606 }}; 2607 } 2608 2609 template <bool Signed> 2610 InstructionSelector::ComplexRendererFns 2611 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const { 2612 MachineInstr *MI = Root.getParent(); 2613 2614 InstructionSelector::ComplexRendererFns Default = {{ 2615 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 2616 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // offset 2617 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 2618 }}; 2619 2620 if (!STI.hasFlatInstOffsets()) 2621 return Default; 2622 2623 const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg()); 2624 if (!OpDef || OpDef->getOpcode() != AMDGPU::G_PTR_ADD) 2625 return Default; 2626 2627 Optional<int64_t> Offset = 2628 getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI); 2629 if (!Offset.hasValue()) 2630 return Default; 2631 2632 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); 2633 if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed)) 2634 return Default; 2635 2636 Register BasePtr = OpDef->getOperand(1).getReg(); 2637 2638 return {{ 2639 [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); }, 2640 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); }, 2641 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 2642 }}; 2643 } 2644 2645 InstructionSelector::ComplexRendererFns 2646 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const { 2647 return selectFlatOffsetImpl<false>(Root); 2648 } 2649 2650 InstructionSelector::ComplexRendererFns 2651 AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const { 2652 return selectFlatOffsetImpl<true>(Root); 2653 } 2654 2655 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { 2656 auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>(); 2657 return PSV && PSV->isStack(); 2658 } 2659 2660 InstructionSelector::ComplexRendererFns 2661 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { 2662 MachineInstr *MI = Root.getParent(); 2663 MachineBasicBlock *MBB = MI->getParent(); 2664 MachineFunction *MF = MBB->getParent(); 2665 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 2666 2667 int64_t Offset = 0; 2668 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset))) { 2669 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2670 2671 // TODO: Should this be inside the render function? The iterator seems to 2672 // move. 2673 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), 2674 HighBits) 2675 .addImm(Offset & ~4095); 2676 2677 return {{[=](MachineInstrBuilder &MIB) { // rsrc 2678 MIB.addReg(Info->getScratchRSrcReg()); 2679 }, 2680 [=](MachineInstrBuilder &MIB) { // vaddr 2681 MIB.addReg(HighBits); 2682 }, 2683 [=](MachineInstrBuilder &MIB) { // soffset 2684 const MachineMemOperand *MMO = *MI->memoperands_begin(); 2685 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 2686 2687 Register SOffsetReg = isStackPtrRelative(PtrInfo) 2688 ? Info->getStackPtrOffsetReg() 2689 : Info->getScratchWaveOffsetReg(); 2690 MIB.addReg(SOffsetReg); 2691 }, 2692 [=](MachineInstrBuilder &MIB) { // offset 2693 MIB.addImm(Offset & 4095); 2694 }}}; 2695 } 2696 2697 assert(Offset == 0); 2698 2699 // Try to fold a frame index directly into the MUBUF vaddr field, and any 2700 // offsets. 2701 Optional<int> FI; 2702 Register VAddr = Root.getReg(); 2703 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) { 2704 if (isBaseWithConstantOffset(Root, *MRI)) { 2705 const MachineOperand &LHS = RootDef->getOperand(1); 2706 const MachineOperand &RHS = RootDef->getOperand(2); 2707 const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); 2708 const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); 2709 if (LHSDef && RHSDef) { 2710 int64_t PossibleOffset = 2711 RHSDef->getOperand(1).getCImm()->getSExtValue(); 2712 if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) && 2713 (!STI.privateMemoryResourceIsRangeChecked() || 2714 KnownBits->signBitIsZero(LHS.getReg()))) { 2715 if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX) 2716 FI = LHSDef->getOperand(1).getIndex(); 2717 else 2718 VAddr = LHS.getReg(); 2719 Offset = PossibleOffset; 2720 } 2721 } 2722 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) { 2723 FI = RootDef->getOperand(1).getIndex(); 2724 } 2725 } 2726 2727 // If we don't know this private access is a local stack object, it needs to 2728 // be relative to the entry point's scratch wave offset register. 2729 // TODO: Should split large offsets that don't fit like above. 2730 // TODO: Don't use scratch wave offset just because the offset didn't fit. 2731 Register SOffset = FI.hasValue() ? Info->getStackPtrOffsetReg() 2732 : Info->getScratchWaveOffsetReg(); 2733 2734 return {{[=](MachineInstrBuilder &MIB) { // rsrc 2735 MIB.addReg(Info->getScratchRSrcReg()); 2736 }, 2737 [=](MachineInstrBuilder &MIB) { // vaddr 2738 if (FI.hasValue()) 2739 MIB.addFrameIndex(FI.getValue()); 2740 else 2741 MIB.addReg(VAddr); 2742 }, 2743 [=](MachineInstrBuilder &MIB) { // soffset 2744 MIB.addReg(SOffset); 2745 }, 2746 [=](MachineInstrBuilder &MIB) { // offset 2747 MIB.addImm(Offset); 2748 }}}; 2749 } 2750 2751 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base, 2752 int64_t Offset, 2753 unsigned OffsetBits) const { 2754 if ((OffsetBits == 16 && !isUInt<16>(Offset)) || 2755 (OffsetBits == 8 && !isUInt<8>(Offset))) 2756 return false; 2757 2758 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) 2759 return true; 2760 2761 // On Southern Islands instruction with a negative base value and an offset 2762 // don't seem to work. 2763 return KnownBits->signBitIsZero(Base); 2764 } 2765 2766 InstructionSelector::ComplexRendererFns 2767 AMDGPUInstructionSelector::selectMUBUFScratchOffset( 2768 MachineOperand &Root) const { 2769 MachineInstr *MI = Root.getParent(); 2770 MachineBasicBlock *MBB = MI->getParent(); 2771 2772 int64_t Offset = 0; 2773 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) || 2774 !SIInstrInfo::isLegalMUBUFImmOffset(Offset)) 2775 return {}; 2776 2777 const MachineFunction *MF = MBB->getParent(); 2778 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 2779 const MachineMemOperand *MMO = *MI->memoperands_begin(); 2780 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 2781 2782 Register SOffsetReg = isStackPtrRelative(PtrInfo) 2783 ? Info->getStackPtrOffsetReg() 2784 : Info->getScratchWaveOffsetReg(); 2785 return {{ 2786 [=](MachineInstrBuilder &MIB) { 2787 MIB.addReg(Info->getScratchRSrcReg()); 2788 }, // rsrc 2789 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffsetReg); }, // soffset 2790 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset 2791 }}; 2792 } 2793 2794 std::pair<Register, unsigned> 2795 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const { 2796 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 2797 if (!RootDef) 2798 return std::make_pair(Root.getReg(), 0); 2799 2800 int64_t ConstAddr = 0; 2801 2802 Register PtrBase; 2803 int64_t Offset; 2804 std::tie(PtrBase, Offset) = 2805 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 2806 2807 if (Offset) { 2808 if (isDSOffsetLegal(PtrBase, Offset, 16)) { 2809 // (add n0, c0) 2810 return std::make_pair(PtrBase, Offset); 2811 } 2812 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 2813 // TODO 2814 2815 2816 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 2817 // TODO 2818 2819 } 2820 2821 return std::make_pair(Root.getReg(), 0); 2822 } 2823 2824 InstructionSelector::ComplexRendererFns 2825 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const { 2826 Register Reg; 2827 unsigned Offset; 2828 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root); 2829 return {{ 2830 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 2831 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } 2832 }}; 2833 } 2834 2835 InstructionSelector::ComplexRendererFns 2836 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const { 2837 Register Reg; 2838 unsigned Offset; 2839 std::tie(Reg, Offset) = selectDS64Bit4ByteAlignedImpl(Root); 2840 return {{ 2841 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 2842 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, 2843 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); } 2844 }}; 2845 } 2846 2847 std::pair<Register, unsigned> 2848 AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const { 2849 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 2850 if (!RootDef) 2851 return std::make_pair(Root.getReg(), 0); 2852 2853 int64_t ConstAddr = 0; 2854 2855 Register PtrBase; 2856 int64_t Offset; 2857 std::tie(PtrBase, Offset) = 2858 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 2859 2860 if (Offset) { 2861 int64_t DWordOffset0 = Offset / 4; 2862 int64_t DWordOffset1 = DWordOffset0 + 1; 2863 if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) { 2864 // (add n0, c0) 2865 return std::make_pair(PtrBase, DWordOffset0); 2866 } 2867 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 2868 // TODO 2869 2870 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 2871 // TODO 2872 2873 } 2874 2875 return std::make_pair(Root.getReg(), 0); 2876 } 2877 2878 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return 2879 /// the base value with the constant offset. There may be intervening copies 2880 /// between \p Root and the identified constant. Returns \p Root, 0 if this does 2881 /// not match the pattern. 2882 std::pair<Register, int64_t> 2883 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset( 2884 Register Root, const MachineRegisterInfo &MRI) const { 2885 MachineInstr *RootI = MRI.getVRegDef(Root); 2886 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD) 2887 return {Root, 0}; 2888 2889 MachineOperand &RHS = RootI->getOperand(2); 2890 Optional<ValueAndVReg> MaybeOffset 2891 = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true); 2892 if (!MaybeOffset) 2893 return {Root, 0}; 2894 return {RootI->getOperand(1).getReg(), MaybeOffset->Value}; 2895 } 2896 2897 static void addZeroImm(MachineInstrBuilder &MIB) { 2898 MIB.addImm(0); 2899 } 2900 2901 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p 2902 /// BasePtr is not valid, a null base pointer will be used. 2903 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, 2904 uint32_t FormatLo, uint32_t FormatHi, 2905 Register BasePtr) { 2906 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 2907 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 2908 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 2909 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 2910 2911 B.buildInstr(AMDGPU::S_MOV_B32) 2912 .addDef(RSrc2) 2913 .addImm(FormatLo); 2914 B.buildInstr(AMDGPU::S_MOV_B32) 2915 .addDef(RSrc3) 2916 .addImm(FormatHi); 2917 2918 // Build the half of the subregister with the constants before building the 2919 // full 128-bit register. If we are building multiple resource descriptors, 2920 // this will allow CSEing of the 2-component register. 2921 B.buildInstr(AMDGPU::REG_SEQUENCE) 2922 .addDef(RSrcHi) 2923 .addReg(RSrc2) 2924 .addImm(AMDGPU::sub0) 2925 .addReg(RSrc3) 2926 .addImm(AMDGPU::sub1); 2927 2928 Register RSrcLo = BasePtr; 2929 if (!BasePtr) { 2930 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 2931 B.buildInstr(AMDGPU::S_MOV_B64) 2932 .addDef(RSrcLo) 2933 .addImm(0); 2934 } 2935 2936 B.buildInstr(AMDGPU::REG_SEQUENCE) 2937 .addDef(RSrc) 2938 .addReg(RSrcLo) 2939 .addImm(AMDGPU::sub0_sub1) 2940 .addReg(RSrcHi) 2941 .addImm(AMDGPU::sub2_sub3); 2942 2943 return RSrc; 2944 } 2945 2946 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 2947 const SIInstrInfo &TII, Register BasePtr) { 2948 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 2949 2950 // FIXME: Why are half the "default" bits ignored based on the addressing 2951 // mode? 2952 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr); 2953 } 2954 2955 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 2956 const SIInstrInfo &TII, Register BasePtr) { 2957 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 2958 2959 // FIXME: Why are half the "default" bits ignored based on the addressing 2960 // mode? 2961 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr); 2962 } 2963 2964 AMDGPUInstructionSelector::MUBUFAddressData 2965 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const { 2966 MUBUFAddressData Data; 2967 Data.N0 = Src; 2968 2969 Register PtrBase; 2970 int64_t Offset; 2971 2972 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI); 2973 if (isUInt<32>(Offset)) { 2974 Data.N0 = PtrBase; 2975 Data.Offset = Offset; 2976 } 2977 2978 if (MachineInstr *InputAdd 2979 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) { 2980 Data.N2 = InputAdd->getOperand(1).getReg(); 2981 Data.N3 = InputAdd->getOperand(2).getReg(); 2982 2983 // FIXME: Need to fix extra SGPR->VGPRcopies inserted 2984 // FIXME: Don't know this was defined by operand 0 2985 // 2986 // TODO: Remove this when we have copy folding optimizations after 2987 // RegBankSelect. 2988 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg(); 2989 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg(); 2990 } 2991 2992 return Data; 2993 } 2994 2995 /// Return if the addr64 mubuf mode should be used for the given address. 2996 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const { 2997 // (ptr_add N2, N3) -> addr64, or 2998 // (ptr_add (ptr_add N2, N3), C1) -> addr64 2999 if (Addr.N2) 3000 return true; 3001 3002 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI); 3003 return N0Bank->getID() == AMDGPU::VGPRRegBankID; 3004 } 3005 3006 /// Split an immediate offset \p ImmOffset depending on whether it fits in the 3007 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable 3008 /// component. 3009 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset( 3010 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const { 3011 if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset)) 3012 return; 3013 3014 // Illegal offset, store it in soffset. 3015 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 3016 B.buildInstr(AMDGPU::S_MOV_B32) 3017 .addDef(SOffset) 3018 .addImm(ImmOffset); 3019 ImmOffset = 0; 3020 } 3021 3022 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl( 3023 MachineOperand &Root, Register &VAddr, Register &RSrcReg, 3024 Register &SOffset, int64_t &Offset) const { 3025 // FIXME: Predicates should stop this from reaching here. 3026 // addr64 bit was removed for volcanic islands. 3027 if (!STI.hasAddr64() || STI.useFlatForGlobal()) 3028 return false; 3029 3030 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 3031 if (!shouldUseAddr64(AddrData)) 3032 return false; 3033 3034 Register N0 = AddrData.N0; 3035 Register N2 = AddrData.N2; 3036 Register N3 = AddrData.N3; 3037 Offset = AddrData.Offset; 3038 3039 // Base pointer for the SRD. 3040 Register SRDPtr; 3041 3042 if (N2) { 3043 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 3044 assert(N3); 3045 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 3046 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the 3047 // addr64, and construct the default resource from a 0 address. 3048 VAddr = N0; 3049 } else { 3050 SRDPtr = N3; 3051 VAddr = N2; 3052 } 3053 } else { 3054 // N2 is not divergent. 3055 SRDPtr = N2; 3056 VAddr = N3; 3057 } 3058 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 3059 // Use the default null pointer in the resource 3060 VAddr = N0; 3061 } else { 3062 // N0 -> offset, or 3063 // (N0 + C1) -> offset 3064 SRDPtr = N0; 3065 } 3066 3067 MachineIRBuilder B(*Root.getParent()); 3068 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr); 3069 splitIllegalMUBUFOffset(B, SOffset, Offset); 3070 return true; 3071 } 3072 3073 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl( 3074 MachineOperand &Root, Register &RSrcReg, Register &SOffset, 3075 int64_t &Offset) const { 3076 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 3077 if (shouldUseAddr64(AddrData)) 3078 return false; 3079 3080 // N0 -> offset, or 3081 // (N0 + C1) -> offset 3082 Register SRDPtr = AddrData.N0; 3083 Offset = AddrData.Offset; 3084 3085 // TODO: Look through extensions for 32-bit soffset. 3086 MachineIRBuilder B(*Root.getParent()); 3087 3088 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr); 3089 splitIllegalMUBUFOffset(B, SOffset, Offset); 3090 return true; 3091 } 3092 3093 InstructionSelector::ComplexRendererFns 3094 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const { 3095 Register VAddr; 3096 Register RSrcReg; 3097 Register SOffset; 3098 int64_t Offset = 0; 3099 3100 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 3101 return {}; 3102 3103 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 3104 // pattern. 3105 return {{ 3106 [=](MachineInstrBuilder &MIB) { // rsrc 3107 MIB.addReg(RSrcReg); 3108 }, 3109 [=](MachineInstrBuilder &MIB) { // vaddr 3110 MIB.addReg(VAddr); 3111 }, 3112 [=](MachineInstrBuilder &MIB) { // soffset 3113 if (SOffset) 3114 MIB.addReg(SOffset); 3115 else 3116 MIB.addImm(0); 3117 }, 3118 [=](MachineInstrBuilder &MIB) { // offset 3119 MIB.addImm(Offset); 3120 }, 3121 addZeroImm, // glc 3122 addZeroImm, // slc 3123 addZeroImm, // tfe 3124 addZeroImm, // dlc 3125 addZeroImm // swz 3126 }}; 3127 } 3128 3129 InstructionSelector::ComplexRendererFns 3130 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const { 3131 Register RSrcReg; 3132 Register SOffset; 3133 int64_t Offset = 0; 3134 3135 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 3136 return {}; 3137 3138 return {{ 3139 [=](MachineInstrBuilder &MIB) { // rsrc 3140 MIB.addReg(RSrcReg); 3141 }, 3142 [=](MachineInstrBuilder &MIB) { // soffset 3143 if (SOffset) 3144 MIB.addReg(SOffset); 3145 else 3146 MIB.addImm(0); 3147 }, 3148 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 3149 addZeroImm, // glc 3150 addZeroImm, // slc 3151 addZeroImm, // tfe 3152 addZeroImm, // dlc 3153 addZeroImm // swz 3154 }}; 3155 } 3156 3157 InstructionSelector::ComplexRendererFns 3158 AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const { 3159 Register VAddr; 3160 Register RSrcReg; 3161 Register SOffset; 3162 int64_t Offset = 0; 3163 3164 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 3165 return {}; 3166 3167 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 3168 // pattern. 3169 return {{ 3170 [=](MachineInstrBuilder &MIB) { // rsrc 3171 MIB.addReg(RSrcReg); 3172 }, 3173 [=](MachineInstrBuilder &MIB) { // vaddr 3174 MIB.addReg(VAddr); 3175 }, 3176 [=](MachineInstrBuilder &MIB) { // soffset 3177 if (SOffset) 3178 MIB.addReg(SOffset); 3179 else 3180 MIB.addImm(0); 3181 }, 3182 [=](MachineInstrBuilder &MIB) { // offset 3183 MIB.addImm(Offset); 3184 }, 3185 addZeroImm // slc 3186 }}; 3187 } 3188 3189 InstructionSelector::ComplexRendererFns 3190 AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const { 3191 Register RSrcReg; 3192 Register SOffset; 3193 int64_t Offset = 0; 3194 3195 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 3196 return {}; 3197 3198 return {{ 3199 [=](MachineInstrBuilder &MIB) { // rsrc 3200 MIB.addReg(RSrcReg); 3201 }, 3202 [=](MachineInstrBuilder &MIB) { // soffset 3203 if (SOffset) 3204 MIB.addReg(SOffset); 3205 else 3206 MIB.addImm(0); 3207 }, 3208 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 3209 addZeroImm // slc 3210 }}; 3211 } 3212 3213 /// Get an immediate that must be 32-bits, and treated as zero extended. 3214 static Optional<uint64_t> getConstantZext32Val(Register Reg, 3215 const MachineRegisterInfo &MRI) { 3216 // getConstantVRegVal sexts any values, so see if that matters. 3217 Optional<int64_t> OffsetVal = getConstantVRegVal(Reg, MRI); 3218 if (!OffsetVal || !isInt<32>(*OffsetVal)) 3219 return None; 3220 return Lo_32(*OffsetVal); 3221 } 3222 3223 InstructionSelector::ComplexRendererFns 3224 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const { 3225 Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI); 3226 if (!OffsetVal) 3227 return {}; 3228 3229 Optional<int64_t> EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal); 3230 if (!EncodedImm) 3231 return {}; 3232 3233 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; 3234 } 3235 3236 InstructionSelector::ComplexRendererFns 3237 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const { 3238 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); 3239 3240 Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI); 3241 if (!OffsetVal) 3242 return {}; 3243 3244 Optional<int64_t> EncodedImm 3245 = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal); 3246 if (!EncodedImm) 3247 return {}; 3248 3249 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; 3250 } 3251 3252 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, 3253 const MachineInstr &MI, 3254 int OpIdx) const { 3255 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 3256 "Expected G_CONSTANT"); 3257 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue()); 3258 } 3259 3260 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB, 3261 const MachineInstr &MI, 3262 int OpIdx) const { 3263 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 3264 "Expected G_CONSTANT"); 3265 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue()); 3266 } 3267 3268 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB, 3269 const MachineInstr &MI, 3270 int OpIdx) const { 3271 assert(OpIdx == -1); 3272 3273 const MachineOperand &Op = MI.getOperand(1); 3274 if (MI.getOpcode() == TargetOpcode::G_FCONSTANT) 3275 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); 3276 else { 3277 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT"); 3278 MIB.addImm(Op.getCImm()->getSExtValue()); 3279 } 3280 } 3281 3282 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB, 3283 const MachineInstr &MI, 3284 int OpIdx) const { 3285 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 3286 "Expected G_CONSTANT"); 3287 MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation()); 3288 } 3289 3290 /// This only really exists to satisfy DAG type checking machinery, so is a 3291 /// no-op here. 3292 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB, 3293 const MachineInstr &MI, 3294 int OpIdx) const { 3295 MIB.addImm(MI.getOperand(OpIdx).getImm()); 3296 } 3297 3298 void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB, 3299 const MachineInstr &MI, 3300 int OpIdx) const { 3301 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3302 MIB.addImm(MI.getOperand(OpIdx).getImm() & 1); 3303 } 3304 3305 void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB, 3306 const MachineInstr &MI, 3307 int OpIdx) const { 3308 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3309 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1); 3310 } 3311 3312 void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB, 3313 const MachineInstr &MI, 3314 int OpIdx) const { 3315 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3316 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1); 3317 } 3318 3319 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB, 3320 const MachineInstr &MI, 3321 int OpIdx) const { 3322 assert(OpIdx >= 0 && "expected to match an immediate operand"); 3323 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1); 3324 } 3325 3326 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const { 3327 return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm()); 3328 } 3329 3330 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const { 3331 return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm()); 3332 } 3333 3334 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const { 3335 return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm()); 3336 } 3337 3338 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const { 3339 return TII.isInlineConstant(Imm); 3340 } 3341