1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the InstructionSelector class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUInstructionSelector.h" 15 #include "AMDGPUInstrInfo.h" 16 #include "AMDGPUGlobalISelUtils.h" 17 #include "AMDGPURegisterBankInfo.h" 18 #include "AMDGPURegisterInfo.h" 19 #include "AMDGPUSubtarget.h" 20 #include "AMDGPUTargetMachine.h" 21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 22 #include "SIMachineFunctionInfo.h" 23 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 24 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" 25 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 28 #include "llvm/CodeGen/GlobalISel/Utils.h" 29 #include "llvm/CodeGen/MachineBasicBlock.h" 30 #include "llvm/CodeGen/MachineFunction.h" 31 #include "llvm/CodeGen/MachineInstr.h" 32 #include "llvm/CodeGen/MachineInstrBuilder.h" 33 #include "llvm/CodeGen/MachineRegisterInfo.h" 34 #include "llvm/IR/Type.h" 35 #include "llvm/Support/Debug.h" 36 #include "llvm/Support/raw_ostream.h" 37 38 #define DEBUG_TYPE "amdgpu-isel" 39 40 using namespace llvm; 41 using namespace MIPatternMatch; 42 43 #define GET_GLOBALISEL_IMPL 44 #define AMDGPUSubtarget GCNSubtarget 45 #include "AMDGPUGenGlobalISel.inc" 46 #undef GET_GLOBALISEL_IMPL 47 #undef AMDGPUSubtarget 48 49 AMDGPUInstructionSelector::AMDGPUInstructionSelector( 50 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, 51 const AMDGPUTargetMachine &TM) 52 : InstructionSelector(), TII(*STI.getInstrInfo()), 53 TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM), 54 STI(STI), 55 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG), 56 #define GET_GLOBALISEL_PREDICATES_INIT 57 #include "AMDGPUGenGlobalISel.inc" 58 #undef GET_GLOBALISEL_PREDICATES_INIT 59 #define GET_GLOBALISEL_TEMPORARIES_INIT 60 #include "AMDGPUGenGlobalISel.inc" 61 #undef GET_GLOBALISEL_TEMPORARIES_INIT 62 { 63 } 64 65 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } 66 67 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB, 68 CodeGenCoverage &CoverageInfo) { 69 MRI = &MF.getRegInfo(); 70 InstructionSelector::setupMF(MF, KB, CoverageInfo); 71 } 72 73 bool AMDGPUInstructionSelector::isVCC(Register Reg, 74 const MachineRegisterInfo &MRI) const { 75 if (Register::isPhysicalRegister(Reg)) 76 return Reg == TRI.getVCC(); 77 78 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 79 const TargetRegisterClass *RC = 80 RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); 81 if (RC) { 82 const LLT Ty = MRI.getType(Reg); 83 return RC->hasSuperClassEq(TRI.getBoolRC()) && 84 Ty.isValid() && Ty.getSizeInBits() == 1; 85 } 86 87 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); 88 return RB->getID() == AMDGPU::VCCRegBankID; 89 } 90 91 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI, 92 unsigned NewOpc) const { 93 MI.setDesc(TII.get(NewOpc)); 94 MI.RemoveOperand(1); // Remove intrinsic ID. 95 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 96 97 MachineOperand &Dst = MI.getOperand(0); 98 MachineOperand &Src = MI.getOperand(1); 99 100 // TODO: This should be legalized to s32 if needed 101 if (MRI->getType(Dst.getReg()) == LLT::scalar(1)) 102 return false; 103 104 const TargetRegisterClass *DstRC 105 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 106 const TargetRegisterClass *SrcRC 107 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 108 if (!DstRC || DstRC != SrcRC) 109 return false; 110 111 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) && 112 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI); 113 } 114 115 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { 116 const DebugLoc &DL = I.getDebugLoc(); 117 MachineBasicBlock *BB = I.getParent(); 118 I.setDesc(TII.get(TargetOpcode::COPY)); 119 120 const MachineOperand &Src = I.getOperand(1); 121 MachineOperand &Dst = I.getOperand(0); 122 Register DstReg = Dst.getReg(); 123 Register SrcReg = Src.getReg(); 124 125 if (isVCC(DstReg, *MRI)) { 126 if (SrcReg == AMDGPU::SCC) { 127 const TargetRegisterClass *RC 128 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 129 if (!RC) 130 return true; 131 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 132 } 133 134 if (!isVCC(SrcReg, *MRI)) { 135 // TODO: Should probably leave the copy and let copyPhysReg expand it. 136 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI)) 137 return false; 138 139 const TargetRegisterClass *SrcRC 140 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 141 142 Register MaskedReg = MRI->createVirtualRegister(SrcRC); 143 144 // We can't trust the high bits at this point, so clear them. 145 146 // TODO: Skip masking high bits if def is known boolean. 147 148 unsigned AndOpc = TRI.isSGPRClass(SrcRC) ? 149 AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32; 150 BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg) 151 .addImm(1) 152 .addReg(SrcReg); 153 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) 154 .addImm(0) 155 .addReg(MaskedReg); 156 157 if (!MRI->getRegClassOrNull(SrcReg)) 158 MRI->setRegClass(SrcReg, SrcRC); 159 I.eraseFromParent(); 160 return true; 161 } 162 163 const TargetRegisterClass *RC = 164 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 165 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) 166 return false; 167 168 // Don't constrain the source register to a class so the def instruction 169 // handles it (unless it's undef). 170 // 171 // FIXME: This is a hack. When selecting the def, we neeed to know 172 // specifically know that the result is VCCRegBank, and not just an SGPR 173 // with size 1. An SReg_32 with size 1 is ambiguous with wave32. 174 if (Src.isUndef()) { 175 const TargetRegisterClass *SrcRC = 176 TRI.getConstrainedRegClassForOperand(Src, *MRI); 177 if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 178 return false; 179 } 180 181 return true; 182 } 183 184 for (const MachineOperand &MO : I.operands()) { 185 if (Register::isPhysicalRegister(MO.getReg())) 186 continue; 187 188 const TargetRegisterClass *RC = 189 TRI.getConstrainedRegClassForOperand(MO, *MRI); 190 if (!RC) 191 continue; 192 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); 193 } 194 return true; 195 } 196 197 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { 198 const Register DefReg = I.getOperand(0).getReg(); 199 const LLT DefTy = MRI->getType(DefReg); 200 201 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy) 202 203 const RegClassOrRegBank &RegClassOrBank = 204 MRI->getRegClassOrRegBank(DefReg); 205 206 const TargetRegisterClass *DefRC 207 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); 208 if (!DefRC) { 209 if (!DefTy.isValid()) { 210 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); 211 return false; 212 } 213 214 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); 215 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI); 216 if (!DefRC) { 217 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); 218 return false; 219 } 220 } 221 222 // TODO: Verify that all registers have the same bank 223 I.setDesc(TII.get(TargetOpcode::PHI)); 224 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI); 225 } 226 227 MachineOperand 228 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, 229 const TargetRegisterClass &SubRC, 230 unsigned SubIdx) const { 231 232 MachineInstr *MI = MO.getParent(); 233 MachineBasicBlock *BB = MO.getParent()->getParent(); 234 Register DstReg = MRI->createVirtualRegister(&SubRC); 235 236 if (MO.isReg()) { 237 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); 238 Register Reg = MO.getReg(); 239 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) 240 .addReg(Reg, 0, ComposedSubIdx); 241 242 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(), 243 MO.isKill(), MO.isDead(), MO.isUndef(), 244 MO.isEarlyClobber(), 0, MO.isDebug(), 245 MO.isInternalRead()); 246 } 247 248 assert(MO.isImm()); 249 250 APInt Imm(64, MO.getImm()); 251 252 switch (SubIdx) { 253 default: 254 llvm_unreachable("do not know to split immediate with this sub index."); 255 case AMDGPU::sub0: 256 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue()); 257 case AMDGPU::sub1: 258 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue()); 259 } 260 } 261 262 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { 263 switch (Opc) { 264 case AMDGPU::G_AND: 265 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; 266 case AMDGPU::G_OR: 267 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32; 268 case AMDGPU::G_XOR: 269 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32; 270 default: 271 llvm_unreachable("not a bit op"); 272 } 273 } 274 275 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { 276 MachineOperand &Dst = I.getOperand(0); 277 MachineOperand &Src0 = I.getOperand(1); 278 MachineOperand &Src1 = I.getOperand(2); 279 Register DstReg = Dst.getReg(); 280 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 281 282 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 283 if (DstRB->getID() == AMDGPU::VCCRegBankID) { 284 const TargetRegisterClass *RC = TRI.getBoolRC(); 285 unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), 286 RC == &AMDGPU::SReg_64RegClass); 287 I.setDesc(TII.get(InstOpc)); 288 289 // FIXME: Hack to avoid turning the register bank into a register class. 290 // The selector for G_ICMP relies on seeing the register bank for the result 291 // is VCC. In wave32 if we constrain the registers to SReg_32 here, it will 292 // be ambiguous whether it's a scalar or vector bool. 293 if (Src0.isUndef() && !MRI->getRegClassOrNull(Src0.getReg())) 294 MRI->setRegClass(Src0.getReg(), RC); 295 if (Src1.isUndef() && !MRI->getRegClassOrNull(Src1.getReg())) 296 MRI->setRegClass(Src1.getReg(), RC); 297 298 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 299 } 300 301 // TODO: Should this allow an SCC bank result, and produce a copy from SCC for 302 // the result? 303 if (DstRB->getID() == AMDGPU::SGPRRegBankID) { 304 unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), Size > 32); 305 I.setDesc(TII.get(InstOpc)); 306 // Dead implicit-def of scc 307 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef 308 true, // isImp 309 false, // isKill 310 true)); // isDead 311 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 312 } 313 314 return false; 315 } 316 317 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { 318 MachineBasicBlock *BB = I.getParent(); 319 MachineFunction *MF = BB->getParent(); 320 Register DstReg = I.getOperand(0).getReg(); 321 const DebugLoc &DL = I.getDebugLoc(); 322 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 323 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 324 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID; 325 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB; 326 327 if (Size == 32) { 328 if (IsSALU) { 329 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; 330 MachineInstr *Add = 331 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 332 .add(I.getOperand(1)) 333 .add(I.getOperand(2)); 334 I.eraseFromParent(); 335 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 336 } 337 338 if (STI.hasAddNoCarry()) { 339 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64; 340 I.setDesc(TII.get(Opc)); 341 I.addOperand(*MF, MachineOperand::CreateImm(0)); 342 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 343 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 344 } 345 346 const unsigned Opc = Sub ? AMDGPU::V_SUB_I32_e64 : AMDGPU::V_ADD_I32_e64; 347 348 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass()); 349 MachineInstr *Add 350 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 351 .addDef(UnusedCarry, RegState::Dead) 352 .add(I.getOperand(1)) 353 .add(I.getOperand(2)) 354 .addImm(0); 355 I.eraseFromParent(); 356 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 357 } 358 359 assert(!Sub && "illegal sub should not reach here"); 360 361 const TargetRegisterClass &RC 362 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass; 363 const TargetRegisterClass &HalfRC 364 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass; 365 366 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0)); 367 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0)); 368 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1)); 369 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1)); 370 371 Register DstLo = MRI->createVirtualRegister(&HalfRC); 372 Register DstHi = MRI->createVirtualRegister(&HalfRC); 373 374 if (IsSALU) { 375 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) 376 .add(Lo1) 377 .add(Lo2); 378 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) 379 .add(Hi1) 380 .add(Hi2); 381 } else { 382 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass(); 383 Register CarryReg = MRI->createVirtualRegister(CarryRC); 384 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_I32_e64), DstLo) 385 .addDef(CarryReg) 386 .add(Lo1) 387 .add(Lo2) 388 .addImm(0); 389 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi) 390 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead) 391 .add(Hi1) 392 .add(Hi2) 393 .addReg(CarryReg, RegState::Kill) 394 .addImm(0); 395 396 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI)) 397 return false; 398 } 399 400 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 401 .addReg(DstLo) 402 .addImm(AMDGPU::sub0) 403 .addReg(DstHi) 404 .addImm(AMDGPU::sub1); 405 406 407 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI)) 408 return false; 409 410 I.eraseFromParent(); 411 return true; 412 } 413 414 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE( 415 MachineInstr &I) const { 416 MachineBasicBlock *BB = I.getParent(); 417 MachineFunction *MF = BB->getParent(); 418 const DebugLoc &DL = I.getDebugLoc(); 419 Register Dst0Reg = I.getOperand(0).getReg(); 420 Register Dst1Reg = I.getOperand(1).getReg(); 421 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO || 422 I.getOpcode() == AMDGPU::G_UADDE; 423 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE || 424 I.getOpcode() == AMDGPU::G_USUBE; 425 426 if (isVCC(Dst1Reg, *MRI)) { 427 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned 428 // carry out despite the _i32 name. These were renamed in VI to _U32. 429 // FIXME: We should probably rename the opcodes here. 430 unsigned NoCarryOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; 431 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 432 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc)); 433 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 434 I.addOperand(*MF, MachineOperand::CreateImm(0)); 435 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 436 } 437 438 Register Src0Reg = I.getOperand(2).getReg(); 439 Register Src1Reg = I.getOperand(3).getReg(); 440 441 if (HasCarryIn) { 442 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 443 .addReg(I.getOperand(4).getReg()); 444 } 445 446 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; 447 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; 448 449 BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg) 450 .add(I.getOperand(2)) 451 .add(I.getOperand(3)); 452 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg) 453 .addReg(AMDGPU::SCC); 454 455 if (!MRI->getRegClassOrNull(Dst1Reg)) 456 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass); 457 458 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) || 459 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) || 460 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI)) 461 return false; 462 463 if (HasCarryIn && 464 !RBI.constrainGenericRegister(I.getOperand(4).getReg(), 465 AMDGPU::SReg_32RegClass, *MRI)) 466 return false; 467 468 I.eraseFromParent(); 469 return true; 470 } 471 472 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { 473 MachineBasicBlock *BB = I.getParent(); 474 Register DstReg = I.getOperand(0).getReg(); 475 Register SrcReg = I.getOperand(1).getReg(); 476 LLT DstTy = MRI->getType(DstReg); 477 LLT SrcTy = MRI->getType(SrcReg); 478 const unsigned SrcSize = SrcTy.getSizeInBits(); 479 const unsigned DstSize = DstTy.getSizeInBits(); 480 481 // TODO: Should handle any multiple of 32 offset. 482 unsigned Offset = I.getOperand(2).getImm(); 483 if (Offset % DstSize != 0) 484 return false; 485 486 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 487 const TargetRegisterClass *SrcRC = 488 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 489 if (!SrcRC) 490 return false; 491 492 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); 493 494 const DebugLoc &DL = I.getDebugLoc(); 495 MachineInstr *Copy = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg) 496 .addReg(SrcReg, 0, SubRegs[Offset / DstSize]); 497 498 for (const MachineOperand &MO : Copy->operands()) { 499 const TargetRegisterClass *RC = 500 TRI.getConstrainedRegClassForOperand(MO, *MRI); 501 if (!RC) 502 continue; 503 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); 504 } 505 I.eraseFromParent(); 506 return true; 507 } 508 509 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { 510 MachineBasicBlock *BB = MI.getParent(); 511 Register DstReg = MI.getOperand(0).getReg(); 512 LLT DstTy = MRI->getType(DstReg); 513 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg()); 514 515 const unsigned SrcSize = SrcTy.getSizeInBits(); 516 if (SrcSize < 32) 517 return selectImpl(MI, *CoverageInfo); 518 519 const DebugLoc &DL = MI.getDebugLoc(); 520 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 521 const unsigned DstSize = DstTy.getSizeInBits(); 522 const TargetRegisterClass *DstRC = 523 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 524 if (!DstRC) 525 return false; 526 527 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8); 528 MachineInstrBuilder MIB = 529 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg); 530 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) { 531 MachineOperand &Src = MI.getOperand(I + 1); 532 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef())); 533 MIB.addImm(SubRegs[I]); 534 535 const TargetRegisterClass *SrcRC 536 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 537 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI)) 538 return false; 539 } 540 541 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 542 return false; 543 544 MI.eraseFromParent(); 545 return true; 546 } 547 548 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { 549 MachineBasicBlock *BB = MI.getParent(); 550 const int NumDst = MI.getNumOperands() - 1; 551 552 MachineOperand &Src = MI.getOperand(NumDst); 553 554 Register SrcReg = Src.getReg(); 555 Register DstReg0 = MI.getOperand(0).getReg(); 556 LLT DstTy = MRI->getType(DstReg0); 557 LLT SrcTy = MRI->getType(SrcReg); 558 559 const unsigned DstSize = DstTy.getSizeInBits(); 560 const unsigned SrcSize = SrcTy.getSizeInBits(); 561 const DebugLoc &DL = MI.getDebugLoc(); 562 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 563 564 const TargetRegisterClass *SrcRC = 565 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 566 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 567 return false; 568 569 const unsigned SrcFlags = getUndefRegState(Src.isUndef()); 570 571 // Note we could have mixed SGPR and VGPR destination banks for an SGPR 572 // source, and this relies on the fact that the same subregister indices are 573 // used for both. 574 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); 575 for (int I = 0, E = NumDst; I != E; ++I) { 576 MachineOperand &Dst = MI.getOperand(I); 577 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg()) 578 .addReg(SrcReg, SrcFlags, SubRegs[I]); 579 580 const TargetRegisterClass *DstRC = 581 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 582 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI)) 583 return false; 584 } 585 586 MI.eraseFromParent(); 587 return true; 588 } 589 590 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const { 591 return selectG_ADD_SUB(I); 592 } 593 594 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { 595 const MachineOperand &MO = I.getOperand(0); 596 597 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The 598 // regbank check here is to know why getConstrainedRegClassForOperand failed. 599 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI); 600 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) || 601 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) { 602 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); 603 return true; 604 } 605 606 return false; 607 } 608 609 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { 610 MachineBasicBlock *BB = I.getParent(); 611 612 Register DstReg = I.getOperand(0).getReg(); 613 Register Src0Reg = I.getOperand(1).getReg(); 614 Register Src1Reg = I.getOperand(2).getReg(); 615 LLT Src1Ty = MRI->getType(Src1Reg); 616 617 unsigned DstSize = MRI->getType(DstReg).getSizeInBits(); 618 unsigned InsSize = Src1Ty.getSizeInBits(); 619 620 int64_t Offset = I.getOperand(3).getImm(); 621 if (Offset % 32 != 0) 622 return false; 623 624 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32); 625 if (SubReg == AMDGPU::NoSubRegister) 626 return false; 627 628 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 629 const TargetRegisterClass *DstRC = 630 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 631 if (!DstRC) 632 return false; 633 634 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI); 635 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI); 636 const TargetRegisterClass *Src0RC = 637 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI); 638 const TargetRegisterClass *Src1RC = 639 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI); 640 641 // Deal with weird cases where the class only partially supports the subreg 642 // index. 643 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg); 644 if (!Src0RC) 645 return false; 646 647 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 648 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) || 649 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI)) 650 return false; 651 652 const DebugLoc &DL = I.getDebugLoc(); 653 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg) 654 .addReg(Src0Reg) 655 .addReg(Src1Reg) 656 .addImm(SubReg); 657 658 I.eraseFromParent(); 659 return true; 660 } 661 662 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const { 663 if (STI.getLDSBankCount() != 16) 664 return selectImpl(MI, *CoverageInfo); 665 666 Register Dst = MI.getOperand(0).getReg(); 667 Register Src0 = MI.getOperand(2).getReg(); 668 Register M0Val = MI.getOperand(6).getReg(); 669 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) || 670 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) || 671 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI)) 672 return false; 673 674 // This requires 2 instructions. It is possible to write a pattern to support 675 // this, but the generated isel emitter doesn't correctly deal with multiple 676 // output instructions using the same physical register input. The copy to m0 677 // is incorrectly placed before the second instruction. 678 // 679 // TODO: Match source modifiers. 680 681 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 682 const DebugLoc &DL = MI.getDebugLoc(); 683 MachineBasicBlock *MBB = MI.getParent(); 684 685 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 686 .addReg(M0Val); 687 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov) 688 .addImm(2) 689 .addImm(MI.getOperand(4).getImm()) // $attr 690 .addImm(MI.getOperand(3).getImm()); // $attrchan 691 692 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst) 693 .addImm(0) // $src0_modifiers 694 .addReg(Src0) // $src0 695 .addImm(MI.getOperand(4).getImm()) // $attr 696 .addImm(MI.getOperand(3).getImm()) // $attrchan 697 .addImm(0) // $src2_modifiers 698 .addReg(InterpMov) // $src2 - 2 f16 values selected by high 699 .addImm(MI.getOperand(5).getImm()) // $high 700 .addImm(0) // $clamp 701 .addImm(0); // $omod 702 703 MI.eraseFromParent(); 704 return true; 705 } 706 707 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { 708 unsigned IntrinsicID = I.getIntrinsicID(); 709 switch (IntrinsicID) { 710 case Intrinsic::amdgcn_if_break: { 711 MachineBasicBlock *BB = I.getParent(); 712 713 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 714 // SelectionDAG uses for wave32 vs wave64. 715 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK)) 716 .add(I.getOperand(0)) 717 .add(I.getOperand(2)) 718 .add(I.getOperand(3)); 719 720 Register DstReg = I.getOperand(0).getReg(); 721 Register Src0Reg = I.getOperand(2).getReg(); 722 Register Src1Reg = I.getOperand(3).getReg(); 723 724 I.eraseFromParent(); 725 726 for (Register Reg : { DstReg, Src0Reg, Src1Reg }) 727 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 728 729 return true; 730 } 731 case Intrinsic::amdgcn_interp_p1_f16: 732 return selectInterpP1F16(I); 733 case Intrinsic::amdgcn_wqm: 734 return constrainCopyLikeIntrin(I, AMDGPU::WQM); 735 case Intrinsic::amdgcn_softwqm: 736 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM); 737 case Intrinsic::amdgcn_wwm: 738 return constrainCopyLikeIntrin(I, AMDGPU::WWM); 739 default: 740 return selectImpl(I, *CoverageInfo); 741 } 742 } 743 744 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) { 745 if (Size != 32 && Size != 64) 746 return -1; 747 switch (P) { 748 default: 749 llvm_unreachable("Unknown condition code!"); 750 case CmpInst::ICMP_NE: 751 return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64; 752 case CmpInst::ICMP_EQ: 753 return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64; 754 case CmpInst::ICMP_SGT: 755 return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64; 756 case CmpInst::ICMP_SGE: 757 return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64; 758 case CmpInst::ICMP_SLT: 759 return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64; 760 case CmpInst::ICMP_SLE: 761 return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64; 762 case CmpInst::ICMP_UGT: 763 return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64; 764 case CmpInst::ICMP_UGE: 765 return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64; 766 case CmpInst::ICMP_ULT: 767 return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64; 768 case CmpInst::ICMP_ULE: 769 return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64; 770 } 771 } 772 773 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P, 774 unsigned Size) const { 775 if (Size == 64) { 776 if (!STI.hasScalarCompareEq64()) 777 return -1; 778 779 switch (P) { 780 case CmpInst::ICMP_NE: 781 return AMDGPU::S_CMP_LG_U64; 782 case CmpInst::ICMP_EQ: 783 return AMDGPU::S_CMP_EQ_U64; 784 default: 785 return -1; 786 } 787 } 788 789 if (Size != 32) 790 return -1; 791 792 switch (P) { 793 case CmpInst::ICMP_NE: 794 return AMDGPU::S_CMP_LG_U32; 795 case CmpInst::ICMP_EQ: 796 return AMDGPU::S_CMP_EQ_U32; 797 case CmpInst::ICMP_SGT: 798 return AMDGPU::S_CMP_GT_I32; 799 case CmpInst::ICMP_SGE: 800 return AMDGPU::S_CMP_GE_I32; 801 case CmpInst::ICMP_SLT: 802 return AMDGPU::S_CMP_LT_I32; 803 case CmpInst::ICMP_SLE: 804 return AMDGPU::S_CMP_LE_I32; 805 case CmpInst::ICMP_UGT: 806 return AMDGPU::S_CMP_GT_U32; 807 case CmpInst::ICMP_UGE: 808 return AMDGPU::S_CMP_GE_U32; 809 case CmpInst::ICMP_ULT: 810 return AMDGPU::S_CMP_LT_U32; 811 case CmpInst::ICMP_ULE: 812 return AMDGPU::S_CMP_LE_U32; 813 default: 814 llvm_unreachable("Unknown condition code!"); 815 } 816 } 817 818 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { 819 MachineBasicBlock *BB = I.getParent(); 820 const DebugLoc &DL = I.getDebugLoc(); 821 822 Register SrcReg = I.getOperand(2).getReg(); 823 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 824 825 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); 826 827 Register CCReg = I.getOperand(0).getReg(); 828 if (!isVCC(CCReg, *MRI)) { 829 int Opcode = getS_CMPOpcode(Pred, Size); 830 if (Opcode == -1) 831 return false; 832 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode)) 833 .add(I.getOperand(2)) 834 .add(I.getOperand(3)); 835 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg) 836 .addReg(AMDGPU::SCC); 837 bool Ret = 838 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) && 839 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI); 840 I.eraseFromParent(); 841 return Ret; 842 } 843 844 int Opcode = getV_CMPOpcode(Pred, Size); 845 if (Opcode == -1) 846 return false; 847 848 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), 849 I.getOperand(0).getReg()) 850 .add(I.getOperand(2)) 851 .add(I.getOperand(3)); 852 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), 853 *TRI.getBoolRC(), *MRI); 854 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 855 I.eraseFromParent(); 856 return Ret; 857 } 858 859 static bool isZero(Register Reg, MachineRegisterInfo &MRI) { 860 int64_t C; 861 if (mi_match(Reg, MRI, m_ICst(C)) && C == 0) 862 return true; 863 864 // FIXME: matcher should ignore copies 865 return mi_match(Reg, MRI, m_Copy(m_ICst(C))) && C == 0; 866 } 867 868 static unsigned extractGLC(unsigned AuxiliaryData) { 869 return AuxiliaryData & 1; 870 } 871 872 static unsigned extractSLC(unsigned AuxiliaryData) { 873 return (AuxiliaryData >> 1) & 1; 874 } 875 876 static unsigned extractDLC(unsigned AuxiliaryData) { 877 return (AuxiliaryData >> 2) & 1; 878 } 879 880 static unsigned extractSWZ(unsigned AuxiliaryData) { 881 return (AuxiliaryData >> 3) & 1; 882 } 883 884 static unsigned getBufferStoreOpcode(LLT Ty, 885 const unsigned MemSize, 886 const bool Offen) { 887 const int Size = Ty.getSizeInBits(); 888 switch (8 * MemSize) { 889 case 8: 890 return Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact : 891 AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact; 892 case 16: 893 return Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact : 894 AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact; 895 default: 896 unsigned Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact : 897 AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact; 898 if (Size > 32) 899 Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32); 900 return Opc; 901 } 902 } 903 904 static unsigned getBufferStoreFormatOpcode(LLT Ty, 905 const unsigned MemSize, 906 const bool Offen) { 907 bool IsD16Packed = Ty.getScalarSizeInBits() == 16; 908 bool IsD16Unpacked = 8 * MemSize < Ty.getSizeInBits(); 909 int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 910 911 if (IsD16Packed) { 912 switch (NumElts) { 913 case 1: 914 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFEN_exact : 915 AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFSET_exact; 916 case 2: 917 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact : 918 AMDGPU::BUFFER_STORE_FORMAT_D16_XY_OFFSET_exact; 919 case 3: 920 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_OFFEN_exact : 921 AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_OFFSET_exact; 922 case 4: 923 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact : 924 AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_OFFSET_exact; 925 default: 926 return -1; 927 } 928 } 929 930 if (IsD16Unpacked) { 931 switch (NumElts) { 932 case 1: 933 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFEN_exact : 934 AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFSET_exact; 935 case 2: 936 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact : 937 AMDGPU::BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFSET_exact; 938 case 3: 939 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_gfx80_OFFEN_exact : 940 AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_gfx80_OFFSET_exact; 941 case 4: 942 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact : 943 AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFSET_exact; 944 default: 945 return -1; 946 } 947 } 948 949 switch (NumElts) { 950 case 1: 951 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_X_OFFEN_exact : 952 AMDGPU::BUFFER_STORE_FORMAT_X_OFFSET_exact; 953 case 2: 954 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XY_OFFEN_exact : 955 AMDGPU::BUFFER_STORE_FORMAT_XY_OFFSET_exact; 956 case 3: 957 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XYZ_OFFEN_exact : 958 AMDGPU::BUFFER_STORE_FORMAT_XYZ_OFFSET_exact; 959 case 4: 960 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XYZW_OFFEN_exact : 961 AMDGPU::BUFFER_STORE_FORMAT_XYZW_OFFSET_exact; 962 default: 963 return -1; 964 } 965 966 llvm_unreachable("unhandled buffer store"); 967 } 968 969 // TODO: Move this to combiner 970 // Returns base register, imm offset, total constant offset. 971 std::tuple<Register, unsigned, unsigned> 972 AMDGPUInstructionSelector::splitBufferOffsets(MachineIRBuilder &B, 973 Register OrigOffset) const { 974 const unsigned MaxImm = 4095; 975 Register BaseReg; 976 unsigned TotalConstOffset; 977 MachineInstr *OffsetDef; 978 979 std::tie(BaseReg, TotalConstOffset, OffsetDef) 980 = AMDGPU::getBaseWithConstantOffset(*MRI, OrigOffset); 981 982 unsigned ImmOffset = TotalConstOffset; 983 984 // If the immediate value is too big for the immoffset field, put the value 985 // and -4096 into the immoffset field so that the value that is copied/added 986 // for the voffset field is a multiple of 4096, and it stands more chance 987 // of being CSEd with the copy/add for another similar load/store.f 988 // However, do not do that rounding down to a multiple of 4096 if that is a 989 // negative number, as it appears to be illegal to have a negative offset 990 // in the vgpr, even if adding the immediate offset makes it positive. 991 unsigned Overflow = ImmOffset & ~MaxImm; 992 ImmOffset -= Overflow; 993 if ((int32_t)Overflow < 0) { 994 Overflow += ImmOffset; 995 ImmOffset = 0; 996 } 997 998 if (Overflow != 0) { 999 // In case this is in a waterfall loop, insert offset code at the def point 1000 // of the offset, not inside the loop. 1001 MachineBasicBlock::iterator OldInsPt = B.getInsertPt(); 1002 MachineBasicBlock &OldMBB = B.getMBB(); 1003 B.setInstr(*OffsetDef); 1004 1005 if (!BaseReg) { 1006 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1007 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1008 .addDef(BaseReg) 1009 .addImm(Overflow); 1010 } else { 1011 Register OverflowVal = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1012 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1013 .addDef(OverflowVal) 1014 .addImm(Overflow); 1015 1016 Register NewBaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1017 TII.getAddNoCarry(B.getMBB(), B.getInsertPt(), B.getDebugLoc(), NewBaseReg) 1018 .addReg(BaseReg) 1019 .addReg(OverflowVal, RegState::Kill) 1020 .addImm(0); 1021 BaseReg = NewBaseReg; 1022 } 1023 1024 B.setInsertPt(OldMBB, OldInsPt); 1025 } 1026 1027 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 1028 } 1029 1030 bool AMDGPUInstructionSelector::selectStoreIntrinsic(MachineInstr &MI, 1031 bool IsFormat) const { 1032 MachineIRBuilder B(MI); 1033 MachineFunction &MF = B.getMF(); 1034 Register VData = MI.getOperand(1).getReg(); 1035 LLT Ty = MRI->getType(VData); 1036 1037 int Size = Ty.getSizeInBits(); 1038 if (Size % 32 != 0) 1039 return false; 1040 1041 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 1042 MachineMemOperand *MMO = *MI.memoperands_begin(); 1043 const int MemSize = MMO->getSize(); 1044 1045 Register RSrc = MI.getOperand(2).getReg(); 1046 Register VOffset = MI.getOperand(3).getReg(); 1047 Register SOffset = MI.getOperand(4).getReg(); 1048 unsigned AuxiliaryData = MI.getOperand(5).getImm(); 1049 unsigned ImmOffset; 1050 unsigned TotalOffset; 1051 1052 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 1053 if (TotalOffset != 0) 1054 MMO = MF.getMachineMemOperand(MMO, TotalOffset, MemSize); 1055 1056 const bool Offen = !isZero(VOffset, *MRI); 1057 1058 int Opc = IsFormat ? getBufferStoreFormatOpcode(Ty, MemSize, Offen) : 1059 getBufferStoreOpcode(Ty, MemSize, Offen); 1060 if (Opc == -1) 1061 return false; 1062 1063 MachineInstrBuilder MIB = B.buildInstr(Opc) 1064 .addUse(VData); 1065 1066 if (Offen) 1067 MIB.addUse(VOffset); 1068 1069 MIB.addUse(RSrc) 1070 .addUse(SOffset) 1071 .addImm(ImmOffset) 1072 .addImm(extractGLC(AuxiliaryData)) 1073 .addImm(extractSLC(AuxiliaryData)) 1074 .addImm(0) // tfe: FIXME: Remove from inst 1075 .addImm(extractDLC(AuxiliaryData)) 1076 .addImm(extractSWZ(AuxiliaryData)) 1077 .addMemOperand(MMO); 1078 1079 MI.eraseFromParent(); 1080 1081 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1082 } 1083 1084 static unsigned getDSShaderTypeValue(const MachineFunction &MF) { 1085 switch (MF.getFunction().getCallingConv()) { 1086 case CallingConv::AMDGPU_PS: 1087 return 1; 1088 case CallingConv::AMDGPU_VS: 1089 return 2; 1090 case CallingConv::AMDGPU_GS: 1091 return 3; 1092 case CallingConv::AMDGPU_HS: 1093 case CallingConv::AMDGPU_LS: 1094 case CallingConv::AMDGPU_ES: 1095 report_fatal_error("ds_ordered_count unsupported for this calling conv"); 1096 case CallingConv::AMDGPU_CS: 1097 case CallingConv::AMDGPU_KERNEL: 1098 case CallingConv::C: 1099 case CallingConv::Fast: 1100 default: 1101 // Assume other calling conventions are various compute callable functions 1102 return 0; 1103 } 1104 } 1105 1106 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic( 1107 MachineInstr &MI, Intrinsic::ID IntrID) const { 1108 MachineBasicBlock *MBB = MI.getParent(); 1109 MachineFunction *MF = MBB->getParent(); 1110 const DebugLoc &DL = MI.getDebugLoc(); 1111 1112 unsigned IndexOperand = MI.getOperand(7).getImm(); 1113 bool WaveRelease = MI.getOperand(8).getImm() != 0; 1114 bool WaveDone = MI.getOperand(9).getImm() != 0; 1115 1116 if (WaveDone && !WaveRelease) 1117 report_fatal_error("ds_ordered_count: wave_done requires wave_release"); 1118 1119 unsigned OrderedCountIndex = IndexOperand & 0x3f; 1120 IndexOperand &= ~0x3f; 1121 unsigned CountDw = 0; 1122 1123 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) { 1124 CountDw = (IndexOperand >> 24) & 0xf; 1125 IndexOperand &= ~(0xf << 24); 1126 1127 if (CountDw < 1 || CountDw > 4) { 1128 report_fatal_error( 1129 "ds_ordered_count: dword count must be between 1 and 4"); 1130 } 1131 } 1132 1133 if (IndexOperand) 1134 report_fatal_error("ds_ordered_count: bad index operand"); 1135 1136 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1; 1137 unsigned ShaderType = getDSShaderTypeValue(*MF); 1138 1139 unsigned Offset0 = OrderedCountIndex << 2; 1140 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) | 1141 (Instruction << 4); 1142 1143 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) 1144 Offset1 |= (CountDw - 1) << 6; 1145 1146 unsigned Offset = Offset0 | (Offset1 << 8); 1147 1148 Register M0Val = MI.getOperand(2).getReg(); 1149 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1150 .addReg(M0Val); 1151 1152 Register DstReg = MI.getOperand(0).getReg(); 1153 Register ValReg = MI.getOperand(3).getReg(); 1154 MachineInstrBuilder DS = 1155 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg) 1156 .addReg(ValReg) 1157 .addImm(Offset) 1158 .cloneMemRefs(MI); 1159 1160 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI)) 1161 return false; 1162 1163 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI); 1164 MI.eraseFromParent(); 1165 return Ret; 1166 } 1167 1168 static unsigned gwsIntrinToOpcode(unsigned IntrID) { 1169 switch (IntrID) { 1170 case Intrinsic::amdgcn_ds_gws_init: 1171 return AMDGPU::DS_GWS_INIT; 1172 case Intrinsic::amdgcn_ds_gws_barrier: 1173 return AMDGPU::DS_GWS_BARRIER; 1174 case Intrinsic::amdgcn_ds_gws_sema_v: 1175 return AMDGPU::DS_GWS_SEMA_V; 1176 case Intrinsic::amdgcn_ds_gws_sema_br: 1177 return AMDGPU::DS_GWS_SEMA_BR; 1178 case Intrinsic::amdgcn_ds_gws_sema_p: 1179 return AMDGPU::DS_GWS_SEMA_P; 1180 case Intrinsic::amdgcn_ds_gws_sema_release_all: 1181 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL; 1182 default: 1183 llvm_unreachable("not a gws intrinsic"); 1184 } 1185 } 1186 1187 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, 1188 Intrinsic::ID IID) const { 1189 if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all && 1190 !STI.hasGWSSemaReleaseAll()) 1191 return false; 1192 1193 // intrinsic ID, vsrc, offset 1194 const bool HasVSrc = MI.getNumOperands() == 3; 1195 assert(HasVSrc || MI.getNumOperands() == 2); 1196 1197 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg(); 1198 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI); 1199 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID) 1200 return false; 1201 1202 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1203 assert(OffsetDef); 1204 1205 unsigned ImmOffset; 1206 1207 MachineBasicBlock *MBB = MI.getParent(); 1208 const DebugLoc &DL = MI.getDebugLoc(); 1209 1210 MachineInstr *Readfirstlane = nullptr; 1211 1212 // If we legalized the VGPR input, strip out the readfirstlane to analyze the 1213 // incoming offset, in case there's an add of a constant. We'll have to put it 1214 // back later. 1215 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) { 1216 Readfirstlane = OffsetDef; 1217 BaseOffset = OffsetDef->getOperand(1).getReg(); 1218 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1219 } 1220 1221 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) { 1222 // If we have a constant offset, try to use the 0 in m0 as the base. 1223 // TODO: Look into changing the default m0 initialization value. If the 1224 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to 1225 // the immediate offset. 1226 1227 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue(); 1228 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1229 .addImm(0); 1230 } else { 1231 std::tie(BaseOffset, ImmOffset, OffsetDef) 1232 = AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset); 1233 1234 if (Readfirstlane) { 1235 // We have the constant offset now, so put the readfirstlane back on the 1236 // variable component. 1237 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI)) 1238 return false; 1239 1240 Readfirstlane->getOperand(1).setReg(BaseOffset); 1241 BaseOffset = Readfirstlane->getOperand(0).getReg(); 1242 } else { 1243 if (!RBI.constrainGenericRegister(BaseOffset, 1244 AMDGPU::SReg_32RegClass, *MRI)) 1245 return false; 1246 } 1247 1248 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1249 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base) 1250 .addReg(BaseOffset) 1251 .addImm(16); 1252 1253 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1254 .addReg(M0Base); 1255 } 1256 1257 // The resource id offset is computed as (<isa opaque base> + M0[21:16] + 1258 // offset field) % 64. Some versions of the programming guide omit the m0 1259 // part, or claim it's from offset 0. 1260 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID))); 1261 1262 if (HasVSrc) { 1263 Register VSrc = MI.getOperand(1).getReg(); 1264 MIB.addReg(VSrc); 1265 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI)) 1266 return false; 1267 } 1268 1269 MIB.addImm(ImmOffset) 1270 .addImm(-1) // $gds 1271 .cloneMemRefs(MI); 1272 1273 MI.eraseFromParent(); 1274 return true; 1275 } 1276 1277 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI, 1278 bool IsAppend) const { 1279 Register PtrBase = MI.getOperand(2).getReg(); 1280 LLT PtrTy = MRI->getType(PtrBase); 1281 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS; 1282 1283 unsigned Offset; 1284 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2)); 1285 1286 // TODO: Should this try to look through readfirstlane like GWS? 1287 if (!isDSOffsetLegal(PtrBase, Offset, 16)) { 1288 PtrBase = MI.getOperand(2).getReg(); 1289 Offset = 0; 1290 } 1291 1292 MachineBasicBlock *MBB = MI.getParent(); 1293 const DebugLoc &DL = MI.getDebugLoc(); 1294 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME; 1295 1296 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1297 .addReg(PtrBase); 1298 BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg()) 1299 .addImm(Offset) 1300 .addImm(IsGDS ? -1 : 0) 1301 .cloneMemRefs(MI); 1302 1303 MI.eraseFromParent(); 1304 return true; 1305 } 1306 1307 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( 1308 MachineInstr &I) const { 1309 MachineBasicBlock *BB = I.getParent(); 1310 unsigned IntrinsicID = I.getIntrinsicID(); 1311 switch (IntrinsicID) { 1312 case Intrinsic::amdgcn_end_cf: { 1313 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 1314 // SelectionDAG uses for wave32 vs wave64. 1315 BuildMI(*BB, &I, I.getDebugLoc(), 1316 TII.get(AMDGPU::SI_END_CF)) 1317 .add(I.getOperand(1)); 1318 1319 Register Reg = I.getOperand(1).getReg(); 1320 I.eraseFromParent(); 1321 1322 if (!MRI->getRegClassOrNull(Reg)) 1323 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 1324 return true; 1325 } 1326 case Intrinsic::amdgcn_raw_buffer_store: 1327 return selectStoreIntrinsic(I, false); 1328 case Intrinsic::amdgcn_raw_buffer_store_format: 1329 return selectStoreIntrinsic(I, true); 1330 case Intrinsic::amdgcn_ds_ordered_add: 1331 case Intrinsic::amdgcn_ds_ordered_swap: 1332 return selectDSOrderedIntrinsic(I, IntrinsicID); 1333 case Intrinsic::amdgcn_ds_gws_init: 1334 case Intrinsic::amdgcn_ds_gws_barrier: 1335 case Intrinsic::amdgcn_ds_gws_sema_v: 1336 case Intrinsic::amdgcn_ds_gws_sema_br: 1337 case Intrinsic::amdgcn_ds_gws_sema_p: 1338 case Intrinsic::amdgcn_ds_gws_sema_release_all: 1339 return selectDSGWSIntrinsic(I, IntrinsicID); 1340 case Intrinsic::amdgcn_ds_append: 1341 return selectDSAppendConsume(I, true); 1342 case Intrinsic::amdgcn_ds_consume: 1343 return selectDSAppendConsume(I, false); 1344 default: 1345 return selectImpl(I, *CoverageInfo); 1346 } 1347 } 1348 1349 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { 1350 MachineBasicBlock *BB = I.getParent(); 1351 const DebugLoc &DL = I.getDebugLoc(); 1352 1353 Register DstReg = I.getOperand(0).getReg(); 1354 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 1355 assert(Size <= 32 || Size == 64); 1356 const MachineOperand &CCOp = I.getOperand(1); 1357 Register CCReg = CCOp.getReg(); 1358 if (!isVCC(CCReg, *MRI)) { 1359 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 : 1360 AMDGPU::S_CSELECT_B32; 1361 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 1362 .addReg(CCReg); 1363 1364 // The generic constrainSelectedInstRegOperands doesn't work for the scc register 1365 // bank, because it does not cover the register class that we used to represent 1366 // for it. So we need to manually set the register class here. 1367 if (!MRI->getRegClassOrNull(CCReg)) 1368 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI)); 1369 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) 1370 .add(I.getOperand(2)) 1371 .add(I.getOperand(3)); 1372 1373 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) | 1374 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI); 1375 I.eraseFromParent(); 1376 return Ret; 1377 } 1378 1379 // Wide VGPR select should have been split in RegBankSelect. 1380 if (Size > 32) 1381 return false; 1382 1383 MachineInstr *Select = 1384 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1385 .addImm(0) 1386 .add(I.getOperand(3)) 1387 .addImm(0) 1388 .add(I.getOperand(2)) 1389 .add(I.getOperand(1)); 1390 1391 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); 1392 I.eraseFromParent(); 1393 return Ret; 1394 } 1395 1396 bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { 1397 initM0(I); 1398 return selectImpl(I, *CoverageInfo); 1399 } 1400 1401 static int sizeToSubRegIndex(unsigned Size) { 1402 switch (Size) { 1403 case 32: 1404 return AMDGPU::sub0; 1405 case 64: 1406 return AMDGPU::sub0_sub1; 1407 case 96: 1408 return AMDGPU::sub0_sub1_sub2; 1409 case 128: 1410 return AMDGPU::sub0_sub1_sub2_sub3; 1411 case 256: 1412 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; 1413 default: 1414 if (Size < 32) 1415 return AMDGPU::sub0; 1416 if (Size > 256) 1417 return -1; 1418 return sizeToSubRegIndex(PowerOf2Ceil(Size)); 1419 } 1420 } 1421 1422 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { 1423 Register DstReg = I.getOperand(0).getReg(); 1424 Register SrcReg = I.getOperand(1).getReg(); 1425 const LLT DstTy = MRI->getType(DstReg); 1426 const LLT SrcTy = MRI->getType(SrcReg); 1427 if (!DstTy.isScalar()) 1428 return false; 1429 1430 const LLT S1 = LLT::scalar(1); 1431 1432 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1433 const RegisterBank *DstRB; 1434 if (DstTy == S1) { 1435 // This is a special case. We don't treat s1 for legalization artifacts as 1436 // vcc booleans. 1437 DstRB = SrcRB; 1438 } else { 1439 DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1440 if (SrcRB != DstRB) 1441 return false; 1442 } 1443 1444 unsigned DstSize = DstTy.getSizeInBits(); 1445 unsigned SrcSize = SrcTy.getSizeInBits(); 1446 1447 const TargetRegisterClass *SrcRC 1448 = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI); 1449 const TargetRegisterClass *DstRC 1450 = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI); 1451 1452 if (SrcSize > 32) { 1453 int SubRegIdx = sizeToSubRegIndex(DstSize); 1454 if (SubRegIdx == -1) 1455 return false; 1456 1457 // Deal with weird cases where the class only partially supports the subreg 1458 // index. 1459 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx); 1460 if (!SrcRC) 1461 return false; 1462 1463 I.getOperand(1).setSubReg(SubRegIdx); 1464 } 1465 1466 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 1467 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) { 1468 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); 1469 return false; 1470 } 1471 1472 I.setDesc(TII.get(TargetOpcode::COPY)); 1473 return true; 1474 } 1475 1476 /// \returns true if a bitmask for \p Size bits will be an inline immediate. 1477 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) { 1478 Mask = maskTrailingOnes<unsigned>(Size); 1479 int SignedMask = static_cast<int>(Mask); 1480 return SignedMask >= -16 && SignedMask <= 64; 1481 } 1482 1483 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1. 1484 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank( 1485 Register Reg, const MachineRegisterInfo &MRI, 1486 const TargetRegisterInfo &TRI) const { 1487 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 1488 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>()) 1489 return RB; 1490 1491 // Ignore the type, since we don't use vcc in artifacts. 1492 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>()) 1493 return &RBI.getRegBankFromRegClass(*RC, LLT()); 1494 return nullptr; 1495 } 1496 1497 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { 1498 bool Signed = I.getOpcode() == AMDGPU::G_SEXT; 1499 const DebugLoc &DL = I.getDebugLoc(); 1500 MachineBasicBlock &MBB = *I.getParent(); 1501 const Register DstReg = I.getOperand(0).getReg(); 1502 const Register SrcReg = I.getOperand(1).getReg(); 1503 1504 const LLT DstTy = MRI->getType(DstReg); 1505 const LLT SrcTy = MRI->getType(SrcReg); 1506 const unsigned SrcSize = SrcTy.getSizeInBits(); 1507 const unsigned DstSize = DstTy.getSizeInBits(); 1508 if (!DstTy.isScalar()) 1509 return false; 1510 1511 if (I.getOpcode() == AMDGPU::G_ANYEXT) 1512 return selectCOPY(I); 1513 1514 // Artifact casts should never use vcc. 1515 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI); 1516 1517 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) { 1518 // 64-bit should have been split up in RegBankSelect 1519 1520 // Try to use an and with a mask if it will save code size. 1521 unsigned Mask; 1522 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1523 MachineInstr *ExtI = 1524 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg) 1525 .addImm(Mask) 1526 .addReg(SrcReg); 1527 I.eraseFromParent(); 1528 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1529 } 1530 1531 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32; 1532 MachineInstr *ExtI = 1533 BuildMI(MBB, I, DL, TII.get(BFE), DstReg) 1534 .addReg(SrcReg) 1535 .addImm(0) // Offset 1536 .addImm(SrcSize); // Width 1537 I.eraseFromParent(); 1538 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1539 } 1540 1541 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) { 1542 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI)) 1543 return false; 1544 1545 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) { 1546 const unsigned SextOpc = SrcSize == 8 ? 1547 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16; 1548 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg) 1549 .addReg(SrcReg); 1550 I.eraseFromParent(); 1551 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1552 } 1553 1554 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; 1555 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; 1556 1557 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width. 1558 if (DstSize > 32 && SrcSize <= 32) { 1559 // We need a 64-bit register source, but the high bits don't matter. 1560 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); 1561 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1562 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 1563 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) 1564 .addReg(SrcReg) 1565 .addImm(AMDGPU::sub0) 1566 .addReg(UndefReg) 1567 .addImm(AMDGPU::sub1); 1568 1569 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg) 1570 .addReg(ExtReg) 1571 .addImm(SrcSize << 16); 1572 1573 I.eraseFromParent(); 1574 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI); 1575 } 1576 1577 unsigned Mask; 1578 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1579 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg) 1580 .addReg(SrcReg) 1581 .addImm(Mask); 1582 } else { 1583 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg) 1584 .addReg(SrcReg) 1585 .addImm(SrcSize << 16); 1586 } 1587 1588 I.eraseFromParent(); 1589 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1590 } 1591 1592 return false; 1593 } 1594 1595 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { 1596 MachineBasicBlock *BB = I.getParent(); 1597 MachineOperand &ImmOp = I.getOperand(1); 1598 1599 // The AMDGPU backend only supports Imm operands and not CImm or FPImm. 1600 if (ImmOp.isFPImm()) { 1601 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt(); 1602 ImmOp.ChangeToImmediate(Imm.getZExtValue()); 1603 } else if (ImmOp.isCImm()) { 1604 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getZExtValue()); 1605 } 1606 1607 Register DstReg = I.getOperand(0).getReg(); 1608 unsigned Size; 1609 bool IsSgpr; 1610 const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg()); 1611 if (RB) { 1612 IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID; 1613 Size = MRI->getType(DstReg).getSizeInBits(); 1614 } else { 1615 const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg); 1616 IsSgpr = TRI.isSGPRClass(RC); 1617 Size = TRI.getRegSizeInBits(*RC); 1618 } 1619 1620 if (Size != 32 && Size != 64) 1621 return false; 1622 1623 unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1624 if (Size == 32) { 1625 I.setDesc(TII.get(Opcode)); 1626 I.addImplicitDefUseOperands(*MF); 1627 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 1628 } 1629 1630 const DebugLoc &DL = I.getDebugLoc(); 1631 1632 APInt Imm(Size, I.getOperand(1).getImm()); 1633 1634 MachineInstr *ResInst; 1635 if (IsSgpr && TII.isInlineConstant(Imm)) { 1636 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) 1637 .addImm(I.getOperand(1).getImm()); 1638 } else { 1639 const TargetRegisterClass *RC = IsSgpr ? 1640 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass; 1641 Register LoReg = MRI->createVirtualRegister(RC); 1642 Register HiReg = MRI->createVirtualRegister(RC); 1643 1644 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg) 1645 .addImm(Imm.trunc(32).getZExtValue()); 1646 1647 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg) 1648 .addImm(Imm.ashr(32).getZExtValue()); 1649 1650 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 1651 .addReg(LoReg) 1652 .addImm(AMDGPU::sub0) 1653 .addReg(HiReg) 1654 .addImm(AMDGPU::sub1); 1655 } 1656 1657 // We can't call constrainSelectedInstRegOperands here, because it doesn't 1658 // work for target independent opcodes 1659 I.eraseFromParent(); 1660 const TargetRegisterClass *DstRC = 1661 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI); 1662 if (!DstRC) 1663 return true; 1664 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI); 1665 } 1666 1667 static bool isConstant(const MachineInstr &MI) { 1668 return MI.getOpcode() == TargetOpcode::G_CONSTANT; 1669 } 1670 1671 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, 1672 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const { 1673 1674 const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg()); 1675 1676 assert(PtrMI); 1677 1678 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD) 1679 return; 1680 1681 GEPInfo GEPInfo(*PtrMI); 1682 1683 for (unsigned i = 1; i != 3; ++i) { 1684 const MachineOperand &GEPOp = PtrMI->getOperand(i); 1685 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg()); 1686 assert(OpDef); 1687 if (i == 2 && isConstant(*OpDef)) { 1688 // TODO: Could handle constant base + variable offset, but a combine 1689 // probably should have commuted it. 1690 assert(GEPInfo.Imm == 0); 1691 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue(); 1692 continue; 1693 } 1694 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI); 1695 if (OpBank->getID() == AMDGPU::SGPRRegBankID) 1696 GEPInfo.SgprParts.push_back(GEPOp.getReg()); 1697 else 1698 GEPInfo.VgprParts.push_back(GEPOp.getReg()); 1699 } 1700 1701 AddrInfo.push_back(GEPInfo); 1702 getAddrModeInfo(*PtrMI, MRI, AddrInfo); 1703 } 1704 1705 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const { 1706 if (!MI.hasOneMemOperand()) 1707 return false; 1708 1709 const MachineMemOperand *MMO = *MI.memoperands_begin(); 1710 const Value *Ptr = MMO->getValue(); 1711 1712 // UndefValue means this is a load of a kernel input. These are uniform. 1713 // Sometimes LDS instructions have constant pointers. 1714 // If Ptr is null, then that means this mem operand contains a 1715 // PseudoSourceValue like GOT. 1716 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || 1717 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) 1718 return true; 1719 1720 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 1721 return true; 1722 1723 const Instruction *I = dyn_cast<Instruction>(Ptr); 1724 return I && I->getMetadata("amdgpu.uniform"); 1725 } 1726 1727 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const { 1728 for (const GEPInfo &GEPInfo : AddrInfo) { 1729 if (!GEPInfo.VgprParts.empty()) 1730 return true; 1731 } 1732 return false; 1733 } 1734 1735 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const { 1736 MachineBasicBlock *BB = I.getParent(); 1737 1738 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); 1739 unsigned AS = PtrTy.getAddressSpace(); 1740 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) && 1741 STI.ldsRequiresM0Init()) { 1742 // If DS instructions require M0 initializtion, insert it before selecting. 1743 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1744 .addImm(-1); 1745 } 1746 } 1747 1748 bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const { 1749 initM0(I); 1750 return selectImpl(I, *CoverageInfo); 1751 } 1752 1753 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { 1754 MachineBasicBlock *BB = I.getParent(); 1755 MachineOperand &CondOp = I.getOperand(0); 1756 Register CondReg = CondOp.getReg(); 1757 const DebugLoc &DL = I.getDebugLoc(); 1758 1759 unsigned BrOpcode; 1760 Register CondPhysReg; 1761 const TargetRegisterClass *ConstrainRC; 1762 1763 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide 1764 // whether the branch is uniform when selecting the instruction. In 1765 // GlobalISel, we should push that decision into RegBankSelect. Assume for now 1766 // RegBankSelect knows what it's doing if the branch condition is scc, even 1767 // though it currently does not. 1768 if (!isVCC(CondReg, *MRI)) { 1769 if (MRI->getType(CondReg) != LLT::scalar(32)) 1770 return false; 1771 1772 CondPhysReg = AMDGPU::SCC; 1773 BrOpcode = AMDGPU::S_CBRANCH_SCC1; 1774 // FIXME: Hack for isSCC tests 1775 ConstrainRC = &AMDGPU::SGPR_32RegClass; 1776 } else { 1777 // FIXME: Do we have to insert an and with exec here, like in SelectionDAG? 1778 // We sort of know that a VCC producer based on the register bank, that ands 1779 // inactive lanes with 0. What if there was a logical operation with vcc 1780 // producers in different blocks/with different exec masks? 1781 // FIXME: Should scc->vcc copies and with exec? 1782 CondPhysReg = TRI.getVCC(); 1783 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ; 1784 ConstrainRC = TRI.getBoolRC(); 1785 } 1786 1787 if (!MRI->getRegClassOrNull(CondReg)) 1788 MRI->setRegClass(CondReg, ConstrainRC); 1789 1790 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg) 1791 .addReg(CondReg); 1792 BuildMI(*BB, &I, DL, TII.get(BrOpcode)) 1793 .addMBB(I.getOperand(1).getMBB()); 1794 1795 I.eraseFromParent(); 1796 return true; 1797 } 1798 1799 bool AMDGPUInstructionSelector::selectG_FRAME_INDEX(MachineInstr &I) const { 1800 Register DstReg = I.getOperand(0).getReg(); 1801 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1802 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 1803 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32)); 1804 if (IsVGPR) 1805 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 1806 1807 return RBI.constrainGenericRegister( 1808 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI); 1809 } 1810 1811 bool AMDGPUInstructionSelector::selectG_PTR_MASK(MachineInstr &I) const { 1812 uint64_t Align = I.getOperand(2).getImm(); 1813 const uint64_t Mask = ~((UINT64_C(1) << Align) - 1); 1814 1815 MachineBasicBlock *BB = I.getParent(); 1816 1817 Register DstReg = I.getOperand(0).getReg(); 1818 Register SrcReg = I.getOperand(1).getReg(); 1819 1820 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1821 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1822 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 1823 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 1824 unsigned MovOpc = IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 1825 const TargetRegisterClass &RegRC 1826 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 1827 1828 LLT Ty = MRI->getType(DstReg); 1829 1830 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB, 1831 *MRI); 1832 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB, 1833 *MRI); 1834 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 1835 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 1836 return false; 1837 1838 const DebugLoc &DL = I.getDebugLoc(); 1839 Register ImmReg = MRI->createVirtualRegister(&RegRC); 1840 BuildMI(*BB, &I, DL, TII.get(MovOpc), ImmReg) 1841 .addImm(Mask); 1842 1843 if (Ty.getSizeInBits() == 32) { 1844 BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg) 1845 .addReg(SrcReg) 1846 .addReg(ImmReg); 1847 I.eraseFromParent(); 1848 return true; 1849 } 1850 1851 Register HiReg = MRI->createVirtualRegister(&RegRC); 1852 Register LoReg = MRI->createVirtualRegister(&RegRC); 1853 Register MaskLo = MRI->createVirtualRegister(&RegRC); 1854 1855 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg) 1856 .addReg(SrcReg, 0, AMDGPU::sub0); 1857 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg) 1858 .addReg(SrcReg, 0, AMDGPU::sub1); 1859 1860 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskLo) 1861 .addReg(LoReg) 1862 .addReg(ImmReg); 1863 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 1864 .addReg(MaskLo) 1865 .addImm(AMDGPU::sub0) 1866 .addReg(HiReg) 1867 .addImm(AMDGPU::sub1); 1868 I.eraseFromParent(); 1869 return true; 1870 } 1871 1872 /// Return the register to use for the index value, and the subregister to use 1873 /// for the indirectly accessed register. 1874 static std::pair<Register, unsigned> 1875 computeIndirectRegIndex(MachineRegisterInfo &MRI, 1876 const SIRegisterInfo &TRI, 1877 const TargetRegisterClass *SuperRC, 1878 Register IdxReg, 1879 unsigned EltSize) { 1880 Register IdxBaseReg; 1881 int Offset; 1882 MachineInstr *Unused; 1883 1884 std::tie(IdxBaseReg, Offset, Unused) 1885 = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg); 1886 1887 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize); 1888 1889 // Skip out of bounds offsets, or else we would end up using an undefined 1890 // register. 1891 if (static_cast<unsigned>(Offset) >= SubRegs.size()) 1892 return std::make_pair(IdxReg, SubRegs[0]); 1893 return std::make_pair(IdxBaseReg, SubRegs[Offset]); 1894 } 1895 1896 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT( 1897 MachineInstr &MI) const { 1898 Register DstReg = MI.getOperand(0).getReg(); 1899 Register SrcReg = MI.getOperand(1).getReg(); 1900 Register IdxReg = MI.getOperand(2).getReg(); 1901 1902 LLT DstTy = MRI->getType(DstReg); 1903 LLT SrcTy = MRI->getType(SrcReg); 1904 1905 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1906 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1907 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 1908 1909 // The index must be scalar. If it wasn't RegBankSelect should have moved this 1910 // into a waterfall loop. 1911 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 1912 return false; 1913 1914 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB, 1915 *MRI); 1916 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB, 1917 *MRI); 1918 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 1919 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 1920 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 1921 return false; 1922 1923 MachineBasicBlock *BB = MI.getParent(); 1924 const DebugLoc &DL = MI.getDebugLoc(); 1925 const bool Is64 = DstTy.getSizeInBits() == 64; 1926 1927 unsigned SubReg; 1928 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg, 1929 DstTy.getSizeInBits() / 8); 1930 1931 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) { 1932 if (DstTy.getSizeInBits() != 32 && !Is64) 1933 return false; 1934 1935 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1936 .addReg(IdxReg); 1937 1938 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32; 1939 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg) 1940 .addReg(SrcReg, 0, SubReg) 1941 .addReg(SrcReg, RegState::Implicit); 1942 MI.eraseFromParent(); 1943 return true; 1944 } 1945 1946 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32) 1947 return false; 1948 1949 if (!STI.useVGPRIndexMode()) { 1950 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1951 .addReg(IdxReg); 1952 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg) 1953 .addReg(SrcReg, RegState::Undef, SubReg) 1954 .addReg(SrcReg, RegState::Implicit); 1955 MI.eraseFromParent(); 1956 return true; 1957 } 1958 1959 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON)) 1960 .addReg(IdxReg) 1961 .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE); 1962 BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstReg) 1963 .addReg(SrcReg, RegState::Undef, SubReg) 1964 .addReg(SrcReg, RegState::Implicit) 1965 .addReg(AMDGPU::M0, RegState::Implicit); 1966 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF)); 1967 1968 MI.eraseFromParent(); 1969 return true; 1970 } 1971 1972 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd 1973 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT( 1974 MachineInstr &MI) const { 1975 Register DstReg = MI.getOperand(0).getReg(); 1976 Register VecReg = MI.getOperand(1).getReg(); 1977 Register ValReg = MI.getOperand(2).getReg(); 1978 Register IdxReg = MI.getOperand(3).getReg(); 1979 1980 LLT VecTy = MRI->getType(DstReg); 1981 LLT ValTy = MRI->getType(ValReg); 1982 unsigned VecSize = VecTy.getSizeInBits(); 1983 unsigned ValSize = ValTy.getSizeInBits(); 1984 1985 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI); 1986 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI); 1987 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 1988 1989 assert(VecTy.getElementType() == ValTy); 1990 1991 // The index must be scalar. If it wasn't RegBankSelect should have moved this 1992 // into a waterfall loop. 1993 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 1994 return false; 1995 1996 const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB, 1997 *MRI); 1998 const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB, 1999 *MRI); 2000 2001 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) || 2002 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) || 2003 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) || 2004 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 2005 return false; 2006 2007 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32) 2008 return false; 2009 2010 unsigned SubReg; 2011 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, 2012 ValSize / 8); 2013 2014 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID && 2015 STI.useVGPRIndexMode(); 2016 2017 MachineBasicBlock *BB = MI.getParent(); 2018 const DebugLoc &DL = MI.getDebugLoc(); 2019 2020 if (IndexMode) { 2021 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON)) 2022 .addReg(IdxReg) 2023 .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE); 2024 } else { 2025 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2026 .addReg(IdxReg); 2027 } 2028 2029 const MCInstrDesc &RegWriteOp 2030 = TII.getIndirectRegWritePseudo(VecSize, ValSize, 2031 VecRB->getID() == AMDGPU::SGPRRegBankID); 2032 BuildMI(*BB, MI, DL, RegWriteOp, DstReg) 2033 .addReg(VecReg) 2034 .addReg(ValReg) 2035 .addImm(SubReg); 2036 2037 if (IndexMode) 2038 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF)); 2039 2040 MI.eraseFromParent(); 2041 return true; 2042 } 2043 2044 bool AMDGPUInstructionSelector::select(MachineInstr &I) { 2045 if (I.isPHI()) 2046 return selectPHI(I); 2047 2048 if (!I.isPreISelOpcode()) { 2049 if (I.isCopy()) 2050 return selectCOPY(I); 2051 return true; 2052 } 2053 2054 switch (I.getOpcode()) { 2055 case TargetOpcode::G_AND: 2056 case TargetOpcode::G_OR: 2057 case TargetOpcode::G_XOR: 2058 if (selectG_AND_OR_XOR(I)) 2059 return true; 2060 return selectImpl(I, *CoverageInfo); 2061 case TargetOpcode::G_ADD: 2062 case TargetOpcode::G_SUB: 2063 if (selectImpl(I, *CoverageInfo)) 2064 return true; 2065 return selectG_ADD_SUB(I); 2066 case TargetOpcode::G_UADDO: 2067 case TargetOpcode::G_USUBO: 2068 case TargetOpcode::G_UADDE: 2069 case TargetOpcode::G_USUBE: 2070 return selectG_UADDO_USUBO_UADDE_USUBE(I); 2071 case TargetOpcode::G_INTTOPTR: 2072 case TargetOpcode::G_BITCAST: 2073 case TargetOpcode::G_PTRTOINT: 2074 return selectCOPY(I); 2075 case TargetOpcode::G_CONSTANT: 2076 case TargetOpcode::G_FCONSTANT: 2077 return selectG_CONSTANT(I); 2078 case TargetOpcode::G_EXTRACT: 2079 return selectG_EXTRACT(I); 2080 case TargetOpcode::G_MERGE_VALUES: 2081 case TargetOpcode::G_BUILD_VECTOR: 2082 case TargetOpcode::G_CONCAT_VECTORS: 2083 return selectG_MERGE_VALUES(I); 2084 case TargetOpcode::G_UNMERGE_VALUES: 2085 return selectG_UNMERGE_VALUES(I); 2086 case TargetOpcode::G_PTR_ADD: 2087 return selectG_PTR_ADD(I); 2088 case TargetOpcode::G_IMPLICIT_DEF: 2089 return selectG_IMPLICIT_DEF(I); 2090 case TargetOpcode::G_INSERT: 2091 return selectG_INSERT(I); 2092 case TargetOpcode::G_INTRINSIC: 2093 return selectG_INTRINSIC(I); 2094 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: 2095 return selectG_INTRINSIC_W_SIDE_EFFECTS(I); 2096 case TargetOpcode::G_ICMP: 2097 if (selectG_ICMP(I)) 2098 return true; 2099 return selectImpl(I, *CoverageInfo); 2100 case TargetOpcode::G_LOAD: 2101 case TargetOpcode::G_ATOMIC_CMPXCHG: 2102 case TargetOpcode::G_ATOMICRMW_XCHG: 2103 case TargetOpcode::G_ATOMICRMW_ADD: 2104 case TargetOpcode::G_ATOMICRMW_SUB: 2105 case TargetOpcode::G_ATOMICRMW_AND: 2106 case TargetOpcode::G_ATOMICRMW_OR: 2107 case TargetOpcode::G_ATOMICRMW_XOR: 2108 case TargetOpcode::G_ATOMICRMW_MIN: 2109 case TargetOpcode::G_ATOMICRMW_MAX: 2110 case TargetOpcode::G_ATOMICRMW_UMIN: 2111 case TargetOpcode::G_ATOMICRMW_UMAX: 2112 case TargetOpcode::G_ATOMICRMW_FADD: 2113 return selectG_LOAD_ATOMICRMW(I); 2114 case TargetOpcode::G_SELECT: 2115 return selectG_SELECT(I); 2116 case TargetOpcode::G_STORE: 2117 return selectG_STORE(I); 2118 case TargetOpcode::G_TRUNC: 2119 return selectG_TRUNC(I); 2120 case TargetOpcode::G_SEXT: 2121 case TargetOpcode::G_ZEXT: 2122 case TargetOpcode::G_ANYEXT: 2123 if (selectImpl(I, *CoverageInfo)) 2124 return true; 2125 return selectG_SZA_EXT(I); 2126 case TargetOpcode::G_BRCOND: 2127 return selectG_BRCOND(I); 2128 case TargetOpcode::G_FRAME_INDEX: 2129 return selectG_FRAME_INDEX(I); 2130 case TargetOpcode::G_PTR_MASK: 2131 return selectG_PTR_MASK(I); 2132 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 2133 return selectG_EXTRACT_VECTOR_ELT(I); 2134 case TargetOpcode::G_INSERT_VECTOR_ELT: 2135 return selectG_INSERT_VECTOR_ELT(I); 2136 case AMDGPU::G_AMDGPU_ATOMIC_INC: 2137 case AMDGPU::G_AMDGPU_ATOMIC_DEC: 2138 initM0(I); 2139 return selectImpl(I, *CoverageInfo); 2140 default: 2141 return selectImpl(I, *CoverageInfo); 2142 } 2143 return false; 2144 } 2145 2146 InstructionSelector::ComplexRendererFns 2147 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { 2148 return {{ 2149 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 2150 }}; 2151 2152 } 2153 2154 std::pair<Register, unsigned> 2155 AMDGPUInstructionSelector::selectVOP3ModsImpl( 2156 Register Src) const { 2157 unsigned Mods = 0; 2158 MachineInstr *MI = MRI->getVRegDef(Src); 2159 2160 if (MI && MI->getOpcode() == AMDGPU::G_FNEG) { 2161 Src = MI->getOperand(1).getReg(); 2162 Mods |= SISrcMods::NEG; 2163 MI = MRI->getVRegDef(Src); 2164 } 2165 2166 if (MI && MI->getOpcode() == AMDGPU::G_FABS) { 2167 Src = MI->getOperand(1).getReg(); 2168 Mods |= SISrcMods::ABS; 2169 } 2170 2171 return std::make_pair(Src, Mods); 2172 } 2173 2174 /// 2175 /// This will select either an SGPR or VGPR operand and will save us from 2176 /// having to write an extra tablegen pattern. 2177 InstructionSelector::ComplexRendererFns 2178 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const { 2179 return {{ 2180 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 2181 }}; 2182 } 2183 2184 InstructionSelector::ComplexRendererFns 2185 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { 2186 Register Src; 2187 unsigned Mods; 2188 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 2189 2190 return {{ 2191 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2192 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 2193 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 2194 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 2195 }}; 2196 } 2197 2198 InstructionSelector::ComplexRendererFns 2199 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { 2200 return {{ 2201 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 2202 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 2203 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 2204 }}; 2205 } 2206 2207 InstructionSelector::ComplexRendererFns 2208 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { 2209 Register Src; 2210 unsigned Mods; 2211 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 2212 2213 return {{ 2214 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2215 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 2216 }}; 2217 } 2218 2219 InstructionSelector::ComplexRendererFns 2220 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const { 2221 Register Src; 2222 unsigned Mods; 2223 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 2224 if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI)) 2225 return None; 2226 2227 return {{ 2228 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 2229 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 2230 }}; 2231 } 2232 2233 InstructionSelector::ComplexRendererFns 2234 AMDGPUInstructionSelector::selectVOP3OpSelMods0(MachineOperand &Root) const { 2235 // FIXME: Handle clamp and op_sel 2236 return {{ 2237 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 2238 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // src_mods 2239 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // clamp 2240 }}; 2241 } 2242 2243 InstructionSelector::ComplexRendererFns 2244 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { 2245 // FIXME: Handle op_sel 2246 return {{ 2247 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 2248 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods 2249 }}; 2250 } 2251 2252 InstructionSelector::ComplexRendererFns 2253 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { 2254 SmallVector<GEPInfo, 4> AddrInfo; 2255 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 2256 2257 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 2258 return None; 2259 2260 const GEPInfo &GEPInfo = AddrInfo[0]; 2261 2262 if (!AMDGPU::isLegalSMRDImmOffset(STI, GEPInfo.Imm)) 2263 return None; 2264 2265 unsigned PtrReg = GEPInfo.SgprParts[0]; 2266 int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm); 2267 return {{ 2268 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 2269 [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); } 2270 }}; 2271 } 2272 2273 InstructionSelector::ComplexRendererFns 2274 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { 2275 SmallVector<GEPInfo, 4> AddrInfo; 2276 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 2277 2278 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 2279 return None; 2280 2281 const GEPInfo &GEPInfo = AddrInfo[0]; 2282 unsigned PtrReg = GEPInfo.SgprParts[0]; 2283 int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm); 2284 if (!isUInt<32>(EncodedImm)) 2285 return None; 2286 2287 return {{ 2288 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 2289 [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); } 2290 }}; 2291 } 2292 2293 InstructionSelector::ComplexRendererFns 2294 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { 2295 MachineInstr *MI = Root.getParent(); 2296 MachineBasicBlock *MBB = MI->getParent(); 2297 2298 SmallVector<GEPInfo, 4> AddrInfo; 2299 getAddrModeInfo(*MI, *MRI, AddrInfo); 2300 2301 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, 2302 // then we can select all ptr + 32-bit offsets not just immediate offsets. 2303 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 2304 return None; 2305 2306 const GEPInfo &GEPInfo = AddrInfo[0]; 2307 if (!GEPInfo.Imm || !isUInt<32>(GEPInfo.Imm)) 2308 return None; 2309 2310 // If we make it this far we have a load with an 32-bit immediate offset. 2311 // It is OK to select this using a sgpr offset, because we have already 2312 // failed trying to select this load into one of the _IMM variants since 2313 // the _IMM Patterns are considered before the _SGPR patterns. 2314 unsigned PtrReg = GEPInfo.SgprParts[0]; 2315 Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2316 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) 2317 .addImm(GEPInfo.Imm); 2318 return {{ 2319 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 2320 [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); } 2321 }}; 2322 } 2323 2324 template <bool Signed> 2325 InstructionSelector::ComplexRendererFns 2326 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const { 2327 MachineInstr *MI = Root.getParent(); 2328 2329 InstructionSelector::ComplexRendererFns Default = {{ 2330 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 2331 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // offset 2332 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 2333 }}; 2334 2335 if (!STI.hasFlatInstOffsets()) 2336 return Default; 2337 2338 const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg()); 2339 if (!OpDef || OpDef->getOpcode() != AMDGPU::G_PTR_ADD) 2340 return Default; 2341 2342 Optional<int64_t> Offset = 2343 getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI); 2344 if (!Offset.hasValue()) 2345 return Default; 2346 2347 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); 2348 if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed)) 2349 return Default; 2350 2351 Register BasePtr = OpDef->getOperand(1).getReg(); 2352 2353 return {{ 2354 [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); }, 2355 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); }, 2356 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 2357 }}; 2358 } 2359 2360 InstructionSelector::ComplexRendererFns 2361 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const { 2362 return selectFlatOffsetImpl<false>(Root); 2363 } 2364 2365 InstructionSelector::ComplexRendererFns 2366 AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const { 2367 return selectFlatOffsetImpl<true>(Root); 2368 } 2369 2370 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { 2371 auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>(); 2372 return PSV && PSV->isStack(); 2373 } 2374 2375 InstructionSelector::ComplexRendererFns 2376 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { 2377 MachineInstr *MI = Root.getParent(); 2378 MachineBasicBlock *MBB = MI->getParent(); 2379 MachineFunction *MF = MBB->getParent(); 2380 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 2381 2382 int64_t Offset = 0; 2383 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset))) { 2384 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2385 2386 // TODO: Should this be inside the render function? The iterator seems to 2387 // move. 2388 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), 2389 HighBits) 2390 .addImm(Offset & ~4095); 2391 2392 return {{[=](MachineInstrBuilder &MIB) { // rsrc 2393 MIB.addReg(Info->getScratchRSrcReg()); 2394 }, 2395 [=](MachineInstrBuilder &MIB) { // vaddr 2396 MIB.addReg(HighBits); 2397 }, 2398 [=](MachineInstrBuilder &MIB) { // soffset 2399 const MachineMemOperand *MMO = *MI->memoperands_begin(); 2400 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 2401 2402 Register SOffsetReg = isStackPtrRelative(PtrInfo) 2403 ? Info->getStackPtrOffsetReg() 2404 : Info->getScratchWaveOffsetReg(); 2405 MIB.addReg(SOffsetReg); 2406 }, 2407 [=](MachineInstrBuilder &MIB) { // offset 2408 MIB.addImm(Offset & 4095); 2409 }}}; 2410 } 2411 2412 assert(Offset == 0); 2413 2414 // Try to fold a frame index directly into the MUBUF vaddr field, and any 2415 // offsets. 2416 Optional<int> FI; 2417 Register VAddr = Root.getReg(); 2418 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) { 2419 if (isBaseWithConstantOffset(Root, *MRI)) { 2420 const MachineOperand &LHS = RootDef->getOperand(1); 2421 const MachineOperand &RHS = RootDef->getOperand(2); 2422 const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); 2423 const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); 2424 if (LHSDef && RHSDef) { 2425 int64_t PossibleOffset = 2426 RHSDef->getOperand(1).getCImm()->getSExtValue(); 2427 if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) && 2428 (!STI.privateMemoryResourceIsRangeChecked() || 2429 KnownBits->signBitIsZero(LHS.getReg()))) { 2430 if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX) 2431 FI = LHSDef->getOperand(1).getIndex(); 2432 else 2433 VAddr = LHS.getReg(); 2434 Offset = PossibleOffset; 2435 } 2436 } 2437 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) { 2438 FI = RootDef->getOperand(1).getIndex(); 2439 } 2440 } 2441 2442 // If we don't know this private access is a local stack object, it needs to 2443 // be relative to the entry point's scratch wave offset register. 2444 // TODO: Should split large offsets that don't fit like above. 2445 // TODO: Don't use scratch wave offset just because the offset didn't fit. 2446 Register SOffset = FI.hasValue() ? Info->getStackPtrOffsetReg() 2447 : Info->getScratchWaveOffsetReg(); 2448 2449 return {{[=](MachineInstrBuilder &MIB) { // rsrc 2450 MIB.addReg(Info->getScratchRSrcReg()); 2451 }, 2452 [=](MachineInstrBuilder &MIB) { // vaddr 2453 if (FI.hasValue()) 2454 MIB.addFrameIndex(FI.getValue()); 2455 else 2456 MIB.addReg(VAddr); 2457 }, 2458 [=](MachineInstrBuilder &MIB) { // soffset 2459 MIB.addReg(SOffset); 2460 }, 2461 [=](MachineInstrBuilder &MIB) { // offset 2462 MIB.addImm(Offset); 2463 }}}; 2464 } 2465 2466 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base, 2467 int64_t Offset, 2468 unsigned OffsetBits) const { 2469 if ((OffsetBits == 16 && !isUInt<16>(Offset)) || 2470 (OffsetBits == 8 && !isUInt<8>(Offset))) 2471 return false; 2472 2473 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) 2474 return true; 2475 2476 // On Southern Islands instruction with a negative base value and an offset 2477 // don't seem to work. 2478 return KnownBits->signBitIsZero(Base); 2479 } 2480 2481 InstructionSelector::ComplexRendererFns 2482 AMDGPUInstructionSelector::selectMUBUFScratchOffset( 2483 MachineOperand &Root) const { 2484 MachineInstr *MI = Root.getParent(); 2485 MachineBasicBlock *MBB = MI->getParent(); 2486 2487 int64_t Offset = 0; 2488 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) || 2489 !SIInstrInfo::isLegalMUBUFImmOffset(Offset)) 2490 return {}; 2491 2492 const MachineFunction *MF = MBB->getParent(); 2493 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 2494 const MachineMemOperand *MMO = *MI->memoperands_begin(); 2495 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 2496 2497 Register SOffsetReg = isStackPtrRelative(PtrInfo) 2498 ? Info->getStackPtrOffsetReg() 2499 : Info->getScratchWaveOffsetReg(); 2500 return {{ 2501 [=](MachineInstrBuilder &MIB) { 2502 MIB.addReg(Info->getScratchRSrcReg()); 2503 }, // rsrc 2504 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffsetReg); }, // soffset 2505 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset 2506 }}; 2507 } 2508 2509 std::pair<Register, unsigned> 2510 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const { 2511 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 2512 if (!RootDef) 2513 return std::make_pair(Root.getReg(), 0); 2514 2515 int64_t ConstAddr = 0; 2516 if (isBaseWithConstantOffset(Root, *MRI)) { 2517 const MachineOperand &LHS = RootDef->getOperand(1); 2518 const MachineOperand &RHS = RootDef->getOperand(2); 2519 const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); 2520 const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); 2521 if (LHSDef && RHSDef) { 2522 int64_t PossibleOffset = 2523 RHSDef->getOperand(1).getCImm()->getSExtValue(); 2524 if (isDSOffsetLegal(LHS.getReg(), PossibleOffset, 16)) { 2525 // (add n0, c0) 2526 return std::make_pair(LHS.getReg(), PossibleOffset); 2527 } 2528 } 2529 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 2530 // TODO 2531 2532 2533 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 2534 // TODO 2535 2536 } 2537 2538 return std::make_pair(Root.getReg(), 0); 2539 } 2540 2541 InstructionSelector::ComplexRendererFns 2542 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const { 2543 2544 Register Reg; 2545 unsigned Offset; 2546 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root); 2547 return {{ 2548 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 2549 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } 2550 }}; 2551 } 2552 2553 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, 2554 const MachineInstr &MI, 2555 int OpIdx) const { 2556 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 2557 "Expected G_CONSTANT"); 2558 Optional<int64_t> CstVal = getConstantVRegVal(MI.getOperand(0).getReg(), *MRI); 2559 assert(CstVal && "Expected constant value"); 2560 MIB.addImm(CstVal.getValue()); 2561 } 2562 2563 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB, 2564 const MachineInstr &MI, 2565 int OpIdx) const { 2566 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 2567 "Expected G_CONSTANT"); 2568 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue()); 2569 } 2570 2571 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB, 2572 const MachineInstr &MI, 2573 int OpIdx) const { 2574 assert(OpIdx == -1); 2575 2576 const MachineOperand &Op = MI.getOperand(1); 2577 if (MI.getOpcode() == TargetOpcode::G_FCONSTANT) 2578 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); 2579 else { 2580 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT"); 2581 MIB.addImm(Op.getCImm()->getSExtValue()); 2582 } 2583 } 2584 2585 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB, 2586 const MachineInstr &MI, 2587 int OpIdx) const { 2588 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 2589 "Expected G_CONSTANT"); 2590 MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation()); 2591 } 2592 2593 /// This only really exists to satisfy DAG type checking machinery, so is a 2594 /// no-op here. 2595 void AMDGPUInstructionSelector::renderTruncTImm32(MachineInstrBuilder &MIB, 2596 const MachineInstr &MI, 2597 int OpIdx) const { 2598 MIB.addImm(MI.getOperand(OpIdx).getImm()); 2599 } 2600 2601 void AMDGPUInstructionSelector::renderTruncTImm16(MachineInstrBuilder &MIB, 2602 const MachineInstr &MI, 2603 int OpIdx) const { 2604 MIB.addImm(MI.getOperand(OpIdx).getImm()); 2605 } 2606 2607 void AMDGPUInstructionSelector::renderTruncTImm1(MachineInstrBuilder &MIB, 2608 const MachineInstr &MI, 2609 int OpIdx) const { 2610 MIB.addImm(MI.getOperand(OpIdx).getImm()); 2611 } 2612 2613 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const { 2614 return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm()); 2615 } 2616 2617 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const { 2618 return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm()); 2619 } 2620 2621 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const { 2622 return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm()); 2623 } 2624 2625 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const { 2626 return TII.isInlineConstant(Imm); 2627 } 2628