1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the InstructionSelector class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUInstructionSelector.h" 15 #include "AMDGPUInstrInfo.h" 16 #include "AMDGPURegisterBankInfo.h" 17 #include "AMDGPURegisterInfo.h" 18 #include "AMDGPUSubtarget.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 23 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" 24 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/Utils.h" 27 #include "llvm/CodeGen/MachineBasicBlock.h" 28 #include "llvm/CodeGen/MachineFunction.h" 29 #include "llvm/CodeGen/MachineInstr.h" 30 #include "llvm/CodeGen/MachineInstrBuilder.h" 31 #include "llvm/CodeGen/MachineRegisterInfo.h" 32 #include "llvm/IR/Type.h" 33 #include "llvm/Support/Debug.h" 34 #include "llvm/Support/raw_ostream.h" 35 36 #define DEBUG_TYPE "amdgpu-isel" 37 38 using namespace llvm; 39 using namespace MIPatternMatch; 40 41 #define GET_GLOBALISEL_IMPL 42 #define AMDGPUSubtarget GCNSubtarget 43 #include "AMDGPUGenGlobalISel.inc" 44 #undef GET_GLOBALISEL_IMPL 45 #undef AMDGPUSubtarget 46 47 AMDGPUInstructionSelector::AMDGPUInstructionSelector( 48 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, 49 const AMDGPUTargetMachine &TM) 50 : InstructionSelector(), TII(*STI.getInstrInfo()), 51 TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM), 52 STI(STI), 53 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG), 54 #define GET_GLOBALISEL_PREDICATES_INIT 55 #include "AMDGPUGenGlobalISel.inc" 56 #undef GET_GLOBALISEL_PREDICATES_INIT 57 #define GET_GLOBALISEL_TEMPORARIES_INIT 58 #include "AMDGPUGenGlobalISel.inc" 59 #undef GET_GLOBALISEL_TEMPORARIES_INIT 60 { 61 } 62 63 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } 64 65 static bool isSCC(Register Reg, const MachineRegisterInfo &MRI) { 66 if (Register::isPhysicalRegister(Reg)) 67 return Reg == AMDGPU::SCC; 68 69 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 70 const TargetRegisterClass *RC = 71 RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); 72 if (RC) { 73 // FIXME: This is ambiguous for wave32. This could be SCC or VCC, but the 74 // context of the register bank has been lost. 75 if (RC->getID() != AMDGPU::SReg_32_XM0RegClassID) 76 return false; 77 const LLT Ty = MRI.getType(Reg); 78 return Ty.isValid() && Ty.getSizeInBits() == 1; 79 } 80 81 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); 82 return RB->getID() == AMDGPU::SCCRegBankID; 83 } 84 85 bool AMDGPUInstructionSelector::isVCC(Register Reg, 86 const MachineRegisterInfo &MRI) const { 87 if (Register::isPhysicalRegister(Reg)) 88 return Reg == TRI.getVCC(); 89 90 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 91 const TargetRegisterClass *RC = 92 RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); 93 if (RC) { 94 const LLT Ty = MRI.getType(Reg); 95 return RC->hasSuperClassEq(TRI.getBoolRC()) && 96 Ty.isValid() && Ty.getSizeInBits() == 1; 97 } 98 99 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); 100 return RB->getID() == AMDGPU::VCCRegBankID; 101 } 102 103 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { 104 const DebugLoc &DL = I.getDebugLoc(); 105 MachineBasicBlock *BB = I.getParent(); 106 MachineFunction *MF = BB->getParent(); 107 MachineRegisterInfo &MRI = MF->getRegInfo(); 108 I.setDesc(TII.get(TargetOpcode::COPY)); 109 110 const MachineOperand &Src = I.getOperand(1); 111 MachineOperand &Dst = I.getOperand(0); 112 Register DstReg = Dst.getReg(); 113 Register SrcReg = Src.getReg(); 114 115 if (isVCC(DstReg, MRI)) { 116 if (SrcReg == AMDGPU::SCC) { 117 const TargetRegisterClass *RC 118 = TRI.getConstrainedRegClassForOperand(Dst, MRI); 119 if (!RC) 120 return true; 121 return RBI.constrainGenericRegister(DstReg, *RC, MRI); 122 } 123 124 if (!isVCC(SrcReg, MRI)) { 125 // TODO: Should probably leave the copy and let copyPhysReg expand it. 126 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), MRI)) 127 return false; 128 129 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) 130 .addImm(0) 131 .addReg(SrcReg); 132 133 if (!MRI.getRegClassOrNull(SrcReg)) 134 MRI.setRegClass(SrcReg, TRI.getConstrainedRegClassForOperand(Src, MRI)); 135 I.eraseFromParent(); 136 return true; 137 } 138 139 const TargetRegisterClass *RC = 140 TRI.getConstrainedRegClassForOperand(Dst, MRI); 141 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, MRI)) 142 return false; 143 144 // Don't constrain the source register to a class so the def instruction 145 // handles it (unless it's undef). 146 // 147 // FIXME: This is a hack. When selecting the def, we neeed to know 148 // specifically know that the result is VCCRegBank, and not just an SGPR 149 // with size 1. An SReg_32 with size 1 is ambiguous with wave32. 150 if (Src.isUndef()) { 151 const TargetRegisterClass *SrcRC = 152 TRI.getConstrainedRegClassForOperand(Src, MRI); 153 if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI)) 154 return false; 155 } 156 157 return true; 158 } 159 160 for (const MachineOperand &MO : I.operands()) { 161 if (Register::isPhysicalRegister(MO.getReg())) 162 continue; 163 164 const TargetRegisterClass *RC = 165 TRI.getConstrainedRegClassForOperand(MO, MRI); 166 if (!RC) 167 continue; 168 RBI.constrainGenericRegister(MO.getReg(), *RC, MRI); 169 } 170 return true; 171 } 172 173 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { 174 MachineBasicBlock *BB = I.getParent(); 175 MachineFunction *MF = BB->getParent(); 176 MachineRegisterInfo &MRI = MF->getRegInfo(); 177 178 const Register DefReg = I.getOperand(0).getReg(); 179 const LLT DefTy = MRI.getType(DefReg); 180 181 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy) 182 183 const RegClassOrRegBank &RegClassOrBank = 184 MRI.getRegClassOrRegBank(DefReg); 185 186 const TargetRegisterClass *DefRC 187 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); 188 if (!DefRC) { 189 if (!DefTy.isValid()) { 190 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); 191 return false; 192 } 193 194 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); 195 if (RB.getID() == AMDGPU::SCCRegBankID) { 196 LLVM_DEBUG(dbgs() << "illegal scc phi\n"); 197 return false; 198 } 199 200 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, MRI); 201 if (!DefRC) { 202 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); 203 return false; 204 } 205 } 206 207 I.setDesc(TII.get(TargetOpcode::PHI)); 208 return RBI.constrainGenericRegister(DefReg, *DefRC, MRI); 209 } 210 211 MachineOperand 212 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, 213 const TargetRegisterClass &SubRC, 214 unsigned SubIdx) const { 215 216 MachineInstr *MI = MO.getParent(); 217 MachineBasicBlock *BB = MO.getParent()->getParent(); 218 MachineFunction *MF = BB->getParent(); 219 MachineRegisterInfo &MRI = MF->getRegInfo(); 220 Register DstReg = MRI.createVirtualRegister(&SubRC); 221 222 if (MO.isReg()) { 223 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); 224 Register Reg = MO.getReg(); 225 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) 226 .addReg(Reg, 0, ComposedSubIdx); 227 228 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(), 229 MO.isKill(), MO.isDead(), MO.isUndef(), 230 MO.isEarlyClobber(), 0, MO.isDebug(), 231 MO.isInternalRead()); 232 } 233 234 assert(MO.isImm()); 235 236 APInt Imm(64, MO.getImm()); 237 238 switch (SubIdx) { 239 default: 240 llvm_unreachable("do not know to split immediate with this sub index."); 241 case AMDGPU::sub0: 242 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue()); 243 case AMDGPU::sub1: 244 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue()); 245 } 246 } 247 248 static int64_t getConstant(const MachineInstr *MI) { 249 return MI->getOperand(1).getCImm()->getSExtValue(); 250 } 251 252 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { 253 switch (Opc) { 254 case AMDGPU::G_AND: 255 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; 256 case AMDGPU::G_OR: 257 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32; 258 case AMDGPU::G_XOR: 259 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32; 260 default: 261 llvm_unreachable("not a bit op"); 262 } 263 } 264 265 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { 266 MachineBasicBlock *BB = I.getParent(); 267 MachineFunction *MF = BB->getParent(); 268 MachineRegisterInfo &MRI = MF->getRegInfo(); 269 MachineOperand &Dst = I.getOperand(0); 270 MachineOperand &Src0 = I.getOperand(1); 271 MachineOperand &Src1 = I.getOperand(2); 272 Register DstReg = Dst.getReg(); 273 unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI); 274 275 const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI); 276 if (DstRB->getID() == AMDGPU::VCCRegBankID) { 277 const TargetRegisterClass *RC = TRI.getBoolRC(); 278 unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), 279 RC == &AMDGPU::SReg_64RegClass); 280 I.setDesc(TII.get(InstOpc)); 281 282 // FIXME: Hack to avoid turning the register bank into a register class. 283 // The selector for G_ICMP relies on seeing the register bank for the result 284 // is VCC. In wave32 if we constrain the registers to SReg_32 here, it will 285 // be ambiguous whether it's a scalar or vector bool. 286 if (Src0.isUndef() && !MRI.getRegClassOrNull(Src0.getReg())) 287 MRI.setRegClass(Src0.getReg(), RC); 288 if (Src1.isUndef() && !MRI.getRegClassOrNull(Src1.getReg())) 289 MRI.setRegClass(Src1.getReg(), RC); 290 291 return RBI.constrainGenericRegister(DstReg, *RC, MRI); 292 } 293 294 // TODO: Should this allow an SCC bank result, and produce a copy from SCC for 295 // the result? 296 if (DstRB->getID() == AMDGPU::SGPRRegBankID) { 297 unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), Size > 32); 298 I.setDesc(TII.get(InstOpc)); 299 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 300 } 301 302 return false; 303 } 304 305 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { 306 MachineBasicBlock *BB = I.getParent(); 307 MachineFunction *MF = BB->getParent(); 308 MachineRegisterInfo &MRI = MF->getRegInfo(); 309 Register DstReg = I.getOperand(0).getReg(); 310 const DebugLoc &DL = I.getDebugLoc(); 311 unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI); 312 const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI); 313 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID; 314 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB; 315 316 if (Size == 32) { 317 if (IsSALU) { 318 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; 319 MachineInstr *Add = 320 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 321 .add(I.getOperand(1)) 322 .add(I.getOperand(2)); 323 I.eraseFromParent(); 324 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 325 } 326 327 if (STI.hasAddNoCarry()) { 328 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64; 329 I.setDesc(TII.get(Opc)); 330 I.addOperand(*MF, MachineOperand::CreateImm(0)); 331 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 332 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 333 } 334 335 const unsigned Opc = Sub ? AMDGPU::V_SUB_I32_e64 : AMDGPU::V_ADD_I32_e64; 336 337 Register UnusedCarry = MRI.createVirtualRegister(TRI.getWaveMaskRegClass()); 338 MachineInstr *Add 339 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 340 .addDef(UnusedCarry, RegState::Dead) 341 .add(I.getOperand(1)) 342 .add(I.getOperand(2)) 343 .addImm(0); 344 I.eraseFromParent(); 345 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 346 } 347 348 assert(!Sub && "illegal sub should not reach here"); 349 350 const TargetRegisterClass &RC 351 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass; 352 const TargetRegisterClass &HalfRC 353 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass; 354 355 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0)); 356 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0)); 357 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1)); 358 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1)); 359 360 Register DstLo = MRI.createVirtualRegister(&HalfRC); 361 Register DstHi = MRI.createVirtualRegister(&HalfRC); 362 363 if (IsSALU) { 364 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) 365 .add(Lo1) 366 .add(Lo2); 367 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) 368 .add(Hi1) 369 .add(Hi2); 370 } else { 371 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass(); 372 Register CarryReg = MRI.createVirtualRegister(CarryRC); 373 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_I32_e64), DstLo) 374 .addDef(CarryReg) 375 .add(Lo1) 376 .add(Lo2) 377 .addImm(0); 378 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi) 379 .addDef(MRI.createVirtualRegister(CarryRC), RegState::Dead) 380 .add(Hi1) 381 .add(Hi2) 382 .addReg(CarryReg, RegState::Kill) 383 .addImm(0); 384 385 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI)) 386 return false; 387 } 388 389 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 390 .addReg(DstLo) 391 .addImm(AMDGPU::sub0) 392 .addReg(DstHi) 393 .addImm(AMDGPU::sub1); 394 395 396 if (!RBI.constrainGenericRegister(DstReg, RC, MRI)) 397 return false; 398 399 I.eraseFromParent(); 400 return true; 401 } 402 403 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { 404 MachineBasicBlock *BB = I.getParent(); 405 MachineFunction *MF = BB->getParent(); 406 MachineRegisterInfo &MRI = MF->getRegInfo(); 407 assert(I.getOperand(2).getImm() % 32 == 0); 408 unsigned SubReg = TRI.getSubRegFromChannel(I.getOperand(2).getImm() / 32); 409 const DebugLoc &DL = I.getDebugLoc(); 410 MachineInstr *Copy = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), 411 I.getOperand(0).getReg()) 412 .addReg(I.getOperand(1).getReg(), 0, SubReg); 413 414 for (const MachineOperand &MO : Copy->operands()) { 415 const TargetRegisterClass *RC = 416 TRI.getConstrainedRegClassForOperand(MO, MRI); 417 if (!RC) 418 continue; 419 RBI.constrainGenericRegister(MO.getReg(), *RC, MRI); 420 } 421 I.eraseFromParent(); 422 return true; 423 } 424 425 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { 426 MachineBasicBlock *BB = MI.getParent(); 427 MachineFunction *MF = BB->getParent(); 428 MachineRegisterInfo &MRI = MF->getRegInfo(); 429 Register DstReg = MI.getOperand(0).getReg(); 430 LLT DstTy = MRI.getType(DstReg); 431 LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); 432 433 const unsigned SrcSize = SrcTy.getSizeInBits(); 434 if (SrcSize < 32) 435 return false; 436 437 const DebugLoc &DL = MI.getDebugLoc(); 438 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, TRI); 439 const unsigned DstSize = DstTy.getSizeInBits(); 440 const TargetRegisterClass *DstRC = 441 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, MRI); 442 if (!DstRC) 443 return false; 444 445 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8); 446 MachineInstrBuilder MIB = 447 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg); 448 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) { 449 MachineOperand &Src = MI.getOperand(I + 1); 450 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef())); 451 MIB.addImm(SubRegs[I]); 452 453 const TargetRegisterClass *SrcRC 454 = TRI.getConstrainedRegClassForOperand(Src, MRI); 455 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, MRI)) 456 return false; 457 } 458 459 if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) 460 return false; 461 462 MI.eraseFromParent(); 463 return true; 464 } 465 466 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { 467 MachineBasicBlock *BB = MI.getParent(); 468 MachineFunction *MF = BB->getParent(); 469 MachineRegisterInfo &MRI = MF->getRegInfo(); 470 const int NumDst = MI.getNumOperands() - 1; 471 472 MachineOperand &Src = MI.getOperand(NumDst); 473 474 Register SrcReg = Src.getReg(); 475 Register DstReg0 = MI.getOperand(0).getReg(); 476 LLT DstTy = MRI.getType(DstReg0); 477 LLT SrcTy = MRI.getType(SrcReg); 478 479 const unsigned DstSize = DstTy.getSizeInBits(); 480 const unsigned SrcSize = SrcTy.getSizeInBits(); 481 const DebugLoc &DL = MI.getDebugLoc(); 482 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, TRI); 483 484 const TargetRegisterClass *SrcRC = 485 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, MRI); 486 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI)) 487 return false; 488 489 const unsigned SrcFlags = getUndefRegState(Src.isUndef()); 490 491 // Note we could have mixed SGPR and VGPR destination banks for an SGPR 492 // source, and this relies on the fact that the same subregister indices are 493 // used for both. 494 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); 495 for (int I = 0, E = NumDst; I != E; ++I) { 496 MachineOperand &Dst = MI.getOperand(I); 497 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg()) 498 .addReg(SrcReg, SrcFlags, SubRegs[I]); 499 500 const TargetRegisterClass *DstRC = 501 TRI.getConstrainedRegClassForOperand(Dst, MRI); 502 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, MRI)) 503 return false; 504 } 505 506 MI.eraseFromParent(); 507 return true; 508 } 509 510 bool AMDGPUInstructionSelector::selectG_GEP(MachineInstr &I) const { 511 return selectG_ADD_SUB(I); 512 } 513 514 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { 515 MachineBasicBlock *BB = I.getParent(); 516 MachineFunction *MF = BB->getParent(); 517 MachineRegisterInfo &MRI = MF->getRegInfo(); 518 const MachineOperand &MO = I.getOperand(0); 519 520 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The 521 // regbank check here is to know why getConstrainedRegClassForOperand failed. 522 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, MRI); 523 if ((!RC && !MRI.getRegBankOrNull(MO.getReg())) || 524 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, MRI))) { 525 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); 526 return true; 527 } 528 529 return false; 530 } 531 532 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { 533 MachineBasicBlock *BB = I.getParent(); 534 MachineFunction *MF = BB->getParent(); 535 MachineRegisterInfo &MRI = MF->getRegInfo(); 536 537 Register Src0Reg = I.getOperand(1).getReg(); 538 Register Src1Reg = I.getOperand(2).getReg(); 539 LLT Src1Ty = MRI.getType(Src1Reg); 540 if (Src1Ty.getSizeInBits() != 32) 541 return false; 542 543 int64_t Offset = I.getOperand(3).getImm(); 544 if (Offset % 32 != 0) 545 return false; 546 547 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32); 548 const DebugLoc &DL = I.getDebugLoc(); 549 550 MachineInstr *Ins = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG)) 551 .addDef(I.getOperand(0).getReg()) 552 .addReg(Src0Reg) 553 .addReg(Src1Reg) 554 .addImm(SubReg); 555 556 for (const MachineOperand &MO : Ins->operands()) { 557 if (!MO.isReg()) 558 continue; 559 if (Register::isPhysicalRegister(MO.getReg())) 560 continue; 561 562 const TargetRegisterClass *RC = 563 TRI.getConstrainedRegClassForOperand(MO, MRI); 564 if (!RC) 565 continue; 566 RBI.constrainGenericRegister(MO.getReg(), *RC, MRI); 567 } 568 I.eraseFromParent(); 569 return true; 570 } 571 572 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { 573 unsigned IntrinsicID = I.getOperand(I.getNumExplicitDefs()).getIntrinsicID(); 574 switch (IntrinsicID) { 575 case Intrinsic::amdgcn_if_break: { 576 MachineBasicBlock *BB = I.getParent(); 577 MachineFunction *MF = BB->getParent(); 578 MachineRegisterInfo &MRI = MF->getRegInfo(); 579 580 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 581 // SelectionDAG uses for wave32 vs wave64. 582 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK)) 583 .add(I.getOperand(0)) 584 .add(I.getOperand(2)) 585 .add(I.getOperand(3)); 586 587 Register DstReg = I.getOperand(0).getReg(); 588 Register Src0Reg = I.getOperand(2).getReg(); 589 Register Src1Reg = I.getOperand(3).getReg(); 590 591 I.eraseFromParent(); 592 593 for (Register Reg : { DstReg, Src0Reg, Src1Reg }) { 594 if (!MRI.getRegClassOrNull(Reg)) 595 MRI.setRegClass(Reg, TRI.getWaveMaskRegClass()); 596 } 597 598 return true; 599 } 600 default: 601 return selectImpl(I, *CoverageInfo); 602 } 603 } 604 605 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) { 606 if (Size != 32 && Size != 64) 607 return -1; 608 switch (P) { 609 default: 610 llvm_unreachable("Unknown condition code!"); 611 case CmpInst::ICMP_NE: 612 return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64; 613 case CmpInst::ICMP_EQ: 614 return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64; 615 case CmpInst::ICMP_SGT: 616 return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64; 617 case CmpInst::ICMP_SGE: 618 return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64; 619 case CmpInst::ICMP_SLT: 620 return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64; 621 case CmpInst::ICMP_SLE: 622 return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64; 623 case CmpInst::ICMP_UGT: 624 return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64; 625 case CmpInst::ICMP_UGE: 626 return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64; 627 case CmpInst::ICMP_ULT: 628 return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64; 629 case CmpInst::ICMP_ULE: 630 return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64; 631 } 632 } 633 634 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P, 635 unsigned Size) const { 636 if (Size == 64) { 637 if (!STI.hasScalarCompareEq64()) 638 return -1; 639 640 switch (P) { 641 case CmpInst::ICMP_NE: 642 return AMDGPU::S_CMP_LG_U64; 643 case CmpInst::ICMP_EQ: 644 return AMDGPU::S_CMP_EQ_U64; 645 default: 646 return -1; 647 } 648 } 649 650 if (Size != 32) 651 return -1; 652 653 switch (P) { 654 case CmpInst::ICMP_NE: 655 return AMDGPU::S_CMP_LG_U32; 656 case CmpInst::ICMP_EQ: 657 return AMDGPU::S_CMP_EQ_U32; 658 case CmpInst::ICMP_SGT: 659 return AMDGPU::S_CMP_GT_I32; 660 case CmpInst::ICMP_SGE: 661 return AMDGPU::S_CMP_GE_I32; 662 case CmpInst::ICMP_SLT: 663 return AMDGPU::S_CMP_LT_I32; 664 case CmpInst::ICMP_SLE: 665 return AMDGPU::S_CMP_LE_I32; 666 case CmpInst::ICMP_UGT: 667 return AMDGPU::S_CMP_GT_U32; 668 case CmpInst::ICMP_UGE: 669 return AMDGPU::S_CMP_GE_U32; 670 case CmpInst::ICMP_ULT: 671 return AMDGPU::S_CMP_LT_U32; 672 case CmpInst::ICMP_ULE: 673 return AMDGPU::S_CMP_LE_U32; 674 default: 675 llvm_unreachable("Unknown condition code!"); 676 } 677 } 678 679 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { 680 MachineBasicBlock *BB = I.getParent(); 681 MachineFunction *MF = BB->getParent(); 682 MachineRegisterInfo &MRI = MF->getRegInfo(); 683 const DebugLoc &DL = I.getDebugLoc(); 684 685 Register SrcReg = I.getOperand(2).getReg(); 686 unsigned Size = RBI.getSizeInBits(SrcReg, MRI, TRI); 687 688 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); 689 690 Register CCReg = I.getOperand(0).getReg(); 691 if (isSCC(CCReg, MRI)) { 692 int Opcode = getS_CMPOpcode(Pred, Size); 693 if (Opcode == -1) 694 return false; 695 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode)) 696 .add(I.getOperand(2)) 697 .add(I.getOperand(3)); 698 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg) 699 .addReg(AMDGPU::SCC); 700 bool Ret = 701 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) && 702 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, MRI); 703 I.eraseFromParent(); 704 return Ret; 705 } 706 707 int Opcode = getV_CMPOpcode(Pred, Size); 708 if (Opcode == -1) 709 return false; 710 711 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), 712 I.getOperand(0).getReg()) 713 .add(I.getOperand(2)) 714 .add(I.getOperand(3)); 715 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), 716 *TRI.getBoolRC(), MRI); 717 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 718 I.eraseFromParent(); 719 return Ret; 720 } 721 722 static MachineInstr * 723 buildEXP(const TargetInstrInfo &TII, MachineInstr *Insert, unsigned Tgt, 724 unsigned Reg0, unsigned Reg1, unsigned Reg2, unsigned Reg3, 725 unsigned VM, bool Compr, unsigned Enabled, bool Done) { 726 const DebugLoc &DL = Insert->getDebugLoc(); 727 MachineBasicBlock &BB = *Insert->getParent(); 728 unsigned Opcode = Done ? AMDGPU::EXP_DONE : AMDGPU::EXP; 729 return BuildMI(BB, Insert, DL, TII.get(Opcode)) 730 .addImm(Tgt) 731 .addReg(Reg0) 732 .addReg(Reg1) 733 .addReg(Reg2) 734 .addReg(Reg3) 735 .addImm(VM) 736 .addImm(Compr) 737 .addImm(Enabled); 738 } 739 740 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( 741 MachineInstr &I) const { 742 MachineBasicBlock *BB = I.getParent(); 743 MachineFunction *MF = BB->getParent(); 744 MachineRegisterInfo &MRI = MF->getRegInfo(); 745 746 unsigned IntrinsicID = I.getIntrinsicID(); 747 switch (IntrinsicID) { 748 case Intrinsic::amdgcn_exp: { 749 int64_t Tgt = getConstant(MRI.getVRegDef(I.getOperand(1).getReg())); 750 int64_t Enabled = getConstant(MRI.getVRegDef(I.getOperand(2).getReg())); 751 int64_t Done = getConstant(MRI.getVRegDef(I.getOperand(7).getReg())); 752 int64_t VM = getConstant(MRI.getVRegDef(I.getOperand(8).getReg())); 753 754 MachineInstr *Exp = buildEXP(TII, &I, Tgt, I.getOperand(3).getReg(), 755 I.getOperand(4).getReg(), 756 I.getOperand(5).getReg(), 757 I.getOperand(6).getReg(), 758 VM, false, Enabled, Done); 759 760 I.eraseFromParent(); 761 return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI); 762 } 763 case Intrinsic::amdgcn_exp_compr: { 764 const DebugLoc &DL = I.getDebugLoc(); 765 int64_t Tgt = getConstant(MRI.getVRegDef(I.getOperand(1).getReg())); 766 int64_t Enabled = getConstant(MRI.getVRegDef(I.getOperand(2).getReg())); 767 Register Reg0 = I.getOperand(3).getReg(); 768 Register Reg1 = I.getOperand(4).getReg(); 769 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 770 int64_t Done = getConstant(MRI.getVRegDef(I.getOperand(5).getReg())); 771 int64_t VM = getConstant(MRI.getVRegDef(I.getOperand(6).getReg())); 772 773 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef); 774 MachineInstr *Exp = buildEXP(TII, &I, Tgt, Reg0, Reg1, Undef, Undef, VM, 775 true, Enabled, Done); 776 777 I.eraseFromParent(); 778 return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI); 779 } 780 case Intrinsic::amdgcn_end_cf: { 781 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 782 // SelectionDAG uses for wave32 vs wave64. 783 BuildMI(*BB, &I, I.getDebugLoc(), 784 TII.get(AMDGPU::SI_END_CF)) 785 .add(I.getOperand(1)); 786 787 Register Reg = I.getOperand(1).getReg(); 788 I.eraseFromParent(); 789 790 if (!MRI.getRegClassOrNull(Reg)) 791 MRI.setRegClass(Reg, TRI.getWaveMaskRegClass()); 792 return true; 793 } 794 default: 795 return selectImpl(I, *CoverageInfo); 796 } 797 } 798 799 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { 800 MachineBasicBlock *BB = I.getParent(); 801 MachineFunction *MF = BB->getParent(); 802 MachineRegisterInfo &MRI = MF->getRegInfo(); 803 const DebugLoc &DL = I.getDebugLoc(); 804 805 Register DstReg = I.getOperand(0).getReg(); 806 unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI); 807 assert(Size <= 32 || Size == 64); 808 const MachineOperand &CCOp = I.getOperand(1); 809 Register CCReg = CCOp.getReg(); 810 if (isSCC(CCReg, MRI)) { 811 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 : 812 AMDGPU::S_CSELECT_B32; 813 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 814 .addReg(CCReg); 815 816 // The generic constrainSelectedInstRegOperands doesn't work for the scc register 817 // bank, because it does not cover the register class that we used to represent 818 // for it. So we need to manually set the register class here. 819 if (!MRI.getRegClassOrNull(CCReg)) 820 MRI.setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, MRI)); 821 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) 822 .add(I.getOperand(2)) 823 .add(I.getOperand(3)); 824 825 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) | 826 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI); 827 I.eraseFromParent(); 828 return Ret; 829 } 830 831 // Wide VGPR select should have been split in RegBankSelect. 832 if (Size > 32) 833 return false; 834 835 MachineInstr *Select = 836 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 837 .addImm(0) 838 .add(I.getOperand(3)) 839 .addImm(0) 840 .add(I.getOperand(2)) 841 .add(I.getOperand(1)); 842 843 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); 844 I.eraseFromParent(); 845 return Ret; 846 } 847 848 bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { 849 initM0(I); 850 return selectImpl(I, *CoverageInfo); 851 } 852 853 static int sizeToSubRegIndex(unsigned Size) { 854 switch (Size) { 855 case 32: 856 return AMDGPU::sub0; 857 case 64: 858 return AMDGPU::sub0_sub1; 859 case 96: 860 return AMDGPU::sub0_sub1_sub2; 861 case 128: 862 return AMDGPU::sub0_sub1_sub2_sub3; 863 case 256: 864 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; 865 default: 866 if (Size < 32) 867 return AMDGPU::sub0; 868 if (Size > 256) 869 return -1; 870 return sizeToSubRegIndex(PowerOf2Ceil(Size)); 871 } 872 } 873 874 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { 875 MachineBasicBlock *BB = I.getParent(); 876 MachineFunction *MF = BB->getParent(); 877 MachineRegisterInfo &MRI = MF->getRegInfo(); 878 879 Register DstReg = I.getOperand(0).getReg(); 880 Register SrcReg = I.getOperand(1).getReg(); 881 const LLT DstTy = MRI.getType(DstReg); 882 const LLT SrcTy = MRI.getType(SrcReg); 883 if (!DstTy.isScalar()) 884 return false; 885 886 const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI); 887 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, MRI, TRI); 888 if (SrcRB != DstRB) 889 return false; 890 891 unsigned DstSize = DstTy.getSizeInBits(); 892 unsigned SrcSize = SrcTy.getSizeInBits(); 893 894 const TargetRegisterClass *SrcRC 895 = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, MRI); 896 const TargetRegisterClass *DstRC 897 = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, MRI); 898 899 if (SrcSize > 32) { 900 int SubRegIdx = sizeToSubRegIndex(DstSize); 901 if (SubRegIdx == -1) 902 return false; 903 904 // Deal with weird cases where the class only partially supports the subreg 905 // index. 906 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx); 907 if (!SrcRC) 908 return false; 909 910 I.getOperand(1).setSubReg(SubRegIdx); 911 } 912 913 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) || 914 !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { 915 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); 916 return false; 917 } 918 919 I.setDesc(TII.get(TargetOpcode::COPY)); 920 return true; 921 } 922 923 /// \returns true if a bitmask for \p Size bits will be an inline immediate. 924 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) { 925 Mask = maskTrailingOnes<unsigned>(Size); 926 int SignedMask = static_cast<int>(Mask); 927 return SignedMask >= -16 && SignedMask <= 64; 928 } 929 930 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { 931 bool Signed = I.getOpcode() == AMDGPU::G_SEXT; 932 const DebugLoc &DL = I.getDebugLoc(); 933 MachineBasicBlock &MBB = *I.getParent(); 934 MachineFunction &MF = *MBB.getParent(); 935 MachineRegisterInfo &MRI = MF.getRegInfo(); 936 const Register DstReg = I.getOperand(0).getReg(); 937 const Register SrcReg = I.getOperand(1).getReg(); 938 939 const LLT DstTy = MRI.getType(DstReg); 940 const LLT SrcTy = MRI.getType(SrcReg); 941 const LLT S1 = LLT::scalar(1); 942 const unsigned SrcSize = SrcTy.getSizeInBits(); 943 const unsigned DstSize = DstTy.getSizeInBits(); 944 if (!DstTy.isScalar()) 945 return false; 946 947 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, TRI); 948 949 if (SrcBank->getID() == AMDGPU::SCCRegBankID) { 950 if (SrcTy != S1 || DstSize > 64) // Invalid 951 return false; 952 953 unsigned Opcode = 954 DstSize > 32 ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32; 955 const TargetRegisterClass *DstRC = 956 DstSize > 32 ? &AMDGPU::SReg_64RegClass : &AMDGPU::SReg_32RegClass; 957 958 // FIXME: Create an extra copy to avoid incorrectly constraining the result 959 // of the scc producer. 960 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 961 BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), TmpReg) 962 .addReg(SrcReg); 963 BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 964 .addReg(TmpReg); 965 966 // The instruction operands are backwards from what you would expect. 967 BuildMI(MBB, I, DL, TII.get(Opcode), DstReg) 968 .addImm(0) 969 .addImm(Signed ? -1 : 1); 970 I.eraseFromParent(); 971 return RBI.constrainGenericRegister(DstReg, *DstRC, MRI); 972 } 973 974 if (SrcBank->getID() == AMDGPU::VCCRegBankID && DstSize <= 32) { 975 if (SrcTy != S1) // Invalid 976 return false; 977 978 MachineInstr *ExtI = 979 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 980 .addImm(0) // src0_modifiers 981 .addImm(0) // src0 982 .addImm(0) // src1_modifiers 983 .addImm(Signed ? -1 : 1) // src1 984 .addUse(SrcReg); 985 I.eraseFromParent(); 986 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 987 } 988 989 if (I.getOpcode() == AMDGPU::G_ANYEXT) 990 return selectCOPY(I); 991 992 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) { 993 // 64-bit should have been split up in RegBankSelect 994 995 // Try to use an and with a mask if it will save code size. 996 unsigned Mask; 997 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 998 MachineInstr *ExtI = 999 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg) 1000 .addImm(Mask) 1001 .addReg(SrcReg); 1002 I.eraseFromParent(); 1003 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1004 } 1005 1006 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32; 1007 MachineInstr *ExtI = 1008 BuildMI(MBB, I, DL, TII.get(BFE), DstReg) 1009 .addReg(SrcReg) 1010 .addImm(0) // Offset 1011 .addImm(SrcSize); // Width 1012 I.eraseFromParent(); 1013 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1014 } 1015 1016 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) { 1017 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI)) 1018 return false; 1019 1020 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) { 1021 const unsigned SextOpc = SrcSize == 8 ? 1022 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16; 1023 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg) 1024 .addReg(SrcReg); 1025 I.eraseFromParent(); 1026 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, MRI); 1027 } 1028 1029 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; 1030 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; 1031 1032 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width. 1033 if (DstSize > 32 && SrcSize <= 32) { 1034 // We need a 64-bit register source, but the high bits don't matter. 1035 Register ExtReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 1036 Register UndefReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1037 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 1038 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) 1039 .addReg(SrcReg) 1040 .addImm(AMDGPU::sub0) 1041 .addReg(UndefReg) 1042 .addImm(AMDGPU::sub1); 1043 1044 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg) 1045 .addReg(ExtReg) 1046 .addImm(SrcSize << 16); 1047 1048 I.eraseFromParent(); 1049 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, MRI); 1050 } 1051 1052 unsigned Mask; 1053 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1054 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg) 1055 .addReg(SrcReg) 1056 .addImm(Mask); 1057 } else { 1058 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg) 1059 .addReg(SrcReg) 1060 .addImm(SrcSize << 16); 1061 } 1062 1063 I.eraseFromParent(); 1064 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, MRI); 1065 } 1066 1067 return false; 1068 } 1069 1070 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { 1071 MachineBasicBlock *BB = I.getParent(); 1072 MachineFunction *MF = BB->getParent(); 1073 MachineRegisterInfo &MRI = MF->getRegInfo(); 1074 MachineOperand &ImmOp = I.getOperand(1); 1075 1076 // The AMDGPU backend only supports Imm operands and not CImm or FPImm. 1077 if (ImmOp.isFPImm()) { 1078 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt(); 1079 ImmOp.ChangeToImmediate(Imm.getZExtValue()); 1080 } else if (ImmOp.isCImm()) { 1081 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getZExtValue()); 1082 } 1083 1084 Register DstReg = I.getOperand(0).getReg(); 1085 unsigned Size; 1086 bool IsSgpr; 1087 const RegisterBank *RB = MRI.getRegBankOrNull(I.getOperand(0).getReg()); 1088 if (RB) { 1089 IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID; 1090 Size = MRI.getType(DstReg).getSizeInBits(); 1091 } else { 1092 const TargetRegisterClass *RC = TRI.getRegClassForReg(MRI, DstReg); 1093 IsSgpr = TRI.isSGPRClass(RC); 1094 Size = TRI.getRegSizeInBits(*RC); 1095 } 1096 1097 if (Size != 32 && Size != 64) 1098 return false; 1099 1100 unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1101 if (Size == 32) { 1102 I.setDesc(TII.get(Opcode)); 1103 I.addImplicitDefUseOperands(*MF); 1104 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 1105 } 1106 1107 DebugLoc DL = I.getDebugLoc(); 1108 const TargetRegisterClass *RC = IsSgpr ? &AMDGPU::SReg_32_XM0RegClass : 1109 &AMDGPU::VGPR_32RegClass; 1110 Register LoReg = MRI.createVirtualRegister(RC); 1111 Register HiReg = MRI.createVirtualRegister(RC); 1112 const APInt &Imm = APInt(Size, I.getOperand(1).getImm()); 1113 1114 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg) 1115 .addImm(Imm.trunc(32).getZExtValue()); 1116 1117 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg) 1118 .addImm(Imm.ashr(32).getZExtValue()); 1119 1120 const MachineInstr *RS = 1121 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 1122 .addReg(LoReg) 1123 .addImm(AMDGPU::sub0) 1124 .addReg(HiReg) 1125 .addImm(AMDGPU::sub1); 1126 1127 // We can't call constrainSelectedInstRegOperands here, because it doesn't 1128 // work for target independent opcodes 1129 I.eraseFromParent(); 1130 const TargetRegisterClass *DstRC = 1131 TRI.getConstrainedRegClassForOperand(RS->getOperand(0), MRI); 1132 if (!DstRC) 1133 return true; 1134 return RBI.constrainGenericRegister(DstReg, *DstRC, MRI); 1135 } 1136 1137 static bool isConstant(const MachineInstr &MI) { 1138 return MI.getOpcode() == TargetOpcode::G_CONSTANT; 1139 } 1140 1141 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, 1142 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const { 1143 1144 const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg()); 1145 1146 assert(PtrMI); 1147 1148 if (PtrMI->getOpcode() != TargetOpcode::G_GEP) 1149 return; 1150 1151 GEPInfo GEPInfo(*PtrMI); 1152 1153 for (unsigned i = 1; i != 3; ++i) { 1154 const MachineOperand &GEPOp = PtrMI->getOperand(i); 1155 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg()); 1156 assert(OpDef); 1157 if (i == 2 && isConstant(*OpDef)) { 1158 // TODO: Could handle constant base + variable offset, but a combine 1159 // probably should have commuted it. 1160 assert(GEPInfo.Imm == 0); 1161 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue(); 1162 continue; 1163 } 1164 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI); 1165 if (OpBank->getID() == AMDGPU::SGPRRegBankID) 1166 GEPInfo.SgprParts.push_back(GEPOp.getReg()); 1167 else 1168 GEPInfo.VgprParts.push_back(GEPOp.getReg()); 1169 } 1170 1171 AddrInfo.push_back(GEPInfo); 1172 getAddrModeInfo(*PtrMI, MRI, AddrInfo); 1173 } 1174 1175 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const { 1176 if (!MI.hasOneMemOperand()) 1177 return false; 1178 1179 const MachineMemOperand *MMO = *MI.memoperands_begin(); 1180 const Value *Ptr = MMO->getValue(); 1181 1182 // UndefValue means this is a load of a kernel input. These are uniform. 1183 // Sometimes LDS instructions have constant pointers. 1184 // If Ptr is null, then that means this mem operand contains a 1185 // PseudoSourceValue like GOT. 1186 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || 1187 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) 1188 return true; 1189 1190 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 1191 return true; 1192 1193 const Instruction *I = dyn_cast<Instruction>(Ptr); 1194 return I && I->getMetadata("amdgpu.uniform"); 1195 } 1196 1197 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const { 1198 for (const GEPInfo &GEPInfo : AddrInfo) { 1199 if (!GEPInfo.VgprParts.empty()) 1200 return true; 1201 } 1202 return false; 1203 } 1204 1205 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const { 1206 MachineBasicBlock *BB = I.getParent(); 1207 MachineFunction *MF = BB->getParent(); 1208 MachineRegisterInfo &MRI = MF->getRegInfo(); 1209 1210 const LLT PtrTy = MRI.getType(I.getOperand(1).getReg()); 1211 unsigned AS = PtrTy.getAddressSpace(); 1212 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) && 1213 STI.ldsRequiresM0Init()) { 1214 // If DS instructions require M0 initializtion, insert it before selecting. 1215 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1216 .addImm(-1); 1217 } 1218 } 1219 1220 bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const { 1221 initM0(I); 1222 return selectImpl(I, *CoverageInfo); 1223 } 1224 1225 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { 1226 MachineBasicBlock *BB = I.getParent(); 1227 MachineFunction *MF = BB->getParent(); 1228 MachineRegisterInfo &MRI = MF->getRegInfo(); 1229 MachineOperand &CondOp = I.getOperand(0); 1230 Register CondReg = CondOp.getReg(); 1231 const DebugLoc &DL = I.getDebugLoc(); 1232 1233 unsigned BrOpcode; 1234 Register CondPhysReg; 1235 const TargetRegisterClass *ConstrainRC; 1236 1237 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide 1238 // whether the branch is uniform when selecting the instruction. In 1239 // GlobalISel, we should push that decision into RegBankSelect. Assume for now 1240 // RegBankSelect knows what it's doing if the branch condition is scc, even 1241 // though it currently does not. 1242 if (isSCC(CondReg, MRI)) { 1243 CondPhysReg = AMDGPU::SCC; 1244 BrOpcode = AMDGPU::S_CBRANCH_SCC1; 1245 ConstrainRC = &AMDGPU::SReg_32_XM0RegClass; 1246 } else if (isVCC(CondReg, MRI)) { 1247 // FIXME: Do we have to insert an and with exec here, like in SelectionDAG? 1248 // We sort of know that a VCC producer based on the register bank, that ands 1249 // inactive lanes with 0. What if there was a logical operation with vcc 1250 // producers in different blocks/with different exec masks? 1251 // FIXME: Should scc->vcc copies and with exec? 1252 CondPhysReg = TRI.getVCC(); 1253 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ; 1254 ConstrainRC = TRI.getBoolRC(); 1255 } else 1256 return false; 1257 1258 if (!MRI.getRegClassOrNull(CondReg)) 1259 MRI.setRegClass(CondReg, ConstrainRC); 1260 1261 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg) 1262 .addReg(CondReg); 1263 BuildMI(*BB, &I, DL, TII.get(BrOpcode)) 1264 .addMBB(I.getOperand(1).getMBB()); 1265 1266 I.eraseFromParent(); 1267 return true; 1268 } 1269 1270 bool AMDGPUInstructionSelector::selectG_FRAME_INDEX(MachineInstr &I) const { 1271 MachineBasicBlock *BB = I.getParent(); 1272 MachineFunction *MF = BB->getParent(); 1273 MachineRegisterInfo &MRI = MF->getRegInfo(); 1274 1275 Register DstReg = I.getOperand(0).getReg(); 1276 const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI); 1277 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 1278 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32)); 1279 if (IsVGPR) 1280 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 1281 1282 return RBI.constrainGenericRegister( 1283 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, MRI); 1284 } 1285 1286 bool AMDGPUInstructionSelector::selectG_PTR_MASK(MachineInstr &I) const { 1287 uint64_t Align = I.getOperand(2).getImm(); 1288 const uint64_t Mask = ~((UINT64_C(1) << Align) - 1); 1289 1290 MachineBasicBlock *BB = I.getParent(); 1291 MachineFunction *MF = BB->getParent(); 1292 MachineRegisterInfo &MRI = MF->getRegInfo(); 1293 1294 Register DstReg = I.getOperand(0).getReg(); 1295 Register SrcReg = I.getOperand(1).getReg(); 1296 1297 const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI); 1298 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, MRI, TRI); 1299 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 1300 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 1301 unsigned MovOpc = IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 1302 const TargetRegisterClass &RegRC 1303 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 1304 1305 LLT Ty = MRI.getType(DstReg); 1306 1307 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB, 1308 MRI); 1309 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB, 1310 MRI); 1311 if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI) || 1312 !RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI)) 1313 return false; 1314 1315 const DebugLoc &DL = I.getDebugLoc(); 1316 Register ImmReg = MRI.createVirtualRegister(&RegRC); 1317 BuildMI(*BB, &I, DL, TII.get(MovOpc), ImmReg) 1318 .addImm(Mask); 1319 1320 if (Ty.getSizeInBits() == 32) { 1321 BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg) 1322 .addReg(SrcReg) 1323 .addReg(ImmReg); 1324 I.eraseFromParent(); 1325 return true; 1326 } 1327 1328 Register HiReg = MRI.createVirtualRegister(&RegRC); 1329 Register LoReg = MRI.createVirtualRegister(&RegRC); 1330 Register MaskLo = MRI.createVirtualRegister(&RegRC); 1331 1332 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg) 1333 .addReg(SrcReg, 0, AMDGPU::sub0); 1334 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg) 1335 .addReg(SrcReg, 0, AMDGPU::sub1); 1336 1337 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskLo) 1338 .addReg(LoReg) 1339 .addReg(ImmReg); 1340 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 1341 .addReg(MaskLo) 1342 .addImm(AMDGPU::sub0) 1343 .addReg(HiReg) 1344 .addImm(AMDGPU::sub1); 1345 I.eraseFromParent(); 1346 return true; 1347 } 1348 1349 bool AMDGPUInstructionSelector::select(MachineInstr &I) { 1350 if (I.isPHI()) 1351 return selectPHI(I); 1352 1353 if (!isPreISelGenericOpcode(I.getOpcode())) { 1354 if (I.isCopy()) 1355 return selectCOPY(I); 1356 return true; 1357 } 1358 1359 switch (I.getOpcode()) { 1360 case TargetOpcode::G_AND: 1361 case TargetOpcode::G_OR: 1362 case TargetOpcode::G_XOR: 1363 if (selectG_AND_OR_XOR(I)) 1364 return true; 1365 return selectImpl(I, *CoverageInfo); 1366 case TargetOpcode::G_ADD: 1367 case TargetOpcode::G_SUB: 1368 if (selectImpl(I, *CoverageInfo)) 1369 return true; 1370 return selectG_ADD_SUB(I); 1371 case TargetOpcode::G_INTTOPTR: 1372 case TargetOpcode::G_BITCAST: 1373 return selectCOPY(I); 1374 case TargetOpcode::G_CONSTANT: 1375 case TargetOpcode::G_FCONSTANT: 1376 return selectG_CONSTANT(I); 1377 case TargetOpcode::G_EXTRACT: 1378 return selectG_EXTRACT(I); 1379 case TargetOpcode::G_MERGE_VALUES: 1380 case TargetOpcode::G_BUILD_VECTOR: 1381 case TargetOpcode::G_CONCAT_VECTORS: 1382 return selectG_MERGE_VALUES(I); 1383 case TargetOpcode::G_UNMERGE_VALUES: 1384 return selectG_UNMERGE_VALUES(I); 1385 case TargetOpcode::G_GEP: 1386 return selectG_GEP(I); 1387 case TargetOpcode::G_IMPLICIT_DEF: 1388 return selectG_IMPLICIT_DEF(I); 1389 case TargetOpcode::G_INSERT: 1390 return selectG_INSERT(I); 1391 case TargetOpcode::G_INTRINSIC: 1392 return selectG_INTRINSIC(I); 1393 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: 1394 return selectG_INTRINSIC_W_SIDE_EFFECTS(I); 1395 case TargetOpcode::G_ICMP: 1396 if (selectG_ICMP(I)) 1397 return true; 1398 return selectImpl(I, *CoverageInfo); 1399 case TargetOpcode::G_LOAD: 1400 case TargetOpcode::G_ATOMIC_CMPXCHG: 1401 case TargetOpcode::G_ATOMICRMW_XCHG: 1402 case TargetOpcode::G_ATOMICRMW_ADD: 1403 case TargetOpcode::G_ATOMICRMW_SUB: 1404 case TargetOpcode::G_ATOMICRMW_AND: 1405 case TargetOpcode::G_ATOMICRMW_OR: 1406 case TargetOpcode::G_ATOMICRMW_XOR: 1407 case TargetOpcode::G_ATOMICRMW_MIN: 1408 case TargetOpcode::G_ATOMICRMW_MAX: 1409 case TargetOpcode::G_ATOMICRMW_UMIN: 1410 case TargetOpcode::G_ATOMICRMW_UMAX: 1411 case TargetOpcode::G_ATOMICRMW_FADD: 1412 return selectG_LOAD_ATOMICRMW(I); 1413 case TargetOpcode::G_SELECT: 1414 return selectG_SELECT(I); 1415 case TargetOpcode::G_STORE: 1416 return selectG_STORE(I); 1417 case TargetOpcode::G_TRUNC: 1418 return selectG_TRUNC(I); 1419 case TargetOpcode::G_SEXT: 1420 case TargetOpcode::G_ZEXT: 1421 case TargetOpcode::G_ANYEXT: 1422 return selectG_SZA_EXT(I); 1423 case TargetOpcode::G_BRCOND: 1424 return selectG_BRCOND(I); 1425 case TargetOpcode::G_FRAME_INDEX: 1426 return selectG_FRAME_INDEX(I); 1427 case TargetOpcode::G_FENCE: 1428 // FIXME: Tablegen importer doesn't handle the imm operands correctly, and 1429 // is checking for G_CONSTANT 1430 I.setDesc(TII.get(AMDGPU::ATOMIC_FENCE)); 1431 return true; 1432 case TargetOpcode::G_PTR_MASK: 1433 return selectG_PTR_MASK(I); 1434 default: 1435 return selectImpl(I, *CoverageInfo); 1436 } 1437 return false; 1438 } 1439 1440 InstructionSelector::ComplexRendererFns 1441 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { 1442 return {{ 1443 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 1444 }}; 1445 1446 } 1447 1448 std::pair<Register, unsigned> 1449 AMDGPUInstructionSelector::selectVOP3ModsImpl( 1450 Register Src, const MachineRegisterInfo &MRI) const { 1451 unsigned Mods = 0; 1452 MachineInstr *MI = MRI.getVRegDef(Src); 1453 1454 if (MI && MI->getOpcode() == AMDGPU::G_FNEG) { 1455 Src = MI->getOperand(1).getReg(); 1456 Mods |= SISrcMods::NEG; 1457 MI = MRI.getVRegDef(Src); 1458 } 1459 1460 if (MI && MI->getOpcode() == AMDGPU::G_FABS) { 1461 Src = MI->getOperand(1).getReg(); 1462 Mods |= SISrcMods::ABS; 1463 } 1464 1465 return std::make_pair(Src, Mods); 1466 } 1467 1468 /// 1469 /// This will select either an SGPR or VGPR operand and will save us from 1470 /// having to write an extra tablegen pattern. 1471 InstructionSelector::ComplexRendererFns 1472 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const { 1473 return {{ 1474 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 1475 }}; 1476 } 1477 1478 InstructionSelector::ComplexRendererFns 1479 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { 1480 MachineRegisterInfo &MRI 1481 = Root.getParent()->getParent()->getParent()->getRegInfo(); 1482 1483 Register Src; 1484 unsigned Mods; 1485 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), MRI); 1486 1487 return {{ 1488 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 1489 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 1490 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 1491 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 1492 }}; 1493 } 1494 1495 InstructionSelector::ComplexRendererFns 1496 AMDGPUInstructionSelector::selectVOP3Mods0Clamp0OMod(MachineOperand &Root) const { 1497 MachineRegisterInfo &MRI 1498 = Root.getParent()->getParent()->getParent()->getRegInfo(); 1499 1500 Register Src; 1501 unsigned Mods; 1502 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), MRI); 1503 1504 return {{ 1505 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 1506 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 1507 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 1508 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 1509 }}; 1510 } 1511 1512 InstructionSelector::ComplexRendererFns 1513 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { 1514 return {{ 1515 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 1516 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 1517 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 1518 }}; 1519 } 1520 1521 InstructionSelector::ComplexRendererFns 1522 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { 1523 MachineRegisterInfo &MRI 1524 = Root.getParent()->getParent()->getParent()->getRegInfo(); 1525 1526 Register Src; 1527 unsigned Mods; 1528 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), MRI); 1529 1530 return {{ 1531 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 1532 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 1533 }}; 1534 } 1535 1536 InstructionSelector::ComplexRendererFns 1537 AMDGPUInstructionSelector::selectVOP3OpSelMods0(MachineOperand &Root) const { 1538 // FIXME: Handle clamp and op_sel 1539 return {{ 1540 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 1541 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // src_mods 1542 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // clamp 1543 }}; 1544 } 1545 1546 InstructionSelector::ComplexRendererFns 1547 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { 1548 // FIXME: Handle op_sel 1549 return {{ 1550 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 1551 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods 1552 }}; 1553 } 1554 1555 InstructionSelector::ComplexRendererFns 1556 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { 1557 MachineRegisterInfo &MRI = 1558 Root.getParent()->getParent()->getParent()->getRegInfo(); 1559 1560 SmallVector<GEPInfo, 4> AddrInfo; 1561 getAddrModeInfo(*Root.getParent(), MRI, AddrInfo); 1562 1563 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 1564 return None; 1565 1566 const GEPInfo &GEPInfo = AddrInfo[0]; 1567 1568 if (!AMDGPU::isLegalSMRDImmOffset(STI, GEPInfo.Imm)) 1569 return None; 1570 1571 unsigned PtrReg = GEPInfo.SgprParts[0]; 1572 int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm); 1573 return {{ 1574 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 1575 [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); } 1576 }}; 1577 } 1578 1579 InstructionSelector::ComplexRendererFns 1580 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { 1581 MachineRegisterInfo &MRI = 1582 Root.getParent()->getParent()->getParent()->getRegInfo(); 1583 1584 SmallVector<GEPInfo, 4> AddrInfo; 1585 getAddrModeInfo(*Root.getParent(), MRI, AddrInfo); 1586 1587 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 1588 return None; 1589 1590 const GEPInfo &GEPInfo = AddrInfo[0]; 1591 unsigned PtrReg = GEPInfo.SgprParts[0]; 1592 int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm); 1593 if (!isUInt<32>(EncodedImm)) 1594 return None; 1595 1596 return {{ 1597 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 1598 [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); } 1599 }}; 1600 } 1601 1602 InstructionSelector::ComplexRendererFns 1603 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { 1604 MachineInstr *MI = Root.getParent(); 1605 MachineBasicBlock *MBB = MI->getParent(); 1606 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1607 1608 SmallVector<GEPInfo, 4> AddrInfo; 1609 getAddrModeInfo(*MI, MRI, AddrInfo); 1610 1611 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, 1612 // then we can select all ptr + 32-bit offsets not just immediate offsets. 1613 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 1614 return None; 1615 1616 const GEPInfo &GEPInfo = AddrInfo[0]; 1617 if (!GEPInfo.Imm || !isUInt<32>(GEPInfo.Imm)) 1618 return None; 1619 1620 // If we make it this far we have a load with an 32-bit immediate offset. 1621 // It is OK to select this using a sgpr offset, because we have already 1622 // failed trying to select this load into one of the _IMM variants since 1623 // the _IMM Patterns are considered before the _SGPR patterns. 1624 unsigned PtrReg = GEPInfo.SgprParts[0]; 1625 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 1626 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) 1627 .addImm(GEPInfo.Imm); 1628 return {{ 1629 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 1630 [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); } 1631 }}; 1632 } 1633 1634 template <bool Signed> 1635 InstructionSelector::ComplexRendererFns 1636 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const { 1637 MachineInstr *MI = Root.getParent(); 1638 MachineBasicBlock *MBB = MI->getParent(); 1639 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1640 1641 InstructionSelector::ComplexRendererFns Default = {{ 1642 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 1643 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // offset 1644 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 1645 }}; 1646 1647 if (!STI.hasFlatInstOffsets()) 1648 return Default; 1649 1650 const MachineInstr *OpDef = MRI.getVRegDef(Root.getReg()); 1651 if (!OpDef || OpDef->getOpcode() != AMDGPU::G_GEP) 1652 return Default; 1653 1654 Optional<int64_t> Offset = 1655 getConstantVRegVal(OpDef->getOperand(2).getReg(), MRI); 1656 if (!Offset.hasValue()) 1657 return Default; 1658 1659 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); 1660 if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed)) 1661 return Default; 1662 1663 Register BasePtr = OpDef->getOperand(1).getReg(); 1664 1665 return {{ 1666 [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); }, 1667 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); }, 1668 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 1669 }}; 1670 } 1671 1672 InstructionSelector::ComplexRendererFns 1673 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const { 1674 return selectFlatOffsetImpl<false>(Root); 1675 } 1676 1677 InstructionSelector::ComplexRendererFns 1678 AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const { 1679 return selectFlatOffsetImpl<true>(Root); 1680 } 1681 1682 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { 1683 auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>(); 1684 return PSV && PSV->isStack(); 1685 } 1686 1687 InstructionSelector::ComplexRendererFns 1688 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { 1689 MachineInstr *MI = Root.getParent(); 1690 MachineBasicBlock *MBB = MI->getParent(); 1691 MachineFunction *MF = MBB->getParent(); 1692 MachineRegisterInfo &MRI = MF->getRegInfo(); 1693 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 1694 1695 int64_t Offset = 0; 1696 if (mi_match(Root.getReg(), MRI, m_ICst(Offset))) { 1697 Register HighBits = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1698 1699 // TODO: Should this be inside the render function? The iterator seems to 1700 // move. 1701 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), 1702 HighBits) 1703 .addImm(Offset & ~4095); 1704 1705 return {{[=](MachineInstrBuilder &MIB) { // rsrc 1706 MIB.addReg(Info->getScratchRSrcReg()); 1707 }, 1708 [=](MachineInstrBuilder &MIB) { // vaddr 1709 MIB.addReg(HighBits); 1710 }, 1711 [=](MachineInstrBuilder &MIB) { // soffset 1712 const MachineMemOperand *MMO = *MI->memoperands_begin(); 1713 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 1714 1715 Register SOffsetReg = isStackPtrRelative(PtrInfo) 1716 ? Info->getStackPtrOffsetReg() 1717 : Info->getScratchWaveOffsetReg(); 1718 MIB.addReg(SOffsetReg); 1719 }, 1720 [=](MachineInstrBuilder &MIB) { // offset 1721 MIB.addImm(Offset & 4095); 1722 }}}; 1723 } 1724 1725 assert(Offset == 0); 1726 1727 // Try to fold a frame index directly into the MUBUF vaddr field, and any 1728 // offsets. 1729 Optional<int> FI; 1730 Register VAddr = Root.getReg(); 1731 if (const MachineInstr *RootDef = MRI.getVRegDef(Root.getReg())) { 1732 if (isBaseWithConstantOffset(Root, MRI)) { 1733 const MachineOperand &LHS = RootDef->getOperand(1); 1734 const MachineOperand &RHS = RootDef->getOperand(2); 1735 const MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg()); 1736 const MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg()); 1737 if (LHSDef && RHSDef) { 1738 int64_t PossibleOffset = 1739 RHSDef->getOperand(1).getCImm()->getSExtValue(); 1740 if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) && 1741 (!STI.privateMemoryResourceIsRangeChecked() || 1742 KnownBits->signBitIsZero(LHS.getReg()))) { 1743 if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX) 1744 FI = LHSDef->getOperand(1).getIndex(); 1745 else 1746 VAddr = LHS.getReg(); 1747 Offset = PossibleOffset; 1748 } 1749 } 1750 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) { 1751 FI = RootDef->getOperand(1).getIndex(); 1752 } 1753 } 1754 1755 // If we don't know this private access is a local stack object, it needs to 1756 // be relative to the entry point's scratch wave offset register. 1757 // TODO: Should split large offsets that don't fit like above. 1758 // TODO: Don't use scratch wave offset just because the offset didn't fit. 1759 Register SOffset = FI.hasValue() ? Info->getStackPtrOffsetReg() 1760 : Info->getScratchWaveOffsetReg(); 1761 1762 return {{[=](MachineInstrBuilder &MIB) { // rsrc 1763 MIB.addReg(Info->getScratchRSrcReg()); 1764 }, 1765 [=](MachineInstrBuilder &MIB) { // vaddr 1766 if (FI.hasValue()) 1767 MIB.addFrameIndex(FI.getValue()); 1768 else 1769 MIB.addReg(VAddr); 1770 }, 1771 [=](MachineInstrBuilder &MIB) { // soffset 1772 MIB.addReg(SOffset); 1773 }, 1774 [=](MachineInstrBuilder &MIB) { // offset 1775 MIB.addImm(Offset); 1776 }}}; 1777 } 1778 1779 bool AMDGPUInstructionSelector::isDSOffsetLegal(const MachineRegisterInfo &MRI, 1780 const MachineOperand &Base, 1781 int64_t Offset, 1782 unsigned OffsetBits) const { 1783 if ((OffsetBits == 16 && !isUInt<16>(Offset)) || 1784 (OffsetBits == 8 && !isUInt<8>(Offset))) 1785 return false; 1786 1787 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) 1788 return true; 1789 1790 // On Southern Islands instruction with a negative base value and an offset 1791 // don't seem to work. 1792 return KnownBits->signBitIsZero(Base.getReg()); 1793 } 1794 1795 InstructionSelector::ComplexRendererFns 1796 AMDGPUInstructionSelector::selectMUBUFScratchOffset( 1797 MachineOperand &Root) const { 1798 MachineInstr *MI = Root.getParent(); 1799 MachineBasicBlock *MBB = MI->getParent(); 1800 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1801 1802 int64_t Offset = 0; 1803 if (!mi_match(Root.getReg(), MRI, m_ICst(Offset)) || 1804 !SIInstrInfo::isLegalMUBUFImmOffset(Offset)) 1805 return {}; 1806 1807 const MachineFunction *MF = MBB->getParent(); 1808 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 1809 const MachineMemOperand *MMO = *MI->memoperands_begin(); 1810 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 1811 1812 Register SOffsetReg = isStackPtrRelative(PtrInfo) 1813 ? Info->getStackPtrOffsetReg() 1814 : Info->getScratchWaveOffsetReg(); 1815 return {{ 1816 [=](MachineInstrBuilder &MIB) { 1817 MIB.addReg(Info->getScratchRSrcReg()); 1818 }, // rsrc 1819 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffsetReg); }, // soffset 1820 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset 1821 }}; 1822 } 1823 1824 InstructionSelector::ComplexRendererFns 1825 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const { 1826 MachineInstr *MI = Root.getParent(); 1827 MachineBasicBlock *MBB = MI->getParent(); 1828 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1829 1830 const MachineInstr *RootDef = MRI.getVRegDef(Root.getReg()); 1831 if (!RootDef) { 1832 return {{ 1833 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 1834 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } 1835 }}; 1836 } 1837 1838 int64_t ConstAddr = 0; 1839 if (isBaseWithConstantOffset(Root, MRI)) { 1840 const MachineOperand &LHS = RootDef->getOperand(1); 1841 const MachineOperand &RHS = RootDef->getOperand(2); 1842 const MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg()); 1843 const MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg()); 1844 if (LHSDef && RHSDef) { 1845 int64_t PossibleOffset = 1846 RHSDef->getOperand(1).getCImm()->getSExtValue(); 1847 if (isDSOffsetLegal(MRI, LHS, PossibleOffset, 16)) { 1848 // (add n0, c0) 1849 return {{ 1850 [=](MachineInstrBuilder &MIB) { MIB.add(LHS); }, 1851 [=](MachineInstrBuilder &MIB) { MIB.addImm(PossibleOffset); } 1852 }}; 1853 } 1854 } 1855 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 1856 1857 1858 1859 } else if (mi_match(Root.getReg(), MRI, m_ICst(ConstAddr))) { 1860 1861 1862 } 1863 1864 return {{ 1865 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 1866 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } 1867 }}; 1868 } 1869