1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the InstructionSelector class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUInstructionSelector.h" 15 #include "AMDGPUInstrInfo.h" 16 #include "AMDGPUGlobalISelUtils.h" 17 #include "AMDGPURegisterBankInfo.h" 18 #include "AMDGPURegisterInfo.h" 19 #include "AMDGPUSubtarget.h" 20 #include "AMDGPUTargetMachine.h" 21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 22 #include "SIMachineFunctionInfo.h" 23 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 24 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" 25 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 28 #include "llvm/CodeGen/GlobalISel/Utils.h" 29 #include "llvm/CodeGen/MachineBasicBlock.h" 30 #include "llvm/CodeGen/MachineFunction.h" 31 #include "llvm/CodeGen/MachineInstr.h" 32 #include "llvm/CodeGen/MachineInstrBuilder.h" 33 #include "llvm/CodeGen/MachineRegisterInfo.h" 34 #include "llvm/IR/Type.h" 35 #include "llvm/Support/Debug.h" 36 #include "llvm/Support/raw_ostream.h" 37 38 #define DEBUG_TYPE "amdgpu-isel" 39 40 using namespace llvm; 41 using namespace MIPatternMatch; 42 43 #define GET_GLOBALISEL_IMPL 44 #define AMDGPUSubtarget GCNSubtarget 45 #include "AMDGPUGenGlobalISel.inc" 46 #undef GET_GLOBALISEL_IMPL 47 #undef AMDGPUSubtarget 48 49 AMDGPUInstructionSelector::AMDGPUInstructionSelector( 50 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, 51 const AMDGPUTargetMachine &TM) 52 : InstructionSelector(), TII(*STI.getInstrInfo()), 53 TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM), 54 STI(STI), 55 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG), 56 #define GET_GLOBALISEL_PREDICATES_INIT 57 #include "AMDGPUGenGlobalISel.inc" 58 #undef GET_GLOBALISEL_PREDICATES_INIT 59 #define GET_GLOBALISEL_TEMPORARIES_INIT 60 #include "AMDGPUGenGlobalISel.inc" 61 #undef GET_GLOBALISEL_TEMPORARIES_INIT 62 { 63 } 64 65 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } 66 67 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB, 68 CodeGenCoverage &CoverageInfo) { 69 MRI = &MF.getRegInfo(); 70 InstructionSelector::setupMF(MF, KB, CoverageInfo); 71 } 72 73 static bool isSCC(Register Reg, const MachineRegisterInfo &MRI) { 74 if (Register::isPhysicalRegister(Reg)) 75 return Reg == AMDGPU::SCC; 76 77 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 78 const TargetRegisterClass *RC = 79 RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); 80 if (RC) { 81 // FIXME: This is ambiguous for wave32. This could be SCC or VCC, but the 82 // context of the register bank has been lost. 83 // Has a hack getRegClassForSizeOnBank uses exactly SGPR_32RegClass, which 84 // won't ever beconstrained any further. 85 if (RC != &AMDGPU::SGPR_32RegClass) 86 return false; 87 const LLT Ty = MRI.getType(Reg); 88 return Ty.isValid() && Ty.getSizeInBits() == 1; 89 } 90 91 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); 92 return RB->getID() == AMDGPU::SCCRegBankID; 93 } 94 95 bool AMDGPUInstructionSelector::isVCC(Register Reg, 96 const MachineRegisterInfo &MRI) const { 97 if (Register::isPhysicalRegister(Reg)) 98 return Reg == TRI.getVCC(); 99 100 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 101 const TargetRegisterClass *RC = 102 RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); 103 if (RC) { 104 const LLT Ty = MRI.getType(Reg); 105 return RC->hasSuperClassEq(TRI.getBoolRC()) && 106 Ty.isValid() && Ty.getSizeInBits() == 1; 107 } 108 109 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); 110 return RB->getID() == AMDGPU::VCCRegBankID; 111 } 112 113 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { 114 const DebugLoc &DL = I.getDebugLoc(); 115 MachineBasicBlock *BB = I.getParent(); 116 I.setDesc(TII.get(TargetOpcode::COPY)); 117 118 const MachineOperand &Src = I.getOperand(1); 119 MachineOperand &Dst = I.getOperand(0); 120 Register DstReg = Dst.getReg(); 121 Register SrcReg = Src.getReg(); 122 123 if (isVCC(DstReg, *MRI)) { 124 if (SrcReg == AMDGPU::SCC) { 125 const TargetRegisterClass *RC 126 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 127 if (!RC) 128 return true; 129 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 130 } 131 132 if (!isVCC(SrcReg, *MRI)) { 133 // TODO: Should probably leave the copy and let copyPhysReg expand it. 134 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI)) 135 return false; 136 137 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) 138 .addImm(0) 139 .addReg(SrcReg); 140 141 if (!MRI->getRegClassOrNull(SrcReg)) 142 MRI->setRegClass(SrcReg, TRI.getConstrainedRegClassForOperand(Src, *MRI)); 143 I.eraseFromParent(); 144 return true; 145 } 146 147 const TargetRegisterClass *RC = 148 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 149 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) 150 return false; 151 152 // Don't constrain the source register to a class so the def instruction 153 // handles it (unless it's undef). 154 // 155 // FIXME: This is a hack. When selecting the def, we neeed to know 156 // specifically know that the result is VCCRegBank, and not just an SGPR 157 // with size 1. An SReg_32 with size 1 is ambiguous with wave32. 158 if (Src.isUndef()) { 159 const TargetRegisterClass *SrcRC = 160 TRI.getConstrainedRegClassForOperand(Src, *MRI); 161 if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 162 return false; 163 } 164 165 return true; 166 } 167 168 for (const MachineOperand &MO : I.operands()) { 169 if (Register::isPhysicalRegister(MO.getReg())) 170 continue; 171 172 const TargetRegisterClass *RC = 173 TRI.getConstrainedRegClassForOperand(MO, *MRI); 174 if (!RC) 175 continue; 176 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); 177 } 178 return true; 179 } 180 181 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { 182 const Register DefReg = I.getOperand(0).getReg(); 183 const LLT DefTy = MRI->getType(DefReg); 184 185 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy) 186 187 const RegClassOrRegBank &RegClassOrBank = 188 MRI->getRegClassOrRegBank(DefReg); 189 190 const TargetRegisterClass *DefRC 191 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); 192 if (!DefRC) { 193 if (!DefTy.isValid()) { 194 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); 195 return false; 196 } 197 198 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); 199 if (RB.getID() == AMDGPU::SCCRegBankID) { 200 LLVM_DEBUG(dbgs() << "illegal scc phi\n"); 201 return false; 202 } 203 204 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI); 205 if (!DefRC) { 206 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); 207 return false; 208 } 209 } 210 211 I.setDesc(TII.get(TargetOpcode::PHI)); 212 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI); 213 } 214 215 MachineOperand 216 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, 217 const TargetRegisterClass &SubRC, 218 unsigned SubIdx) const { 219 220 MachineInstr *MI = MO.getParent(); 221 MachineBasicBlock *BB = MO.getParent()->getParent(); 222 Register DstReg = MRI->createVirtualRegister(&SubRC); 223 224 if (MO.isReg()) { 225 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); 226 Register Reg = MO.getReg(); 227 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) 228 .addReg(Reg, 0, ComposedSubIdx); 229 230 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(), 231 MO.isKill(), MO.isDead(), MO.isUndef(), 232 MO.isEarlyClobber(), 0, MO.isDebug(), 233 MO.isInternalRead()); 234 } 235 236 assert(MO.isImm()); 237 238 APInt Imm(64, MO.getImm()); 239 240 switch (SubIdx) { 241 default: 242 llvm_unreachable("do not know to split immediate with this sub index."); 243 case AMDGPU::sub0: 244 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue()); 245 case AMDGPU::sub1: 246 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue()); 247 } 248 } 249 250 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { 251 switch (Opc) { 252 case AMDGPU::G_AND: 253 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; 254 case AMDGPU::G_OR: 255 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32; 256 case AMDGPU::G_XOR: 257 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32; 258 default: 259 llvm_unreachable("not a bit op"); 260 } 261 } 262 263 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { 264 MachineOperand &Dst = I.getOperand(0); 265 MachineOperand &Src0 = I.getOperand(1); 266 MachineOperand &Src1 = I.getOperand(2); 267 Register DstReg = Dst.getReg(); 268 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 269 270 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 271 if (DstRB->getID() == AMDGPU::VCCRegBankID) { 272 const TargetRegisterClass *RC = TRI.getBoolRC(); 273 unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), 274 RC == &AMDGPU::SReg_64RegClass); 275 I.setDesc(TII.get(InstOpc)); 276 277 // FIXME: Hack to avoid turning the register bank into a register class. 278 // The selector for G_ICMP relies on seeing the register bank for the result 279 // is VCC. In wave32 if we constrain the registers to SReg_32 here, it will 280 // be ambiguous whether it's a scalar or vector bool. 281 if (Src0.isUndef() && !MRI->getRegClassOrNull(Src0.getReg())) 282 MRI->setRegClass(Src0.getReg(), RC); 283 if (Src1.isUndef() && !MRI->getRegClassOrNull(Src1.getReg())) 284 MRI->setRegClass(Src1.getReg(), RC); 285 286 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 287 } 288 289 // TODO: Should this allow an SCC bank result, and produce a copy from SCC for 290 // the result? 291 if (DstRB->getID() == AMDGPU::SGPRRegBankID) { 292 unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), Size > 32); 293 I.setDesc(TII.get(InstOpc)); 294 // Dead implicit-def of scc 295 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef 296 true, // isImp 297 false, // isKill 298 true)); // isDead 299 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 300 } 301 302 return false; 303 } 304 305 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { 306 MachineBasicBlock *BB = I.getParent(); 307 MachineFunction *MF = BB->getParent(); 308 Register DstReg = I.getOperand(0).getReg(); 309 const DebugLoc &DL = I.getDebugLoc(); 310 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 311 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 312 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID; 313 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB; 314 315 if (Size == 32) { 316 if (IsSALU) { 317 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; 318 MachineInstr *Add = 319 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 320 .add(I.getOperand(1)) 321 .add(I.getOperand(2)); 322 I.eraseFromParent(); 323 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 324 } 325 326 if (STI.hasAddNoCarry()) { 327 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64; 328 I.setDesc(TII.get(Opc)); 329 I.addOperand(*MF, MachineOperand::CreateImm(0)); 330 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 331 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 332 } 333 334 const unsigned Opc = Sub ? AMDGPU::V_SUB_I32_e64 : AMDGPU::V_ADD_I32_e64; 335 336 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass()); 337 MachineInstr *Add 338 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 339 .addDef(UnusedCarry, RegState::Dead) 340 .add(I.getOperand(1)) 341 .add(I.getOperand(2)) 342 .addImm(0); 343 I.eraseFromParent(); 344 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 345 } 346 347 assert(!Sub && "illegal sub should not reach here"); 348 349 const TargetRegisterClass &RC 350 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass; 351 const TargetRegisterClass &HalfRC 352 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass; 353 354 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0)); 355 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0)); 356 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1)); 357 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1)); 358 359 Register DstLo = MRI->createVirtualRegister(&HalfRC); 360 Register DstHi = MRI->createVirtualRegister(&HalfRC); 361 362 if (IsSALU) { 363 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) 364 .add(Lo1) 365 .add(Lo2); 366 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) 367 .add(Hi1) 368 .add(Hi2); 369 } else { 370 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass(); 371 Register CarryReg = MRI->createVirtualRegister(CarryRC); 372 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_I32_e64), DstLo) 373 .addDef(CarryReg) 374 .add(Lo1) 375 .add(Lo2) 376 .addImm(0); 377 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi) 378 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead) 379 .add(Hi1) 380 .add(Hi2) 381 .addReg(CarryReg, RegState::Kill) 382 .addImm(0); 383 384 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI)) 385 return false; 386 } 387 388 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 389 .addReg(DstLo) 390 .addImm(AMDGPU::sub0) 391 .addReg(DstHi) 392 .addImm(AMDGPU::sub1); 393 394 395 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI)) 396 return false; 397 398 I.eraseFromParent(); 399 return true; 400 } 401 402 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO(MachineInstr &I) const { 403 MachineBasicBlock *BB = I.getParent(); 404 MachineFunction *MF = BB->getParent(); 405 const DebugLoc &DL = I.getDebugLoc(); 406 Register Dst0Reg = I.getOperand(0).getReg(); 407 Register Dst1Reg = I.getOperand(1).getReg(); 408 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO; 409 410 if (!isSCC(Dst1Reg, *MRI)) { 411 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned 412 // carry out despite the _i32 name. These were renamed in VI to _U32. 413 // FIXME: We should probably rename the opcodes here. 414 unsigned NewOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; 415 I.setDesc(TII.get(NewOpc)); 416 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 417 I.addOperand(*MF, MachineOperand::CreateImm(0)); 418 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 419 } 420 421 Register Src0Reg = I.getOperand(2).getReg(); 422 Register Src1Reg = I.getOperand(3).getReg(); 423 unsigned NewOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; 424 BuildMI(*BB, &I, DL, TII.get(NewOpc), Dst0Reg) 425 .add(I.getOperand(2)) 426 .add(I.getOperand(3)); 427 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg) 428 .addReg(AMDGPU::SCC); 429 430 if (!MRI->getRegClassOrNull(Dst1Reg)) 431 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass); 432 433 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) || 434 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) || 435 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI)) 436 return false; 437 438 I.eraseFromParent(); 439 return true; 440 } 441 442 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { 443 MachineBasicBlock *BB = I.getParent(); 444 Register DstReg = I.getOperand(0).getReg(); 445 Register SrcReg = I.getOperand(1).getReg(); 446 LLT DstTy = MRI->getType(DstReg); 447 LLT SrcTy = MRI->getType(SrcReg); 448 const unsigned SrcSize = SrcTy.getSizeInBits(); 449 const unsigned DstSize = DstTy.getSizeInBits(); 450 451 // TODO: Should handle any multiple of 32 offset. 452 unsigned Offset = I.getOperand(2).getImm(); 453 if (Offset % DstSize != 0) 454 return false; 455 456 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 457 const TargetRegisterClass *SrcRC = 458 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 459 if (!SrcRC) 460 return false; 461 462 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); 463 464 const DebugLoc &DL = I.getDebugLoc(); 465 MachineInstr *Copy = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg) 466 .addReg(SrcReg, 0, SubRegs[Offset / DstSize]); 467 468 for (const MachineOperand &MO : Copy->operands()) { 469 const TargetRegisterClass *RC = 470 TRI.getConstrainedRegClassForOperand(MO, *MRI); 471 if (!RC) 472 continue; 473 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); 474 } 475 I.eraseFromParent(); 476 return true; 477 } 478 479 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { 480 MachineBasicBlock *BB = MI.getParent(); 481 Register DstReg = MI.getOperand(0).getReg(); 482 LLT DstTy = MRI->getType(DstReg); 483 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg()); 484 485 const unsigned SrcSize = SrcTy.getSizeInBits(); 486 if (SrcSize < 32) 487 return selectImpl(MI, *CoverageInfo); 488 489 const DebugLoc &DL = MI.getDebugLoc(); 490 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 491 const unsigned DstSize = DstTy.getSizeInBits(); 492 const TargetRegisterClass *DstRC = 493 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 494 if (!DstRC) 495 return false; 496 497 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8); 498 MachineInstrBuilder MIB = 499 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg); 500 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) { 501 MachineOperand &Src = MI.getOperand(I + 1); 502 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef())); 503 MIB.addImm(SubRegs[I]); 504 505 const TargetRegisterClass *SrcRC 506 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 507 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI)) 508 return false; 509 } 510 511 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 512 return false; 513 514 MI.eraseFromParent(); 515 return true; 516 } 517 518 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { 519 MachineBasicBlock *BB = MI.getParent(); 520 const int NumDst = MI.getNumOperands() - 1; 521 522 MachineOperand &Src = MI.getOperand(NumDst); 523 524 Register SrcReg = Src.getReg(); 525 Register DstReg0 = MI.getOperand(0).getReg(); 526 LLT DstTy = MRI->getType(DstReg0); 527 LLT SrcTy = MRI->getType(SrcReg); 528 529 const unsigned DstSize = DstTy.getSizeInBits(); 530 const unsigned SrcSize = SrcTy.getSizeInBits(); 531 const DebugLoc &DL = MI.getDebugLoc(); 532 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 533 534 const TargetRegisterClass *SrcRC = 535 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 536 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 537 return false; 538 539 const unsigned SrcFlags = getUndefRegState(Src.isUndef()); 540 541 // Note we could have mixed SGPR and VGPR destination banks for an SGPR 542 // source, and this relies on the fact that the same subregister indices are 543 // used for both. 544 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); 545 for (int I = 0, E = NumDst; I != E; ++I) { 546 MachineOperand &Dst = MI.getOperand(I); 547 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg()) 548 .addReg(SrcReg, SrcFlags, SubRegs[I]); 549 550 const TargetRegisterClass *DstRC = 551 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 552 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI)) 553 return false; 554 } 555 556 MI.eraseFromParent(); 557 return true; 558 } 559 560 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const { 561 return selectG_ADD_SUB(I); 562 } 563 564 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { 565 const MachineOperand &MO = I.getOperand(0); 566 567 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The 568 // regbank check here is to know why getConstrainedRegClassForOperand failed. 569 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI); 570 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) || 571 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) { 572 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); 573 return true; 574 } 575 576 return false; 577 } 578 579 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { 580 MachineBasicBlock *BB = I.getParent(); 581 582 Register DstReg = I.getOperand(0).getReg(); 583 Register Src0Reg = I.getOperand(1).getReg(); 584 Register Src1Reg = I.getOperand(2).getReg(); 585 LLT Src1Ty = MRI->getType(Src1Reg); 586 587 unsigned DstSize = MRI->getType(DstReg).getSizeInBits(); 588 unsigned InsSize = Src1Ty.getSizeInBits(); 589 590 int64_t Offset = I.getOperand(3).getImm(); 591 if (Offset % 32 != 0) 592 return false; 593 594 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32); 595 if (SubReg == AMDGPU::NoSubRegister) 596 return false; 597 598 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 599 const TargetRegisterClass *DstRC = 600 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 601 if (!DstRC) 602 return false; 603 604 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI); 605 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI); 606 const TargetRegisterClass *Src0RC = 607 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI); 608 const TargetRegisterClass *Src1RC = 609 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI); 610 611 // Deal with weird cases where the class only partially supports the subreg 612 // index. 613 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg); 614 if (!Src0RC) 615 return false; 616 617 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 618 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) || 619 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI)) 620 return false; 621 622 const DebugLoc &DL = I.getDebugLoc(); 623 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg) 624 .addReg(Src0Reg) 625 .addReg(Src1Reg) 626 .addImm(SubReg); 627 628 I.eraseFromParent(); 629 return true; 630 } 631 632 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { 633 unsigned IntrinsicID = I.getIntrinsicID(); 634 switch (IntrinsicID) { 635 case Intrinsic::amdgcn_if_break: { 636 MachineBasicBlock *BB = I.getParent(); 637 638 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 639 // SelectionDAG uses for wave32 vs wave64. 640 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK)) 641 .add(I.getOperand(0)) 642 .add(I.getOperand(2)) 643 .add(I.getOperand(3)); 644 645 Register DstReg = I.getOperand(0).getReg(); 646 Register Src0Reg = I.getOperand(2).getReg(); 647 Register Src1Reg = I.getOperand(3).getReg(); 648 649 I.eraseFromParent(); 650 651 for (Register Reg : { DstReg, Src0Reg, Src1Reg }) 652 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 653 654 return true; 655 } 656 default: 657 return selectImpl(I, *CoverageInfo); 658 } 659 } 660 661 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) { 662 if (Size != 32 && Size != 64) 663 return -1; 664 switch (P) { 665 default: 666 llvm_unreachable("Unknown condition code!"); 667 case CmpInst::ICMP_NE: 668 return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64; 669 case CmpInst::ICMP_EQ: 670 return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64; 671 case CmpInst::ICMP_SGT: 672 return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64; 673 case CmpInst::ICMP_SGE: 674 return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64; 675 case CmpInst::ICMP_SLT: 676 return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64; 677 case CmpInst::ICMP_SLE: 678 return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64; 679 case CmpInst::ICMP_UGT: 680 return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64; 681 case CmpInst::ICMP_UGE: 682 return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64; 683 case CmpInst::ICMP_ULT: 684 return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64; 685 case CmpInst::ICMP_ULE: 686 return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64; 687 } 688 } 689 690 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P, 691 unsigned Size) const { 692 if (Size == 64) { 693 if (!STI.hasScalarCompareEq64()) 694 return -1; 695 696 switch (P) { 697 case CmpInst::ICMP_NE: 698 return AMDGPU::S_CMP_LG_U64; 699 case CmpInst::ICMP_EQ: 700 return AMDGPU::S_CMP_EQ_U64; 701 default: 702 return -1; 703 } 704 } 705 706 if (Size != 32) 707 return -1; 708 709 switch (P) { 710 case CmpInst::ICMP_NE: 711 return AMDGPU::S_CMP_LG_U32; 712 case CmpInst::ICMP_EQ: 713 return AMDGPU::S_CMP_EQ_U32; 714 case CmpInst::ICMP_SGT: 715 return AMDGPU::S_CMP_GT_I32; 716 case CmpInst::ICMP_SGE: 717 return AMDGPU::S_CMP_GE_I32; 718 case CmpInst::ICMP_SLT: 719 return AMDGPU::S_CMP_LT_I32; 720 case CmpInst::ICMP_SLE: 721 return AMDGPU::S_CMP_LE_I32; 722 case CmpInst::ICMP_UGT: 723 return AMDGPU::S_CMP_GT_U32; 724 case CmpInst::ICMP_UGE: 725 return AMDGPU::S_CMP_GE_U32; 726 case CmpInst::ICMP_ULT: 727 return AMDGPU::S_CMP_LT_U32; 728 case CmpInst::ICMP_ULE: 729 return AMDGPU::S_CMP_LE_U32; 730 default: 731 llvm_unreachable("Unknown condition code!"); 732 } 733 } 734 735 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { 736 MachineBasicBlock *BB = I.getParent(); 737 const DebugLoc &DL = I.getDebugLoc(); 738 739 Register SrcReg = I.getOperand(2).getReg(); 740 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 741 742 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); 743 744 Register CCReg = I.getOperand(0).getReg(); 745 if (isSCC(CCReg, *MRI)) { 746 int Opcode = getS_CMPOpcode(Pred, Size); 747 if (Opcode == -1) 748 return false; 749 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode)) 750 .add(I.getOperand(2)) 751 .add(I.getOperand(3)); 752 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg) 753 .addReg(AMDGPU::SCC); 754 bool Ret = 755 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) && 756 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI); 757 I.eraseFromParent(); 758 return Ret; 759 } 760 761 int Opcode = getV_CMPOpcode(Pred, Size); 762 if (Opcode == -1) 763 return false; 764 765 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), 766 I.getOperand(0).getReg()) 767 .add(I.getOperand(2)) 768 .add(I.getOperand(3)); 769 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), 770 *TRI.getBoolRC(), *MRI); 771 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 772 I.eraseFromParent(); 773 return Ret; 774 } 775 776 static MachineInstr * 777 buildEXP(const TargetInstrInfo &TII, MachineInstr *Insert, unsigned Tgt, 778 unsigned Reg0, unsigned Reg1, unsigned Reg2, unsigned Reg3, 779 unsigned VM, bool Compr, unsigned Enabled, bool Done) { 780 const DebugLoc &DL = Insert->getDebugLoc(); 781 MachineBasicBlock &BB = *Insert->getParent(); 782 unsigned Opcode = Done ? AMDGPU::EXP_DONE : AMDGPU::EXP; 783 return BuildMI(BB, Insert, DL, TII.get(Opcode)) 784 .addImm(Tgt) 785 .addReg(Reg0) 786 .addReg(Reg1) 787 .addReg(Reg2) 788 .addReg(Reg3) 789 .addImm(VM) 790 .addImm(Compr) 791 .addImm(Enabled); 792 } 793 794 static bool isZero(Register Reg, MachineRegisterInfo &MRI) { 795 int64_t C; 796 if (mi_match(Reg, MRI, m_ICst(C)) && C == 0) 797 return true; 798 799 // FIXME: matcher should ignore copies 800 return mi_match(Reg, MRI, m_Copy(m_ICst(C))) && C == 0; 801 } 802 803 static unsigned extractGLC(unsigned AuxiliaryData) { 804 return AuxiliaryData & 1; 805 } 806 807 static unsigned extractSLC(unsigned AuxiliaryData) { 808 return (AuxiliaryData >> 1) & 1; 809 } 810 811 static unsigned extractDLC(unsigned AuxiliaryData) { 812 return (AuxiliaryData >> 2) & 1; 813 } 814 815 static unsigned extractSWZ(unsigned AuxiliaryData) { 816 return (AuxiliaryData >> 3) & 1; 817 } 818 819 static unsigned getBufferStoreOpcode(LLT Ty, 820 const unsigned MemSize, 821 const bool Offen) { 822 const int Size = Ty.getSizeInBits(); 823 switch (8 * MemSize) { 824 case 8: 825 return Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact : 826 AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact; 827 case 16: 828 return Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact : 829 AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact; 830 default: 831 unsigned Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact : 832 AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact; 833 if (Size > 32) 834 Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32); 835 return Opc; 836 } 837 } 838 839 static unsigned getBufferStoreFormatOpcode(LLT Ty, 840 const unsigned MemSize, 841 const bool Offen) { 842 bool IsD16Packed = Ty.getScalarSizeInBits() == 16; 843 bool IsD16Unpacked = 8 * MemSize < Ty.getSizeInBits(); 844 int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 845 846 if (IsD16Packed) { 847 switch (NumElts) { 848 case 1: 849 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFEN_exact : 850 AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFSET_exact; 851 case 2: 852 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact : 853 AMDGPU::BUFFER_STORE_FORMAT_D16_XY_OFFSET_exact; 854 case 3: 855 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_OFFEN_exact : 856 AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_OFFSET_exact; 857 case 4: 858 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact : 859 AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_OFFSET_exact; 860 default: 861 return -1; 862 } 863 } 864 865 if (IsD16Unpacked) { 866 switch (NumElts) { 867 case 1: 868 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFEN_exact : 869 AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFSET_exact; 870 case 2: 871 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact : 872 AMDGPU::BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFSET_exact; 873 case 3: 874 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_gfx80_OFFEN_exact : 875 AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_gfx80_OFFSET_exact; 876 case 4: 877 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact : 878 AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFSET_exact; 879 default: 880 return -1; 881 } 882 } 883 884 switch (NumElts) { 885 case 1: 886 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_X_OFFEN_exact : 887 AMDGPU::BUFFER_STORE_FORMAT_X_OFFSET_exact; 888 case 2: 889 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XY_OFFEN_exact : 890 AMDGPU::BUFFER_STORE_FORMAT_XY_OFFSET_exact; 891 case 3: 892 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XYZ_OFFEN_exact : 893 AMDGPU::BUFFER_STORE_FORMAT_XYZ_OFFSET_exact; 894 case 4: 895 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XYZW_OFFEN_exact : 896 AMDGPU::BUFFER_STORE_FORMAT_XYZW_OFFSET_exact; 897 default: 898 return -1; 899 } 900 901 llvm_unreachable("unhandled buffer store"); 902 } 903 904 // TODO: Move this to combiner 905 // Returns base register, imm offset, total constant offset. 906 std::tuple<Register, unsigned, unsigned> 907 AMDGPUInstructionSelector::splitBufferOffsets(MachineIRBuilder &B, 908 Register OrigOffset) const { 909 const unsigned MaxImm = 4095; 910 Register BaseReg; 911 unsigned TotalConstOffset; 912 MachineInstr *OffsetDef; 913 914 std::tie(BaseReg, TotalConstOffset, OffsetDef) 915 = AMDGPU::getBaseWithConstantOffset(*MRI, OrigOffset); 916 917 unsigned ImmOffset = TotalConstOffset; 918 919 // If the immediate value is too big for the immoffset field, put the value 920 // and -4096 into the immoffset field so that the value that is copied/added 921 // for the voffset field is a multiple of 4096, and it stands more chance 922 // of being CSEd with the copy/add for another similar load/store.f 923 // However, do not do that rounding down to a multiple of 4096 if that is a 924 // negative number, as it appears to be illegal to have a negative offset 925 // in the vgpr, even if adding the immediate offset makes it positive. 926 unsigned Overflow = ImmOffset & ~MaxImm; 927 ImmOffset -= Overflow; 928 if ((int32_t)Overflow < 0) { 929 Overflow += ImmOffset; 930 ImmOffset = 0; 931 } 932 933 if (Overflow != 0) { 934 // In case this is in a waterfall loop, insert offset code at the def point 935 // of the offset, not inside the loop. 936 MachineBasicBlock::iterator OldInsPt = B.getInsertPt(); 937 MachineBasicBlock &OldMBB = B.getMBB(); 938 B.setInstr(*OffsetDef); 939 940 if (!BaseReg) { 941 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 942 B.buildInstr(AMDGPU::V_MOV_B32_e32) 943 .addDef(BaseReg) 944 .addImm(Overflow); 945 } else { 946 Register OverflowVal = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 947 B.buildInstr(AMDGPU::V_MOV_B32_e32) 948 .addDef(OverflowVal) 949 .addImm(Overflow); 950 951 Register NewBaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 952 TII.getAddNoCarry(B.getMBB(), B.getInsertPt(), B.getDebugLoc(), NewBaseReg) 953 .addReg(BaseReg) 954 .addReg(OverflowVal, RegState::Kill) 955 .addImm(0); 956 BaseReg = NewBaseReg; 957 } 958 959 B.setInsertPt(OldMBB, OldInsPt); 960 } 961 962 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 963 } 964 965 bool AMDGPUInstructionSelector::selectStoreIntrinsic(MachineInstr &MI, 966 bool IsFormat) const { 967 MachineIRBuilder B(MI); 968 MachineFunction &MF = B.getMF(); 969 Register VData = MI.getOperand(1).getReg(); 970 LLT Ty = MRI->getType(VData); 971 972 int Size = Ty.getSizeInBits(); 973 if (Size % 32 != 0) 974 return false; 975 976 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 977 MachineMemOperand *MMO = *MI.memoperands_begin(); 978 const int MemSize = MMO->getSize(); 979 980 Register RSrc = MI.getOperand(2).getReg(); 981 Register VOffset = MI.getOperand(3).getReg(); 982 Register SOffset = MI.getOperand(4).getReg(); 983 unsigned AuxiliaryData = MI.getOperand(5).getImm(); 984 unsigned ImmOffset; 985 unsigned TotalOffset; 986 987 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 988 if (TotalOffset != 0) 989 MMO = MF.getMachineMemOperand(MMO, TotalOffset, MemSize); 990 991 const bool Offen = !isZero(VOffset, *MRI); 992 993 int Opc = IsFormat ? getBufferStoreFormatOpcode(Ty, MemSize, Offen) : 994 getBufferStoreOpcode(Ty, MemSize, Offen); 995 if (Opc == -1) 996 return false; 997 998 MachineInstrBuilder MIB = B.buildInstr(Opc) 999 .addUse(VData); 1000 1001 if (Offen) 1002 MIB.addUse(VOffset); 1003 1004 MIB.addUse(RSrc) 1005 .addUse(SOffset) 1006 .addImm(ImmOffset) 1007 .addImm(extractGLC(AuxiliaryData)) 1008 .addImm(extractSLC(AuxiliaryData)) 1009 .addImm(0) // tfe: FIXME: Remove from inst 1010 .addImm(extractDLC(AuxiliaryData)) 1011 .addImm(extractSWZ(AuxiliaryData)) 1012 .addMemOperand(MMO); 1013 1014 MI.eraseFromParent(); 1015 1016 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1017 } 1018 1019 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( 1020 MachineInstr &I) const { 1021 MachineBasicBlock *BB = I.getParent(); 1022 unsigned IntrinsicID = I.getIntrinsicID(); 1023 switch (IntrinsicID) { 1024 case Intrinsic::amdgcn_exp: { 1025 int64_t Tgt = I.getOperand(1).getImm(); 1026 int64_t Enabled = I.getOperand(2).getImm(); 1027 int64_t Done = I.getOperand(7).getImm(); 1028 int64_t VM = I.getOperand(8).getImm(); 1029 1030 MachineInstr *Exp = buildEXP(TII, &I, Tgt, I.getOperand(3).getReg(), 1031 I.getOperand(4).getReg(), 1032 I.getOperand(5).getReg(), 1033 I.getOperand(6).getReg(), 1034 VM, false, Enabled, Done); 1035 1036 I.eraseFromParent(); 1037 return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI); 1038 } 1039 case Intrinsic::amdgcn_exp_compr: { 1040 const DebugLoc &DL = I.getDebugLoc(); 1041 int64_t Tgt = I.getOperand(1).getImm(); 1042 int64_t Enabled = I.getOperand(2).getImm(); 1043 Register Reg0 = I.getOperand(3).getReg(); 1044 Register Reg1 = I.getOperand(4).getReg(); 1045 Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1046 int64_t Done = I.getOperand(5).getImm(); 1047 int64_t VM = I.getOperand(6).getImm(); 1048 1049 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef); 1050 MachineInstr *Exp = buildEXP(TII, &I, Tgt, Reg0, Reg1, Undef, Undef, VM, 1051 true, Enabled, Done); 1052 1053 I.eraseFromParent(); 1054 return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI); 1055 } 1056 case Intrinsic::amdgcn_end_cf: { 1057 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 1058 // SelectionDAG uses for wave32 vs wave64. 1059 BuildMI(*BB, &I, I.getDebugLoc(), 1060 TII.get(AMDGPU::SI_END_CF)) 1061 .add(I.getOperand(1)); 1062 1063 Register Reg = I.getOperand(1).getReg(); 1064 I.eraseFromParent(); 1065 1066 if (!MRI->getRegClassOrNull(Reg)) 1067 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 1068 return true; 1069 } 1070 case Intrinsic::amdgcn_raw_buffer_store: 1071 return selectStoreIntrinsic(I, false); 1072 case Intrinsic::amdgcn_raw_buffer_store_format: 1073 return selectStoreIntrinsic(I, true); 1074 default: 1075 return selectImpl(I, *CoverageInfo); 1076 } 1077 } 1078 1079 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { 1080 MachineBasicBlock *BB = I.getParent(); 1081 const DebugLoc &DL = I.getDebugLoc(); 1082 1083 Register DstReg = I.getOperand(0).getReg(); 1084 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 1085 assert(Size <= 32 || Size == 64); 1086 const MachineOperand &CCOp = I.getOperand(1); 1087 Register CCReg = CCOp.getReg(); 1088 if (isSCC(CCReg, *MRI)) { 1089 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 : 1090 AMDGPU::S_CSELECT_B32; 1091 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 1092 .addReg(CCReg); 1093 1094 // The generic constrainSelectedInstRegOperands doesn't work for the scc register 1095 // bank, because it does not cover the register class that we used to represent 1096 // for it. So we need to manually set the register class here. 1097 if (!MRI->getRegClassOrNull(CCReg)) 1098 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI)); 1099 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) 1100 .add(I.getOperand(2)) 1101 .add(I.getOperand(3)); 1102 1103 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) | 1104 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI); 1105 I.eraseFromParent(); 1106 return Ret; 1107 } 1108 1109 // Wide VGPR select should have been split in RegBankSelect. 1110 if (Size > 32) 1111 return false; 1112 1113 MachineInstr *Select = 1114 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1115 .addImm(0) 1116 .add(I.getOperand(3)) 1117 .addImm(0) 1118 .add(I.getOperand(2)) 1119 .add(I.getOperand(1)); 1120 1121 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); 1122 I.eraseFromParent(); 1123 return Ret; 1124 } 1125 1126 bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { 1127 initM0(I); 1128 return selectImpl(I, *CoverageInfo); 1129 } 1130 1131 static int sizeToSubRegIndex(unsigned Size) { 1132 switch (Size) { 1133 case 32: 1134 return AMDGPU::sub0; 1135 case 64: 1136 return AMDGPU::sub0_sub1; 1137 case 96: 1138 return AMDGPU::sub0_sub1_sub2; 1139 case 128: 1140 return AMDGPU::sub0_sub1_sub2_sub3; 1141 case 256: 1142 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; 1143 default: 1144 if (Size < 32) 1145 return AMDGPU::sub0; 1146 if (Size > 256) 1147 return -1; 1148 return sizeToSubRegIndex(PowerOf2Ceil(Size)); 1149 } 1150 } 1151 1152 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { 1153 Register DstReg = I.getOperand(0).getReg(); 1154 Register SrcReg = I.getOperand(1).getReg(); 1155 const LLT DstTy = MRI->getType(DstReg); 1156 const LLT SrcTy = MRI->getType(SrcReg); 1157 if (!DstTy.isScalar()) 1158 return false; 1159 1160 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1161 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1162 if (SrcRB != DstRB) 1163 return false; 1164 1165 unsigned DstSize = DstTy.getSizeInBits(); 1166 unsigned SrcSize = SrcTy.getSizeInBits(); 1167 1168 const TargetRegisterClass *SrcRC 1169 = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI); 1170 const TargetRegisterClass *DstRC 1171 = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI); 1172 1173 if (SrcSize > 32) { 1174 int SubRegIdx = sizeToSubRegIndex(DstSize); 1175 if (SubRegIdx == -1) 1176 return false; 1177 1178 // Deal with weird cases where the class only partially supports the subreg 1179 // index. 1180 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx); 1181 if (!SrcRC) 1182 return false; 1183 1184 I.getOperand(1).setSubReg(SubRegIdx); 1185 } 1186 1187 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 1188 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) { 1189 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); 1190 return false; 1191 } 1192 1193 I.setDesc(TII.get(TargetOpcode::COPY)); 1194 return true; 1195 } 1196 1197 /// \returns true if a bitmask for \p Size bits will be an inline immediate. 1198 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) { 1199 Mask = maskTrailingOnes<unsigned>(Size); 1200 int SignedMask = static_cast<int>(Mask); 1201 return SignedMask >= -16 && SignedMask <= 64; 1202 } 1203 1204 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { 1205 bool Signed = I.getOpcode() == AMDGPU::G_SEXT; 1206 const DebugLoc &DL = I.getDebugLoc(); 1207 MachineBasicBlock &MBB = *I.getParent(); 1208 const Register DstReg = I.getOperand(0).getReg(); 1209 const Register SrcReg = I.getOperand(1).getReg(); 1210 1211 const LLT DstTy = MRI->getType(DstReg); 1212 const LLT SrcTy = MRI->getType(SrcReg); 1213 const LLT S1 = LLT::scalar(1); 1214 const unsigned SrcSize = SrcTy.getSizeInBits(); 1215 const unsigned DstSize = DstTy.getSizeInBits(); 1216 if (!DstTy.isScalar()) 1217 return false; 1218 1219 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 1220 1221 if (SrcBank->getID() == AMDGPU::SCCRegBankID) { 1222 if (SrcTy != S1 || DstSize > 64) // Invalid 1223 return false; 1224 1225 unsigned Opcode = 1226 DstSize > 32 ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32; 1227 const TargetRegisterClass *DstRC = 1228 DstSize > 32 ? &AMDGPU::SReg_64RegClass : &AMDGPU::SReg_32RegClass; 1229 1230 // FIXME: Create an extra copy to avoid incorrectly constraining the result 1231 // of the scc producer. 1232 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1233 BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), TmpReg) 1234 .addReg(SrcReg); 1235 BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 1236 .addReg(TmpReg); 1237 1238 // The instruction operands are backwards from what you would expect. 1239 BuildMI(MBB, I, DL, TII.get(Opcode), DstReg) 1240 .addImm(0) 1241 .addImm(Signed ? -1 : 1); 1242 I.eraseFromParent(); 1243 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI); 1244 } 1245 1246 if (SrcBank->getID() == AMDGPU::VCCRegBankID && DstSize <= 32) { 1247 if (SrcTy != S1) // Invalid 1248 return false; 1249 1250 MachineInstr *ExtI = 1251 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1252 .addImm(0) // src0_modifiers 1253 .addImm(0) // src0 1254 .addImm(0) // src1_modifiers 1255 .addImm(Signed ? -1 : 1) // src1 1256 .addUse(SrcReg); 1257 I.eraseFromParent(); 1258 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1259 } 1260 1261 if (I.getOpcode() == AMDGPU::G_ANYEXT) 1262 return selectCOPY(I); 1263 1264 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) { 1265 // 64-bit should have been split up in RegBankSelect 1266 1267 // Try to use an and with a mask if it will save code size. 1268 unsigned Mask; 1269 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1270 MachineInstr *ExtI = 1271 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg) 1272 .addImm(Mask) 1273 .addReg(SrcReg); 1274 I.eraseFromParent(); 1275 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1276 } 1277 1278 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32; 1279 MachineInstr *ExtI = 1280 BuildMI(MBB, I, DL, TII.get(BFE), DstReg) 1281 .addReg(SrcReg) 1282 .addImm(0) // Offset 1283 .addImm(SrcSize); // Width 1284 I.eraseFromParent(); 1285 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1286 } 1287 1288 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) { 1289 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI)) 1290 return false; 1291 1292 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) { 1293 const unsigned SextOpc = SrcSize == 8 ? 1294 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16; 1295 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg) 1296 .addReg(SrcReg); 1297 I.eraseFromParent(); 1298 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1299 } 1300 1301 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; 1302 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; 1303 1304 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width. 1305 if (DstSize > 32 && SrcSize <= 32) { 1306 // We need a 64-bit register source, but the high bits don't matter. 1307 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); 1308 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1309 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 1310 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) 1311 .addReg(SrcReg) 1312 .addImm(AMDGPU::sub0) 1313 .addReg(UndefReg) 1314 .addImm(AMDGPU::sub1); 1315 1316 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg) 1317 .addReg(ExtReg) 1318 .addImm(SrcSize << 16); 1319 1320 I.eraseFromParent(); 1321 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI); 1322 } 1323 1324 unsigned Mask; 1325 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1326 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg) 1327 .addReg(SrcReg) 1328 .addImm(Mask); 1329 } else { 1330 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg) 1331 .addReg(SrcReg) 1332 .addImm(SrcSize << 16); 1333 } 1334 1335 I.eraseFromParent(); 1336 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1337 } 1338 1339 return false; 1340 } 1341 1342 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { 1343 MachineBasicBlock *BB = I.getParent(); 1344 MachineOperand &ImmOp = I.getOperand(1); 1345 1346 // The AMDGPU backend only supports Imm operands and not CImm or FPImm. 1347 if (ImmOp.isFPImm()) { 1348 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt(); 1349 ImmOp.ChangeToImmediate(Imm.getZExtValue()); 1350 } else if (ImmOp.isCImm()) { 1351 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getZExtValue()); 1352 } 1353 1354 Register DstReg = I.getOperand(0).getReg(); 1355 unsigned Size; 1356 bool IsSgpr; 1357 const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg()); 1358 if (RB) { 1359 IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID; 1360 Size = MRI->getType(DstReg).getSizeInBits(); 1361 } else { 1362 const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg); 1363 IsSgpr = TRI.isSGPRClass(RC); 1364 Size = TRI.getRegSizeInBits(*RC); 1365 } 1366 1367 if (Size != 32 && Size != 64) 1368 return false; 1369 1370 unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1371 if (Size == 32) { 1372 I.setDesc(TII.get(Opcode)); 1373 I.addImplicitDefUseOperands(*MF); 1374 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 1375 } 1376 1377 const DebugLoc &DL = I.getDebugLoc(); 1378 1379 APInt Imm(Size, I.getOperand(1).getImm()); 1380 1381 MachineInstr *ResInst; 1382 if (IsSgpr && TII.isInlineConstant(Imm)) { 1383 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) 1384 .addImm(I.getOperand(1).getImm()); 1385 } else { 1386 const TargetRegisterClass *RC = IsSgpr ? 1387 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass; 1388 Register LoReg = MRI->createVirtualRegister(RC); 1389 Register HiReg = MRI->createVirtualRegister(RC); 1390 1391 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg) 1392 .addImm(Imm.trunc(32).getZExtValue()); 1393 1394 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg) 1395 .addImm(Imm.ashr(32).getZExtValue()); 1396 1397 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 1398 .addReg(LoReg) 1399 .addImm(AMDGPU::sub0) 1400 .addReg(HiReg) 1401 .addImm(AMDGPU::sub1); 1402 } 1403 1404 // We can't call constrainSelectedInstRegOperands here, because it doesn't 1405 // work for target independent opcodes 1406 I.eraseFromParent(); 1407 const TargetRegisterClass *DstRC = 1408 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI); 1409 if (!DstRC) 1410 return true; 1411 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI); 1412 } 1413 1414 static bool isConstant(const MachineInstr &MI) { 1415 return MI.getOpcode() == TargetOpcode::G_CONSTANT; 1416 } 1417 1418 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, 1419 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const { 1420 1421 const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg()); 1422 1423 assert(PtrMI); 1424 1425 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD) 1426 return; 1427 1428 GEPInfo GEPInfo(*PtrMI); 1429 1430 for (unsigned i = 1; i != 3; ++i) { 1431 const MachineOperand &GEPOp = PtrMI->getOperand(i); 1432 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg()); 1433 assert(OpDef); 1434 if (i == 2 && isConstant(*OpDef)) { 1435 // TODO: Could handle constant base + variable offset, but a combine 1436 // probably should have commuted it. 1437 assert(GEPInfo.Imm == 0); 1438 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue(); 1439 continue; 1440 } 1441 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI); 1442 if (OpBank->getID() == AMDGPU::SGPRRegBankID) 1443 GEPInfo.SgprParts.push_back(GEPOp.getReg()); 1444 else 1445 GEPInfo.VgprParts.push_back(GEPOp.getReg()); 1446 } 1447 1448 AddrInfo.push_back(GEPInfo); 1449 getAddrModeInfo(*PtrMI, MRI, AddrInfo); 1450 } 1451 1452 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const { 1453 if (!MI.hasOneMemOperand()) 1454 return false; 1455 1456 const MachineMemOperand *MMO = *MI.memoperands_begin(); 1457 const Value *Ptr = MMO->getValue(); 1458 1459 // UndefValue means this is a load of a kernel input. These are uniform. 1460 // Sometimes LDS instructions have constant pointers. 1461 // If Ptr is null, then that means this mem operand contains a 1462 // PseudoSourceValue like GOT. 1463 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || 1464 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) 1465 return true; 1466 1467 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 1468 return true; 1469 1470 const Instruction *I = dyn_cast<Instruction>(Ptr); 1471 return I && I->getMetadata("amdgpu.uniform"); 1472 } 1473 1474 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const { 1475 for (const GEPInfo &GEPInfo : AddrInfo) { 1476 if (!GEPInfo.VgprParts.empty()) 1477 return true; 1478 } 1479 return false; 1480 } 1481 1482 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const { 1483 MachineBasicBlock *BB = I.getParent(); 1484 1485 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); 1486 unsigned AS = PtrTy.getAddressSpace(); 1487 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) && 1488 STI.ldsRequiresM0Init()) { 1489 // If DS instructions require M0 initializtion, insert it before selecting. 1490 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1491 .addImm(-1); 1492 } 1493 } 1494 1495 bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const { 1496 initM0(I); 1497 return selectImpl(I, *CoverageInfo); 1498 } 1499 1500 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { 1501 MachineBasicBlock *BB = I.getParent(); 1502 MachineOperand &CondOp = I.getOperand(0); 1503 Register CondReg = CondOp.getReg(); 1504 const DebugLoc &DL = I.getDebugLoc(); 1505 1506 unsigned BrOpcode; 1507 Register CondPhysReg; 1508 const TargetRegisterClass *ConstrainRC; 1509 1510 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide 1511 // whether the branch is uniform when selecting the instruction. In 1512 // GlobalISel, we should push that decision into RegBankSelect. Assume for now 1513 // RegBankSelect knows what it's doing if the branch condition is scc, even 1514 // though it currently does not. 1515 if (isSCC(CondReg, *MRI)) { 1516 CondPhysReg = AMDGPU::SCC; 1517 BrOpcode = AMDGPU::S_CBRANCH_SCC1; 1518 // FIXME: Hack for isSCC tests 1519 ConstrainRC = &AMDGPU::SGPR_32RegClass; 1520 } else if (isVCC(CondReg, *MRI)) { 1521 // FIXME: Do we have to insert an and with exec here, like in SelectionDAG? 1522 // We sort of know that a VCC producer based on the register bank, that ands 1523 // inactive lanes with 0. What if there was a logical operation with vcc 1524 // producers in different blocks/with different exec masks? 1525 // FIXME: Should scc->vcc copies and with exec? 1526 CondPhysReg = TRI.getVCC(); 1527 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ; 1528 ConstrainRC = TRI.getBoolRC(); 1529 } else 1530 return false; 1531 1532 if (!MRI->getRegClassOrNull(CondReg)) 1533 MRI->setRegClass(CondReg, ConstrainRC); 1534 1535 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg) 1536 .addReg(CondReg); 1537 BuildMI(*BB, &I, DL, TII.get(BrOpcode)) 1538 .addMBB(I.getOperand(1).getMBB()); 1539 1540 I.eraseFromParent(); 1541 return true; 1542 } 1543 1544 bool AMDGPUInstructionSelector::selectG_FRAME_INDEX(MachineInstr &I) const { 1545 Register DstReg = I.getOperand(0).getReg(); 1546 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1547 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 1548 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32)); 1549 if (IsVGPR) 1550 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 1551 1552 return RBI.constrainGenericRegister( 1553 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI); 1554 } 1555 1556 bool AMDGPUInstructionSelector::selectG_PTR_MASK(MachineInstr &I) const { 1557 uint64_t Align = I.getOperand(2).getImm(); 1558 const uint64_t Mask = ~((UINT64_C(1) << Align) - 1); 1559 1560 MachineBasicBlock *BB = I.getParent(); 1561 1562 Register DstReg = I.getOperand(0).getReg(); 1563 Register SrcReg = I.getOperand(1).getReg(); 1564 1565 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1566 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1567 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 1568 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 1569 unsigned MovOpc = IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 1570 const TargetRegisterClass &RegRC 1571 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 1572 1573 LLT Ty = MRI->getType(DstReg); 1574 1575 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB, 1576 *MRI); 1577 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB, 1578 *MRI); 1579 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 1580 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 1581 return false; 1582 1583 const DebugLoc &DL = I.getDebugLoc(); 1584 Register ImmReg = MRI->createVirtualRegister(&RegRC); 1585 BuildMI(*BB, &I, DL, TII.get(MovOpc), ImmReg) 1586 .addImm(Mask); 1587 1588 if (Ty.getSizeInBits() == 32) { 1589 BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg) 1590 .addReg(SrcReg) 1591 .addReg(ImmReg); 1592 I.eraseFromParent(); 1593 return true; 1594 } 1595 1596 Register HiReg = MRI->createVirtualRegister(&RegRC); 1597 Register LoReg = MRI->createVirtualRegister(&RegRC); 1598 Register MaskLo = MRI->createVirtualRegister(&RegRC); 1599 1600 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg) 1601 .addReg(SrcReg, 0, AMDGPU::sub0); 1602 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg) 1603 .addReg(SrcReg, 0, AMDGPU::sub1); 1604 1605 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskLo) 1606 .addReg(LoReg) 1607 .addReg(ImmReg); 1608 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 1609 .addReg(MaskLo) 1610 .addImm(AMDGPU::sub0) 1611 .addReg(HiReg) 1612 .addImm(AMDGPU::sub1); 1613 I.eraseFromParent(); 1614 return true; 1615 } 1616 1617 bool AMDGPUInstructionSelector::select(MachineInstr &I) { 1618 if (I.isPHI()) 1619 return selectPHI(I); 1620 1621 if (!I.isPreISelOpcode()) { 1622 if (I.isCopy()) 1623 return selectCOPY(I); 1624 return true; 1625 } 1626 1627 switch (I.getOpcode()) { 1628 case TargetOpcode::G_AND: 1629 case TargetOpcode::G_OR: 1630 case TargetOpcode::G_XOR: 1631 if (selectG_AND_OR_XOR(I)) 1632 return true; 1633 return selectImpl(I, *CoverageInfo); 1634 case TargetOpcode::G_ADD: 1635 case TargetOpcode::G_SUB: 1636 if (selectImpl(I, *CoverageInfo)) 1637 return true; 1638 return selectG_ADD_SUB(I); 1639 case TargetOpcode::G_UADDO: 1640 case TargetOpcode::G_USUBO: 1641 return selectG_UADDO_USUBO(I); 1642 case TargetOpcode::G_INTTOPTR: 1643 case TargetOpcode::G_BITCAST: 1644 case TargetOpcode::G_PTRTOINT: 1645 return selectCOPY(I); 1646 case TargetOpcode::G_CONSTANT: 1647 case TargetOpcode::G_FCONSTANT: 1648 return selectG_CONSTANT(I); 1649 case TargetOpcode::G_EXTRACT: 1650 return selectG_EXTRACT(I); 1651 case TargetOpcode::G_MERGE_VALUES: 1652 case TargetOpcode::G_BUILD_VECTOR: 1653 case TargetOpcode::G_CONCAT_VECTORS: 1654 return selectG_MERGE_VALUES(I); 1655 case TargetOpcode::G_UNMERGE_VALUES: 1656 return selectG_UNMERGE_VALUES(I); 1657 case TargetOpcode::G_PTR_ADD: 1658 return selectG_PTR_ADD(I); 1659 case TargetOpcode::G_IMPLICIT_DEF: 1660 return selectG_IMPLICIT_DEF(I); 1661 case TargetOpcode::G_INSERT: 1662 return selectG_INSERT(I); 1663 case TargetOpcode::G_INTRINSIC: 1664 return selectG_INTRINSIC(I); 1665 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: 1666 return selectG_INTRINSIC_W_SIDE_EFFECTS(I); 1667 case TargetOpcode::G_ICMP: 1668 if (selectG_ICMP(I)) 1669 return true; 1670 return selectImpl(I, *CoverageInfo); 1671 case TargetOpcode::G_LOAD: 1672 case TargetOpcode::G_ATOMIC_CMPXCHG: 1673 case TargetOpcode::G_ATOMICRMW_XCHG: 1674 case TargetOpcode::G_ATOMICRMW_ADD: 1675 case TargetOpcode::G_ATOMICRMW_SUB: 1676 case TargetOpcode::G_ATOMICRMW_AND: 1677 case TargetOpcode::G_ATOMICRMW_OR: 1678 case TargetOpcode::G_ATOMICRMW_XOR: 1679 case TargetOpcode::G_ATOMICRMW_MIN: 1680 case TargetOpcode::G_ATOMICRMW_MAX: 1681 case TargetOpcode::G_ATOMICRMW_UMIN: 1682 case TargetOpcode::G_ATOMICRMW_UMAX: 1683 case TargetOpcode::G_ATOMICRMW_FADD: 1684 return selectG_LOAD_ATOMICRMW(I); 1685 case TargetOpcode::G_SELECT: 1686 return selectG_SELECT(I); 1687 case TargetOpcode::G_STORE: 1688 return selectG_STORE(I); 1689 case TargetOpcode::G_TRUNC: 1690 return selectG_TRUNC(I); 1691 case TargetOpcode::G_SEXT: 1692 case TargetOpcode::G_ZEXT: 1693 case TargetOpcode::G_ANYEXT: 1694 return selectG_SZA_EXT(I); 1695 case TargetOpcode::G_BRCOND: 1696 return selectG_BRCOND(I); 1697 case TargetOpcode::G_FRAME_INDEX: 1698 return selectG_FRAME_INDEX(I); 1699 case TargetOpcode::G_PTR_MASK: 1700 return selectG_PTR_MASK(I); 1701 default: 1702 return selectImpl(I, *CoverageInfo); 1703 } 1704 return false; 1705 } 1706 1707 InstructionSelector::ComplexRendererFns 1708 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { 1709 return {{ 1710 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 1711 }}; 1712 1713 } 1714 1715 std::pair<Register, unsigned> 1716 AMDGPUInstructionSelector::selectVOP3ModsImpl( 1717 Register Src) const { 1718 unsigned Mods = 0; 1719 MachineInstr *MI = MRI->getVRegDef(Src); 1720 1721 if (MI && MI->getOpcode() == AMDGPU::G_FNEG) { 1722 Src = MI->getOperand(1).getReg(); 1723 Mods |= SISrcMods::NEG; 1724 MI = MRI->getVRegDef(Src); 1725 } 1726 1727 if (MI && MI->getOpcode() == AMDGPU::G_FABS) { 1728 Src = MI->getOperand(1).getReg(); 1729 Mods |= SISrcMods::ABS; 1730 } 1731 1732 return std::make_pair(Src, Mods); 1733 } 1734 1735 /// 1736 /// This will select either an SGPR or VGPR operand and will save us from 1737 /// having to write an extra tablegen pattern. 1738 InstructionSelector::ComplexRendererFns 1739 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const { 1740 return {{ 1741 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 1742 }}; 1743 } 1744 1745 InstructionSelector::ComplexRendererFns 1746 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { 1747 Register Src; 1748 unsigned Mods; 1749 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 1750 1751 return {{ 1752 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 1753 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 1754 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 1755 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 1756 }}; 1757 } 1758 1759 InstructionSelector::ComplexRendererFns 1760 AMDGPUInstructionSelector::selectVOP3Mods0Clamp0OMod(MachineOperand &Root) const { 1761 Register Src; 1762 unsigned Mods; 1763 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 1764 1765 return {{ 1766 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 1767 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 1768 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 1769 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 1770 }}; 1771 } 1772 1773 InstructionSelector::ComplexRendererFns 1774 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { 1775 return {{ 1776 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 1777 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 1778 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 1779 }}; 1780 } 1781 1782 InstructionSelector::ComplexRendererFns 1783 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { 1784 Register Src; 1785 unsigned Mods; 1786 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 1787 1788 return {{ 1789 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 1790 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 1791 }}; 1792 } 1793 1794 InstructionSelector::ComplexRendererFns 1795 AMDGPUInstructionSelector::selectVOP3OpSelMods0(MachineOperand &Root) const { 1796 // FIXME: Handle clamp and op_sel 1797 return {{ 1798 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 1799 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // src_mods 1800 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // clamp 1801 }}; 1802 } 1803 1804 InstructionSelector::ComplexRendererFns 1805 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { 1806 // FIXME: Handle op_sel 1807 return {{ 1808 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 1809 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods 1810 }}; 1811 } 1812 1813 InstructionSelector::ComplexRendererFns 1814 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { 1815 SmallVector<GEPInfo, 4> AddrInfo; 1816 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 1817 1818 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 1819 return None; 1820 1821 const GEPInfo &GEPInfo = AddrInfo[0]; 1822 1823 if (!AMDGPU::isLegalSMRDImmOffset(STI, GEPInfo.Imm)) 1824 return None; 1825 1826 unsigned PtrReg = GEPInfo.SgprParts[0]; 1827 int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm); 1828 return {{ 1829 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 1830 [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); } 1831 }}; 1832 } 1833 1834 InstructionSelector::ComplexRendererFns 1835 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { 1836 SmallVector<GEPInfo, 4> AddrInfo; 1837 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 1838 1839 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 1840 return None; 1841 1842 const GEPInfo &GEPInfo = AddrInfo[0]; 1843 unsigned PtrReg = GEPInfo.SgprParts[0]; 1844 int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm); 1845 if (!isUInt<32>(EncodedImm)) 1846 return None; 1847 1848 return {{ 1849 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 1850 [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); } 1851 }}; 1852 } 1853 1854 InstructionSelector::ComplexRendererFns 1855 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { 1856 MachineInstr *MI = Root.getParent(); 1857 MachineBasicBlock *MBB = MI->getParent(); 1858 1859 SmallVector<GEPInfo, 4> AddrInfo; 1860 getAddrModeInfo(*MI, *MRI, AddrInfo); 1861 1862 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, 1863 // then we can select all ptr + 32-bit offsets not just immediate offsets. 1864 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 1865 return None; 1866 1867 const GEPInfo &GEPInfo = AddrInfo[0]; 1868 if (!GEPInfo.Imm || !isUInt<32>(GEPInfo.Imm)) 1869 return None; 1870 1871 // If we make it this far we have a load with an 32-bit immediate offset. 1872 // It is OK to select this using a sgpr offset, because we have already 1873 // failed trying to select this load into one of the _IMM variants since 1874 // the _IMM Patterns are considered before the _SGPR patterns. 1875 unsigned PtrReg = GEPInfo.SgprParts[0]; 1876 Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1877 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) 1878 .addImm(GEPInfo.Imm); 1879 return {{ 1880 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 1881 [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); } 1882 }}; 1883 } 1884 1885 template <bool Signed> 1886 InstructionSelector::ComplexRendererFns 1887 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const { 1888 MachineInstr *MI = Root.getParent(); 1889 1890 InstructionSelector::ComplexRendererFns Default = {{ 1891 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 1892 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // offset 1893 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 1894 }}; 1895 1896 if (!STI.hasFlatInstOffsets()) 1897 return Default; 1898 1899 const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg()); 1900 if (!OpDef || OpDef->getOpcode() != AMDGPU::G_PTR_ADD) 1901 return Default; 1902 1903 Optional<int64_t> Offset = 1904 getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI); 1905 if (!Offset.hasValue()) 1906 return Default; 1907 1908 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); 1909 if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed)) 1910 return Default; 1911 1912 Register BasePtr = OpDef->getOperand(1).getReg(); 1913 1914 return {{ 1915 [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); }, 1916 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); }, 1917 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 1918 }}; 1919 } 1920 1921 InstructionSelector::ComplexRendererFns 1922 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const { 1923 return selectFlatOffsetImpl<false>(Root); 1924 } 1925 1926 InstructionSelector::ComplexRendererFns 1927 AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const { 1928 return selectFlatOffsetImpl<true>(Root); 1929 } 1930 1931 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { 1932 auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>(); 1933 return PSV && PSV->isStack(); 1934 } 1935 1936 InstructionSelector::ComplexRendererFns 1937 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { 1938 MachineInstr *MI = Root.getParent(); 1939 MachineBasicBlock *MBB = MI->getParent(); 1940 MachineFunction *MF = MBB->getParent(); 1941 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 1942 1943 int64_t Offset = 0; 1944 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset))) { 1945 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1946 1947 // TODO: Should this be inside the render function? The iterator seems to 1948 // move. 1949 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), 1950 HighBits) 1951 .addImm(Offset & ~4095); 1952 1953 return {{[=](MachineInstrBuilder &MIB) { // rsrc 1954 MIB.addReg(Info->getScratchRSrcReg()); 1955 }, 1956 [=](MachineInstrBuilder &MIB) { // vaddr 1957 MIB.addReg(HighBits); 1958 }, 1959 [=](MachineInstrBuilder &MIB) { // soffset 1960 const MachineMemOperand *MMO = *MI->memoperands_begin(); 1961 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 1962 1963 Register SOffsetReg = isStackPtrRelative(PtrInfo) 1964 ? Info->getStackPtrOffsetReg() 1965 : Info->getScratchWaveOffsetReg(); 1966 MIB.addReg(SOffsetReg); 1967 }, 1968 [=](MachineInstrBuilder &MIB) { // offset 1969 MIB.addImm(Offset & 4095); 1970 }}}; 1971 } 1972 1973 assert(Offset == 0); 1974 1975 // Try to fold a frame index directly into the MUBUF vaddr field, and any 1976 // offsets. 1977 Optional<int> FI; 1978 Register VAddr = Root.getReg(); 1979 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) { 1980 if (isBaseWithConstantOffset(Root, *MRI)) { 1981 const MachineOperand &LHS = RootDef->getOperand(1); 1982 const MachineOperand &RHS = RootDef->getOperand(2); 1983 const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); 1984 const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); 1985 if (LHSDef && RHSDef) { 1986 int64_t PossibleOffset = 1987 RHSDef->getOperand(1).getCImm()->getSExtValue(); 1988 if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) && 1989 (!STI.privateMemoryResourceIsRangeChecked() || 1990 KnownBits->signBitIsZero(LHS.getReg()))) { 1991 if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX) 1992 FI = LHSDef->getOperand(1).getIndex(); 1993 else 1994 VAddr = LHS.getReg(); 1995 Offset = PossibleOffset; 1996 } 1997 } 1998 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) { 1999 FI = RootDef->getOperand(1).getIndex(); 2000 } 2001 } 2002 2003 // If we don't know this private access is a local stack object, it needs to 2004 // be relative to the entry point's scratch wave offset register. 2005 // TODO: Should split large offsets that don't fit like above. 2006 // TODO: Don't use scratch wave offset just because the offset didn't fit. 2007 Register SOffset = FI.hasValue() ? Info->getStackPtrOffsetReg() 2008 : Info->getScratchWaveOffsetReg(); 2009 2010 return {{[=](MachineInstrBuilder &MIB) { // rsrc 2011 MIB.addReg(Info->getScratchRSrcReg()); 2012 }, 2013 [=](MachineInstrBuilder &MIB) { // vaddr 2014 if (FI.hasValue()) 2015 MIB.addFrameIndex(FI.getValue()); 2016 else 2017 MIB.addReg(VAddr); 2018 }, 2019 [=](MachineInstrBuilder &MIB) { // soffset 2020 MIB.addReg(SOffset); 2021 }, 2022 [=](MachineInstrBuilder &MIB) { // offset 2023 MIB.addImm(Offset); 2024 }}}; 2025 } 2026 2027 bool AMDGPUInstructionSelector::isDSOffsetLegal(const MachineRegisterInfo &MRI, 2028 const MachineOperand &Base, 2029 int64_t Offset, 2030 unsigned OffsetBits) const { 2031 if ((OffsetBits == 16 && !isUInt<16>(Offset)) || 2032 (OffsetBits == 8 && !isUInt<8>(Offset))) 2033 return false; 2034 2035 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) 2036 return true; 2037 2038 // On Southern Islands instruction with a negative base value and an offset 2039 // don't seem to work. 2040 return KnownBits->signBitIsZero(Base.getReg()); 2041 } 2042 2043 InstructionSelector::ComplexRendererFns 2044 AMDGPUInstructionSelector::selectMUBUFScratchOffset( 2045 MachineOperand &Root) const { 2046 MachineInstr *MI = Root.getParent(); 2047 MachineBasicBlock *MBB = MI->getParent(); 2048 2049 int64_t Offset = 0; 2050 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) || 2051 !SIInstrInfo::isLegalMUBUFImmOffset(Offset)) 2052 return {}; 2053 2054 const MachineFunction *MF = MBB->getParent(); 2055 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 2056 const MachineMemOperand *MMO = *MI->memoperands_begin(); 2057 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 2058 2059 Register SOffsetReg = isStackPtrRelative(PtrInfo) 2060 ? Info->getStackPtrOffsetReg() 2061 : Info->getScratchWaveOffsetReg(); 2062 return {{ 2063 [=](MachineInstrBuilder &MIB) { 2064 MIB.addReg(Info->getScratchRSrcReg()); 2065 }, // rsrc 2066 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffsetReg); }, // soffset 2067 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset 2068 }}; 2069 } 2070 2071 InstructionSelector::ComplexRendererFns 2072 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const { 2073 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 2074 if (!RootDef) { 2075 return {{ 2076 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 2077 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } 2078 }}; 2079 } 2080 2081 int64_t ConstAddr = 0; 2082 if (isBaseWithConstantOffset(Root, *MRI)) { 2083 const MachineOperand &LHS = RootDef->getOperand(1); 2084 const MachineOperand &RHS = RootDef->getOperand(2); 2085 const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); 2086 const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); 2087 if (LHSDef && RHSDef) { 2088 int64_t PossibleOffset = 2089 RHSDef->getOperand(1).getCImm()->getSExtValue(); 2090 if (isDSOffsetLegal(*MRI, LHS, PossibleOffset, 16)) { 2091 // (add n0, c0) 2092 return {{ 2093 [=](MachineInstrBuilder &MIB) { MIB.add(LHS); }, 2094 [=](MachineInstrBuilder &MIB) { MIB.addImm(PossibleOffset); } 2095 }}; 2096 } 2097 } 2098 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 2099 2100 2101 2102 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 2103 2104 2105 } 2106 2107 return {{ 2108 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 2109 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } 2110 }}; 2111 } 2112 2113 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, 2114 const MachineInstr &MI) const { 2115 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT"); 2116 Optional<int64_t> CstVal = getConstantVRegVal(MI.getOperand(0).getReg(), *MRI); 2117 assert(CstVal && "Expected constant value"); 2118 MIB.addImm(CstVal.getValue()); 2119 } 2120