1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the InstructionSelector class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUInstructionSelector.h" 15 #include "AMDGPUInstrInfo.h" 16 #include "AMDGPURegisterBankInfo.h" 17 #include "AMDGPURegisterInfo.h" 18 #include "AMDGPUSubtarget.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 23 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" 24 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" 25 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 26 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 27 #include "llvm/CodeGen/GlobalISel/Utils.h" 28 #include "llvm/CodeGen/MachineBasicBlock.h" 29 #include "llvm/CodeGen/MachineFunction.h" 30 #include "llvm/CodeGen/MachineInstr.h" 31 #include "llvm/CodeGen/MachineInstrBuilder.h" 32 #include "llvm/CodeGen/MachineRegisterInfo.h" 33 #include "llvm/IR/Type.h" 34 #include "llvm/Support/Debug.h" 35 #include "llvm/Support/raw_ostream.h" 36 37 #define DEBUG_TYPE "amdgpu-isel" 38 39 using namespace llvm; 40 using namespace MIPatternMatch; 41 42 #define GET_GLOBALISEL_IMPL 43 #define AMDGPUSubtarget GCNSubtarget 44 #include "AMDGPUGenGlobalISel.inc" 45 #undef GET_GLOBALISEL_IMPL 46 #undef AMDGPUSubtarget 47 48 AMDGPUInstructionSelector::AMDGPUInstructionSelector( 49 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, 50 const AMDGPUTargetMachine &TM) 51 : InstructionSelector(), TII(*STI.getInstrInfo()), 52 TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM), 53 STI(STI), 54 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG), 55 #define GET_GLOBALISEL_PREDICATES_INIT 56 #include "AMDGPUGenGlobalISel.inc" 57 #undef GET_GLOBALISEL_PREDICATES_INIT 58 #define GET_GLOBALISEL_TEMPORARIES_INIT 59 #include "AMDGPUGenGlobalISel.inc" 60 #undef GET_GLOBALISEL_TEMPORARIES_INIT 61 { 62 } 63 64 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } 65 66 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB, 67 CodeGenCoverage &CoverageInfo) { 68 MRI = &MF.getRegInfo(); 69 InstructionSelector::setupMF(MF, KB, CoverageInfo); 70 } 71 72 static bool isSCC(Register Reg, const MachineRegisterInfo &MRI) { 73 if (Register::isPhysicalRegister(Reg)) 74 return Reg == AMDGPU::SCC; 75 76 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 77 const TargetRegisterClass *RC = 78 RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); 79 if (RC) { 80 // FIXME: This is ambiguous for wave32. This could be SCC or VCC, but the 81 // context of the register bank has been lost. 82 // Has a hack getRegClassForSizeOnBank uses exactly SGPR_32RegClass, which 83 // won't ever beconstrained any further. 84 if (RC != &AMDGPU::SGPR_32RegClass) 85 return false; 86 const LLT Ty = MRI.getType(Reg); 87 return Ty.isValid() && Ty.getSizeInBits() == 1; 88 } 89 90 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); 91 return RB->getID() == AMDGPU::SCCRegBankID; 92 } 93 94 bool AMDGPUInstructionSelector::isVCC(Register Reg, 95 const MachineRegisterInfo &MRI) const { 96 if (Register::isPhysicalRegister(Reg)) 97 return Reg == TRI.getVCC(); 98 99 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 100 const TargetRegisterClass *RC = 101 RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); 102 if (RC) { 103 const LLT Ty = MRI.getType(Reg); 104 return RC->hasSuperClassEq(TRI.getBoolRC()) && 105 Ty.isValid() && Ty.getSizeInBits() == 1; 106 } 107 108 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); 109 return RB->getID() == AMDGPU::VCCRegBankID; 110 } 111 112 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { 113 const DebugLoc &DL = I.getDebugLoc(); 114 MachineBasicBlock *BB = I.getParent(); 115 I.setDesc(TII.get(TargetOpcode::COPY)); 116 117 const MachineOperand &Src = I.getOperand(1); 118 MachineOperand &Dst = I.getOperand(0); 119 Register DstReg = Dst.getReg(); 120 Register SrcReg = Src.getReg(); 121 122 if (isVCC(DstReg, *MRI)) { 123 if (SrcReg == AMDGPU::SCC) { 124 const TargetRegisterClass *RC 125 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 126 if (!RC) 127 return true; 128 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 129 } 130 131 if (!isVCC(SrcReg, *MRI)) { 132 // TODO: Should probably leave the copy and let copyPhysReg expand it. 133 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI)) 134 return false; 135 136 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) 137 .addImm(0) 138 .addReg(SrcReg); 139 140 if (!MRI->getRegClassOrNull(SrcReg)) 141 MRI->setRegClass(SrcReg, TRI.getConstrainedRegClassForOperand(Src, *MRI)); 142 I.eraseFromParent(); 143 return true; 144 } 145 146 const TargetRegisterClass *RC = 147 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 148 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) 149 return false; 150 151 // Don't constrain the source register to a class so the def instruction 152 // handles it (unless it's undef). 153 // 154 // FIXME: This is a hack. When selecting the def, we neeed to know 155 // specifically know that the result is VCCRegBank, and not just an SGPR 156 // with size 1. An SReg_32 with size 1 is ambiguous with wave32. 157 if (Src.isUndef()) { 158 const TargetRegisterClass *SrcRC = 159 TRI.getConstrainedRegClassForOperand(Src, *MRI); 160 if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 161 return false; 162 } 163 164 return true; 165 } 166 167 for (const MachineOperand &MO : I.operands()) { 168 if (Register::isPhysicalRegister(MO.getReg())) 169 continue; 170 171 const TargetRegisterClass *RC = 172 TRI.getConstrainedRegClassForOperand(MO, *MRI); 173 if (!RC) 174 continue; 175 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); 176 } 177 return true; 178 } 179 180 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { 181 const Register DefReg = I.getOperand(0).getReg(); 182 const LLT DefTy = MRI->getType(DefReg); 183 184 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy) 185 186 const RegClassOrRegBank &RegClassOrBank = 187 MRI->getRegClassOrRegBank(DefReg); 188 189 const TargetRegisterClass *DefRC 190 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); 191 if (!DefRC) { 192 if (!DefTy.isValid()) { 193 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); 194 return false; 195 } 196 197 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); 198 if (RB.getID() == AMDGPU::SCCRegBankID) { 199 LLVM_DEBUG(dbgs() << "illegal scc phi\n"); 200 return false; 201 } 202 203 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI); 204 if (!DefRC) { 205 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); 206 return false; 207 } 208 } 209 210 I.setDesc(TII.get(TargetOpcode::PHI)); 211 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI); 212 } 213 214 MachineOperand 215 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, 216 const TargetRegisterClass &SubRC, 217 unsigned SubIdx) const { 218 219 MachineInstr *MI = MO.getParent(); 220 MachineBasicBlock *BB = MO.getParent()->getParent(); 221 Register DstReg = MRI->createVirtualRegister(&SubRC); 222 223 if (MO.isReg()) { 224 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); 225 Register Reg = MO.getReg(); 226 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) 227 .addReg(Reg, 0, ComposedSubIdx); 228 229 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(), 230 MO.isKill(), MO.isDead(), MO.isUndef(), 231 MO.isEarlyClobber(), 0, MO.isDebug(), 232 MO.isInternalRead()); 233 } 234 235 assert(MO.isImm()); 236 237 APInt Imm(64, MO.getImm()); 238 239 switch (SubIdx) { 240 default: 241 llvm_unreachable("do not know to split immediate with this sub index."); 242 case AMDGPU::sub0: 243 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue()); 244 case AMDGPU::sub1: 245 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue()); 246 } 247 } 248 249 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { 250 switch (Opc) { 251 case AMDGPU::G_AND: 252 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; 253 case AMDGPU::G_OR: 254 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32; 255 case AMDGPU::G_XOR: 256 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32; 257 default: 258 llvm_unreachable("not a bit op"); 259 } 260 } 261 262 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { 263 MachineOperand &Dst = I.getOperand(0); 264 MachineOperand &Src0 = I.getOperand(1); 265 MachineOperand &Src1 = I.getOperand(2); 266 Register DstReg = Dst.getReg(); 267 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 268 269 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 270 if (DstRB->getID() == AMDGPU::VCCRegBankID) { 271 const TargetRegisterClass *RC = TRI.getBoolRC(); 272 unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), 273 RC == &AMDGPU::SReg_64RegClass); 274 I.setDesc(TII.get(InstOpc)); 275 276 // FIXME: Hack to avoid turning the register bank into a register class. 277 // The selector for G_ICMP relies on seeing the register bank for the result 278 // is VCC. In wave32 if we constrain the registers to SReg_32 here, it will 279 // be ambiguous whether it's a scalar or vector bool. 280 if (Src0.isUndef() && !MRI->getRegClassOrNull(Src0.getReg())) 281 MRI->setRegClass(Src0.getReg(), RC); 282 if (Src1.isUndef() && !MRI->getRegClassOrNull(Src1.getReg())) 283 MRI->setRegClass(Src1.getReg(), RC); 284 285 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 286 } 287 288 // TODO: Should this allow an SCC bank result, and produce a copy from SCC for 289 // the result? 290 if (DstRB->getID() == AMDGPU::SGPRRegBankID) { 291 unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), Size > 32); 292 I.setDesc(TII.get(InstOpc)); 293 // Dead implicit-def of scc 294 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef 295 true, // isImp 296 false, // isKill 297 true)); // isDead 298 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 299 } 300 301 return false; 302 } 303 304 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { 305 MachineBasicBlock *BB = I.getParent(); 306 MachineFunction *MF = BB->getParent(); 307 Register DstReg = I.getOperand(0).getReg(); 308 const DebugLoc &DL = I.getDebugLoc(); 309 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 310 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 311 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID; 312 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB; 313 314 if (Size == 32) { 315 if (IsSALU) { 316 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; 317 MachineInstr *Add = 318 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 319 .add(I.getOperand(1)) 320 .add(I.getOperand(2)); 321 I.eraseFromParent(); 322 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 323 } 324 325 if (STI.hasAddNoCarry()) { 326 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64; 327 I.setDesc(TII.get(Opc)); 328 I.addOperand(*MF, MachineOperand::CreateImm(0)); 329 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 330 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 331 } 332 333 const unsigned Opc = Sub ? AMDGPU::V_SUB_I32_e64 : AMDGPU::V_ADD_I32_e64; 334 335 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass()); 336 MachineInstr *Add 337 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 338 .addDef(UnusedCarry, RegState::Dead) 339 .add(I.getOperand(1)) 340 .add(I.getOperand(2)) 341 .addImm(0); 342 I.eraseFromParent(); 343 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 344 } 345 346 assert(!Sub && "illegal sub should not reach here"); 347 348 const TargetRegisterClass &RC 349 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass; 350 const TargetRegisterClass &HalfRC 351 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass; 352 353 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0)); 354 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0)); 355 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1)); 356 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1)); 357 358 Register DstLo = MRI->createVirtualRegister(&HalfRC); 359 Register DstHi = MRI->createVirtualRegister(&HalfRC); 360 361 if (IsSALU) { 362 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) 363 .add(Lo1) 364 .add(Lo2); 365 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) 366 .add(Hi1) 367 .add(Hi2); 368 } else { 369 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass(); 370 Register CarryReg = MRI->createVirtualRegister(CarryRC); 371 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_I32_e64), DstLo) 372 .addDef(CarryReg) 373 .add(Lo1) 374 .add(Lo2) 375 .addImm(0); 376 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi) 377 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead) 378 .add(Hi1) 379 .add(Hi2) 380 .addReg(CarryReg, RegState::Kill) 381 .addImm(0); 382 383 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI)) 384 return false; 385 } 386 387 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 388 .addReg(DstLo) 389 .addImm(AMDGPU::sub0) 390 .addReg(DstHi) 391 .addImm(AMDGPU::sub1); 392 393 394 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI)) 395 return false; 396 397 I.eraseFromParent(); 398 return true; 399 } 400 401 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO(MachineInstr &I) const { 402 MachineBasicBlock *BB = I.getParent(); 403 MachineFunction *MF = BB->getParent(); 404 MachineRegisterInfo &MRI = MF->getRegInfo(); 405 const DebugLoc &DL = I.getDebugLoc(); 406 Register Dst0Reg = I.getOperand(0).getReg(); 407 Register Dst1Reg = I.getOperand(1).getReg(); 408 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO; 409 410 if (!isSCC(Dst1Reg, MRI)) { 411 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned 412 // carry out despite the _i32 name. These were renamed in VI to _U32. 413 // FIXME: We should probably rename the opcodes here. 414 unsigned NewOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; 415 I.setDesc(TII.get(NewOpc)); 416 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 417 I.addOperand(*MF, MachineOperand::CreateImm(0)); 418 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 419 } 420 421 Register Src0Reg = I.getOperand(2).getReg(); 422 Register Src1Reg = I.getOperand(3).getReg(); 423 unsigned NewOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; 424 BuildMI(*BB, &I, DL, TII.get(NewOpc), Dst0Reg) 425 .add(I.getOperand(2)) 426 .add(I.getOperand(3)); 427 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg) 428 .addReg(AMDGPU::SCC); 429 430 if (!MRI.getRegClassOrNull(Dst1Reg)) 431 MRI.setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass); 432 433 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, MRI) || 434 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, MRI) || 435 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, MRI)) 436 return false; 437 438 I.eraseFromParent(); 439 return true; 440 } 441 442 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { 443 MachineBasicBlock *BB = I.getParent(); 444 unsigned Offset = I.getOperand(2).getImm(); 445 if (Offset % 32 != 0) 446 return false; 447 448 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32); 449 const DebugLoc &DL = I.getDebugLoc(); 450 MachineInstr *Copy = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), 451 I.getOperand(0).getReg()) 452 .addReg(I.getOperand(1).getReg(), 0, SubReg); 453 454 for (const MachineOperand &MO : Copy->operands()) { 455 const TargetRegisterClass *RC = 456 TRI.getConstrainedRegClassForOperand(MO, *MRI); 457 if (!RC) 458 continue; 459 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); 460 } 461 I.eraseFromParent(); 462 return true; 463 } 464 465 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { 466 MachineBasicBlock *BB = MI.getParent(); 467 Register DstReg = MI.getOperand(0).getReg(); 468 LLT DstTy = MRI->getType(DstReg); 469 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg()); 470 471 const unsigned SrcSize = SrcTy.getSizeInBits(); 472 if (SrcSize < 32) 473 return false; 474 475 const DebugLoc &DL = MI.getDebugLoc(); 476 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 477 const unsigned DstSize = DstTy.getSizeInBits(); 478 const TargetRegisterClass *DstRC = 479 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 480 if (!DstRC) 481 return false; 482 483 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8); 484 MachineInstrBuilder MIB = 485 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg); 486 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) { 487 MachineOperand &Src = MI.getOperand(I + 1); 488 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef())); 489 MIB.addImm(SubRegs[I]); 490 491 const TargetRegisterClass *SrcRC 492 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 493 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI)) 494 return false; 495 } 496 497 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 498 return false; 499 500 MI.eraseFromParent(); 501 return true; 502 } 503 504 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { 505 MachineBasicBlock *BB = MI.getParent(); 506 const int NumDst = MI.getNumOperands() - 1; 507 508 MachineOperand &Src = MI.getOperand(NumDst); 509 510 Register SrcReg = Src.getReg(); 511 Register DstReg0 = MI.getOperand(0).getReg(); 512 LLT DstTy = MRI->getType(DstReg0); 513 LLT SrcTy = MRI->getType(SrcReg); 514 515 const unsigned DstSize = DstTy.getSizeInBits(); 516 const unsigned SrcSize = SrcTy.getSizeInBits(); 517 const DebugLoc &DL = MI.getDebugLoc(); 518 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 519 520 const TargetRegisterClass *SrcRC = 521 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 522 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 523 return false; 524 525 const unsigned SrcFlags = getUndefRegState(Src.isUndef()); 526 527 // Note we could have mixed SGPR and VGPR destination banks for an SGPR 528 // source, and this relies on the fact that the same subregister indices are 529 // used for both. 530 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); 531 for (int I = 0, E = NumDst; I != E; ++I) { 532 MachineOperand &Dst = MI.getOperand(I); 533 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg()) 534 .addReg(SrcReg, SrcFlags, SubRegs[I]); 535 536 const TargetRegisterClass *DstRC = 537 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 538 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI)) 539 return false; 540 } 541 542 MI.eraseFromParent(); 543 return true; 544 } 545 546 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const { 547 return selectG_ADD_SUB(I); 548 } 549 550 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { 551 const MachineOperand &MO = I.getOperand(0); 552 553 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The 554 // regbank check here is to know why getConstrainedRegClassForOperand failed. 555 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI); 556 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) || 557 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) { 558 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); 559 return true; 560 } 561 562 return false; 563 } 564 565 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { 566 MachineBasicBlock *BB = I.getParent(); 567 568 Register DstReg = I.getOperand(0).getReg(); 569 Register Src0Reg = I.getOperand(1).getReg(); 570 Register Src1Reg = I.getOperand(2).getReg(); 571 LLT Src1Ty = MRI->getType(Src1Reg); 572 573 unsigned DstSize = MRI->getType(DstReg).getSizeInBits(); 574 unsigned InsSize = Src1Ty.getSizeInBits(); 575 576 int64_t Offset = I.getOperand(3).getImm(); 577 if (Offset % 32 != 0) 578 return false; 579 580 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32); 581 if (SubReg == AMDGPU::NoSubRegister) 582 return false; 583 584 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 585 const TargetRegisterClass *DstRC = 586 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 587 if (!DstRC) 588 return false; 589 590 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI); 591 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI); 592 const TargetRegisterClass *Src0RC = 593 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI); 594 const TargetRegisterClass *Src1RC = 595 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI); 596 597 // Deal with weird cases where the class only partially supports the subreg 598 // index. 599 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg); 600 if (!Src0RC) 601 return false; 602 603 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 604 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) || 605 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI)) 606 return false; 607 608 const DebugLoc &DL = I.getDebugLoc(); 609 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg) 610 .addReg(Src0Reg) 611 .addReg(Src1Reg) 612 .addImm(SubReg); 613 614 I.eraseFromParent(); 615 return true; 616 } 617 618 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { 619 unsigned IntrinsicID = I.getIntrinsicID(); 620 switch (IntrinsicID) { 621 case Intrinsic::amdgcn_if_break: { 622 MachineBasicBlock *BB = I.getParent(); 623 624 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 625 // SelectionDAG uses for wave32 vs wave64. 626 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK)) 627 .add(I.getOperand(0)) 628 .add(I.getOperand(2)) 629 .add(I.getOperand(3)); 630 631 Register DstReg = I.getOperand(0).getReg(); 632 Register Src0Reg = I.getOperand(2).getReg(); 633 Register Src1Reg = I.getOperand(3).getReg(); 634 635 I.eraseFromParent(); 636 637 for (Register Reg : { DstReg, Src0Reg, Src1Reg }) 638 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 639 640 return true; 641 } 642 default: 643 return selectImpl(I, *CoverageInfo); 644 } 645 } 646 647 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) { 648 if (Size != 32 && Size != 64) 649 return -1; 650 switch (P) { 651 default: 652 llvm_unreachable("Unknown condition code!"); 653 case CmpInst::ICMP_NE: 654 return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64; 655 case CmpInst::ICMP_EQ: 656 return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64; 657 case CmpInst::ICMP_SGT: 658 return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64; 659 case CmpInst::ICMP_SGE: 660 return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64; 661 case CmpInst::ICMP_SLT: 662 return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64; 663 case CmpInst::ICMP_SLE: 664 return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64; 665 case CmpInst::ICMP_UGT: 666 return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64; 667 case CmpInst::ICMP_UGE: 668 return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64; 669 case CmpInst::ICMP_ULT: 670 return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64; 671 case CmpInst::ICMP_ULE: 672 return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64; 673 } 674 } 675 676 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P, 677 unsigned Size) const { 678 if (Size == 64) { 679 if (!STI.hasScalarCompareEq64()) 680 return -1; 681 682 switch (P) { 683 case CmpInst::ICMP_NE: 684 return AMDGPU::S_CMP_LG_U64; 685 case CmpInst::ICMP_EQ: 686 return AMDGPU::S_CMP_EQ_U64; 687 default: 688 return -1; 689 } 690 } 691 692 if (Size != 32) 693 return -1; 694 695 switch (P) { 696 case CmpInst::ICMP_NE: 697 return AMDGPU::S_CMP_LG_U32; 698 case CmpInst::ICMP_EQ: 699 return AMDGPU::S_CMP_EQ_U32; 700 case CmpInst::ICMP_SGT: 701 return AMDGPU::S_CMP_GT_I32; 702 case CmpInst::ICMP_SGE: 703 return AMDGPU::S_CMP_GE_I32; 704 case CmpInst::ICMP_SLT: 705 return AMDGPU::S_CMP_LT_I32; 706 case CmpInst::ICMP_SLE: 707 return AMDGPU::S_CMP_LE_I32; 708 case CmpInst::ICMP_UGT: 709 return AMDGPU::S_CMP_GT_U32; 710 case CmpInst::ICMP_UGE: 711 return AMDGPU::S_CMP_GE_U32; 712 case CmpInst::ICMP_ULT: 713 return AMDGPU::S_CMP_LT_U32; 714 case CmpInst::ICMP_ULE: 715 return AMDGPU::S_CMP_LE_U32; 716 default: 717 llvm_unreachable("Unknown condition code!"); 718 } 719 } 720 721 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { 722 MachineBasicBlock *BB = I.getParent(); 723 const DebugLoc &DL = I.getDebugLoc(); 724 725 Register SrcReg = I.getOperand(2).getReg(); 726 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 727 728 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); 729 730 Register CCReg = I.getOperand(0).getReg(); 731 if (isSCC(CCReg, *MRI)) { 732 int Opcode = getS_CMPOpcode(Pred, Size); 733 if (Opcode == -1) 734 return false; 735 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode)) 736 .add(I.getOperand(2)) 737 .add(I.getOperand(3)); 738 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg) 739 .addReg(AMDGPU::SCC); 740 bool Ret = 741 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) && 742 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI); 743 I.eraseFromParent(); 744 return Ret; 745 } 746 747 int Opcode = getV_CMPOpcode(Pred, Size); 748 if (Opcode == -1) 749 return false; 750 751 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), 752 I.getOperand(0).getReg()) 753 .add(I.getOperand(2)) 754 .add(I.getOperand(3)); 755 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), 756 *TRI.getBoolRC(), *MRI); 757 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 758 I.eraseFromParent(); 759 return Ret; 760 } 761 762 static MachineInstr * 763 buildEXP(const TargetInstrInfo &TII, MachineInstr *Insert, unsigned Tgt, 764 unsigned Reg0, unsigned Reg1, unsigned Reg2, unsigned Reg3, 765 unsigned VM, bool Compr, unsigned Enabled, bool Done) { 766 const DebugLoc &DL = Insert->getDebugLoc(); 767 MachineBasicBlock &BB = *Insert->getParent(); 768 unsigned Opcode = Done ? AMDGPU::EXP_DONE : AMDGPU::EXP; 769 return BuildMI(BB, Insert, DL, TII.get(Opcode)) 770 .addImm(Tgt) 771 .addReg(Reg0) 772 .addReg(Reg1) 773 .addReg(Reg2) 774 .addReg(Reg3) 775 .addImm(VM) 776 .addImm(Compr) 777 .addImm(Enabled); 778 } 779 780 static bool isZero(Register Reg, MachineRegisterInfo &MRI) { 781 int64_t C; 782 if (mi_match(Reg, MRI, m_ICst(C)) && C == 0) 783 return true; 784 785 // FIXME: matcher should ignore copies 786 return mi_match(Reg, MRI, m_Copy(m_ICst(C))) && C == 0; 787 } 788 789 static unsigned extractGLC(unsigned AuxiliaryData) { 790 return AuxiliaryData & 1; 791 } 792 793 static unsigned extractSLC(unsigned AuxiliaryData) { 794 return (AuxiliaryData >> 1) & 1; 795 } 796 797 static unsigned extractDLC(unsigned AuxiliaryData) { 798 return (AuxiliaryData >> 2) & 1; 799 } 800 801 static unsigned extractSWZ(unsigned AuxiliaryData) { 802 return (AuxiliaryData >> 3) & 1; 803 } 804 805 // Returns Base register, constant offset, and offset def point. 806 static std::tuple<Register, unsigned, MachineInstr *> 807 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { 808 MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); 809 if (!Def) 810 return std::make_tuple(Reg, 0, nullptr); 811 812 if (Def->getOpcode() == AMDGPU::G_CONSTANT) { 813 unsigned Offset; 814 const MachineOperand &Op = Def->getOperand(1); 815 if (Op.isImm()) 816 Offset = Op.getImm(); 817 else 818 Offset = Op.getCImm()->getZExtValue(); 819 820 return std::make_tuple(Register(), Offset, Def); 821 } 822 823 int64_t Offset; 824 if (Def->getOpcode() == AMDGPU::G_ADD) { 825 // TODO: Handle G_OR used for add case 826 if (mi_match(Def->getOperand(1).getReg(), MRI, m_ICst(Offset))) 827 return std::make_tuple(Def->getOperand(0).getReg(), Offset, Def); 828 829 // FIXME: matcher should ignore copies 830 if (mi_match(Def->getOperand(1).getReg(), MRI, m_Copy(m_ICst(Offset)))) 831 return std::make_tuple(Def->getOperand(0).getReg(), Offset, Def); 832 } 833 834 return std::make_tuple(Reg, 0, Def); 835 } 836 837 static unsigned getBufferStoreOpcode(LLT Ty, 838 const unsigned MemSize, 839 const bool Offen) { 840 const int Size = Ty.getSizeInBits(); 841 switch (8 * MemSize) { 842 case 8: 843 return Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact : 844 AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact; 845 case 16: 846 return Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact : 847 AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact; 848 default: 849 unsigned Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact : 850 AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact; 851 if (Size > 32) 852 Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32); 853 return Opc; 854 } 855 } 856 857 static unsigned getBufferStoreFormatOpcode(LLT Ty, 858 const unsigned MemSize, 859 const bool Offen) { 860 bool IsD16Packed = Ty.getScalarSizeInBits() == 16; 861 bool IsD16Unpacked = 8 * MemSize < Ty.getSizeInBits(); 862 int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 863 864 if (IsD16Packed) { 865 switch (NumElts) { 866 case 1: 867 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFEN_exact : 868 AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFSET_exact; 869 case 2: 870 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact : 871 AMDGPU::BUFFER_STORE_FORMAT_D16_XY_OFFSET_exact; 872 case 3: 873 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_OFFEN_exact : 874 AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_OFFSET_exact; 875 case 4: 876 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact : 877 AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_OFFSET_exact; 878 default: 879 return -1; 880 } 881 } 882 883 if (IsD16Unpacked) { 884 switch (NumElts) { 885 case 1: 886 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFEN_exact : 887 AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFSET_exact; 888 case 2: 889 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact : 890 AMDGPU::BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFSET_exact; 891 case 3: 892 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_gfx80_OFFEN_exact : 893 AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_gfx80_OFFSET_exact; 894 case 4: 895 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact : 896 AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFSET_exact; 897 default: 898 return -1; 899 } 900 } 901 902 switch (NumElts) { 903 case 1: 904 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_X_OFFEN_exact : 905 AMDGPU::BUFFER_STORE_FORMAT_X_OFFSET_exact; 906 case 2: 907 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XY_OFFEN_exact : 908 AMDGPU::BUFFER_STORE_FORMAT_XY_OFFSET_exact; 909 case 3: 910 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XYZ_OFFEN_exact : 911 AMDGPU::BUFFER_STORE_FORMAT_XYZ_OFFSET_exact; 912 case 4: 913 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XYZW_OFFEN_exact : 914 AMDGPU::BUFFER_STORE_FORMAT_XYZW_OFFSET_exact; 915 default: 916 return -1; 917 } 918 919 llvm_unreachable("unhandled buffer store"); 920 } 921 922 // TODO: Move this to combiner 923 // Returns base register, imm offset, total constant offset. 924 std::tuple<Register, unsigned, unsigned> 925 AMDGPUInstructionSelector::splitBufferOffsets(MachineIRBuilder &B, 926 Register OrigOffset) const { 927 const unsigned MaxImm = 4095; 928 Register BaseReg; 929 unsigned TotalConstOffset; 930 MachineInstr *OffsetDef; 931 932 std::tie(BaseReg, TotalConstOffset, OffsetDef) 933 = getBaseWithConstantOffset(*MRI, OrigOffset); 934 935 unsigned ImmOffset = TotalConstOffset; 936 937 // If the immediate value is too big for the immoffset field, put the value 938 // and -4096 into the immoffset field so that the value that is copied/added 939 // for the voffset field is a multiple of 4096, and it stands more chance 940 // of being CSEd with the copy/add for another similar load/store.f 941 // However, do not do that rounding down to a multiple of 4096 if that is a 942 // negative number, as it appears to be illegal to have a negative offset 943 // in the vgpr, even if adding the immediate offset makes it positive. 944 unsigned Overflow = ImmOffset & ~MaxImm; 945 ImmOffset -= Overflow; 946 if ((int32_t)Overflow < 0) { 947 Overflow += ImmOffset; 948 ImmOffset = 0; 949 } 950 951 if (Overflow != 0) { 952 // In case this is in a waterfall loop, insert offset code at the def point 953 // of the offset, not inside the loop. 954 MachineBasicBlock::iterator OldInsPt = B.getInsertPt(); 955 MachineBasicBlock &OldMBB = B.getMBB(); 956 B.setInstr(*OffsetDef); 957 958 if (!BaseReg) { 959 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 960 B.buildInstr(AMDGPU::V_MOV_B32_e32) 961 .addDef(BaseReg) 962 .addImm(Overflow); 963 } else { 964 Register OverflowVal = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 965 B.buildInstr(AMDGPU::V_MOV_B32_e32) 966 .addDef(OverflowVal) 967 .addImm(Overflow); 968 969 Register NewBaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 970 TII.getAddNoCarry(B.getMBB(), B.getInsertPt(), B.getDebugLoc(), NewBaseReg) 971 .addReg(BaseReg) 972 .addReg(OverflowVal, RegState::Kill) 973 .addImm(0); 974 BaseReg = NewBaseReg; 975 } 976 977 B.setInsertPt(OldMBB, OldInsPt); 978 } 979 980 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 981 } 982 983 bool AMDGPUInstructionSelector::selectStoreIntrinsic(MachineInstr &MI, 984 bool IsFormat) const { 985 MachineIRBuilder B(MI); 986 MachineFunction &MF = B.getMF(); 987 Register VData = MI.getOperand(1).getReg(); 988 LLT Ty = MRI->getType(VData); 989 990 int Size = Ty.getSizeInBits(); 991 if (Size % 32 != 0) 992 return false; 993 994 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 995 MachineMemOperand *MMO = *MI.memoperands_begin(); 996 const int MemSize = MMO->getSize(); 997 998 Register RSrc = MI.getOperand(2).getReg(); 999 Register VOffset = MI.getOperand(3).getReg(); 1000 Register SOffset = MI.getOperand(4).getReg(); 1001 unsigned AuxiliaryData = MI.getOperand(5).getImm(); 1002 unsigned ImmOffset; 1003 unsigned TotalOffset; 1004 1005 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 1006 if (TotalOffset != 0) 1007 MMO = MF.getMachineMemOperand(MMO, TotalOffset, MemSize); 1008 1009 const bool Offen = !isZero(VOffset, *MRI); 1010 1011 int Opc = IsFormat ? getBufferStoreFormatOpcode(Ty, MemSize, Offen) : 1012 getBufferStoreOpcode(Ty, MemSize, Offen); 1013 if (Opc == -1) 1014 return false; 1015 1016 MachineInstrBuilder MIB = B.buildInstr(Opc) 1017 .addUse(VData); 1018 1019 if (Offen) 1020 MIB.addUse(VOffset); 1021 1022 MIB.addUse(RSrc) 1023 .addUse(SOffset) 1024 .addImm(ImmOffset) 1025 .addImm(extractGLC(AuxiliaryData)) 1026 .addImm(extractSLC(AuxiliaryData)) 1027 .addImm(0) // tfe: FIXME: Remove from inst 1028 .addImm(extractDLC(AuxiliaryData)) 1029 .addImm(extractSWZ(AuxiliaryData)) 1030 .addMemOperand(MMO); 1031 1032 MI.eraseFromParent(); 1033 1034 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1035 } 1036 1037 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( 1038 MachineInstr &I) const { 1039 MachineBasicBlock *BB = I.getParent(); 1040 unsigned IntrinsicID = I.getIntrinsicID(); 1041 switch (IntrinsicID) { 1042 case Intrinsic::amdgcn_exp: { 1043 int64_t Tgt = I.getOperand(1).getImm(); 1044 int64_t Enabled = I.getOperand(2).getImm(); 1045 int64_t Done = I.getOperand(7).getImm(); 1046 int64_t VM = I.getOperand(8).getImm(); 1047 1048 MachineInstr *Exp = buildEXP(TII, &I, Tgt, I.getOperand(3).getReg(), 1049 I.getOperand(4).getReg(), 1050 I.getOperand(5).getReg(), 1051 I.getOperand(6).getReg(), 1052 VM, false, Enabled, Done); 1053 1054 I.eraseFromParent(); 1055 return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI); 1056 } 1057 case Intrinsic::amdgcn_exp_compr: { 1058 const DebugLoc &DL = I.getDebugLoc(); 1059 int64_t Tgt = I.getOperand(1).getImm(); 1060 int64_t Enabled = I.getOperand(2).getImm(); 1061 Register Reg0 = I.getOperand(3).getReg(); 1062 Register Reg1 = I.getOperand(4).getReg(); 1063 Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1064 int64_t Done = I.getOperand(5).getImm(); 1065 int64_t VM = I.getOperand(6).getImm(); 1066 1067 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef); 1068 MachineInstr *Exp = buildEXP(TII, &I, Tgt, Reg0, Reg1, Undef, Undef, VM, 1069 true, Enabled, Done); 1070 1071 I.eraseFromParent(); 1072 return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI); 1073 } 1074 case Intrinsic::amdgcn_end_cf: { 1075 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 1076 // SelectionDAG uses for wave32 vs wave64. 1077 BuildMI(*BB, &I, I.getDebugLoc(), 1078 TII.get(AMDGPU::SI_END_CF)) 1079 .add(I.getOperand(1)); 1080 1081 Register Reg = I.getOperand(1).getReg(); 1082 I.eraseFromParent(); 1083 1084 if (!MRI->getRegClassOrNull(Reg)) 1085 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 1086 return true; 1087 } 1088 case Intrinsic::amdgcn_raw_buffer_store: 1089 return selectStoreIntrinsic(I, false); 1090 case Intrinsic::amdgcn_raw_buffer_store_format: 1091 return selectStoreIntrinsic(I, true); 1092 default: 1093 return selectImpl(I, *CoverageInfo); 1094 } 1095 } 1096 1097 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { 1098 MachineBasicBlock *BB = I.getParent(); 1099 const DebugLoc &DL = I.getDebugLoc(); 1100 1101 Register DstReg = I.getOperand(0).getReg(); 1102 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 1103 assert(Size <= 32 || Size == 64); 1104 const MachineOperand &CCOp = I.getOperand(1); 1105 Register CCReg = CCOp.getReg(); 1106 if (isSCC(CCReg, *MRI)) { 1107 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 : 1108 AMDGPU::S_CSELECT_B32; 1109 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 1110 .addReg(CCReg); 1111 1112 // The generic constrainSelectedInstRegOperands doesn't work for the scc register 1113 // bank, because it does not cover the register class that we used to represent 1114 // for it. So we need to manually set the register class here. 1115 if (!MRI->getRegClassOrNull(CCReg)) 1116 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI)); 1117 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) 1118 .add(I.getOperand(2)) 1119 .add(I.getOperand(3)); 1120 1121 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) | 1122 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI); 1123 I.eraseFromParent(); 1124 return Ret; 1125 } 1126 1127 // Wide VGPR select should have been split in RegBankSelect. 1128 if (Size > 32) 1129 return false; 1130 1131 MachineInstr *Select = 1132 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1133 .addImm(0) 1134 .add(I.getOperand(3)) 1135 .addImm(0) 1136 .add(I.getOperand(2)) 1137 .add(I.getOperand(1)); 1138 1139 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); 1140 I.eraseFromParent(); 1141 return Ret; 1142 } 1143 1144 bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { 1145 initM0(I); 1146 return selectImpl(I, *CoverageInfo); 1147 } 1148 1149 static int sizeToSubRegIndex(unsigned Size) { 1150 switch (Size) { 1151 case 32: 1152 return AMDGPU::sub0; 1153 case 64: 1154 return AMDGPU::sub0_sub1; 1155 case 96: 1156 return AMDGPU::sub0_sub1_sub2; 1157 case 128: 1158 return AMDGPU::sub0_sub1_sub2_sub3; 1159 case 256: 1160 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; 1161 default: 1162 if (Size < 32) 1163 return AMDGPU::sub0; 1164 if (Size > 256) 1165 return -1; 1166 return sizeToSubRegIndex(PowerOf2Ceil(Size)); 1167 } 1168 } 1169 1170 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { 1171 Register DstReg = I.getOperand(0).getReg(); 1172 Register SrcReg = I.getOperand(1).getReg(); 1173 const LLT DstTy = MRI->getType(DstReg); 1174 const LLT SrcTy = MRI->getType(SrcReg); 1175 if (!DstTy.isScalar()) 1176 return false; 1177 1178 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1179 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1180 if (SrcRB != DstRB) 1181 return false; 1182 1183 unsigned DstSize = DstTy.getSizeInBits(); 1184 unsigned SrcSize = SrcTy.getSizeInBits(); 1185 1186 const TargetRegisterClass *SrcRC 1187 = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI); 1188 const TargetRegisterClass *DstRC 1189 = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI); 1190 1191 if (SrcSize > 32) { 1192 int SubRegIdx = sizeToSubRegIndex(DstSize); 1193 if (SubRegIdx == -1) 1194 return false; 1195 1196 // Deal with weird cases where the class only partially supports the subreg 1197 // index. 1198 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx); 1199 if (!SrcRC) 1200 return false; 1201 1202 I.getOperand(1).setSubReg(SubRegIdx); 1203 } 1204 1205 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 1206 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) { 1207 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); 1208 return false; 1209 } 1210 1211 I.setDesc(TII.get(TargetOpcode::COPY)); 1212 return true; 1213 } 1214 1215 /// \returns true if a bitmask for \p Size bits will be an inline immediate. 1216 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) { 1217 Mask = maskTrailingOnes<unsigned>(Size); 1218 int SignedMask = static_cast<int>(Mask); 1219 return SignedMask >= -16 && SignedMask <= 64; 1220 } 1221 1222 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { 1223 bool Signed = I.getOpcode() == AMDGPU::G_SEXT; 1224 const DebugLoc &DL = I.getDebugLoc(); 1225 MachineBasicBlock &MBB = *I.getParent(); 1226 const Register DstReg = I.getOperand(0).getReg(); 1227 const Register SrcReg = I.getOperand(1).getReg(); 1228 1229 const LLT DstTy = MRI->getType(DstReg); 1230 const LLT SrcTy = MRI->getType(SrcReg); 1231 const LLT S1 = LLT::scalar(1); 1232 const unsigned SrcSize = SrcTy.getSizeInBits(); 1233 const unsigned DstSize = DstTy.getSizeInBits(); 1234 if (!DstTy.isScalar()) 1235 return false; 1236 1237 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 1238 1239 if (SrcBank->getID() == AMDGPU::SCCRegBankID) { 1240 if (SrcTy != S1 || DstSize > 64) // Invalid 1241 return false; 1242 1243 unsigned Opcode = 1244 DstSize > 32 ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32; 1245 const TargetRegisterClass *DstRC = 1246 DstSize > 32 ? &AMDGPU::SReg_64RegClass : &AMDGPU::SReg_32RegClass; 1247 1248 // FIXME: Create an extra copy to avoid incorrectly constraining the result 1249 // of the scc producer. 1250 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1251 BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), TmpReg) 1252 .addReg(SrcReg); 1253 BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 1254 .addReg(TmpReg); 1255 1256 // The instruction operands are backwards from what you would expect. 1257 BuildMI(MBB, I, DL, TII.get(Opcode), DstReg) 1258 .addImm(0) 1259 .addImm(Signed ? -1 : 1); 1260 I.eraseFromParent(); 1261 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI); 1262 } 1263 1264 if (SrcBank->getID() == AMDGPU::VCCRegBankID && DstSize <= 32) { 1265 if (SrcTy != S1) // Invalid 1266 return false; 1267 1268 MachineInstr *ExtI = 1269 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1270 .addImm(0) // src0_modifiers 1271 .addImm(0) // src0 1272 .addImm(0) // src1_modifiers 1273 .addImm(Signed ? -1 : 1) // src1 1274 .addUse(SrcReg); 1275 I.eraseFromParent(); 1276 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1277 } 1278 1279 if (I.getOpcode() == AMDGPU::G_ANYEXT) 1280 return selectCOPY(I); 1281 1282 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) { 1283 // 64-bit should have been split up in RegBankSelect 1284 1285 // Try to use an and with a mask if it will save code size. 1286 unsigned Mask; 1287 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1288 MachineInstr *ExtI = 1289 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg) 1290 .addImm(Mask) 1291 .addReg(SrcReg); 1292 I.eraseFromParent(); 1293 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1294 } 1295 1296 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32; 1297 MachineInstr *ExtI = 1298 BuildMI(MBB, I, DL, TII.get(BFE), DstReg) 1299 .addReg(SrcReg) 1300 .addImm(0) // Offset 1301 .addImm(SrcSize); // Width 1302 I.eraseFromParent(); 1303 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1304 } 1305 1306 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) { 1307 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI)) 1308 return false; 1309 1310 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) { 1311 const unsigned SextOpc = SrcSize == 8 ? 1312 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16; 1313 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg) 1314 .addReg(SrcReg); 1315 I.eraseFromParent(); 1316 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1317 } 1318 1319 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; 1320 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; 1321 1322 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width. 1323 if (DstSize > 32 && SrcSize <= 32) { 1324 // We need a 64-bit register source, but the high bits don't matter. 1325 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); 1326 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1327 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 1328 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) 1329 .addReg(SrcReg) 1330 .addImm(AMDGPU::sub0) 1331 .addReg(UndefReg) 1332 .addImm(AMDGPU::sub1); 1333 1334 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg) 1335 .addReg(ExtReg) 1336 .addImm(SrcSize << 16); 1337 1338 I.eraseFromParent(); 1339 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI); 1340 } 1341 1342 unsigned Mask; 1343 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1344 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg) 1345 .addReg(SrcReg) 1346 .addImm(Mask); 1347 } else { 1348 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg) 1349 .addReg(SrcReg) 1350 .addImm(SrcSize << 16); 1351 } 1352 1353 I.eraseFromParent(); 1354 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1355 } 1356 1357 return false; 1358 } 1359 1360 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { 1361 MachineBasicBlock *BB = I.getParent(); 1362 MachineOperand &ImmOp = I.getOperand(1); 1363 1364 // The AMDGPU backend only supports Imm operands and not CImm or FPImm. 1365 if (ImmOp.isFPImm()) { 1366 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt(); 1367 ImmOp.ChangeToImmediate(Imm.getZExtValue()); 1368 } else if (ImmOp.isCImm()) { 1369 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getZExtValue()); 1370 } 1371 1372 Register DstReg = I.getOperand(0).getReg(); 1373 unsigned Size; 1374 bool IsSgpr; 1375 const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg()); 1376 if (RB) { 1377 IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID; 1378 Size = MRI->getType(DstReg).getSizeInBits(); 1379 } else { 1380 const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg); 1381 IsSgpr = TRI.isSGPRClass(RC); 1382 Size = TRI.getRegSizeInBits(*RC); 1383 } 1384 1385 if (Size != 32 && Size != 64) 1386 return false; 1387 1388 unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1389 if (Size == 32) { 1390 I.setDesc(TII.get(Opcode)); 1391 I.addImplicitDefUseOperands(*MF); 1392 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 1393 } 1394 1395 const DebugLoc &DL = I.getDebugLoc(); 1396 1397 APInt Imm(Size, I.getOperand(1).getImm()); 1398 1399 MachineInstr *ResInst; 1400 if (IsSgpr && TII.isInlineConstant(Imm)) { 1401 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) 1402 .addImm(I.getOperand(1).getImm()); 1403 } else { 1404 const TargetRegisterClass *RC = IsSgpr ? 1405 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass; 1406 Register LoReg = MRI->createVirtualRegister(RC); 1407 Register HiReg = MRI->createVirtualRegister(RC); 1408 1409 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg) 1410 .addImm(Imm.trunc(32).getZExtValue()); 1411 1412 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg) 1413 .addImm(Imm.ashr(32).getZExtValue()); 1414 1415 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 1416 .addReg(LoReg) 1417 .addImm(AMDGPU::sub0) 1418 .addReg(HiReg) 1419 .addImm(AMDGPU::sub1); 1420 } 1421 1422 // We can't call constrainSelectedInstRegOperands here, because it doesn't 1423 // work for target independent opcodes 1424 I.eraseFromParent(); 1425 const TargetRegisterClass *DstRC = 1426 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI); 1427 if (!DstRC) 1428 return true; 1429 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI); 1430 } 1431 1432 static bool isConstant(const MachineInstr &MI) { 1433 return MI.getOpcode() == TargetOpcode::G_CONSTANT; 1434 } 1435 1436 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, 1437 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const { 1438 1439 const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg()); 1440 1441 assert(PtrMI); 1442 1443 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD) 1444 return; 1445 1446 GEPInfo GEPInfo(*PtrMI); 1447 1448 for (unsigned i = 1; i != 3; ++i) { 1449 const MachineOperand &GEPOp = PtrMI->getOperand(i); 1450 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg()); 1451 assert(OpDef); 1452 if (i == 2 && isConstant(*OpDef)) { 1453 // TODO: Could handle constant base + variable offset, but a combine 1454 // probably should have commuted it. 1455 assert(GEPInfo.Imm == 0); 1456 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue(); 1457 continue; 1458 } 1459 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI); 1460 if (OpBank->getID() == AMDGPU::SGPRRegBankID) 1461 GEPInfo.SgprParts.push_back(GEPOp.getReg()); 1462 else 1463 GEPInfo.VgprParts.push_back(GEPOp.getReg()); 1464 } 1465 1466 AddrInfo.push_back(GEPInfo); 1467 getAddrModeInfo(*PtrMI, MRI, AddrInfo); 1468 } 1469 1470 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const { 1471 if (!MI.hasOneMemOperand()) 1472 return false; 1473 1474 const MachineMemOperand *MMO = *MI.memoperands_begin(); 1475 const Value *Ptr = MMO->getValue(); 1476 1477 // UndefValue means this is a load of a kernel input. These are uniform. 1478 // Sometimes LDS instructions have constant pointers. 1479 // If Ptr is null, then that means this mem operand contains a 1480 // PseudoSourceValue like GOT. 1481 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || 1482 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) 1483 return true; 1484 1485 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 1486 return true; 1487 1488 const Instruction *I = dyn_cast<Instruction>(Ptr); 1489 return I && I->getMetadata("amdgpu.uniform"); 1490 } 1491 1492 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const { 1493 for (const GEPInfo &GEPInfo : AddrInfo) { 1494 if (!GEPInfo.VgprParts.empty()) 1495 return true; 1496 } 1497 return false; 1498 } 1499 1500 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const { 1501 MachineBasicBlock *BB = I.getParent(); 1502 1503 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); 1504 unsigned AS = PtrTy.getAddressSpace(); 1505 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) && 1506 STI.ldsRequiresM0Init()) { 1507 // If DS instructions require M0 initializtion, insert it before selecting. 1508 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1509 .addImm(-1); 1510 } 1511 } 1512 1513 bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const { 1514 initM0(I); 1515 return selectImpl(I, *CoverageInfo); 1516 } 1517 1518 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { 1519 MachineBasicBlock *BB = I.getParent(); 1520 MachineOperand &CondOp = I.getOperand(0); 1521 Register CondReg = CondOp.getReg(); 1522 const DebugLoc &DL = I.getDebugLoc(); 1523 1524 unsigned BrOpcode; 1525 Register CondPhysReg; 1526 const TargetRegisterClass *ConstrainRC; 1527 1528 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide 1529 // whether the branch is uniform when selecting the instruction. In 1530 // GlobalISel, we should push that decision into RegBankSelect. Assume for now 1531 // RegBankSelect knows what it's doing if the branch condition is scc, even 1532 // though it currently does not. 1533 if (isSCC(CondReg, *MRI)) { 1534 CondPhysReg = AMDGPU::SCC; 1535 BrOpcode = AMDGPU::S_CBRANCH_SCC1; 1536 // FIXME: Hack for isSCC tests 1537 ConstrainRC = &AMDGPU::SGPR_32RegClass; 1538 } else if (isVCC(CondReg, *MRI)) { 1539 // FIXME: Do we have to insert an and with exec here, like in SelectionDAG? 1540 // We sort of know that a VCC producer based on the register bank, that ands 1541 // inactive lanes with 0. What if there was a logical operation with vcc 1542 // producers in different blocks/with different exec masks? 1543 // FIXME: Should scc->vcc copies and with exec? 1544 CondPhysReg = TRI.getVCC(); 1545 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ; 1546 ConstrainRC = TRI.getBoolRC(); 1547 } else 1548 return false; 1549 1550 if (!MRI->getRegClassOrNull(CondReg)) 1551 MRI->setRegClass(CondReg, ConstrainRC); 1552 1553 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg) 1554 .addReg(CondReg); 1555 BuildMI(*BB, &I, DL, TII.get(BrOpcode)) 1556 .addMBB(I.getOperand(1).getMBB()); 1557 1558 I.eraseFromParent(); 1559 return true; 1560 } 1561 1562 bool AMDGPUInstructionSelector::selectG_FRAME_INDEX(MachineInstr &I) const { 1563 Register DstReg = I.getOperand(0).getReg(); 1564 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1565 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 1566 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32)); 1567 if (IsVGPR) 1568 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 1569 1570 return RBI.constrainGenericRegister( 1571 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI); 1572 } 1573 1574 bool AMDGPUInstructionSelector::selectG_PTR_MASK(MachineInstr &I) const { 1575 uint64_t Align = I.getOperand(2).getImm(); 1576 const uint64_t Mask = ~((UINT64_C(1) << Align) - 1); 1577 1578 MachineBasicBlock *BB = I.getParent(); 1579 1580 Register DstReg = I.getOperand(0).getReg(); 1581 Register SrcReg = I.getOperand(1).getReg(); 1582 1583 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1584 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1585 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 1586 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 1587 unsigned MovOpc = IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 1588 const TargetRegisterClass &RegRC 1589 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 1590 1591 LLT Ty = MRI->getType(DstReg); 1592 1593 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB, 1594 *MRI); 1595 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB, 1596 *MRI); 1597 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 1598 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 1599 return false; 1600 1601 const DebugLoc &DL = I.getDebugLoc(); 1602 Register ImmReg = MRI->createVirtualRegister(&RegRC); 1603 BuildMI(*BB, &I, DL, TII.get(MovOpc), ImmReg) 1604 .addImm(Mask); 1605 1606 if (Ty.getSizeInBits() == 32) { 1607 BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg) 1608 .addReg(SrcReg) 1609 .addReg(ImmReg); 1610 I.eraseFromParent(); 1611 return true; 1612 } 1613 1614 Register HiReg = MRI->createVirtualRegister(&RegRC); 1615 Register LoReg = MRI->createVirtualRegister(&RegRC); 1616 Register MaskLo = MRI->createVirtualRegister(&RegRC); 1617 1618 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg) 1619 .addReg(SrcReg, 0, AMDGPU::sub0); 1620 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg) 1621 .addReg(SrcReg, 0, AMDGPU::sub1); 1622 1623 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskLo) 1624 .addReg(LoReg) 1625 .addReg(ImmReg); 1626 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 1627 .addReg(MaskLo) 1628 .addImm(AMDGPU::sub0) 1629 .addReg(HiReg) 1630 .addImm(AMDGPU::sub1); 1631 I.eraseFromParent(); 1632 return true; 1633 } 1634 1635 bool AMDGPUInstructionSelector::select(MachineInstr &I) { 1636 if (I.isPHI()) 1637 return selectPHI(I); 1638 1639 if (!I.isPreISelOpcode()) { 1640 if (I.isCopy()) 1641 return selectCOPY(I); 1642 return true; 1643 } 1644 1645 switch (I.getOpcode()) { 1646 case TargetOpcode::G_AND: 1647 case TargetOpcode::G_OR: 1648 case TargetOpcode::G_XOR: 1649 if (selectG_AND_OR_XOR(I)) 1650 return true; 1651 return selectImpl(I, *CoverageInfo); 1652 case TargetOpcode::G_ADD: 1653 case TargetOpcode::G_SUB: 1654 if (selectImpl(I, *CoverageInfo)) 1655 return true; 1656 return selectG_ADD_SUB(I); 1657 case TargetOpcode::G_UADDO: 1658 case TargetOpcode::G_USUBO: 1659 return selectG_UADDO_USUBO(I); 1660 case TargetOpcode::G_INTTOPTR: 1661 case TargetOpcode::G_BITCAST: 1662 case TargetOpcode::G_PTRTOINT: 1663 return selectCOPY(I); 1664 case TargetOpcode::G_CONSTANT: 1665 case TargetOpcode::G_FCONSTANT: 1666 return selectG_CONSTANT(I); 1667 case TargetOpcode::G_EXTRACT: 1668 return selectG_EXTRACT(I); 1669 case TargetOpcode::G_MERGE_VALUES: 1670 case TargetOpcode::G_BUILD_VECTOR: 1671 case TargetOpcode::G_CONCAT_VECTORS: 1672 return selectG_MERGE_VALUES(I); 1673 case TargetOpcode::G_UNMERGE_VALUES: 1674 return selectG_UNMERGE_VALUES(I); 1675 case TargetOpcode::G_PTR_ADD: 1676 return selectG_PTR_ADD(I); 1677 case TargetOpcode::G_IMPLICIT_DEF: 1678 return selectG_IMPLICIT_DEF(I); 1679 case TargetOpcode::G_INSERT: 1680 return selectG_INSERT(I); 1681 case TargetOpcode::G_INTRINSIC: 1682 return selectG_INTRINSIC(I); 1683 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: 1684 return selectG_INTRINSIC_W_SIDE_EFFECTS(I); 1685 case TargetOpcode::G_ICMP: 1686 if (selectG_ICMP(I)) 1687 return true; 1688 return selectImpl(I, *CoverageInfo); 1689 case TargetOpcode::G_LOAD: 1690 case TargetOpcode::G_ATOMIC_CMPXCHG: 1691 case TargetOpcode::G_ATOMICRMW_XCHG: 1692 case TargetOpcode::G_ATOMICRMW_ADD: 1693 case TargetOpcode::G_ATOMICRMW_SUB: 1694 case TargetOpcode::G_ATOMICRMW_AND: 1695 case TargetOpcode::G_ATOMICRMW_OR: 1696 case TargetOpcode::G_ATOMICRMW_XOR: 1697 case TargetOpcode::G_ATOMICRMW_MIN: 1698 case TargetOpcode::G_ATOMICRMW_MAX: 1699 case TargetOpcode::G_ATOMICRMW_UMIN: 1700 case TargetOpcode::G_ATOMICRMW_UMAX: 1701 case TargetOpcode::G_ATOMICRMW_FADD: 1702 return selectG_LOAD_ATOMICRMW(I); 1703 case TargetOpcode::G_SELECT: 1704 return selectG_SELECT(I); 1705 case TargetOpcode::G_STORE: 1706 return selectG_STORE(I); 1707 case TargetOpcode::G_TRUNC: 1708 return selectG_TRUNC(I); 1709 case TargetOpcode::G_SEXT: 1710 case TargetOpcode::G_ZEXT: 1711 case TargetOpcode::G_ANYEXT: 1712 return selectG_SZA_EXT(I); 1713 case TargetOpcode::G_BRCOND: 1714 return selectG_BRCOND(I); 1715 case TargetOpcode::G_FRAME_INDEX: 1716 return selectG_FRAME_INDEX(I); 1717 case TargetOpcode::G_FENCE: 1718 // FIXME: Tablegen importer doesn't handle the imm operands correctly, and 1719 // is checking for G_CONSTANT 1720 I.setDesc(TII.get(AMDGPU::ATOMIC_FENCE)); 1721 return true; 1722 case TargetOpcode::G_PTR_MASK: 1723 return selectG_PTR_MASK(I); 1724 default: 1725 return selectImpl(I, *CoverageInfo); 1726 } 1727 return false; 1728 } 1729 1730 InstructionSelector::ComplexRendererFns 1731 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { 1732 return {{ 1733 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 1734 }}; 1735 1736 } 1737 1738 std::pair<Register, unsigned> 1739 AMDGPUInstructionSelector::selectVOP3ModsImpl( 1740 Register Src) const { 1741 unsigned Mods = 0; 1742 MachineInstr *MI = MRI->getVRegDef(Src); 1743 1744 if (MI && MI->getOpcode() == AMDGPU::G_FNEG) { 1745 Src = MI->getOperand(1).getReg(); 1746 Mods |= SISrcMods::NEG; 1747 MI = MRI->getVRegDef(Src); 1748 } 1749 1750 if (MI && MI->getOpcode() == AMDGPU::G_FABS) { 1751 Src = MI->getOperand(1).getReg(); 1752 Mods |= SISrcMods::ABS; 1753 } 1754 1755 return std::make_pair(Src, Mods); 1756 } 1757 1758 /// 1759 /// This will select either an SGPR or VGPR operand and will save us from 1760 /// having to write an extra tablegen pattern. 1761 InstructionSelector::ComplexRendererFns 1762 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const { 1763 return {{ 1764 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 1765 }}; 1766 } 1767 1768 InstructionSelector::ComplexRendererFns 1769 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { 1770 Register Src; 1771 unsigned Mods; 1772 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 1773 1774 return {{ 1775 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 1776 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 1777 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 1778 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 1779 }}; 1780 } 1781 1782 InstructionSelector::ComplexRendererFns 1783 AMDGPUInstructionSelector::selectVOP3Mods0Clamp0OMod(MachineOperand &Root) const { 1784 Register Src; 1785 unsigned Mods; 1786 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 1787 1788 return {{ 1789 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 1790 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 1791 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 1792 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 1793 }}; 1794 } 1795 1796 InstructionSelector::ComplexRendererFns 1797 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { 1798 return {{ 1799 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 1800 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 1801 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 1802 }}; 1803 } 1804 1805 InstructionSelector::ComplexRendererFns 1806 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { 1807 Register Src; 1808 unsigned Mods; 1809 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 1810 1811 return {{ 1812 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 1813 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 1814 }}; 1815 } 1816 1817 InstructionSelector::ComplexRendererFns 1818 AMDGPUInstructionSelector::selectVOP3OpSelMods0(MachineOperand &Root) const { 1819 // FIXME: Handle clamp and op_sel 1820 return {{ 1821 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 1822 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // src_mods 1823 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // clamp 1824 }}; 1825 } 1826 1827 InstructionSelector::ComplexRendererFns 1828 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { 1829 // FIXME: Handle op_sel 1830 return {{ 1831 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 1832 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods 1833 }}; 1834 } 1835 1836 InstructionSelector::ComplexRendererFns 1837 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { 1838 SmallVector<GEPInfo, 4> AddrInfo; 1839 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 1840 1841 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 1842 return None; 1843 1844 const GEPInfo &GEPInfo = AddrInfo[0]; 1845 1846 if (!AMDGPU::isLegalSMRDImmOffset(STI, GEPInfo.Imm)) 1847 return None; 1848 1849 unsigned PtrReg = GEPInfo.SgprParts[0]; 1850 int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm); 1851 return {{ 1852 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 1853 [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); } 1854 }}; 1855 } 1856 1857 InstructionSelector::ComplexRendererFns 1858 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { 1859 SmallVector<GEPInfo, 4> AddrInfo; 1860 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 1861 1862 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 1863 return None; 1864 1865 const GEPInfo &GEPInfo = AddrInfo[0]; 1866 unsigned PtrReg = GEPInfo.SgprParts[0]; 1867 int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm); 1868 if (!isUInt<32>(EncodedImm)) 1869 return None; 1870 1871 return {{ 1872 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 1873 [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); } 1874 }}; 1875 } 1876 1877 InstructionSelector::ComplexRendererFns 1878 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { 1879 MachineInstr *MI = Root.getParent(); 1880 MachineBasicBlock *MBB = MI->getParent(); 1881 1882 SmallVector<GEPInfo, 4> AddrInfo; 1883 getAddrModeInfo(*MI, *MRI, AddrInfo); 1884 1885 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, 1886 // then we can select all ptr + 32-bit offsets not just immediate offsets. 1887 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 1888 return None; 1889 1890 const GEPInfo &GEPInfo = AddrInfo[0]; 1891 if (!GEPInfo.Imm || !isUInt<32>(GEPInfo.Imm)) 1892 return None; 1893 1894 // If we make it this far we have a load with an 32-bit immediate offset. 1895 // It is OK to select this using a sgpr offset, because we have already 1896 // failed trying to select this load into one of the _IMM variants since 1897 // the _IMM Patterns are considered before the _SGPR patterns. 1898 unsigned PtrReg = GEPInfo.SgprParts[0]; 1899 Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1900 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) 1901 .addImm(GEPInfo.Imm); 1902 return {{ 1903 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 1904 [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); } 1905 }}; 1906 } 1907 1908 template <bool Signed> 1909 InstructionSelector::ComplexRendererFns 1910 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const { 1911 MachineInstr *MI = Root.getParent(); 1912 1913 InstructionSelector::ComplexRendererFns Default = {{ 1914 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 1915 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // offset 1916 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 1917 }}; 1918 1919 if (!STI.hasFlatInstOffsets()) 1920 return Default; 1921 1922 const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg()); 1923 if (!OpDef || OpDef->getOpcode() != AMDGPU::G_PTR_ADD) 1924 return Default; 1925 1926 Optional<int64_t> Offset = 1927 getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI); 1928 if (!Offset.hasValue()) 1929 return Default; 1930 1931 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); 1932 if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed)) 1933 return Default; 1934 1935 Register BasePtr = OpDef->getOperand(1).getReg(); 1936 1937 return {{ 1938 [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); }, 1939 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); }, 1940 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 1941 }}; 1942 } 1943 1944 InstructionSelector::ComplexRendererFns 1945 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const { 1946 return selectFlatOffsetImpl<false>(Root); 1947 } 1948 1949 InstructionSelector::ComplexRendererFns 1950 AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const { 1951 return selectFlatOffsetImpl<true>(Root); 1952 } 1953 1954 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { 1955 auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>(); 1956 return PSV && PSV->isStack(); 1957 } 1958 1959 InstructionSelector::ComplexRendererFns 1960 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { 1961 MachineInstr *MI = Root.getParent(); 1962 MachineBasicBlock *MBB = MI->getParent(); 1963 MachineFunction *MF = MBB->getParent(); 1964 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 1965 1966 int64_t Offset = 0; 1967 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset))) { 1968 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1969 1970 // TODO: Should this be inside the render function? The iterator seems to 1971 // move. 1972 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), 1973 HighBits) 1974 .addImm(Offset & ~4095); 1975 1976 return {{[=](MachineInstrBuilder &MIB) { // rsrc 1977 MIB.addReg(Info->getScratchRSrcReg()); 1978 }, 1979 [=](MachineInstrBuilder &MIB) { // vaddr 1980 MIB.addReg(HighBits); 1981 }, 1982 [=](MachineInstrBuilder &MIB) { // soffset 1983 const MachineMemOperand *MMO = *MI->memoperands_begin(); 1984 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 1985 1986 Register SOffsetReg = isStackPtrRelative(PtrInfo) 1987 ? Info->getStackPtrOffsetReg() 1988 : Info->getScratchWaveOffsetReg(); 1989 MIB.addReg(SOffsetReg); 1990 }, 1991 [=](MachineInstrBuilder &MIB) { // offset 1992 MIB.addImm(Offset & 4095); 1993 }}}; 1994 } 1995 1996 assert(Offset == 0); 1997 1998 // Try to fold a frame index directly into the MUBUF vaddr field, and any 1999 // offsets. 2000 Optional<int> FI; 2001 Register VAddr = Root.getReg(); 2002 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) { 2003 if (isBaseWithConstantOffset(Root, *MRI)) { 2004 const MachineOperand &LHS = RootDef->getOperand(1); 2005 const MachineOperand &RHS = RootDef->getOperand(2); 2006 const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); 2007 const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); 2008 if (LHSDef && RHSDef) { 2009 int64_t PossibleOffset = 2010 RHSDef->getOperand(1).getCImm()->getSExtValue(); 2011 if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) && 2012 (!STI.privateMemoryResourceIsRangeChecked() || 2013 KnownBits->signBitIsZero(LHS.getReg()))) { 2014 if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX) 2015 FI = LHSDef->getOperand(1).getIndex(); 2016 else 2017 VAddr = LHS.getReg(); 2018 Offset = PossibleOffset; 2019 } 2020 } 2021 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) { 2022 FI = RootDef->getOperand(1).getIndex(); 2023 } 2024 } 2025 2026 // If we don't know this private access is a local stack object, it needs to 2027 // be relative to the entry point's scratch wave offset register. 2028 // TODO: Should split large offsets that don't fit like above. 2029 // TODO: Don't use scratch wave offset just because the offset didn't fit. 2030 Register SOffset = FI.hasValue() ? Info->getStackPtrOffsetReg() 2031 : Info->getScratchWaveOffsetReg(); 2032 2033 return {{[=](MachineInstrBuilder &MIB) { // rsrc 2034 MIB.addReg(Info->getScratchRSrcReg()); 2035 }, 2036 [=](MachineInstrBuilder &MIB) { // vaddr 2037 if (FI.hasValue()) 2038 MIB.addFrameIndex(FI.getValue()); 2039 else 2040 MIB.addReg(VAddr); 2041 }, 2042 [=](MachineInstrBuilder &MIB) { // soffset 2043 MIB.addReg(SOffset); 2044 }, 2045 [=](MachineInstrBuilder &MIB) { // offset 2046 MIB.addImm(Offset); 2047 }}}; 2048 } 2049 2050 bool AMDGPUInstructionSelector::isDSOffsetLegal(const MachineRegisterInfo &MRI, 2051 const MachineOperand &Base, 2052 int64_t Offset, 2053 unsigned OffsetBits) const { 2054 if ((OffsetBits == 16 && !isUInt<16>(Offset)) || 2055 (OffsetBits == 8 && !isUInt<8>(Offset))) 2056 return false; 2057 2058 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) 2059 return true; 2060 2061 // On Southern Islands instruction with a negative base value and an offset 2062 // don't seem to work. 2063 return KnownBits->signBitIsZero(Base.getReg()); 2064 } 2065 2066 InstructionSelector::ComplexRendererFns 2067 AMDGPUInstructionSelector::selectMUBUFScratchOffset( 2068 MachineOperand &Root) const { 2069 MachineInstr *MI = Root.getParent(); 2070 MachineBasicBlock *MBB = MI->getParent(); 2071 2072 int64_t Offset = 0; 2073 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) || 2074 !SIInstrInfo::isLegalMUBUFImmOffset(Offset)) 2075 return {}; 2076 2077 const MachineFunction *MF = MBB->getParent(); 2078 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 2079 const MachineMemOperand *MMO = *MI->memoperands_begin(); 2080 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 2081 2082 Register SOffsetReg = isStackPtrRelative(PtrInfo) 2083 ? Info->getStackPtrOffsetReg() 2084 : Info->getScratchWaveOffsetReg(); 2085 return {{ 2086 [=](MachineInstrBuilder &MIB) { 2087 MIB.addReg(Info->getScratchRSrcReg()); 2088 }, // rsrc 2089 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffsetReg); }, // soffset 2090 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset 2091 }}; 2092 } 2093 2094 InstructionSelector::ComplexRendererFns 2095 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const { 2096 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 2097 if (!RootDef) { 2098 return {{ 2099 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 2100 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } 2101 }}; 2102 } 2103 2104 int64_t ConstAddr = 0; 2105 if (isBaseWithConstantOffset(Root, *MRI)) { 2106 const MachineOperand &LHS = RootDef->getOperand(1); 2107 const MachineOperand &RHS = RootDef->getOperand(2); 2108 const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); 2109 const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); 2110 if (LHSDef && RHSDef) { 2111 int64_t PossibleOffset = 2112 RHSDef->getOperand(1).getCImm()->getSExtValue(); 2113 if (isDSOffsetLegal(*MRI, LHS, PossibleOffset, 16)) { 2114 // (add n0, c0) 2115 return {{ 2116 [=](MachineInstrBuilder &MIB) { MIB.add(LHS); }, 2117 [=](MachineInstrBuilder &MIB) { MIB.addImm(PossibleOffset); } 2118 }}; 2119 } 2120 } 2121 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 2122 2123 2124 2125 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 2126 2127 2128 } 2129 2130 return {{ 2131 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 2132 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } 2133 }}; 2134 } 2135 2136 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, 2137 const MachineInstr &MI) const { 2138 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 2139 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT"); 2140 Optional<int64_t> CstVal = getConstantVRegVal(MI.getOperand(0).getReg(), MRI); 2141 assert(CstVal && "Expected constant value"); 2142 MIB.addImm(CstVal.getValue()); 2143 } 2144