1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the InstructionSelector class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUInstructionSelector.h" 15 #include "AMDGPUInstrInfo.h" 16 #include "AMDGPURegisterBankInfo.h" 17 #include "AMDGPURegisterInfo.h" 18 #include "AMDGPUSubtarget.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 23 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" 24 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" 25 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 26 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 27 #include "llvm/CodeGen/GlobalISel/Utils.h" 28 #include "llvm/CodeGen/MachineBasicBlock.h" 29 #include "llvm/CodeGen/MachineFunction.h" 30 #include "llvm/CodeGen/MachineInstr.h" 31 #include "llvm/CodeGen/MachineInstrBuilder.h" 32 #include "llvm/CodeGen/MachineRegisterInfo.h" 33 #include "llvm/IR/Type.h" 34 #include "llvm/Support/Debug.h" 35 #include "llvm/Support/raw_ostream.h" 36 37 #define DEBUG_TYPE "amdgpu-isel" 38 39 using namespace llvm; 40 using namespace MIPatternMatch; 41 42 #define GET_GLOBALISEL_IMPL 43 #define AMDGPUSubtarget GCNSubtarget 44 #include "AMDGPUGenGlobalISel.inc" 45 #undef GET_GLOBALISEL_IMPL 46 #undef AMDGPUSubtarget 47 48 AMDGPUInstructionSelector::AMDGPUInstructionSelector( 49 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, 50 const AMDGPUTargetMachine &TM) 51 : InstructionSelector(), TII(*STI.getInstrInfo()), 52 TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM), 53 STI(STI), 54 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG), 55 #define GET_GLOBALISEL_PREDICATES_INIT 56 #include "AMDGPUGenGlobalISel.inc" 57 #undef GET_GLOBALISEL_PREDICATES_INIT 58 #define GET_GLOBALISEL_TEMPORARIES_INIT 59 #include "AMDGPUGenGlobalISel.inc" 60 #undef GET_GLOBALISEL_TEMPORARIES_INIT 61 { 62 } 63 64 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } 65 66 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB, 67 CodeGenCoverage &CoverageInfo) { 68 MRI = &MF.getRegInfo(); 69 InstructionSelector::setupMF(MF, KB, CoverageInfo); 70 } 71 72 static bool isSCC(Register Reg, const MachineRegisterInfo &MRI) { 73 if (Register::isPhysicalRegister(Reg)) 74 return Reg == AMDGPU::SCC; 75 76 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 77 const TargetRegisterClass *RC = 78 RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); 79 if (RC) { 80 // FIXME: This is ambiguous for wave32. This could be SCC or VCC, but the 81 // context of the register bank has been lost. 82 // Has a hack getRegClassForSizeOnBank uses exactly SGPR_32RegClass, which 83 // won't ever beconstrained any further. 84 if (RC != &AMDGPU::SGPR_32RegClass) 85 return false; 86 const LLT Ty = MRI.getType(Reg); 87 return Ty.isValid() && Ty.getSizeInBits() == 1; 88 } 89 90 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); 91 return RB->getID() == AMDGPU::SCCRegBankID; 92 } 93 94 bool AMDGPUInstructionSelector::isVCC(Register Reg, 95 const MachineRegisterInfo &MRI) const { 96 if (Register::isPhysicalRegister(Reg)) 97 return Reg == TRI.getVCC(); 98 99 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 100 const TargetRegisterClass *RC = 101 RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); 102 if (RC) { 103 const LLT Ty = MRI.getType(Reg); 104 return RC->hasSuperClassEq(TRI.getBoolRC()) && 105 Ty.isValid() && Ty.getSizeInBits() == 1; 106 } 107 108 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); 109 return RB->getID() == AMDGPU::VCCRegBankID; 110 } 111 112 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { 113 const DebugLoc &DL = I.getDebugLoc(); 114 MachineBasicBlock *BB = I.getParent(); 115 I.setDesc(TII.get(TargetOpcode::COPY)); 116 117 const MachineOperand &Src = I.getOperand(1); 118 MachineOperand &Dst = I.getOperand(0); 119 Register DstReg = Dst.getReg(); 120 Register SrcReg = Src.getReg(); 121 122 if (isVCC(DstReg, *MRI)) { 123 if (SrcReg == AMDGPU::SCC) { 124 const TargetRegisterClass *RC 125 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 126 if (!RC) 127 return true; 128 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 129 } 130 131 if (!isVCC(SrcReg, *MRI)) { 132 // TODO: Should probably leave the copy and let copyPhysReg expand it. 133 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI)) 134 return false; 135 136 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) 137 .addImm(0) 138 .addReg(SrcReg); 139 140 if (!MRI->getRegClassOrNull(SrcReg)) 141 MRI->setRegClass(SrcReg, TRI.getConstrainedRegClassForOperand(Src, *MRI)); 142 I.eraseFromParent(); 143 return true; 144 } 145 146 const TargetRegisterClass *RC = 147 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 148 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) 149 return false; 150 151 // Don't constrain the source register to a class so the def instruction 152 // handles it (unless it's undef). 153 // 154 // FIXME: This is a hack. When selecting the def, we neeed to know 155 // specifically know that the result is VCCRegBank, and not just an SGPR 156 // with size 1. An SReg_32 with size 1 is ambiguous with wave32. 157 if (Src.isUndef()) { 158 const TargetRegisterClass *SrcRC = 159 TRI.getConstrainedRegClassForOperand(Src, *MRI); 160 if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 161 return false; 162 } 163 164 return true; 165 } 166 167 for (const MachineOperand &MO : I.operands()) { 168 if (Register::isPhysicalRegister(MO.getReg())) 169 continue; 170 171 const TargetRegisterClass *RC = 172 TRI.getConstrainedRegClassForOperand(MO, *MRI); 173 if (!RC) 174 continue; 175 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); 176 } 177 return true; 178 } 179 180 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { 181 const Register DefReg = I.getOperand(0).getReg(); 182 const LLT DefTy = MRI->getType(DefReg); 183 184 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy) 185 186 const RegClassOrRegBank &RegClassOrBank = 187 MRI->getRegClassOrRegBank(DefReg); 188 189 const TargetRegisterClass *DefRC 190 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); 191 if (!DefRC) { 192 if (!DefTy.isValid()) { 193 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); 194 return false; 195 } 196 197 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); 198 if (RB.getID() == AMDGPU::SCCRegBankID) { 199 LLVM_DEBUG(dbgs() << "illegal scc phi\n"); 200 return false; 201 } 202 203 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI); 204 if (!DefRC) { 205 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); 206 return false; 207 } 208 } 209 210 I.setDesc(TII.get(TargetOpcode::PHI)); 211 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI); 212 } 213 214 MachineOperand 215 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, 216 const TargetRegisterClass &SubRC, 217 unsigned SubIdx) const { 218 219 MachineInstr *MI = MO.getParent(); 220 MachineBasicBlock *BB = MO.getParent()->getParent(); 221 Register DstReg = MRI->createVirtualRegister(&SubRC); 222 223 if (MO.isReg()) { 224 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); 225 Register Reg = MO.getReg(); 226 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) 227 .addReg(Reg, 0, ComposedSubIdx); 228 229 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(), 230 MO.isKill(), MO.isDead(), MO.isUndef(), 231 MO.isEarlyClobber(), 0, MO.isDebug(), 232 MO.isInternalRead()); 233 } 234 235 assert(MO.isImm()); 236 237 APInt Imm(64, MO.getImm()); 238 239 switch (SubIdx) { 240 default: 241 llvm_unreachable("do not know to split immediate with this sub index."); 242 case AMDGPU::sub0: 243 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue()); 244 case AMDGPU::sub1: 245 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue()); 246 } 247 } 248 249 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { 250 switch (Opc) { 251 case AMDGPU::G_AND: 252 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; 253 case AMDGPU::G_OR: 254 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32; 255 case AMDGPU::G_XOR: 256 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32; 257 default: 258 llvm_unreachable("not a bit op"); 259 } 260 } 261 262 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { 263 MachineOperand &Dst = I.getOperand(0); 264 MachineOperand &Src0 = I.getOperand(1); 265 MachineOperand &Src1 = I.getOperand(2); 266 Register DstReg = Dst.getReg(); 267 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 268 269 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 270 if (DstRB->getID() == AMDGPU::VCCRegBankID) { 271 const TargetRegisterClass *RC = TRI.getBoolRC(); 272 unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), 273 RC == &AMDGPU::SReg_64RegClass); 274 I.setDesc(TII.get(InstOpc)); 275 276 // FIXME: Hack to avoid turning the register bank into a register class. 277 // The selector for G_ICMP relies on seeing the register bank for the result 278 // is VCC. In wave32 if we constrain the registers to SReg_32 here, it will 279 // be ambiguous whether it's a scalar or vector bool. 280 if (Src0.isUndef() && !MRI->getRegClassOrNull(Src0.getReg())) 281 MRI->setRegClass(Src0.getReg(), RC); 282 if (Src1.isUndef() && !MRI->getRegClassOrNull(Src1.getReg())) 283 MRI->setRegClass(Src1.getReg(), RC); 284 285 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 286 } 287 288 // TODO: Should this allow an SCC bank result, and produce a copy from SCC for 289 // the result? 290 if (DstRB->getID() == AMDGPU::SGPRRegBankID) { 291 unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), Size > 32); 292 I.setDesc(TII.get(InstOpc)); 293 // Dead implicit-def of scc 294 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef 295 true, // isImp 296 false, // isKill 297 true)); // isDead 298 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 299 } 300 301 return false; 302 } 303 304 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { 305 MachineBasicBlock *BB = I.getParent(); 306 MachineFunction *MF = BB->getParent(); 307 Register DstReg = I.getOperand(0).getReg(); 308 const DebugLoc &DL = I.getDebugLoc(); 309 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 310 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 311 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID; 312 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB; 313 314 if (Size == 32) { 315 if (IsSALU) { 316 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; 317 MachineInstr *Add = 318 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 319 .add(I.getOperand(1)) 320 .add(I.getOperand(2)); 321 I.eraseFromParent(); 322 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 323 } 324 325 if (STI.hasAddNoCarry()) { 326 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64; 327 I.setDesc(TII.get(Opc)); 328 I.addOperand(*MF, MachineOperand::CreateImm(0)); 329 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 330 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 331 } 332 333 const unsigned Opc = Sub ? AMDGPU::V_SUB_I32_e64 : AMDGPU::V_ADD_I32_e64; 334 335 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass()); 336 MachineInstr *Add 337 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 338 .addDef(UnusedCarry, RegState::Dead) 339 .add(I.getOperand(1)) 340 .add(I.getOperand(2)) 341 .addImm(0); 342 I.eraseFromParent(); 343 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 344 } 345 346 assert(!Sub && "illegal sub should not reach here"); 347 348 const TargetRegisterClass &RC 349 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass; 350 const TargetRegisterClass &HalfRC 351 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass; 352 353 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0)); 354 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0)); 355 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1)); 356 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1)); 357 358 Register DstLo = MRI->createVirtualRegister(&HalfRC); 359 Register DstHi = MRI->createVirtualRegister(&HalfRC); 360 361 if (IsSALU) { 362 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) 363 .add(Lo1) 364 .add(Lo2); 365 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) 366 .add(Hi1) 367 .add(Hi2); 368 } else { 369 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass(); 370 Register CarryReg = MRI->createVirtualRegister(CarryRC); 371 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_I32_e64), DstLo) 372 .addDef(CarryReg) 373 .add(Lo1) 374 .add(Lo2) 375 .addImm(0); 376 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi) 377 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead) 378 .add(Hi1) 379 .add(Hi2) 380 .addReg(CarryReg, RegState::Kill) 381 .addImm(0); 382 383 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI)) 384 return false; 385 } 386 387 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 388 .addReg(DstLo) 389 .addImm(AMDGPU::sub0) 390 .addReg(DstHi) 391 .addImm(AMDGPU::sub1); 392 393 394 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI)) 395 return false; 396 397 I.eraseFromParent(); 398 return true; 399 } 400 401 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO(MachineInstr &I) const { 402 MachineBasicBlock *BB = I.getParent(); 403 MachineFunction *MF = BB->getParent(); 404 const DebugLoc &DL = I.getDebugLoc(); 405 Register Dst0Reg = I.getOperand(0).getReg(); 406 Register Dst1Reg = I.getOperand(1).getReg(); 407 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO; 408 409 if (!isSCC(Dst1Reg, *MRI)) { 410 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned 411 // carry out despite the _i32 name. These were renamed in VI to _U32. 412 // FIXME: We should probably rename the opcodes here. 413 unsigned NewOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; 414 I.setDesc(TII.get(NewOpc)); 415 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 416 I.addOperand(*MF, MachineOperand::CreateImm(0)); 417 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 418 } 419 420 Register Src0Reg = I.getOperand(2).getReg(); 421 Register Src1Reg = I.getOperand(3).getReg(); 422 unsigned NewOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; 423 BuildMI(*BB, &I, DL, TII.get(NewOpc), Dst0Reg) 424 .add(I.getOperand(2)) 425 .add(I.getOperand(3)); 426 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg) 427 .addReg(AMDGPU::SCC); 428 429 if (!MRI->getRegClassOrNull(Dst1Reg)) 430 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass); 431 432 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) || 433 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) || 434 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI)) 435 return false; 436 437 I.eraseFromParent(); 438 return true; 439 } 440 441 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { 442 MachineBasicBlock *BB = I.getParent(); 443 unsigned Offset = I.getOperand(2).getImm(); 444 if (Offset % 32 != 0) 445 return false; 446 447 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32); 448 const DebugLoc &DL = I.getDebugLoc(); 449 MachineInstr *Copy = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), 450 I.getOperand(0).getReg()) 451 .addReg(I.getOperand(1).getReg(), 0, SubReg); 452 453 for (const MachineOperand &MO : Copy->operands()) { 454 const TargetRegisterClass *RC = 455 TRI.getConstrainedRegClassForOperand(MO, *MRI); 456 if (!RC) 457 continue; 458 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); 459 } 460 I.eraseFromParent(); 461 return true; 462 } 463 464 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { 465 MachineBasicBlock *BB = MI.getParent(); 466 Register DstReg = MI.getOperand(0).getReg(); 467 LLT DstTy = MRI->getType(DstReg); 468 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg()); 469 470 const unsigned SrcSize = SrcTy.getSizeInBits(); 471 if (SrcSize < 32) 472 return false; 473 474 const DebugLoc &DL = MI.getDebugLoc(); 475 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 476 const unsigned DstSize = DstTy.getSizeInBits(); 477 const TargetRegisterClass *DstRC = 478 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 479 if (!DstRC) 480 return false; 481 482 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8); 483 MachineInstrBuilder MIB = 484 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg); 485 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) { 486 MachineOperand &Src = MI.getOperand(I + 1); 487 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef())); 488 MIB.addImm(SubRegs[I]); 489 490 const TargetRegisterClass *SrcRC 491 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 492 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI)) 493 return false; 494 } 495 496 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 497 return false; 498 499 MI.eraseFromParent(); 500 return true; 501 } 502 503 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { 504 MachineBasicBlock *BB = MI.getParent(); 505 const int NumDst = MI.getNumOperands() - 1; 506 507 MachineOperand &Src = MI.getOperand(NumDst); 508 509 Register SrcReg = Src.getReg(); 510 Register DstReg0 = MI.getOperand(0).getReg(); 511 LLT DstTy = MRI->getType(DstReg0); 512 LLT SrcTy = MRI->getType(SrcReg); 513 514 const unsigned DstSize = DstTy.getSizeInBits(); 515 const unsigned SrcSize = SrcTy.getSizeInBits(); 516 const DebugLoc &DL = MI.getDebugLoc(); 517 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 518 519 const TargetRegisterClass *SrcRC = 520 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 521 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 522 return false; 523 524 const unsigned SrcFlags = getUndefRegState(Src.isUndef()); 525 526 // Note we could have mixed SGPR and VGPR destination banks for an SGPR 527 // source, and this relies on the fact that the same subregister indices are 528 // used for both. 529 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); 530 for (int I = 0, E = NumDst; I != E; ++I) { 531 MachineOperand &Dst = MI.getOperand(I); 532 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg()) 533 .addReg(SrcReg, SrcFlags, SubRegs[I]); 534 535 const TargetRegisterClass *DstRC = 536 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 537 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI)) 538 return false; 539 } 540 541 MI.eraseFromParent(); 542 return true; 543 } 544 545 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const { 546 return selectG_ADD_SUB(I); 547 } 548 549 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { 550 const MachineOperand &MO = I.getOperand(0); 551 552 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The 553 // regbank check here is to know why getConstrainedRegClassForOperand failed. 554 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI); 555 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) || 556 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) { 557 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); 558 return true; 559 } 560 561 return false; 562 } 563 564 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { 565 MachineBasicBlock *BB = I.getParent(); 566 567 Register DstReg = I.getOperand(0).getReg(); 568 Register Src0Reg = I.getOperand(1).getReg(); 569 Register Src1Reg = I.getOperand(2).getReg(); 570 LLT Src1Ty = MRI->getType(Src1Reg); 571 572 unsigned DstSize = MRI->getType(DstReg).getSizeInBits(); 573 unsigned InsSize = Src1Ty.getSizeInBits(); 574 575 int64_t Offset = I.getOperand(3).getImm(); 576 if (Offset % 32 != 0) 577 return false; 578 579 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32); 580 if (SubReg == AMDGPU::NoSubRegister) 581 return false; 582 583 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 584 const TargetRegisterClass *DstRC = 585 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 586 if (!DstRC) 587 return false; 588 589 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI); 590 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI); 591 const TargetRegisterClass *Src0RC = 592 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI); 593 const TargetRegisterClass *Src1RC = 594 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI); 595 596 // Deal with weird cases where the class only partially supports the subreg 597 // index. 598 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg); 599 if (!Src0RC) 600 return false; 601 602 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 603 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) || 604 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI)) 605 return false; 606 607 const DebugLoc &DL = I.getDebugLoc(); 608 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg) 609 .addReg(Src0Reg) 610 .addReg(Src1Reg) 611 .addImm(SubReg); 612 613 I.eraseFromParent(); 614 return true; 615 } 616 617 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { 618 unsigned IntrinsicID = I.getIntrinsicID(); 619 switch (IntrinsicID) { 620 case Intrinsic::amdgcn_if_break: { 621 MachineBasicBlock *BB = I.getParent(); 622 623 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 624 // SelectionDAG uses for wave32 vs wave64. 625 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK)) 626 .add(I.getOperand(0)) 627 .add(I.getOperand(2)) 628 .add(I.getOperand(3)); 629 630 Register DstReg = I.getOperand(0).getReg(); 631 Register Src0Reg = I.getOperand(2).getReg(); 632 Register Src1Reg = I.getOperand(3).getReg(); 633 634 I.eraseFromParent(); 635 636 for (Register Reg : { DstReg, Src0Reg, Src1Reg }) 637 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 638 639 return true; 640 } 641 default: 642 return selectImpl(I, *CoverageInfo); 643 } 644 } 645 646 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) { 647 if (Size != 32 && Size != 64) 648 return -1; 649 switch (P) { 650 default: 651 llvm_unreachable("Unknown condition code!"); 652 case CmpInst::ICMP_NE: 653 return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64; 654 case CmpInst::ICMP_EQ: 655 return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64; 656 case CmpInst::ICMP_SGT: 657 return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64; 658 case CmpInst::ICMP_SGE: 659 return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64; 660 case CmpInst::ICMP_SLT: 661 return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64; 662 case CmpInst::ICMP_SLE: 663 return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64; 664 case CmpInst::ICMP_UGT: 665 return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64; 666 case CmpInst::ICMP_UGE: 667 return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64; 668 case CmpInst::ICMP_ULT: 669 return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64; 670 case CmpInst::ICMP_ULE: 671 return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64; 672 } 673 } 674 675 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P, 676 unsigned Size) const { 677 if (Size == 64) { 678 if (!STI.hasScalarCompareEq64()) 679 return -1; 680 681 switch (P) { 682 case CmpInst::ICMP_NE: 683 return AMDGPU::S_CMP_LG_U64; 684 case CmpInst::ICMP_EQ: 685 return AMDGPU::S_CMP_EQ_U64; 686 default: 687 return -1; 688 } 689 } 690 691 if (Size != 32) 692 return -1; 693 694 switch (P) { 695 case CmpInst::ICMP_NE: 696 return AMDGPU::S_CMP_LG_U32; 697 case CmpInst::ICMP_EQ: 698 return AMDGPU::S_CMP_EQ_U32; 699 case CmpInst::ICMP_SGT: 700 return AMDGPU::S_CMP_GT_I32; 701 case CmpInst::ICMP_SGE: 702 return AMDGPU::S_CMP_GE_I32; 703 case CmpInst::ICMP_SLT: 704 return AMDGPU::S_CMP_LT_I32; 705 case CmpInst::ICMP_SLE: 706 return AMDGPU::S_CMP_LE_I32; 707 case CmpInst::ICMP_UGT: 708 return AMDGPU::S_CMP_GT_U32; 709 case CmpInst::ICMP_UGE: 710 return AMDGPU::S_CMP_GE_U32; 711 case CmpInst::ICMP_ULT: 712 return AMDGPU::S_CMP_LT_U32; 713 case CmpInst::ICMP_ULE: 714 return AMDGPU::S_CMP_LE_U32; 715 default: 716 llvm_unreachable("Unknown condition code!"); 717 } 718 } 719 720 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { 721 MachineBasicBlock *BB = I.getParent(); 722 const DebugLoc &DL = I.getDebugLoc(); 723 724 Register SrcReg = I.getOperand(2).getReg(); 725 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 726 727 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); 728 729 Register CCReg = I.getOperand(0).getReg(); 730 if (isSCC(CCReg, *MRI)) { 731 int Opcode = getS_CMPOpcode(Pred, Size); 732 if (Opcode == -1) 733 return false; 734 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode)) 735 .add(I.getOperand(2)) 736 .add(I.getOperand(3)); 737 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg) 738 .addReg(AMDGPU::SCC); 739 bool Ret = 740 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) && 741 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI); 742 I.eraseFromParent(); 743 return Ret; 744 } 745 746 int Opcode = getV_CMPOpcode(Pred, Size); 747 if (Opcode == -1) 748 return false; 749 750 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), 751 I.getOperand(0).getReg()) 752 .add(I.getOperand(2)) 753 .add(I.getOperand(3)); 754 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), 755 *TRI.getBoolRC(), *MRI); 756 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 757 I.eraseFromParent(); 758 return Ret; 759 } 760 761 static MachineInstr * 762 buildEXP(const TargetInstrInfo &TII, MachineInstr *Insert, unsigned Tgt, 763 unsigned Reg0, unsigned Reg1, unsigned Reg2, unsigned Reg3, 764 unsigned VM, bool Compr, unsigned Enabled, bool Done) { 765 const DebugLoc &DL = Insert->getDebugLoc(); 766 MachineBasicBlock &BB = *Insert->getParent(); 767 unsigned Opcode = Done ? AMDGPU::EXP_DONE : AMDGPU::EXP; 768 return BuildMI(BB, Insert, DL, TII.get(Opcode)) 769 .addImm(Tgt) 770 .addReg(Reg0) 771 .addReg(Reg1) 772 .addReg(Reg2) 773 .addReg(Reg3) 774 .addImm(VM) 775 .addImm(Compr) 776 .addImm(Enabled); 777 } 778 779 static bool isZero(Register Reg, MachineRegisterInfo &MRI) { 780 int64_t C; 781 if (mi_match(Reg, MRI, m_ICst(C)) && C == 0) 782 return true; 783 784 // FIXME: matcher should ignore copies 785 return mi_match(Reg, MRI, m_Copy(m_ICst(C))) && C == 0; 786 } 787 788 static unsigned extractGLC(unsigned AuxiliaryData) { 789 return AuxiliaryData & 1; 790 } 791 792 static unsigned extractSLC(unsigned AuxiliaryData) { 793 return (AuxiliaryData >> 1) & 1; 794 } 795 796 static unsigned extractDLC(unsigned AuxiliaryData) { 797 return (AuxiliaryData >> 2) & 1; 798 } 799 800 static unsigned extractSWZ(unsigned AuxiliaryData) { 801 return (AuxiliaryData >> 3) & 1; 802 } 803 804 // Returns Base register, constant offset, and offset def point. 805 static std::tuple<Register, unsigned, MachineInstr *> 806 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { 807 MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); 808 if (!Def) 809 return std::make_tuple(Reg, 0, nullptr); 810 811 if (Def->getOpcode() == AMDGPU::G_CONSTANT) { 812 unsigned Offset; 813 const MachineOperand &Op = Def->getOperand(1); 814 if (Op.isImm()) 815 Offset = Op.getImm(); 816 else 817 Offset = Op.getCImm()->getZExtValue(); 818 819 return std::make_tuple(Register(), Offset, Def); 820 } 821 822 int64_t Offset; 823 if (Def->getOpcode() == AMDGPU::G_ADD) { 824 // TODO: Handle G_OR used for add case 825 if (mi_match(Def->getOperand(1).getReg(), MRI, m_ICst(Offset))) 826 return std::make_tuple(Def->getOperand(0).getReg(), Offset, Def); 827 828 // FIXME: matcher should ignore copies 829 if (mi_match(Def->getOperand(1).getReg(), MRI, m_Copy(m_ICst(Offset)))) 830 return std::make_tuple(Def->getOperand(0).getReg(), Offset, Def); 831 } 832 833 return std::make_tuple(Reg, 0, Def); 834 } 835 836 static unsigned getBufferStoreOpcode(LLT Ty, 837 const unsigned MemSize, 838 const bool Offen) { 839 const int Size = Ty.getSizeInBits(); 840 switch (8 * MemSize) { 841 case 8: 842 return Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact : 843 AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact; 844 case 16: 845 return Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact : 846 AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact; 847 default: 848 unsigned Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact : 849 AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact; 850 if (Size > 32) 851 Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32); 852 return Opc; 853 } 854 } 855 856 static unsigned getBufferStoreFormatOpcode(LLT Ty, 857 const unsigned MemSize, 858 const bool Offen) { 859 bool IsD16Packed = Ty.getScalarSizeInBits() == 16; 860 bool IsD16Unpacked = 8 * MemSize < Ty.getSizeInBits(); 861 int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 862 863 if (IsD16Packed) { 864 switch (NumElts) { 865 case 1: 866 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFEN_exact : 867 AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFSET_exact; 868 case 2: 869 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact : 870 AMDGPU::BUFFER_STORE_FORMAT_D16_XY_OFFSET_exact; 871 case 3: 872 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_OFFEN_exact : 873 AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_OFFSET_exact; 874 case 4: 875 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact : 876 AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_OFFSET_exact; 877 default: 878 return -1; 879 } 880 } 881 882 if (IsD16Unpacked) { 883 switch (NumElts) { 884 case 1: 885 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFEN_exact : 886 AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFSET_exact; 887 case 2: 888 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact : 889 AMDGPU::BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFSET_exact; 890 case 3: 891 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_gfx80_OFFEN_exact : 892 AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_gfx80_OFFSET_exact; 893 case 4: 894 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact : 895 AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFSET_exact; 896 default: 897 return -1; 898 } 899 } 900 901 switch (NumElts) { 902 case 1: 903 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_X_OFFEN_exact : 904 AMDGPU::BUFFER_STORE_FORMAT_X_OFFSET_exact; 905 case 2: 906 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XY_OFFEN_exact : 907 AMDGPU::BUFFER_STORE_FORMAT_XY_OFFSET_exact; 908 case 3: 909 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XYZ_OFFEN_exact : 910 AMDGPU::BUFFER_STORE_FORMAT_XYZ_OFFSET_exact; 911 case 4: 912 return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XYZW_OFFEN_exact : 913 AMDGPU::BUFFER_STORE_FORMAT_XYZW_OFFSET_exact; 914 default: 915 return -1; 916 } 917 918 llvm_unreachable("unhandled buffer store"); 919 } 920 921 // TODO: Move this to combiner 922 // Returns base register, imm offset, total constant offset. 923 std::tuple<Register, unsigned, unsigned> 924 AMDGPUInstructionSelector::splitBufferOffsets(MachineIRBuilder &B, 925 Register OrigOffset) const { 926 const unsigned MaxImm = 4095; 927 Register BaseReg; 928 unsigned TotalConstOffset; 929 MachineInstr *OffsetDef; 930 931 std::tie(BaseReg, TotalConstOffset, OffsetDef) 932 = getBaseWithConstantOffset(*MRI, OrigOffset); 933 934 unsigned ImmOffset = TotalConstOffset; 935 936 // If the immediate value is too big for the immoffset field, put the value 937 // and -4096 into the immoffset field so that the value that is copied/added 938 // for the voffset field is a multiple of 4096, and it stands more chance 939 // of being CSEd with the copy/add for another similar load/store.f 940 // However, do not do that rounding down to a multiple of 4096 if that is a 941 // negative number, as it appears to be illegal to have a negative offset 942 // in the vgpr, even if adding the immediate offset makes it positive. 943 unsigned Overflow = ImmOffset & ~MaxImm; 944 ImmOffset -= Overflow; 945 if ((int32_t)Overflow < 0) { 946 Overflow += ImmOffset; 947 ImmOffset = 0; 948 } 949 950 if (Overflow != 0) { 951 // In case this is in a waterfall loop, insert offset code at the def point 952 // of the offset, not inside the loop. 953 MachineBasicBlock::iterator OldInsPt = B.getInsertPt(); 954 MachineBasicBlock &OldMBB = B.getMBB(); 955 B.setInstr(*OffsetDef); 956 957 if (!BaseReg) { 958 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 959 B.buildInstr(AMDGPU::V_MOV_B32_e32) 960 .addDef(BaseReg) 961 .addImm(Overflow); 962 } else { 963 Register OverflowVal = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 964 B.buildInstr(AMDGPU::V_MOV_B32_e32) 965 .addDef(OverflowVal) 966 .addImm(Overflow); 967 968 Register NewBaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 969 TII.getAddNoCarry(B.getMBB(), B.getInsertPt(), B.getDebugLoc(), NewBaseReg) 970 .addReg(BaseReg) 971 .addReg(OverflowVal, RegState::Kill) 972 .addImm(0); 973 BaseReg = NewBaseReg; 974 } 975 976 B.setInsertPt(OldMBB, OldInsPt); 977 } 978 979 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 980 } 981 982 bool AMDGPUInstructionSelector::selectStoreIntrinsic(MachineInstr &MI, 983 bool IsFormat) const { 984 MachineIRBuilder B(MI); 985 MachineFunction &MF = B.getMF(); 986 Register VData = MI.getOperand(1).getReg(); 987 LLT Ty = MRI->getType(VData); 988 989 int Size = Ty.getSizeInBits(); 990 if (Size % 32 != 0) 991 return false; 992 993 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 994 MachineMemOperand *MMO = *MI.memoperands_begin(); 995 const int MemSize = MMO->getSize(); 996 997 Register RSrc = MI.getOperand(2).getReg(); 998 Register VOffset = MI.getOperand(3).getReg(); 999 Register SOffset = MI.getOperand(4).getReg(); 1000 unsigned AuxiliaryData = MI.getOperand(5).getImm(); 1001 unsigned ImmOffset; 1002 unsigned TotalOffset; 1003 1004 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 1005 if (TotalOffset != 0) 1006 MMO = MF.getMachineMemOperand(MMO, TotalOffset, MemSize); 1007 1008 const bool Offen = !isZero(VOffset, *MRI); 1009 1010 int Opc = IsFormat ? getBufferStoreFormatOpcode(Ty, MemSize, Offen) : 1011 getBufferStoreOpcode(Ty, MemSize, Offen); 1012 if (Opc == -1) 1013 return false; 1014 1015 MachineInstrBuilder MIB = B.buildInstr(Opc) 1016 .addUse(VData); 1017 1018 if (Offen) 1019 MIB.addUse(VOffset); 1020 1021 MIB.addUse(RSrc) 1022 .addUse(SOffset) 1023 .addImm(ImmOffset) 1024 .addImm(extractGLC(AuxiliaryData)) 1025 .addImm(extractSLC(AuxiliaryData)) 1026 .addImm(0) // tfe: FIXME: Remove from inst 1027 .addImm(extractDLC(AuxiliaryData)) 1028 .addImm(extractSWZ(AuxiliaryData)) 1029 .addMemOperand(MMO); 1030 1031 MI.eraseFromParent(); 1032 1033 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1034 } 1035 1036 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( 1037 MachineInstr &I) const { 1038 MachineBasicBlock *BB = I.getParent(); 1039 unsigned IntrinsicID = I.getIntrinsicID(); 1040 switch (IntrinsicID) { 1041 case Intrinsic::amdgcn_exp: { 1042 int64_t Tgt = I.getOperand(1).getImm(); 1043 int64_t Enabled = I.getOperand(2).getImm(); 1044 int64_t Done = I.getOperand(7).getImm(); 1045 int64_t VM = I.getOperand(8).getImm(); 1046 1047 MachineInstr *Exp = buildEXP(TII, &I, Tgt, I.getOperand(3).getReg(), 1048 I.getOperand(4).getReg(), 1049 I.getOperand(5).getReg(), 1050 I.getOperand(6).getReg(), 1051 VM, false, Enabled, Done); 1052 1053 I.eraseFromParent(); 1054 return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI); 1055 } 1056 case Intrinsic::amdgcn_exp_compr: { 1057 const DebugLoc &DL = I.getDebugLoc(); 1058 int64_t Tgt = I.getOperand(1).getImm(); 1059 int64_t Enabled = I.getOperand(2).getImm(); 1060 Register Reg0 = I.getOperand(3).getReg(); 1061 Register Reg1 = I.getOperand(4).getReg(); 1062 Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1063 int64_t Done = I.getOperand(5).getImm(); 1064 int64_t VM = I.getOperand(6).getImm(); 1065 1066 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef); 1067 MachineInstr *Exp = buildEXP(TII, &I, Tgt, Reg0, Reg1, Undef, Undef, VM, 1068 true, Enabled, Done); 1069 1070 I.eraseFromParent(); 1071 return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI); 1072 } 1073 case Intrinsic::amdgcn_end_cf: { 1074 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 1075 // SelectionDAG uses for wave32 vs wave64. 1076 BuildMI(*BB, &I, I.getDebugLoc(), 1077 TII.get(AMDGPU::SI_END_CF)) 1078 .add(I.getOperand(1)); 1079 1080 Register Reg = I.getOperand(1).getReg(); 1081 I.eraseFromParent(); 1082 1083 if (!MRI->getRegClassOrNull(Reg)) 1084 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 1085 return true; 1086 } 1087 case Intrinsic::amdgcn_raw_buffer_store: 1088 return selectStoreIntrinsic(I, false); 1089 case Intrinsic::amdgcn_raw_buffer_store_format: 1090 return selectStoreIntrinsic(I, true); 1091 default: 1092 return selectImpl(I, *CoverageInfo); 1093 } 1094 } 1095 1096 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { 1097 MachineBasicBlock *BB = I.getParent(); 1098 const DebugLoc &DL = I.getDebugLoc(); 1099 1100 Register DstReg = I.getOperand(0).getReg(); 1101 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 1102 assert(Size <= 32 || Size == 64); 1103 const MachineOperand &CCOp = I.getOperand(1); 1104 Register CCReg = CCOp.getReg(); 1105 if (isSCC(CCReg, *MRI)) { 1106 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 : 1107 AMDGPU::S_CSELECT_B32; 1108 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 1109 .addReg(CCReg); 1110 1111 // The generic constrainSelectedInstRegOperands doesn't work for the scc register 1112 // bank, because it does not cover the register class that we used to represent 1113 // for it. So we need to manually set the register class here. 1114 if (!MRI->getRegClassOrNull(CCReg)) 1115 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI)); 1116 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) 1117 .add(I.getOperand(2)) 1118 .add(I.getOperand(3)); 1119 1120 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) | 1121 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI); 1122 I.eraseFromParent(); 1123 return Ret; 1124 } 1125 1126 // Wide VGPR select should have been split in RegBankSelect. 1127 if (Size > 32) 1128 return false; 1129 1130 MachineInstr *Select = 1131 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1132 .addImm(0) 1133 .add(I.getOperand(3)) 1134 .addImm(0) 1135 .add(I.getOperand(2)) 1136 .add(I.getOperand(1)); 1137 1138 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); 1139 I.eraseFromParent(); 1140 return Ret; 1141 } 1142 1143 bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { 1144 initM0(I); 1145 return selectImpl(I, *CoverageInfo); 1146 } 1147 1148 static int sizeToSubRegIndex(unsigned Size) { 1149 switch (Size) { 1150 case 32: 1151 return AMDGPU::sub0; 1152 case 64: 1153 return AMDGPU::sub0_sub1; 1154 case 96: 1155 return AMDGPU::sub0_sub1_sub2; 1156 case 128: 1157 return AMDGPU::sub0_sub1_sub2_sub3; 1158 case 256: 1159 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; 1160 default: 1161 if (Size < 32) 1162 return AMDGPU::sub0; 1163 if (Size > 256) 1164 return -1; 1165 return sizeToSubRegIndex(PowerOf2Ceil(Size)); 1166 } 1167 } 1168 1169 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { 1170 Register DstReg = I.getOperand(0).getReg(); 1171 Register SrcReg = I.getOperand(1).getReg(); 1172 const LLT DstTy = MRI->getType(DstReg); 1173 const LLT SrcTy = MRI->getType(SrcReg); 1174 if (!DstTy.isScalar()) 1175 return false; 1176 1177 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1178 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1179 if (SrcRB != DstRB) 1180 return false; 1181 1182 unsigned DstSize = DstTy.getSizeInBits(); 1183 unsigned SrcSize = SrcTy.getSizeInBits(); 1184 1185 const TargetRegisterClass *SrcRC 1186 = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI); 1187 const TargetRegisterClass *DstRC 1188 = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI); 1189 1190 if (SrcSize > 32) { 1191 int SubRegIdx = sizeToSubRegIndex(DstSize); 1192 if (SubRegIdx == -1) 1193 return false; 1194 1195 // Deal with weird cases where the class only partially supports the subreg 1196 // index. 1197 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx); 1198 if (!SrcRC) 1199 return false; 1200 1201 I.getOperand(1).setSubReg(SubRegIdx); 1202 } 1203 1204 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 1205 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) { 1206 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); 1207 return false; 1208 } 1209 1210 I.setDesc(TII.get(TargetOpcode::COPY)); 1211 return true; 1212 } 1213 1214 /// \returns true if a bitmask for \p Size bits will be an inline immediate. 1215 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) { 1216 Mask = maskTrailingOnes<unsigned>(Size); 1217 int SignedMask = static_cast<int>(Mask); 1218 return SignedMask >= -16 && SignedMask <= 64; 1219 } 1220 1221 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { 1222 bool Signed = I.getOpcode() == AMDGPU::G_SEXT; 1223 const DebugLoc &DL = I.getDebugLoc(); 1224 MachineBasicBlock &MBB = *I.getParent(); 1225 const Register DstReg = I.getOperand(0).getReg(); 1226 const Register SrcReg = I.getOperand(1).getReg(); 1227 1228 const LLT DstTy = MRI->getType(DstReg); 1229 const LLT SrcTy = MRI->getType(SrcReg); 1230 const LLT S1 = LLT::scalar(1); 1231 const unsigned SrcSize = SrcTy.getSizeInBits(); 1232 const unsigned DstSize = DstTy.getSizeInBits(); 1233 if (!DstTy.isScalar()) 1234 return false; 1235 1236 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 1237 1238 if (SrcBank->getID() == AMDGPU::SCCRegBankID) { 1239 if (SrcTy != S1 || DstSize > 64) // Invalid 1240 return false; 1241 1242 unsigned Opcode = 1243 DstSize > 32 ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32; 1244 const TargetRegisterClass *DstRC = 1245 DstSize > 32 ? &AMDGPU::SReg_64RegClass : &AMDGPU::SReg_32RegClass; 1246 1247 // FIXME: Create an extra copy to avoid incorrectly constraining the result 1248 // of the scc producer. 1249 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1250 BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), TmpReg) 1251 .addReg(SrcReg); 1252 BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 1253 .addReg(TmpReg); 1254 1255 // The instruction operands are backwards from what you would expect. 1256 BuildMI(MBB, I, DL, TII.get(Opcode), DstReg) 1257 .addImm(0) 1258 .addImm(Signed ? -1 : 1); 1259 I.eraseFromParent(); 1260 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI); 1261 } 1262 1263 if (SrcBank->getID() == AMDGPU::VCCRegBankID && DstSize <= 32) { 1264 if (SrcTy != S1) // Invalid 1265 return false; 1266 1267 MachineInstr *ExtI = 1268 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1269 .addImm(0) // src0_modifiers 1270 .addImm(0) // src0 1271 .addImm(0) // src1_modifiers 1272 .addImm(Signed ? -1 : 1) // src1 1273 .addUse(SrcReg); 1274 I.eraseFromParent(); 1275 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1276 } 1277 1278 if (I.getOpcode() == AMDGPU::G_ANYEXT) 1279 return selectCOPY(I); 1280 1281 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) { 1282 // 64-bit should have been split up in RegBankSelect 1283 1284 // Try to use an and with a mask if it will save code size. 1285 unsigned Mask; 1286 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1287 MachineInstr *ExtI = 1288 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg) 1289 .addImm(Mask) 1290 .addReg(SrcReg); 1291 I.eraseFromParent(); 1292 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1293 } 1294 1295 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32; 1296 MachineInstr *ExtI = 1297 BuildMI(MBB, I, DL, TII.get(BFE), DstReg) 1298 .addReg(SrcReg) 1299 .addImm(0) // Offset 1300 .addImm(SrcSize); // Width 1301 I.eraseFromParent(); 1302 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 1303 } 1304 1305 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) { 1306 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI)) 1307 return false; 1308 1309 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) { 1310 const unsigned SextOpc = SrcSize == 8 ? 1311 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16; 1312 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg) 1313 .addReg(SrcReg); 1314 I.eraseFromParent(); 1315 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1316 } 1317 1318 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; 1319 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; 1320 1321 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width. 1322 if (DstSize > 32 && SrcSize <= 32) { 1323 // We need a 64-bit register source, but the high bits don't matter. 1324 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); 1325 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1326 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 1327 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) 1328 .addReg(SrcReg) 1329 .addImm(AMDGPU::sub0) 1330 .addReg(UndefReg) 1331 .addImm(AMDGPU::sub1); 1332 1333 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg) 1334 .addReg(ExtReg) 1335 .addImm(SrcSize << 16); 1336 1337 I.eraseFromParent(); 1338 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI); 1339 } 1340 1341 unsigned Mask; 1342 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 1343 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg) 1344 .addReg(SrcReg) 1345 .addImm(Mask); 1346 } else { 1347 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg) 1348 .addReg(SrcReg) 1349 .addImm(SrcSize << 16); 1350 } 1351 1352 I.eraseFromParent(); 1353 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 1354 } 1355 1356 return false; 1357 } 1358 1359 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { 1360 MachineBasicBlock *BB = I.getParent(); 1361 MachineOperand &ImmOp = I.getOperand(1); 1362 1363 // The AMDGPU backend only supports Imm operands and not CImm or FPImm. 1364 if (ImmOp.isFPImm()) { 1365 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt(); 1366 ImmOp.ChangeToImmediate(Imm.getZExtValue()); 1367 } else if (ImmOp.isCImm()) { 1368 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getZExtValue()); 1369 } 1370 1371 Register DstReg = I.getOperand(0).getReg(); 1372 unsigned Size; 1373 bool IsSgpr; 1374 const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg()); 1375 if (RB) { 1376 IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID; 1377 Size = MRI->getType(DstReg).getSizeInBits(); 1378 } else { 1379 const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg); 1380 IsSgpr = TRI.isSGPRClass(RC); 1381 Size = TRI.getRegSizeInBits(*RC); 1382 } 1383 1384 if (Size != 32 && Size != 64) 1385 return false; 1386 1387 unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1388 if (Size == 32) { 1389 I.setDesc(TII.get(Opcode)); 1390 I.addImplicitDefUseOperands(*MF); 1391 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 1392 } 1393 1394 const DebugLoc &DL = I.getDebugLoc(); 1395 1396 APInt Imm(Size, I.getOperand(1).getImm()); 1397 1398 MachineInstr *ResInst; 1399 if (IsSgpr && TII.isInlineConstant(Imm)) { 1400 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) 1401 .addImm(I.getOperand(1).getImm()); 1402 } else { 1403 const TargetRegisterClass *RC = IsSgpr ? 1404 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass; 1405 Register LoReg = MRI->createVirtualRegister(RC); 1406 Register HiReg = MRI->createVirtualRegister(RC); 1407 1408 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg) 1409 .addImm(Imm.trunc(32).getZExtValue()); 1410 1411 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg) 1412 .addImm(Imm.ashr(32).getZExtValue()); 1413 1414 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 1415 .addReg(LoReg) 1416 .addImm(AMDGPU::sub0) 1417 .addReg(HiReg) 1418 .addImm(AMDGPU::sub1); 1419 } 1420 1421 // We can't call constrainSelectedInstRegOperands here, because it doesn't 1422 // work for target independent opcodes 1423 I.eraseFromParent(); 1424 const TargetRegisterClass *DstRC = 1425 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI); 1426 if (!DstRC) 1427 return true; 1428 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI); 1429 } 1430 1431 static bool isConstant(const MachineInstr &MI) { 1432 return MI.getOpcode() == TargetOpcode::G_CONSTANT; 1433 } 1434 1435 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, 1436 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const { 1437 1438 const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg()); 1439 1440 assert(PtrMI); 1441 1442 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD) 1443 return; 1444 1445 GEPInfo GEPInfo(*PtrMI); 1446 1447 for (unsigned i = 1; i != 3; ++i) { 1448 const MachineOperand &GEPOp = PtrMI->getOperand(i); 1449 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg()); 1450 assert(OpDef); 1451 if (i == 2 && isConstant(*OpDef)) { 1452 // TODO: Could handle constant base + variable offset, but a combine 1453 // probably should have commuted it. 1454 assert(GEPInfo.Imm == 0); 1455 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue(); 1456 continue; 1457 } 1458 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI); 1459 if (OpBank->getID() == AMDGPU::SGPRRegBankID) 1460 GEPInfo.SgprParts.push_back(GEPOp.getReg()); 1461 else 1462 GEPInfo.VgprParts.push_back(GEPOp.getReg()); 1463 } 1464 1465 AddrInfo.push_back(GEPInfo); 1466 getAddrModeInfo(*PtrMI, MRI, AddrInfo); 1467 } 1468 1469 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const { 1470 if (!MI.hasOneMemOperand()) 1471 return false; 1472 1473 const MachineMemOperand *MMO = *MI.memoperands_begin(); 1474 const Value *Ptr = MMO->getValue(); 1475 1476 // UndefValue means this is a load of a kernel input. These are uniform. 1477 // Sometimes LDS instructions have constant pointers. 1478 // If Ptr is null, then that means this mem operand contains a 1479 // PseudoSourceValue like GOT. 1480 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || 1481 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) 1482 return true; 1483 1484 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 1485 return true; 1486 1487 const Instruction *I = dyn_cast<Instruction>(Ptr); 1488 return I && I->getMetadata("amdgpu.uniform"); 1489 } 1490 1491 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const { 1492 for (const GEPInfo &GEPInfo : AddrInfo) { 1493 if (!GEPInfo.VgprParts.empty()) 1494 return true; 1495 } 1496 return false; 1497 } 1498 1499 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const { 1500 MachineBasicBlock *BB = I.getParent(); 1501 1502 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); 1503 unsigned AS = PtrTy.getAddressSpace(); 1504 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) && 1505 STI.ldsRequiresM0Init()) { 1506 // If DS instructions require M0 initializtion, insert it before selecting. 1507 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1508 .addImm(-1); 1509 } 1510 } 1511 1512 bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const { 1513 initM0(I); 1514 return selectImpl(I, *CoverageInfo); 1515 } 1516 1517 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { 1518 MachineBasicBlock *BB = I.getParent(); 1519 MachineOperand &CondOp = I.getOperand(0); 1520 Register CondReg = CondOp.getReg(); 1521 const DebugLoc &DL = I.getDebugLoc(); 1522 1523 unsigned BrOpcode; 1524 Register CondPhysReg; 1525 const TargetRegisterClass *ConstrainRC; 1526 1527 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide 1528 // whether the branch is uniform when selecting the instruction. In 1529 // GlobalISel, we should push that decision into RegBankSelect. Assume for now 1530 // RegBankSelect knows what it's doing if the branch condition is scc, even 1531 // though it currently does not. 1532 if (isSCC(CondReg, *MRI)) { 1533 CondPhysReg = AMDGPU::SCC; 1534 BrOpcode = AMDGPU::S_CBRANCH_SCC1; 1535 // FIXME: Hack for isSCC tests 1536 ConstrainRC = &AMDGPU::SGPR_32RegClass; 1537 } else if (isVCC(CondReg, *MRI)) { 1538 // FIXME: Do we have to insert an and with exec here, like in SelectionDAG? 1539 // We sort of know that a VCC producer based on the register bank, that ands 1540 // inactive lanes with 0. What if there was a logical operation with vcc 1541 // producers in different blocks/with different exec masks? 1542 // FIXME: Should scc->vcc copies and with exec? 1543 CondPhysReg = TRI.getVCC(); 1544 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ; 1545 ConstrainRC = TRI.getBoolRC(); 1546 } else 1547 return false; 1548 1549 if (!MRI->getRegClassOrNull(CondReg)) 1550 MRI->setRegClass(CondReg, ConstrainRC); 1551 1552 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg) 1553 .addReg(CondReg); 1554 BuildMI(*BB, &I, DL, TII.get(BrOpcode)) 1555 .addMBB(I.getOperand(1).getMBB()); 1556 1557 I.eraseFromParent(); 1558 return true; 1559 } 1560 1561 bool AMDGPUInstructionSelector::selectG_FRAME_INDEX(MachineInstr &I) const { 1562 Register DstReg = I.getOperand(0).getReg(); 1563 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1564 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 1565 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32)); 1566 if (IsVGPR) 1567 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 1568 1569 return RBI.constrainGenericRegister( 1570 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI); 1571 } 1572 1573 bool AMDGPUInstructionSelector::selectG_PTR_MASK(MachineInstr &I) const { 1574 uint64_t Align = I.getOperand(2).getImm(); 1575 const uint64_t Mask = ~((UINT64_C(1) << Align) - 1); 1576 1577 MachineBasicBlock *BB = I.getParent(); 1578 1579 Register DstReg = I.getOperand(0).getReg(); 1580 Register SrcReg = I.getOperand(1).getReg(); 1581 1582 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1583 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1584 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 1585 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 1586 unsigned MovOpc = IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 1587 const TargetRegisterClass &RegRC 1588 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 1589 1590 LLT Ty = MRI->getType(DstReg); 1591 1592 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB, 1593 *MRI); 1594 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB, 1595 *MRI); 1596 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 1597 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 1598 return false; 1599 1600 const DebugLoc &DL = I.getDebugLoc(); 1601 Register ImmReg = MRI->createVirtualRegister(&RegRC); 1602 BuildMI(*BB, &I, DL, TII.get(MovOpc), ImmReg) 1603 .addImm(Mask); 1604 1605 if (Ty.getSizeInBits() == 32) { 1606 BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg) 1607 .addReg(SrcReg) 1608 .addReg(ImmReg); 1609 I.eraseFromParent(); 1610 return true; 1611 } 1612 1613 Register HiReg = MRI->createVirtualRegister(&RegRC); 1614 Register LoReg = MRI->createVirtualRegister(&RegRC); 1615 Register MaskLo = MRI->createVirtualRegister(&RegRC); 1616 1617 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg) 1618 .addReg(SrcReg, 0, AMDGPU::sub0); 1619 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg) 1620 .addReg(SrcReg, 0, AMDGPU::sub1); 1621 1622 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskLo) 1623 .addReg(LoReg) 1624 .addReg(ImmReg); 1625 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 1626 .addReg(MaskLo) 1627 .addImm(AMDGPU::sub0) 1628 .addReg(HiReg) 1629 .addImm(AMDGPU::sub1); 1630 I.eraseFromParent(); 1631 return true; 1632 } 1633 1634 bool AMDGPUInstructionSelector::select(MachineInstr &I) { 1635 if (I.isPHI()) 1636 return selectPHI(I); 1637 1638 if (!I.isPreISelOpcode()) { 1639 if (I.isCopy()) 1640 return selectCOPY(I); 1641 return true; 1642 } 1643 1644 switch (I.getOpcode()) { 1645 case TargetOpcode::G_AND: 1646 case TargetOpcode::G_OR: 1647 case TargetOpcode::G_XOR: 1648 if (selectG_AND_OR_XOR(I)) 1649 return true; 1650 return selectImpl(I, *CoverageInfo); 1651 case TargetOpcode::G_ADD: 1652 case TargetOpcode::G_SUB: 1653 if (selectImpl(I, *CoverageInfo)) 1654 return true; 1655 return selectG_ADD_SUB(I); 1656 case TargetOpcode::G_UADDO: 1657 case TargetOpcode::G_USUBO: 1658 return selectG_UADDO_USUBO(I); 1659 case TargetOpcode::G_INTTOPTR: 1660 case TargetOpcode::G_BITCAST: 1661 case TargetOpcode::G_PTRTOINT: 1662 return selectCOPY(I); 1663 case TargetOpcode::G_CONSTANT: 1664 case TargetOpcode::G_FCONSTANT: 1665 return selectG_CONSTANT(I); 1666 case TargetOpcode::G_EXTRACT: 1667 return selectG_EXTRACT(I); 1668 case TargetOpcode::G_MERGE_VALUES: 1669 case TargetOpcode::G_BUILD_VECTOR: 1670 case TargetOpcode::G_CONCAT_VECTORS: 1671 return selectG_MERGE_VALUES(I); 1672 case TargetOpcode::G_UNMERGE_VALUES: 1673 return selectG_UNMERGE_VALUES(I); 1674 case TargetOpcode::G_PTR_ADD: 1675 return selectG_PTR_ADD(I); 1676 case TargetOpcode::G_IMPLICIT_DEF: 1677 return selectG_IMPLICIT_DEF(I); 1678 case TargetOpcode::G_INSERT: 1679 return selectG_INSERT(I); 1680 case TargetOpcode::G_INTRINSIC: 1681 return selectG_INTRINSIC(I); 1682 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: 1683 return selectG_INTRINSIC_W_SIDE_EFFECTS(I); 1684 case TargetOpcode::G_ICMP: 1685 if (selectG_ICMP(I)) 1686 return true; 1687 return selectImpl(I, *CoverageInfo); 1688 case TargetOpcode::G_LOAD: 1689 case TargetOpcode::G_ATOMIC_CMPXCHG: 1690 case TargetOpcode::G_ATOMICRMW_XCHG: 1691 case TargetOpcode::G_ATOMICRMW_ADD: 1692 case TargetOpcode::G_ATOMICRMW_SUB: 1693 case TargetOpcode::G_ATOMICRMW_AND: 1694 case TargetOpcode::G_ATOMICRMW_OR: 1695 case TargetOpcode::G_ATOMICRMW_XOR: 1696 case TargetOpcode::G_ATOMICRMW_MIN: 1697 case TargetOpcode::G_ATOMICRMW_MAX: 1698 case TargetOpcode::G_ATOMICRMW_UMIN: 1699 case TargetOpcode::G_ATOMICRMW_UMAX: 1700 case TargetOpcode::G_ATOMICRMW_FADD: 1701 return selectG_LOAD_ATOMICRMW(I); 1702 case TargetOpcode::G_SELECT: 1703 return selectG_SELECT(I); 1704 case TargetOpcode::G_STORE: 1705 return selectG_STORE(I); 1706 case TargetOpcode::G_TRUNC: 1707 return selectG_TRUNC(I); 1708 case TargetOpcode::G_SEXT: 1709 case TargetOpcode::G_ZEXT: 1710 case TargetOpcode::G_ANYEXT: 1711 return selectG_SZA_EXT(I); 1712 case TargetOpcode::G_BRCOND: 1713 return selectG_BRCOND(I); 1714 case TargetOpcode::G_FRAME_INDEX: 1715 return selectG_FRAME_INDEX(I); 1716 case TargetOpcode::G_PTR_MASK: 1717 return selectG_PTR_MASK(I); 1718 default: 1719 return selectImpl(I, *CoverageInfo); 1720 } 1721 return false; 1722 } 1723 1724 InstructionSelector::ComplexRendererFns 1725 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { 1726 return {{ 1727 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 1728 }}; 1729 1730 } 1731 1732 std::pair<Register, unsigned> 1733 AMDGPUInstructionSelector::selectVOP3ModsImpl( 1734 Register Src) const { 1735 unsigned Mods = 0; 1736 MachineInstr *MI = MRI->getVRegDef(Src); 1737 1738 if (MI && MI->getOpcode() == AMDGPU::G_FNEG) { 1739 Src = MI->getOperand(1).getReg(); 1740 Mods |= SISrcMods::NEG; 1741 MI = MRI->getVRegDef(Src); 1742 } 1743 1744 if (MI && MI->getOpcode() == AMDGPU::G_FABS) { 1745 Src = MI->getOperand(1).getReg(); 1746 Mods |= SISrcMods::ABS; 1747 } 1748 1749 return std::make_pair(Src, Mods); 1750 } 1751 1752 /// 1753 /// This will select either an SGPR or VGPR operand and will save us from 1754 /// having to write an extra tablegen pattern. 1755 InstructionSelector::ComplexRendererFns 1756 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const { 1757 return {{ 1758 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 1759 }}; 1760 } 1761 1762 InstructionSelector::ComplexRendererFns 1763 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { 1764 Register Src; 1765 unsigned Mods; 1766 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 1767 1768 return {{ 1769 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 1770 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 1771 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 1772 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 1773 }}; 1774 } 1775 1776 InstructionSelector::ComplexRendererFns 1777 AMDGPUInstructionSelector::selectVOP3Mods0Clamp0OMod(MachineOperand &Root) const { 1778 Register Src; 1779 unsigned Mods; 1780 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 1781 1782 return {{ 1783 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 1784 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 1785 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 1786 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 1787 }}; 1788 } 1789 1790 InstructionSelector::ComplexRendererFns 1791 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { 1792 return {{ 1793 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 1794 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 1795 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 1796 }}; 1797 } 1798 1799 InstructionSelector::ComplexRendererFns 1800 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { 1801 Register Src; 1802 unsigned Mods; 1803 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 1804 1805 return {{ 1806 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 1807 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 1808 }}; 1809 } 1810 1811 InstructionSelector::ComplexRendererFns 1812 AMDGPUInstructionSelector::selectVOP3OpSelMods0(MachineOperand &Root) const { 1813 // FIXME: Handle clamp and op_sel 1814 return {{ 1815 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 1816 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // src_mods 1817 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // clamp 1818 }}; 1819 } 1820 1821 InstructionSelector::ComplexRendererFns 1822 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { 1823 // FIXME: Handle op_sel 1824 return {{ 1825 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 1826 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods 1827 }}; 1828 } 1829 1830 InstructionSelector::ComplexRendererFns 1831 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { 1832 SmallVector<GEPInfo, 4> AddrInfo; 1833 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 1834 1835 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 1836 return None; 1837 1838 const GEPInfo &GEPInfo = AddrInfo[0]; 1839 1840 if (!AMDGPU::isLegalSMRDImmOffset(STI, GEPInfo.Imm)) 1841 return None; 1842 1843 unsigned PtrReg = GEPInfo.SgprParts[0]; 1844 int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm); 1845 return {{ 1846 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 1847 [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); } 1848 }}; 1849 } 1850 1851 InstructionSelector::ComplexRendererFns 1852 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { 1853 SmallVector<GEPInfo, 4> AddrInfo; 1854 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 1855 1856 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 1857 return None; 1858 1859 const GEPInfo &GEPInfo = AddrInfo[0]; 1860 unsigned PtrReg = GEPInfo.SgprParts[0]; 1861 int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm); 1862 if (!isUInt<32>(EncodedImm)) 1863 return None; 1864 1865 return {{ 1866 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 1867 [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); } 1868 }}; 1869 } 1870 1871 InstructionSelector::ComplexRendererFns 1872 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { 1873 MachineInstr *MI = Root.getParent(); 1874 MachineBasicBlock *MBB = MI->getParent(); 1875 1876 SmallVector<GEPInfo, 4> AddrInfo; 1877 getAddrModeInfo(*MI, *MRI, AddrInfo); 1878 1879 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, 1880 // then we can select all ptr + 32-bit offsets not just immediate offsets. 1881 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 1882 return None; 1883 1884 const GEPInfo &GEPInfo = AddrInfo[0]; 1885 if (!GEPInfo.Imm || !isUInt<32>(GEPInfo.Imm)) 1886 return None; 1887 1888 // If we make it this far we have a load with an 32-bit immediate offset. 1889 // It is OK to select this using a sgpr offset, because we have already 1890 // failed trying to select this load into one of the _IMM variants since 1891 // the _IMM Patterns are considered before the _SGPR patterns. 1892 unsigned PtrReg = GEPInfo.SgprParts[0]; 1893 Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1894 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) 1895 .addImm(GEPInfo.Imm); 1896 return {{ 1897 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 1898 [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); } 1899 }}; 1900 } 1901 1902 template <bool Signed> 1903 InstructionSelector::ComplexRendererFns 1904 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const { 1905 MachineInstr *MI = Root.getParent(); 1906 1907 InstructionSelector::ComplexRendererFns Default = {{ 1908 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 1909 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // offset 1910 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 1911 }}; 1912 1913 if (!STI.hasFlatInstOffsets()) 1914 return Default; 1915 1916 const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg()); 1917 if (!OpDef || OpDef->getOpcode() != AMDGPU::G_PTR_ADD) 1918 return Default; 1919 1920 Optional<int64_t> Offset = 1921 getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI); 1922 if (!Offset.hasValue()) 1923 return Default; 1924 1925 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); 1926 if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed)) 1927 return Default; 1928 1929 Register BasePtr = OpDef->getOperand(1).getReg(); 1930 1931 return {{ 1932 [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); }, 1933 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); }, 1934 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc 1935 }}; 1936 } 1937 1938 InstructionSelector::ComplexRendererFns 1939 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const { 1940 return selectFlatOffsetImpl<false>(Root); 1941 } 1942 1943 InstructionSelector::ComplexRendererFns 1944 AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const { 1945 return selectFlatOffsetImpl<true>(Root); 1946 } 1947 1948 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { 1949 auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>(); 1950 return PSV && PSV->isStack(); 1951 } 1952 1953 InstructionSelector::ComplexRendererFns 1954 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { 1955 MachineInstr *MI = Root.getParent(); 1956 MachineBasicBlock *MBB = MI->getParent(); 1957 MachineFunction *MF = MBB->getParent(); 1958 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 1959 1960 int64_t Offset = 0; 1961 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset))) { 1962 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1963 1964 // TODO: Should this be inside the render function? The iterator seems to 1965 // move. 1966 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), 1967 HighBits) 1968 .addImm(Offset & ~4095); 1969 1970 return {{[=](MachineInstrBuilder &MIB) { // rsrc 1971 MIB.addReg(Info->getScratchRSrcReg()); 1972 }, 1973 [=](MachineInstrBuilder &MIB) { // vaddr 1974 MIB.addReg(HighBits); 1975 }, 1976 [=](MachineInstrBuilder &MIB) { // soffset 1977 const MachineMemOperand *MMO = *MI->memoperands_begin(); 1978 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 1979 1980 Register SOffsetReg = isStackPtrRelative(PtrInfo) 1981 ? Info->getStackPtrOffsetReg() 1982 : Info->getScratchWaveOffsetReg(); 1983 MIB.addReg(SOffsetReg); 1984 }, 1985 [=](MachineInstrBuilder &MIB) { // offset 1986 MIB.addImm(Offset & 4095); 1987 }}}; 1988 } 1989 1990 assert(Offset == 0); 1991 1992 // Try to fold a frame index directly into the MUBUF vaddr field, and any 1993 // offsets. 1994 Optional<int> FI; 1995 Register VAddr = Root.getReg(); 1996 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) { 1997 if (isBaseWithConstantOffset(Root, *MRI)) { 1998 const MachineOperand &LHS = RootDef->getOperand(1); 1999 const MachineOperand &RHS = RootDef->getOperand(2); 2000 const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); 2001 const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); 2002 if (LHSDef && RHSDef) { 2003 int64_t PossibleOffset = 2004 RHSDef->getOperand(1).getCImm()->getSExtValue(); 2005 if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) && 2006 (!STI.privateMemoryResourceIsRangeChecked() || 2007 KnownBits->signBitIsZero(LHS.getReg()))) { 2008 if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX) 2009 FI = LHSDef->getOperand(1).getIndex(); 2010 else 2011 VAddr = LHS.getReg(); 2012 Offset = PossibleOffset; 2013 } 2014 } 2015 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) { 2016 FI = RootDef->getOperand(1).getIndex(); 2017 } 2018 } 2019 2020 // If we don't know this private access is a local stack object, it needs to 2021 // be relative to the entry point's scratch wave offset register. 2022 // TODO: Should split large offsets that don't fit like above. 2023 // TODO: Don't use scratch wave offset just because the offset didn't fit. 2024 Register SOffset = FI.hasValue() ? Info->getStackPtrOffsetReg() 2025 : Info->getScratchWaveOffsetReg(); 2026 2027 return {{[=](MachineInstrBuilder &MIB) { // rsrc 2028 MIB.addReg(Info->getScratchRSrcReg()); 2029 }, 2030 [=](MachineInstrBuilder &MIB) { // vaddr 2031 if (FI.hasValue()) 2032 MIB.addFrameIndex(FI.getValue()); 2033 else 2034 MIB.addReg(VAddr); 2035 }, 2036 [=](MachineInstrBuilder &MIB) { // soffset 2037 MIB.addReg(SOffset); 2038 }, 2039 [=](MachineInstrBuilder &MIB) { // offset 2040 MIB.addImm(Offset); 2041 }}}; 2042 } 2043 2044 bool AMDGPUInstructionSelector::isDSOffsetLegal(const MachineRegisterInfo &MRI, 2045 const MachineOperand &Base, 2046 int64_t Offset, 2047 unsigned OffsetBits) const { 2048 if ((OffsetBits == 16 && !isUInt<16>(Offset)) || 2049 (OffsetBits == 8 && !isUInt<8>(Offset))) 2050 return false; 2051 2052 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) 2053 return true; 2054 2055 // On Southern Islands instruction with a negative base value and an offset 2056 // don't seem to work. 2057 return KnownBits->signBitIsZero(Base.getReg()); 2058 } 2059 2060 InstructionSelector::ComplexRendererFns 2061 AMDGPUInstructionSelector::selectMUBUFScratchOffset( 2062 MachineOperand &Root) const { 2063 MachineInstr *MI = Root.getParent(); 2064 MachineBasicBlock *MBB = MI->getParent(); 2065 2066 int64_t Offset = 0; 2067 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) || 2068 !SIInstrInfo::isLegalMUBUFImmOffset(Offset)) 2069 return {}; 2070 2071 const MachineFunction *MF = MBB->getParent(); 2072 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 2073 const MachineMemOperand *MMO = *MI->memoperands_begin(); 2074 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 2075 2076 Register SOffsetReg = isStackPtrRelative(PtrInfo) 2077 ? Info->getStackPtrOffsetReg() 2078 : Info->getScratchWaveOffsetReg(); 2079 return {{ 2080 [=](MachineInstrBuilder &MIB) { 2081 MIB.addReg(Info->getScratchRSrcReg()); 2082 }, // rsrc 2083 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffsetReg); }, // soffset 2084 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset 2085 }}; 2086 } 2087 2088 InstructionSelector::ComplexRendererFns 2089 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const { 2090 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 2091 if (!RootDef) { 2092 return {{ 2093 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 2094 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } 2095 }}; 2096 } 2097 2098 int64_t ConstAddr = 0; 2099 if (isBaseWithConstantOffset(Root, *MRI)) { 2100 const MachineOperand &LHS = RootDef->getOperand(1); 2101 const MachineOperand &RHS = RootDef->getOperand(2); 2102 const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); 2103 const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); 2104 if (LHSDef && RHSDef) { 2105 int64_t PossibleOffset = 2106 RHSDef->getOperand(1).getCImm()->getSExtValue(); 2107 if (isDSOffsetLegal(*MRI, LHS, PossibleOffset, 16)) { 2108 // (add n0, c0) 2109 return {{ 2110 [=](MachineInstrBuilder &MIB) { MIB.add(LHS); }, 2111 [=](MachineInstrBuilder &MIB) { MIB.addImm(PossibleOffset); } 2112 }}; 2113 } 2114 } 2115 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 2116 2117 2118 2119 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 2120 2121 2122 } 2123 2124 return {{ 2125 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 2126 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } 2127 }}; 2128 } 2129 2130 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, 2131 const MachineInstr &MI) const { 2132 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT"); 2133 Optional<int64_t> CstVal = getConstantVRegVal(MI.getOperand(0).getReg(), *MRI); 2134 assert(CstVal && "Expected constant value"); 2135 MIB.addImm(CstVal.getValue()); 2136 } 2137