1 //===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Defines an instruction selector for the AMDGPU target. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPUArgumentUsageInfo.h" 16 #include "AMDGPUISelLowering.h" // For AMDGPUISD 17 #include "AMDGPUInstrInfo.h" 18 #include "AMDGPUPerfHintAnalysis.h" 19 #include "AMDGPURegisterInfo.h" 20 #include "AMDGPUSubtarget.h" 21 #include "AMDGPUTargetMachine.h" 22 #include "SIDefines.h" 23 #include "SIISelLowering.h" 24 #include "SIInstrInfo.h" 25 #include "SIMachineFunctionInfo.h" 26 #include "SIRegisterInfo.h" 27 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 28 #include "llvm/ADT/APInt.h" 29 #include "llvm/ADT/SmallVector.h" 30 #include "llvm/ADT/StringRef.h" 31 #include "llvm/Analysis/LegacyDivergenceAnalysis.h" 32 #include "llvm/Analysis/ValueTracking.h" 33 #include "llvm/CodeGen/FunctionLoweringInfo.h" 34 #include "llvm/CodeGen/ISDOpcodes.h" 35 #include "llvm/CodeGen/MachineFunction.h" 36 #include "llvm/CodeGen/MachineRegisterInfo.h" 37 #include "llvm/CodeGen/SelectionDAG.h" 38 #include "llvm/CodeGen/SelectionDAGISel.h" 39 #include "llvm/CodeGen/SelectionDAGNodes.h" 40 #include "llvm/CodeGen/ValueTypes.h" 41 #include "llvm/IR/BasicBlock.h" 42 #include "llvm/IR/Instruction.h" 43 #include "llvm/MC/MCInstrDesc.h" 44 #include "llvm/Support/Casting.h" 45 #include "llvm/Support/CodeGen.h" 46 #include "llvm/Support/ErrorHandling.h" 47 #include "llvm/Support/MachineValueType.h" 48 #include "llvm/Support/MathExtras.h" 49 #include <cassert> 50 #include <cstdint> 51 #include <new> 52 #include <vector> 53 54 #define DEBUG_TYPE "isel" 55 56 using namespace llvm; 57 58 namespace llvm { 59 60 class R600InstrInfo; 61 62 } // end namespace llvm 63 64 //===----------------------------------------------------------------------===// 65 // Instruction Selector Implementation 66 //===----------------------------------------------------------------------===// 67 68 namespace { 69 70 /// AMDGPU specific code to select AMDGPU machine instructions for 71 /// SelectionDAG operations. 72 class AMDGPUDAGToDAGISel : public SelectionDAGISel { 73 // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can 74 // make the right decision when generating code for different targets. 75 const GCNSubtarget *Subtarget; 76 bool EnableLateStructurizeCFG; 77 78 public: 79 explicit AMDGPUDAGToDAGISel(TargetMachine *TM = nullptr, 80 CodeGenOpt::Level OptLevel = CodeGenOpt::Default) 81 : SelectionDAGISel(*TM, OptLevel) { 82 EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG; 83 } 84 ~AMDGPUDAGToDAGISel() override = default; 85 86 void getAnalysisUsage(AnalysisUsage &AU) const override { 87 AU.addRequired<AMDGPUArgumentUsageInfo>(); 88 AU.addRequired<AMDGPUPerfHintAnalysis>(); 89 AU.addRequired<LegacyDivergenceAnalysis>(); 90 SelectionDAGISel::getAnalysisUsage(AU); 91 } 92 93 bool matchLoadD16FromBuildVector(SDNode *N) const; 94 95 bool runOnMachineFunction(MachineFunction &MF) override; 96 void PreprocessISelDAG() override; 97 void Select(SDNode *N) override; 98 StringRef getPassName() const override; 99 void PostprocessISelDAG() override; 100 101 protected: 102 void SelectBuildVector(SDNode *N, unsigned RegClassID); 103 104 private: 105 std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const; 106 bool isNoNanSrc(SDValue N) const; 107 bool isInlineImmediate(const SDNode *N) const; 108 bool isVGPRImm(const SDNode *N) const; 109 bool isUniformLoad(const SDNode *N) const; 110 bool isUniformBr(const SDNode *N) const; 111 112 MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const; 113 114 SDNode *glueCopyToM0LDSInit(SDNode *N) const; 115 SDNode *glueCopyToM0(SDNode *N, SDValue Val) const; 116 117 const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const; 118 virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); 119 virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); 120 bool isDSOffsetLegal(SDValue Base, unsigned Offset, 121 unsigned OffsetBits) const; 122 bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; 123 bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, 124 SDValue &Offset1) const; 125 bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, 126 SDValue &SOffset, SDValue &Offset, SDValue &Offen, 127 SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, 128 SDValue &TFE) const; 129 bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, 130 SDValue &SOffset, SDValue &Offset, SDValue &GLC, 131 SDValue &SLC, SDValue &TFE) const; 132 bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, 133 SDValue &VAddr, SDValue &SOffset, SDValue &Offset, 134 SDValue &SLC) const; 135 bool SelectMUBUFScratchOffen(SDNode *Parent, 136 SDValue Addr, SDValue &RSrc, SDValue &VAddr, 137 SDValue &SOffset, SDValue &ImmOffset) const; 138 bool SelectMUBUFScratchOffset(SDNode *Parent, 139 SDValue Addr, SDValue &SRsrc, SDValue &Soffset, 140 SDValue &Offset) const; 141 142 bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, 143 SDValue &Offset, SDValue &GLC, SDValue &SLC, 144 SDValue &TFE) const; 145 bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, 146 SDValue &Offset, SDValue &SLC) const; 147 bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, 148 SDValue &Offset) const; 149 150 bool SelectFlatAtomic(SDValue Addr, SDValue &VAddr, 151 SDValue &Offset, SDValue &SLC) const; 152 bool SelectFlatAtomicSigned(SDValue Addr, SDValue &VAddr, 153 SDValue &Offset, SDValue &SLC) const; 154 155 template <bool IsSigned> 156 bool SelectFlatOffset(SDValue Addr, SDValue &VAddr, 157 SDValue &Offset, SDValue &SLC) const; 158 159 bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, 160 bool &Imm) const; 161 SDValue Expand32BitAddress(SDValue Addr) const; 162 bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, 163 bool &Imm) const; 164 bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const; 165 bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const; 166 bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const; 167 bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const; 168 bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const; 169 bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; 170 171 bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const; 172 bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods) const; 173 bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; 174 bool SelectVOP3NoMods(SDValue In, SDValue &Src) const; 175 bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, 176 SDValue &Clamp, SDValue &Omod) const; 177 bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods, 178 SDValue &Clamp, SDValue &Omod) const; 179 180 bool SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, SDValue &SrcMods, 181 SDValue &Clamp, 182 SDValue &Omod) const; 183 184 bool SelectVOP3OMods(SDValue In, SDValue &Src, 185 SDValue &Clamp, SDValue &Omod) const; 186 187 bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; 188 bool SelectVOP3PMods0(SDValue In, SDValue &Src, SDValue &SrcMods, 189 SDValue &Clamp) const; 190 191 bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const; 192 bool SelectVOP3OpSel0(SDValue In, SDValue &Src, SDValue &SrcMods, 193 SDValue &Clamp) const; 194 195 bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; 196 bool SelectVOP3OpSelMods0(SDValue In, SDValue &Src, SDValue &SrcMods, 197 SDValue &Clamp) const; 198 bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const; 199 bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; 200 201 SDValue getHi16Elt(SDValue In) const; 202 203 void SelectADD_SUB_I64(SDNode *N); 204 void SelectUADDO_USUBO(SDNode *N); 205 void SelectDIV_SCALE(SDNode *N); 206 void SelectMAD_64_32(SDNode *N); 207 void SelectFMA_W_CHAIN(SDNode *N); 208 void SelectFMUL_W_CHAIN(SDNode *N); 209 210 SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val, 211 uint32_t Offset, uint32_t Width); 212 void SelectS_BFEFromShifts(SDNode *N); 213 void SelectS_BFE(SDNode *N); 214 bool isCBranchSCC(const SDNode *N) const; 215 void SelectBRCOND(SDNode *N); 216 void SelectFMAD_FMA(SDNode *N); 217 void SelectATOMIC_CMP_SWAP(SDNode *N); 218 void SelectINTRINSIC_W_CHAIN(SDNode *N); 219 220 protected: 221 // Include the pieces autogenerated from the target description. 222 #include "AMDGPUGenDAGISel.inc" 223 }; 224 225 class R600DAGToDAGISel : public AMDGPUDAGToDAGISel { 226 const R600Subtarget *Subtarget; 227 228 bool isConstantLoad(const MemSDNode *N, int cbID) const; 229 bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr); 230 bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg, 231 SDValue& Offset); 232 public: 233 explicit R600DAGToDAGISel(TargetMachine *TM, CodeGenOpt::Level OptLevel) : 234 AMDGPUDAGToDAGISel(TM, OptLevel) {} 235 236 void Select(SDNode *N) override; 237 238 bool SelectADDRIndirect(SDValue Addr, SDValue &Base, 239 SDValue &Offset) override; 240 bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, 241 SDValue &Offset) override; 242 243 bool runOnMachineFunction(MachineFunction &MF) override; 244 245 void PreprocessISelDAG() override {} 246 247 protected: 248 // Include the pieces autogenerated from the target description. 249 #include "R600GenDAGISel.inc" 250 }; 251 252 static SDValue stripBitcast(SDValue Val) { 253 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val; 254 } 255 256 // Figure out if this is really an extract of the high 16-bits of a dword. 257 static bool isExtractHiElt(SDValue In, SDValue &Out) { 258 In = stripBitcast(In); 259 if (In.getOpcode() != ISD::TRUNCATE) 260 return false; 261 262 SDValue Srl = In.getOperand(0); 263 if (Srl.getOpcode() == ISD::SRL) { 264 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) { 265 if (ShiftAmt->getZExtValue() == 16) { 266 Out = stripBitcast(Srl.getOperand(0)); 267 return true; 268 } 269 } 270 } 271 272 return false; 273 } 274 275 // Look through operations that obscure just looking at the low 16-bits of the 276 // same register. 277 static SDValue stripExtractLoElt(SDValue In) { 278 if (In.getOpcode() == ISD::TRUNCATE) { 279 SDValue Src = In.getOperand(0); 280 if (Src.getValueType().getSizeInBits() == 32) 281 return stripBitcast(Src); 282 } 283 284 return In; 285 } 286 287 } // end anonymous namespace 288 289 INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel", 290 "AMDGPU DAG->DAG Pattern Instruction Selection", false, false) 291 INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo) 292 INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis) 293 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) 294 INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "amdgpu-isel", 295 "AMDGPU DAG->DAG Pattern Instruction Selection", false, false) 296 297 /// This pass converts a legalized DAG into a AMDGPU-specific 298 // DAG, ready for instruction scheduling. 299 FunctionPass *llvm::createAMDGPUISelDag(TargetMachine *TM, 300 CodeGenOpt::Level OptLevel) { 301 return new AMDGPUDAGToDAGISel(TM, OptLevel); 302 } 303 304 /// This pass converts a legalized DAG into a R600-specific 305 // DAG, ready for instruction scheduling. 306 FunctionPass *llvm::createR600ISelDag(TargetMachine *TM, 307 CodeGenOpt::Level OptLevel) { 308 return new R600DAGToDAGISel(TM, OptLevel); 309 } 310 311 bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { 312 Subtarget = &MF.getSubtarget<GCNSubtarget>(); 313 return SelectionDAGISel::runOnMachineFunction(MF); 314 } 315 316 bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const { 317 assert(Subtarget->d16PreservesUnusedBits()); 318 MVT VT = N->getValueType(0).getSimpleVT(); 319 if (VT != MVT::v2i16 && VT != MVT::v2f16) 320 return false; 321 322 SDValue Lo = N->getOperand(0); 323 SDValue Hi = N->getOperand(1); 324 325 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi)); 326 327 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo 328 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo 329 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo 330 331 // Need to check for possible indirect dependencies on the other half of the 332 // vector to avoid introducing a cycle. 333 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) { 334 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other); 335 336 SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo); 337 SDValue Ops[] = { 338 LdHi->getChain(), LdHi->getBasePtr(), TiedIn 339 }; 340 341 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI; 342 if (LdHi->getMemoryVT() == MVT::i8) { 343 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ? 344 AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8; 345 } else { 346 assert(LdHi->getMemoryVT() == MVT::i16); 347 } 348 349 SDValue NewLoadHi = 350 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList, 351 Ops, LdHi->getMemoryVT(), 352 LdHi->getMemOperand()); 353 354 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi); 355 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1)); 356 return true; 357 } 358 359 // build_vector (load ptr), hi -> load_d16_lo ptr, hi 360 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi 361 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi 362 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo)); 363 if (LdLo && Lo.hasOneUse()) { 364 SDValue TiedIn = getHi16Elt(Hi); 365 if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode())) 366 return false; 367 368 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other); 369 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO; 370 if (LdLo->getMemoryVT() == MVT::i8) { 371 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ? 372 AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8; 373 } else { 374 assert(LdLo->getMemoryVT() == MVT::i16); 375 } 376 377 TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn); 378 379 SDValue Ops[] = { 380 LdLo->getChain(), LdLo->getBasePtr(), TiedIn 381 }; 382 383 SDValue NewLoadLo = 384 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList, 385 Ops, LdLo->getMemoryVT(), 386 LdLo->getMemOperand()); 387 388 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo); 389 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1)); 390 return true; 391 } 392 393 return false; 394 } 395 396 void AMDGPUDAGToDAGISel::PreprocessISelDAG() { 397 if (!Subtarget->d16PreservesUnusedBits()) 398 return; 399 400 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end(); 401 402 bool MadeChange = false; 403 while (Position != CurDAG->allnodes_begin()) { 404 SDNode *N = &*--Position; 405 if (N->use_empty()) 406 continue; 407 408 switch (N->getOpcode()) { 409 case ISD::BUILD_VECTOR: 410 MadeChange |= matchLoadD16FromBuildVector(N); 411 break; 412 default: 413 break; 414 } 415 } 416 417 if (MadeChange) { 418 CurDAG->RemoveDeadNodes(); 419 LLVM_DEBUG(dbgs() << "After PreProcess:\n"; 420 CurDAG->dump();); 421 } 422 } 423 424 bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const { 425 if (TM.Options.NoNaNsFPMath) 426 return true; 427 428 // TODO: Move into isKnownNeverNaN 429 if (N->getFlags().isDefined()) 430 return N->getFlags().hasNoNaNs(); 431 432 return CurDAG->isKnownNeverNaN(N); 433 } 434 435 bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const { 436 const SIInstrInfo *TII = Subtarget->getInstrInfo(); 437 438 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) 439 return TII->isInlineConstant(C->getAPIntValue()); 440 441 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) 442 return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt()); 443 444 return false; 445 } 446 447 /// Determine the register class for \p OpNo 448 /// \returns The register class of the virtual register that will be used for 449 /// the given operand number \OpNo or NULL if the register class cannot be 450 /// determined. 451 const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, 452 unsigned OpNo) const { 453 if (!N->isMachineOpcode()) { 454 if (N->getOpcode() == ISD::CopyToReg) { 455 unsigned Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg(); 456 if (TargetRegisterInfo::isVirtualRegister(Reg)) { 457 MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo(); 458 return MRI.getRegClass(Reg); 459 } 460 461 const SIRegisterInfo *TRI 462 = static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo(); 463 return TRI->getPhysRegClass(Reg); 464 } 465 466 return nullptr; 467 } 468 469 switch (N->getMachineOpcode()) { 470 default: { 471 const MCInstrDesc &Desc = 472 Subtarget->getInstrInfo()->get(N->getMachineOpcode()); 473 unsigned OpIdx = Desc.getNumDefs() + OpNo; 474 if (OpIdx >= Desc.getNumOperands()) 475 return nullptr; 476 int RegClass = Desc.OpInfo[OpIdx].RegClass; 477 if (RegClass == -1) 478 return nullptr; 479 480 return Subtarget->getRegisterInfo()->getRegClass(RegClass); 481 } 482 case AMDGPU::REG_SEQUENCE: { 483 unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 484 const TargetRegisterClass *SuperRC = 485 Subtarget->getRegisterInfo()->getRegClass(RCID); 486 487 SDValue SubRegOp = N->getOperand(OpNo + 1); 488 unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue(); 489 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC, 490 SubRegIdx); 491 } 492 } 493 } 494 495 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const { 496 const SITargetLowering& Lowering = 497 *static_cast<const SITargetLowering*>(getTargetLowering()); 498 499 // Write max value to m0 before each load operation 500 501 SDValue M0 = Lowering.copyToM0(*CurDAG, CurDAG->getEntryNode(), SDLoc(N), 502 Val); 503 504 SDValue Glue = M0.getValue(1); 505 506 SmallVector <SDValue, 8> Ops; 507 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) 508 Ops.push_back(N->getOperand(i)); 509 510 Ops.push_back(Glue); 511 return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops); 512 } 513 514 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const { 515 if (cast<MemSDNode>(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS || 516 !Subtarget->ldsRequiresM0Init()) 517 return N; 518 return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32)); 519 } 520 521 MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm, 522 EVT VT) const { 523 SDNode *Lo = CurDAG->getMachineNode( 524 AMDGPU::S_MOV_B32, DL, MVT::i32, 525 CurDAG->getConstant(Imm & 0xFFFFFFFF, DL, MVT::i32)); 526 SDNode *Hi = 527 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, 528 CurDAG->getConstant(Imm >> 32, DL, MVT::i32)); 529 const SDValue Ops[] = { 530 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), 531 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), 532 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)}; 533 534 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops); 535 } 536 537 static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) { 538 switch (NumVectorElts) { 539 case 1: 540 return AMDGPU::SReg_32_XM0RegClassID; 541 case 2: 542 return AMDGPU::SReg_64RegClassID; 543 case 3: 544 return AMDGPU::SGPR_96RegClassID; 545 case 4: 546 return AMDGPU::SReg_128RegClassID; 547 case 5: 548 return AMDGPU::SGPR_160RegClassID; 549 case 8: 550 return AMDGPU::SReg_256RegClassID; 551 case 16: 552 return AMDGPU::SReg_512RegClassID; 553 } 554 555 llvm_unreachable("invalid vector size"); 556 } 557 558 static bool getConstantValue(SDValue N, uint32_t &Out) { 559 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) { 560 Out = C->getAPIntValue().getZExtValue(); 561 return true; 562 } 563 564 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) { 565 Out = C->getValueAPF().bitcastToAPInt().getZExtValue(); 566 return true; 567 } 568 569 return false; 570 } 571 572 void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) { 573 EVT VT = N->getValueType(0); 574 unsigned NumVectorElts = VT.getVectorNumElements(); 575 EVT EltVT = VT.getVectorElementType(); 576 SDLoc DL(N); 577 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); 578 579 if (NumVectorElts == 1) { 580 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0), 581 RegClass); 582 return; 583 } 584 585 assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not " 586 "supported yet"); 587 // 16 = Max Num Vector Elements 588 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index) 589 // 1 = Vector Register Class 590 SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1); 591 592 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); 593 bool IsRegSeq = true; 594 unsigned NOps = N->getNumOperands(); 595 for (unsigned i = 0; i < NOps; i++) { 596 // XXX: Why is this here? 597 if (isa<RegisterSDNode>(N->getOperand(i))) { 598 IsRegSeq = false; 599 break; 600 } 601 unsigned Sub = AMDGPURegisterInfo::getSubRegFromChannel(i); 602 RegSeqArgs[1 + (2 * i)] = N->getOperand(i); 603 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32); 604 } 605 if (NOps != NumVectorElts) { 606 // Fill in the missing undef elements if this was a scalar_to_vector. 607 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts); 608 MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, 609 DL, EltVT); 610 for (unsigned i = NOps; i < NumVectorElts; ++i) { 611 unsigned Sub = AMDGPURegisterInfo::getSubRegFromChannel(i); 612 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0); 613 RegSeqArgs[1 + (2 * i) + 1] = 614 CurDAG->getTargetConstant(Sub, DL, MVT::i32); 615 } 616 } 617 618 if (!IsRegSeq) 619 SelectCode(N); 620 CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs); 621 } 622 623 void AMDGPUDAGToDAGISel::Select(SDNode *N) { 624 unsigned int Opc = N->getOpcode(); 625 if (N->isMachineOpcode()) { 626 N->setNodeId(-1); 627 return; // Already selected. 628 } 629 630 if (isa<AtomicSDNode>(N) || 631 (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC || 632 Opc == ISD::ATOMIC_LOAD_FADD || 633 Opc == AMDGPUISD::ATOMIC_LOAD_FMIN || 634 Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) 635 N = glueCopyToM0LDSInit(N); 636 637 switch (Opc) { 638 default: 639 break; 640 // We are selecting i64 ADD here instead of custom lower it during 641 // DAG legalization, so we can fold some i64 ADDs used for address 642 // calculation into the LOAD and STORE instructions. 643 case ISD::ADDC: 644 case ISD::ADDE: 645 case ISD::SUBC: 646 case ISD::SUBE: { 647 if (N->getValueType(0) != MVT::i64) 648 break; 649 650 SelectADD_SUB_I64(N); 651 return; 652 } 653 case ISD::UADDO: 654 case ISD::USUBO: { 655 SelectUADDO_USUBO(N); 656 return; 657 } 658 case AMDGPUISD::FMUL_W_CHAIN: { 659 SelectFMUL_W_CHAIN(N); 660 return; 661 } 662 case AMDGPUISD::FMA_W_CHAIN: { 663 SelectFMA_W_CHAIN(N); 664 return; 665 } 666 667 case ISD::SCALAR_TO_VECTOR: 668 case ISD::BUILD_VECTOR: { 669 EVT VT = N->getValueType(0); 670 unsigned NumVectorElts = VT.getVectorNumElements(); 671 if (VT.getScalarSizeInBits() == 16) { 672 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) { 673 uint32_t LHSVal, RHSVal; 674 if (getConstantValue(N->getOperand(0), LHSVal) && 675 getConstantValue(N->getOperand(1), RHSVal)) { 676 uint32_t K = LHSVal | (RHSVal << 16); 677 CurDAG->SelectNodeTo(N, AMDGPU::S_MOV_B32, VT, 678 CurDAG->getTargetConstant(K, SDLoc(N), MVT::i32)); 679 return; 680 } 681 } 682 683 break; 684 } 685 686 assert(VT.getVectorElementType().bitsEq(MVT::i32)); 687 unsigned RegClassID = selectSGPRVectorRegClassID(NumVectorElts); 688 SelectBuildVector(N, RegClassID); 689 return; 690 } 691 case ISD::BUILD_PAIR: { 692 SDValue RC, SubReg0, SubReg1; 693 SDLoc DL(N); 694 if (N->getValueType(0) == MVT::i128) { 695 RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32); 696 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32); 697 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32); 698 } else if (N->getValueType(0) == MVT::i64) { 699 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32); 700 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); 701 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); 702 } else { 703 llvm_unreachable("Unhandled value type for BUILD_PAIR"); 704 } 705 const SDValue Ops[] = { RC, N->getOperand(0), SubReg0, 706 N->getOperand(1), SubReg1 }; 707 ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, 708 N->getValueType(0), Ops)); 709 return; 710 } 711 712 case ISD::Constant: 713 case ISD::ConstantFP: { 714 if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N)) 715 break; 716 717 uint64_t Imm; 718 if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) 719 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue(); 720 else { 721 ConstantSDNode *C = cast<ConstantSDNode>(N); 722 Imm = C->getZExtValue(); 723 } 724 725 SDLoc DL(N); 726 ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0))); 727 return; 728 } 729 case ISD::LOAD: 730 case ISD::STORE: 731 case ISD::ATOMIC_LOAD: 732 case ISD::ATOMIC_STORE: { 733 N = glueCopyToM0LDSInit(N); 734 break; 735 } 736 737 case AMDGPUISD::BFE_I32: 738 case AMDGPUISD::BFE_U32: { 739 // There is a scalar version available, but unlike the vector version which 740 // has a separate operand for the offset and width, the scalar version packs 741 // the width and offset into a single operand. Try to move to the scalar 742 // version if the offsets are constant, so that we can try to keep extended 743 // loads of kernel arguments in SGPRs. 744 745 // TODO: Technically we could try to pattern match scalar bitshifts of 746 // dynamic values, but it's probably not useful. 747 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); 748 if (!Offset) 749 break; 750 751 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2)); 752 if (!Width) 753 break; 754 755 bool Signed = Opc == AMDGPUISD::BFE_I32; 756 757 uint32_t OffsetVal = Offset->getZExtValue(); 758 uint32_t WidthVal = Width->getZExtValue(); 759 760 ReplaceNode(N, getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, 761 SDLoc(N), N->getOperand(0), OffsetVal, WidthVal)); 762 return; 763 } 764 case AMDGPUISD::DIV_SCALE: { 765 SelectDIV_SCALE(N); 766 return; 767 } 768 case AMDGPUISD::MAD_I64_I32: 769 case AMDGPUISD::MAD_U64_U32: { 770 SelectMAD_64_32(N); 771 return; 772 } 773 case ISD::CopyToReg: { 774 const SITargetLowering& Lowering = 775 *static_cast<const SITargetLowering*>(getTargetLowering()); 776 N = Lowering.legalizeTargetIndependentNode(N, *CurDAG); 777 break; 778 } 779 case ISD::AND: 780 case ISD::SRL: 781 case ISD::SRA: 782 case ISD::SIGN_EXTEND_INREG: 783 if (N->getValueType(0) != MVT::i32) 784 break; 785 786 SelectS_BFE(N); 787 return; 788 case ISD::BRCOND: 789 SelectBRCOND(N); 790 return; 791 case ISD::FMAD: 792 case ISD::FMA: 793 SelectFMAD_FMA(N); 794 return; 795 case AMDGPUISD::ATOMIC_CMP_SWAP: 796 SelectATOMIC_CMP_SWAP(N); 797 return; 798 case AMDGPUISD::CVT_PKRTZ_F16_F32: 799 case AMDGPUISD::CVT_PKNORM_I16_F32: 800 case AMDGPUISD::CVT_PKNORM_U16_F32: 801 case AMDGPUISD::CVT_PK_U16_U32: 802 case AMDGPUISD::CVT_PK_I16_I32: { 803 // Hack around using a legal type if f16 is illegal. 804 if (N->getValueType(0) == MVT::i32) { 805 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16; 806 N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT), 807 { N->getOperand(0), N->getOperand(1) }); 808 SelectCode(N); 809 return; 810 } 811 812 break; 813 } 814 case ISD::INTRINSIC_W_CHAIN: { 815 SelectINTRINSIC_W_CHAIN(N); 816 return; 817 } 818 } 819 820 SelectCode(N); 821 } 822 823 bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const { 824 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock(); 825 const Instruction *Term = BB->getTerminator(); 826 return Term->getMetadata("amdgpu.uniform") || 827 Term->getMetadata("structurizecfg.uniform"); 828 } 829 830 StringRef AMDGPUDAGToDAGISel::getPassName() const { 831 return "AMDGPU DAG->DAG Pattern Instruction Selection"; 832 } 833 834 //===----------------------------------------------------------------------===// 835 // Complex Patterns 836 //===----------------------------------------------------------------------===// 837 838 bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base, 839 SDValue &Offset) { 840 return false; 841 } 842 843 bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, 844 SDValue &Offset) { 845 ConstantSDNode *C; 846 SDLoc DL(Addr); 847 848 if ((C = dyn_cast<ConstantSDNode>(Addr))) { 849 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32); 850 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); 851 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) && 852 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) { 853 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32); 854 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); 855 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) && 856 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) { 857 Base = Addr.getOperand(0); 858 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); 859 } else { 860 Base = Addr; 861 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); 862 } 863 864 return true; 865 } 866 867 // FIXME: Should only handle addcarry/subcarry 868 void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { 869 SDLoc DL(N); 870 SDValue LHS = N->getOperand(0); 871 SDValue RHS = N->getOperand(1); 872 873 unsigned Opcode = N->getOpcode(); 874 bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE); 875 bool ProduceCarry = 876 ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC; 877 bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE; 878 879 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); 880 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); 881 882 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, 883 DL, MVT::i32, LHS, Sub0); 884 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, 885 DL, MVT::i32, LHS, Sub1); 886 887 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, 888 DL, MVT::i32, RHS, Sub0); 889 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, 890 DL, MVT::i32, RHS, Sub1); 891 892 SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue); 893 894 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; 895 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; 896 897 SDNode *AddLo; 898 if (!ConsumeCarry) { 899 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) }; 900 AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args); 901 } else { 902 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) }; 903 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args); 904 } 905 SDValue AddHiArgs[] = { 906 SDValue(Hi0, 0), 907 SDValue(Hi1, 0), 908 SDValue(AddLo, 1) 909 }; 910 SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs); 911 912 SDValue RegSequenceArgs[] = { 913 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), 914 SDValue(AddLo,0), 915 Sub0, 916 SDValue(AddHi,0), 917 Sub1, 918 }; 919 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, 920 MVT::i64, RegSequenceArgs); 921 922 if (ProduceCarry) { 923 // Replace the carry-use 924 ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1)); 925 } 926 927 // Replace the remaining uses. 928 ReplaceNode(N, RegSequence); 929 } 930 931 void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) { 932 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned 933 // carry out despite the _i32 name. These were renamed in VI to _U32. 934 // FIXME: We should probably rename the opcodes here. 935 unsigned Opc = N->getOpcode() == ISD::UADDO ? 936 AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; 937 938 CurDAG->SelectNodeTo( 939 N, Opc, N->getVTList(), 940 {N->getOperand(0), N->getOperand(1), 941 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/}); 942 } 943 944 void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) { 945 SDLoc SL(N); 946 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod 947 SDValue Ops[10]; 948 949 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]); 950 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]); 951 SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]); 952 Ops[8] = N->getOperand(0); 953 Ops[9] = N->getOperand(4); 954 955 CurDAG->SelectNodeTo(N, AMDGPU::V_FMA_F32, N->getVTList(), Ops); 956 } 957 958 void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) { 959 SDLoc SL(N); 960 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod 961 SDValue Ops[8]; 962 963 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]); 964 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]); 965 Ops[6] = N->getOperand(0); 966 Ops[7] = N->getOperand(3); 967 968 CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops); 969 } 970 971 // We need to handle this here because tablegen doesn't support matching 972 // instructions with multiple outputs. 973 void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { 974 SDLoc SL(N); 975 EVT VT = N->getValueType(0); 976 977 assert(VT == MVT::f32 || VT == MVT::f64); 978 979 unsigned Opc 980 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32; 981 982 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2) }; 983 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); 984 } 985 986 // We need to handle this here because tablegen doesn't support matching 987 // instructions with multiple outputs. 988 void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) { 989 SDLoc SL(N); 990 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32; 991 unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32 : AMDGPU::V_MAD_U64_U32; 992 993 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1); 994 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), 995 Clamp }; 996 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); 997 } 998 999 bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset, 1000 unsigned OffsetBits) const { 1001 if ((OffsetBits == 16 && !isUInt<16>(Offset)) || 1002 (OffsetBits == 8 && !isUInt<8>(Offset))) 1003 return false; 1004 1005 if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS || 1006 Subtarget->unsafeDSOffsetFoldingEnabled()) 1007 return true; 1008 1009 // On Southern Islands instruction with a negative base value and an offset 1010 // don't seem to work. 1011 return CurDAG->SignBitIsZero(Base); 1012 } 1013 1014 bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, 1015 SDValue &Offset) const { 1016 SDLoc DL(Addr); 1017 if (CurDAG->isBaseWithConstantOffset(Addr)) { 1018 SDValue N0 = Addr.getOperand(0); 1019 SDValue N1 = Addr.getOperand(1); 1020 ConstantSDNode *C1 = cast<ConstantSDNode>(N1); 1021 if (isDSOffsetLegal(N0, C1->getSExtValue(), 16)) { 1022 // (add n0, c0) 1023 Base = N0; 1024 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); 1025 return true; 1026 } 1027 } else if (Addr.getOpcode() == ISD::SUB) { 1028 // sub C, x -> add (sub 0, x), C 1029 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) { 1030 int64_t ByteOffset = C->getSExtValue(); 1031 if (isUInt<16>(ByteOffset)) { 1032 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); 1033 1034 // XXX - This is kind of hacky. Create a dummy sub node so we can check 1035 // the known bits in isDSOffsetLegal. We need to emit the selected node 1036 // here, so this is thrown away. 1037 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32, 1038 Zero, Addr.getOperand(1)); 1039 1040 if (isDSOffsetLegal(Sub, ByteOffset, 16)) { 1041 SmallVector<SDValue, 3> Opnds; 1042 Opnds.push_back(Zero); 1043 Opnds.push_back(Addr.getOperand(1)); 1044 1045 // FIXME: Select to VOP3 version for with-carry. 1046 unsigned SubOp = AMDGPU::V_SUB_I32_e32; 1047 if (Subtarget->hasAddNoCarry()) { 1048 SubOp = AMDGPU::V_SUB_U32_e64; 1049 Opnds.push_back( 1050 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit 1051 } 1052 1053 MachineSDNode *MachineSub = 1054 CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds); 1055 1056 Base = SDValue(MachineSub, 0); 1057 Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16); 1058 return true; 1059 } 1060 } 1061 } 1062 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { 1063 // If we have a constant address, prefer to put the constant into the 1064 // offset. This can save moves to load the constant address since multiple 1065 // operations can share the zero base address register, and enables merging 1066 // into read2 / write2 instructions. 1067 1068 SDLoc DL(Addr); 1069 1070 if (isUInt<16>(CAddr->getZExtValue())) { 1071 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); 1072 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, 1073 DL, MVT::i32, Zero); 1074 Base = SDValue(MovZero, 0); 1075 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16); 1076 return true; 1077 } 1078 } 1079 1080 // default case 1081 Base = Addr; 1082 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16); 1083 return true; 1084 } 1085 1086 // TODO: If offset is too big, put low 16-bit into offset. 1087 bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, 1088 SDValue &Offset0, 1089 SDValue &Offset1) const { 1090 SDLoc DL(Addr); 1091 1092 if (CurDAG->isBaseWithConstantOffset(Addr)) { 1093 SDValue N0 = Addr.getOperand(0); 1094 SDValue N1 = Addr.getOperand(1); 1095 ConstantSDNode *C1 = cast<ConstantSDNode>(N1); 1096 unsigned DWordOffset0 = C1->getZExtValue() / 4; 1097 unsigned DWordOffset1 = DWordOffset0 + 1; 1098 // (add n0, c0) 1099 if (isDSOffsetLegal(N0, DWordOffset1, 8)) { 1100 Base = N0; 1101 Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); 1102 Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); 1103 return true; 1104 } 1105 } else if (Addr.getOpcode() == ISD::SUB) { 1106 // sub C, x -> add (sub 0, x), C 1107 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) { 1108 unsigned DWordOffset0 = C->getZExtValue() / 4; 1109 unsigned DWordOffset1 = DWordOffset0 + 1; 1110 1111 if (isUInt<8>(DWordOffset0)) { 1112 SDLoc DL(Addr); 1113 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); 1114 1115 // XXX - This is kind of hacky. Create a dummy sub node so we can check 1116 // the known bits in isDSOffsetLegal. We need to emit the selected node 1117 // here, so this is thrown away. 1118 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32, 1119 Zero, Addr.getOperand(1)); 1120 1121 if (isDSOffsetLegal(Sub, DWordOffset1, 8)) { 1122 SmallVector<SDValue, 3> Opnds; 1123 Opnds.push_back(Zero); 1124 Opnds.push_back(Addr.getOperand(1)); 1125 unsigned SubOp = AMDGPU::V_SUB_I32_e32; 1126 if (Subtarget->hasAddNoCarry()) { 1127 SubOp = AMDGPU::V_SUB_U32_e64; 1128 Opnds.push_back( 1129 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit 1130 } 1131 1132 MachineSDNode *MachineSub 1133 = CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds); 1134 1135 Base = SDValue(MachineSub, 0); 1136 Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); 1137 Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); 1138 return true; 1139 } 1140 } 1141 } 1142 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { 1143 unsigned DWordOffset0 = CAddr->getZExtValue() / 4; 1144 unsigned DWordOffset1 = DWordOffset0 + 1; 1145 assert(4 * DWordOffset0 == CAddr->getZExtValue()); 1146 1147 if (isUInt<8>(DWordOffset0) && isUInt<8>(DWordOffset1)) { 1148 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); 1149 MachineSDNode *MovZero 1150 = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, 1151 DL, MVT::i32, Zero); 1152 Base = SDValue(MovZero, 0); 1153 Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); 1154 Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); 1155 return true; 1156 } 1157 } 1158 1159 // default case 1160 1161 Base = Addr; 1162 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8); 1163 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8); 1164 return true; 1165 } 1166 1167 bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, 1168 SDValue &VAddr, SDValue &SOffset, 1169 SDValue &Offset, SDValue &Offen, 1170 SDValue &Idxen, SDValue &Addr64, 1171 SDValue &GLC, SDValue &SLC, 1172 SDValue &TFE) const { 1173 // Subtarget prefers to use flat instruction 1174 if (Subtarget->useFlatForGlobal()) 1175 return false; 1176 1177 SDLoc DL(Addr); 1178 1179 if (!GLC.getNode()) 1180 GLC = CurDAG->getTargetConstant(0, DL, MVT::i1); 1181 if (!SLC.getNode()) 1182 SLC = CurDAG->getTargetConstant(0, DL, MVT::i1); 1183 TFE = CurDAG->getTargetConstant(0, DL, MVT::i1); 1184 1185 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1); 1186 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1); 1187 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1); 1188 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); 1189 1190 ConstantSDNode *C1 = nullptr; 1191 SDValue N0 = Addr; 1192 if (CurDAG->isBaseWithConstantOffset(Addr)) { 1193 C1 = cast<ConstantSDNode>(Addr.getOperand(1)); 1194 if (isUInt<32>(C1->getZExtValue())) 1195 N0 = Addr.getOperand(0); 1196 else 1197 C1 = nullptr; 1198 } 1199 1200 if (N0.getOpcode() == ISD::ADD) { 1201 // (add N2, N3) -> addr64, or 1202 // (add (add N2, N3), C1) -> addr64 1203 SDValue N2 = N0.getOperand(0); 1204 SDValue N3 = N0.getOperand(1); 1205 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); 1206 1207 if (N2->isDivergent()) { 1208 if (N3->isDivergent()) { 1209 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the 1210 // addr64, and construct the resource from a 0 address. 1211 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0); 1212 VAddr = N0; 1213 } else { 1214 // N2 is divergent, N3 is not. 1215 Ptr = N3; 1216 VAddr = N2; 1217 } 1218 } else { 1219 // N2 is not divergent. 1220 Ptr = N2; 1221 VAddr = N3; 1222 } 1223 Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); 1224 } else if (N0->isDivergent()) { 1225 // N0 is divergent. Use it as the addr64, and construct the resource from a 1226 // 0 address. 1227 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0); 1228 VAddr = N0; 1229 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); 1230 } else { 1231 // N0 -> offset, or 1232 // (N0 + C1) -> offset 1233 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32); 1234 Ptr = N0; 1235 } 1236 1237 if (!C1) { 1238 // No offset. 1239 Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); 1240 return true; 1241 } 1242 1243 if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) { 1244 // Legal offset for instruction. 1245 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); 1246 return true; 1247 } 1248 1249 // Illegal offset, store it in soffset. 1250 Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); 1251 SOffset = 1252 SDValue(CurDAG->getMachineNode( 1253 AMDGPU::S_MOV_B32, DL, MVT::i32, 1254 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)), 1255 0); 1256 return true; 1257 } 1258 1259 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, 1260 SDValue &VAddr, SDValue &SOffset, 1261 SDValue &Offset, SDValue &GLC, 1262 SDValue &SLC, SDValue &TFE) const { 1263 SDValue Ptr, Offen, Idxen, Addr64; 1264 1265 // addr64 bit was removed for volcanic islands. 1266 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 1267 return false; 1268 1269 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, 1270 GLC, SLC, TFE)) 1271 return false; 1272 1273 ConstantSDNode *C = cast<ConstantSDNode>(Addr64); 1274 if (C->getSExtValue()) { 1275 SDLoc DL(Addr); 1276 1277 const SITargetLowering& Lowering = 1278 *static_cast<const SITargetLowering*>(getTargetLowering()); 1279 1280 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0); 1281 return true; 1282 } 1283 1284 return false; 1285 } 1286 1287 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, 1288 SDValue &VAddr, SDValue &SOffset, 1289 SDValue &Offset, 1290 SDValue &SLC) const { 1291 SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1); 1292 SDValue GLC, TFE; 1293 1294 return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE); 1295 } 1296 1297 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { 1298 auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>(); 1299 return PSV && PSV->isStack(); 1300 } 1301 1302 std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const { 1303 const MachineFunction &MF = CurDAG->getMachineFunction(); 1304 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1305 1306 if (auto FI = dyn_cast<FrameIndexSDNode>(N)) { 1307 SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(), 1308 FI->getValueType(0)); 1309 1310 // If we can resolve this to a frame index access, this is relative to the 1311 // frame pointer SGPR. 1312 return std::make_pair(TFI, CurDAG->getRegister(Info->getFrameOffsetReg(), 1313 MVT::i32)); 1314 } 1315 1316 // If we don't know this private access is a local stack object, it needs to 1317 // be relative to the entry point's scratch wave offset register. 1318 return std::make_pair(N, CurDAG->getRegister(Info->getScratchWaveOffsetReg(), 1319 MVT::i32)); 1320 } 1321 1322 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, 1323 SDValue Addr, SDValue &Rsrc, 1324 SDValue &VAddr, SDValue &SOffset, 1325 SDValue &ImmOffset) const { 1326 1327 SDLoc DL(Addr); 1328 MachineFunction &MF = CurDAG->getMachineFunction(); 1329 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1330 1331 Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); 1332 1333 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { 1334 unsigned Imm = CAddr->getZExtValue(); 1335 1336 SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32); 1337 MachineSDNode *MovHighBits = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, 1338 DL, MVT::i32, HighBits); 1339 VAddr = SDValue(MovHighBits, 0); 1340 1341 // In a call sequence, stores to the argument stack area are relative to the 1342 // stack pointer. 1343 const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo(); 1344 unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ? 1345 Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg(); 1346 1347 SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32); 1348 ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16); 1349 return true; 1350 } 1351 1352 if (CurDAG->isBaseWithConstantOffset(Addr)) { 1353 // (add n0, c1) 1354 1355 SDValue N0 = Addr.getOperand(0); 1356 SDValue N1 = Addr.getOperand(1); 1357 1358 // Offsets in vaddr must be positive if range checking is enabled. 1359 // 1360 // The total computation of vaddr + soffset + offset must not overflow. If 1361 // vaddr is negative, even if offset is 0 the sgpr offset add will end up 1362 // overflowing. 1363 // 1364 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would 1365 // always perform a range check. If a negative vaddr base index was used, 1366 // this would fail the range check. The overall address computation would 1367 // compute a valid address, but this doesn't happen due to the range 1368 // check. For out-of-bounds MUBUF loads, a 0 is returned. 1369 // 1370 // Therefore it should be safe to fold any VGPR offset on gfx9 into the 1371 // MUBUF vaddr, but not on older subtargets which can only do this if the 1372 // sign bit is known 0. 1373 ConstantSDNode *C1 = cast<ConstantSDNode>(N1); 1374 if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue()) && 1375 (!Subtarget->privateMemoryResourceIsRangeChecked() || 1376 CurDAG->SignBitIsZero(N0))) { 1377 std::tie(VAddr, SOffset) = foldFrameIndex(N0); 1378 ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); 1379 return true; 1380 } 1381 } 1382 1383 // (node) 1384 std::tie(VAddr, SOffset) = foldFrameIndex(Addr); 1385 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16); 1386 return true; 1387 } 1388 1389 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, 1390 SDValue Addr, 1391 SDValue &SRsrc, 1392 SDValue &SOffset, 1393 SDValue &Offset) const { 1394 ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr); 1395 if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue())) 1396 return false; 1397 1398 SDLoc DL(Addr); 1399 MachineFunction &MF = CurDAG->getMachineFunction(); 1400 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1401 1402 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); 1403 1404 const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo(); 1405 unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ? 1406 Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg(); 1407 1408 // FIXME: Get from MachinePointerInfo? We should only be using the frame 1409 // offset if we know this is in a call sequence. 1410 SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32); 1411 1412 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16); 1413 return true; 1414 } 1415 1416 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, 1417 SDValue &SOffset, SDValue &Offset, 1418 SDValue &GLC, SDValue &SLC, 1419 SDValue &TFE) const { 1420 SDValue Ptr, VAddr, Offen, Idxen, Addr64; 1421 const SIInstrInfo *TII = 1422 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 1423 1424 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, 1425 GLC, SLC, TFE)) 1426 return false; 1427 1428 if (!cast<ConstantSDNode>(Offen)->getSExtValue() && 1429 !cast<ConstantSDNode>(Idxen)->getSExtValue() && 1430 !cast<ConstantSDNode>(Addr64)->getSExtValue()) { 1431 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | 1432 APInt::getAllOnesValue(32).getZExtValue(); // Size 1433 SDLoc DL(Addr); 1434 1435 const SITargetLowering& Lowering = 1436 *static_cast<const SITargetLowering*>(getTargetLowering()); 1437 1438 SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0); 1439 return true; 1440 } 1441 return false; 1442 } 1443 1444 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, 1445 SDValue &Soffset, SDValue &Offset 1446 ) const { 1447 SDValue GLC, SLC, TFE; 1448 1449 return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE); 1450 } 1451 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, 1452 SDValue &Soffset, SDValue &Offset, 1453 SDValue &SLC) const { 1454 SDValue GLC, TFE; 1455 1456 return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE); 1457 } 1458 1459 template <bool IsSigned> 1460 bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDValue Addr, 1461 SDValue &VAddr, 1462 SDValue &Offset, 1463 SDValue &SLC) const { 1464 int64_t OffsetVal = 0; 1465 1466 if (Subtarget->hasFlatInstOffsets() && 1467 CurDAG->isBaseWithConstantOffset(Addr)) { 1468 SDValue N0 = Addr.getOperand(0); 1469 SDValue N1 = Addr.getOperand(1); 1470 int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue(); 1471 1472 if ((IsSigned && isInt<13>(COffsetVal)) || 1473 (!IsSigned && isUInt<12>(COffsetVal))) { 1474 Addr = N0; 1475 OffsetVal = COffsetVal; 1476 } 1477 } 1478 1479 VAddr = Addr; 1480 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16); 1481 SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1); 1482 1483 return true; 1484 } 1485 1486 bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDValue Addr, 1487 SDValue &VAddr, 1488 SDValue &Offset, 1489 SDValue &SLC) const { 1490 return SelectFlatOffset<false>(Addr, VAddr, Offset, SLC); 1491 } 1492 1493 bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDValue Addr, 1494 SDValue &VAddr, 1495 SDValue &Offset, 1496 SDValue &SLC) const { 1497 return SelectFlatOffset<true>(Addr, VAddr, Offset, SLC); 1498 } 1499 1500 bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, 1501 SDValue &Offset, bool &Imm) const { 1502 1503 // FIXME: Handle non-constant offsets. 1504 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode); 1505 if (!C) 1506 return false; 1507 1508 SDLoc SL(ByteOffsetNode); 1509 GCNSubtarget::Generation Gen = Subtarget->getGeneration(); 1510 int64_t ByteOffset = C->getSExtValue(); 1511 int64_t EncodedOffset = AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset); 1512 1513 if (AMDGPU::isLegalSMRDImmOffset(*Subtarget, ByteOffset)) { 1514 Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32); 1515 Imm = true; 1516 return true; 1517 } 1518 1519 if (!isUInt<32>(EncodedOffset) || !isUInt<32>(ByteOffset)) 1520 return false; 1521 1522 if (Gen == AMDGPUSubtarget::SEA_ISLANDS && isUInt<32>(EncodedOffset)) { 1523 // 32-bit Immediates are supported on Sea Islands. 1524 Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32); 1525 } else { 1526 SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32); 1527 Offset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, 1528 C32Bit), 0); 1529 } 1530 Imm = false; 1531 return true; 1532 } 1533 1534 SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const { 1535 if (Addr.getValueType() != MVT::i32) 1536 return Addr; 1537 1538 // Zero-extend a 32-bit address. 1539 SDLoc SL(Addr); 1540 1541 const MachineFunction &MF = CurDAG->getMachineFunction(); 1542 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1543 unsigned AddrHiVal = Info->get32BitAddressHighBits(); 1544 SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32); 1545 1546 const SDValue Ops[] = { 1547 CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32), 1548 Addr, 1549 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32), 1550 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi), 1551 0), 1552 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32), 1553 }; 1554 1555 return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64, 1556 Ops), 0); 1557 } 1558 1559 bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, 1560 SDValue &Offset, bool &Imm) const { 1561 SDLoc SL(Addr); 1562 1563 // A 32-bit (address + offset) should not cause unsigned 32-bit integer 1564 // wraparound, because s_load instructions perform the addition in 64 bits. 1565 if ((Addr.getValueType() != MVT::i32 || 1566 Addr->getFlags().hasNoUnsignedWrap()) && 1567 CurDAG->isBaseWithConstantOffset(Addr)) { 1568 SDValue N0 = Addr.getOperand(0); 1569 SDValue N1 = Addr.getOperand(1); 1570 1571 if (SelectSMRDOffset(N1, Offset, Imm)) { 1572 SBase = Expand32BitAddress(N0); 1573 return true; 1574 } 1575 } 1576 SBase = Expand32BitAddress(Addr); 1577 Offset = CurDAG->getTargetConstant(0, SL, MVT::i32); 1578 Imm = true; 1579 return true; 1580 } 1581 1582 bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase, 1583 SDValue &Offset) const { 1584 bool Imm; 1585 return SelectSMRD(Addr, SBase, Offset, Imm) && Imm; 1586 } 1587 1588 bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase, 1589 SDValue &Offset) const { 1590 1591 if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS) 1592 return false; 1593 1594 bool Imm; 1595 if (!SelectSMRD(Addr, SBase, Offset, Imm)) 1596 return false; 1597 1598 return !Imm && isa<ConstantSDNode>(Offset); 1599 } 1600 1601 bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase, 1602 SDValue &Offset) const { 1603 bool Imm; 1604 return SelectSMRD(Addr, SBase, Offset, Imm) && !Imm && 1605 !isa<ConstantSDNode>(Offset); 1606 } 1607 1608 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr, 1609 SDValue &Offset) const { 1610 bool Imm; 1611 return SelectSMRDOffset(Addr, Offset, Imm) && Imm; 1612 } 1613 1614 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr, 1615 SDValue &Offset) const { 1616 if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS) 1617 return false; 1618 1619 bool Imm; 1620 if (!SelectSMRDOffset(Addr, Offset, Imm)) 1621 return false; 1622 1623 return !Imm && isa<ConstantSDNode>(Offset); 1624 } 1625 1626 bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index, 1627 SDValue &Base, 1628 SDValue &Offset) const { 1629 SDLoc DL(Index); 1630 1631 if (CurDAG->isBaseWithConstantOffset(Index)) { 1632 SDValue N0 = Index.getOperand(0); 1633 SDValue N1 = Index.getOperand(1); 1634 ConstantSDNode *C1 = cast<ConstantSDNode>(N1); 1635 1636 // (add n0, c0) 1637 // Don't peel off the offset (c0) if doing so could possibly lead 1638 // the base (n0) to be negative. 1639 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0)) { 1640 Base = N0; 1641 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32); 1642 return true; 1643 } 1644 } 1645 1646 if (isa<ConstantSDNode>(Index)) 1647 return false; 1648 1649 Base = Index; 1650 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); 1651 return true; 1652 } 1653 1654 SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, const SDLoc &DL, 1655 SDValue Val, uint32_t Offset, 1656 uint32_t Width) { 1657 // Transformation function, pack the offset and width of a BFE into 1658 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second 1659 // source, bits [5:0] contain the offset and bits [22:16] the width. 1660 uint32_t PackedVal = Offset | (Width << 16); 1661 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32); 1662 1663 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst); 1664 } 1665 1666 void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) { 1667 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c) 1668 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c) 1669 // Predicate: 0 < b <= c < 32 1670 1671 const SDValue &Shl = N->getOperand(0); 1672 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1)); 1673 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 1674 1675 if (B && C) { 1676 uint32_t BVal = B->getZExtValue(); 1677 uint32_t CVal = C->getZExtValue(); 1678 1679 if (0 < BVal && BVal <= CVal && CVal < 32) { 1680 bool Signed = N->getOpcode() == ISD::SRA; 1681 unsigned Opcode = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; 1682 1683 ReplaceNode(N, getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0), CVal - BVal, 1684 32 - CVal)); 1685 return; 1686 } 1687 } 1688 SelectCode(N); 1689 } 1690 1691 void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) { 1692 switch (N->getOpcode()) { 1693 case ISD::AND: 1694 if (N->getOperand(0).getOpcode() == ISD::SRL) { 1695 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)" 1696 // Predicate: isMask(mask) 1697 const SDValue &Srl = N->getOperand(0); 1698 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1)); 1699 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1)); 1700 1701 if (Shift && Mask) { 1702 uint32_t ShiftVal = Shift->getZExtValue(); 1703 uint32_t MaskVal = Mask->getZExtValue(); 1704 1705 if (isMask_32(MaskVal)) { 1706 uint32_t WidthVal = countPopulation(MaskVal); 1707 1708 ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), 1709 Srl.getOperand(0), ShiftVal, WidthVal)); 1710 return; 1711 } 1712 } 1713 } 1714 break; 1715 case ISD::SRL: 1716 if (N->getOperand(0).getOpcode() == ISD::AND) { 1717 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)" 1718 // Predicate: isMask(mask >> b) 1719 const SDValue &And = N->getOperand(0); 1720 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1)); 1721 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1)); 1722 1723 if (Shift && Mask) { 1724 uint32_t ShiftVal = Shift->getZExtValue(); 1725 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal; 1726 1727 if (isMask_32(MaskVal)) { 1728 uint32_t WidthVal = countPopulation(MaskVal); 1729 1730 ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), 1731 And.getOperand(0), ShiftVal, WidthVal)); 1732 return; 1733 } 1734 } 1735 } else if (N->getOperand(0).getOpcode() == ISD::SHL) { 1736 SelectS_BFEFromShifts(N); 1737 return; 1738 } 1739 break; 1740 case ISD::SRA: 1741 if (N->getOperand(0).getOpcode() == ISD::SHL) { 1742 SelectS_BFEFromShifts(N); 1743 return; 1744 } 1745 break; 1746 1747 case ISD::SIGN_EXTEND_INREG: { 1748 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8 1749 SDValue Src = N->getOperand(0); 1750 if (Src.getOpcode() != ISD::SRL) 1751 break; 1752 1753 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1)); 1754 if (!Amt) 1755 break; 1756 1757 unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits(); 1758 ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_I32, SDLoc(N), Src.getOperand(0), 1759 Amt->getZExtValue(), Width)); 1760 return; 1761 } 1762 } 1763 1764 SelectCode(N); 1765 } 1766 1767 bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const { 1768 assert(N->getOpcode() == ISD::BRCOND); 1769 if (!N->hasOneUse()) 1770 return false; 1771 1772 SDValue Cond = N->getOperand(1); 1773 if (Cond.getOpcode() == ISD::CopyToReg) 1774 Cond = Cond.getOperand(2); 1775 1776 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse()) 1777 return false; 1778 1779 MVT VT = Cond.getOperand(0).getSimpleValueType(); 1780 if (VT == MVT::i32) 1781 return true; 1782 1783 if (VT == MVT::i64) { 1784 auto ST = static_cast<const GCNSubtarget *>(Subtarget); 1785 1786 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 1787 return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64(); 1788 } 1789 1790 return false; 1791 } 1792 1793 void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) { 1794 SDValue Cond = N->getOperand(1); 1795 1796 if (Cond.isUndef()) { 1797 CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other, 1798 N->getOperand(2), N->getOperand(0)); 1799 return; 1800 } 1801 1802 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N); 1803 unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ; 1804 unsigned CondReg = UseSCCBr ? AMDGPU::SCC : AMDGPU::VCC; 1805 SDLoc SL(N); 1806 1807 if (!UseSCCBr) { 1808 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not 1809 // analyzed what generates the vcc value, so we do not know whether vcc 1810 // bits for disabled lanes are 0. Thus we need to mask out bits for 1811 // disabled lanes. 1812 // 1813 // For the case that we select S_CBRANCH_SCC1 and it gets 1814 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls 1815 // SIInstrInfo::moveToVALU which inserts the S_AND). 1816 // 1817 // We could add an analysis of what generates the vcc value here and omit 1818 // the S_AND when is unnecessary. But it would be better to add a separate 1819 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it 1820 // catches both cases. 1821 Cond = SDValue(CurDAG->getMachineNode(AMDGPU::S_AND_B64, SL, MVT::i1, 1822 CurDAG->getRegister(AMDGPU::EXEC, MVT::i1), 1823 Cond), 1824 0); 1825 } 1826 1827 SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond); 1828 CurDAG->SelectNodeTo(N, BrOp, MVT::Other, 1829 N->getOperand(2), // Basic Block 1830 VCC.getValue(0)); 1831 } 1832 1833 void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) { 1834 MVT VT = N->getSimpleValueType(0); 1835 bool IsFMA = N->getOpcode() == ISD::FMA; 1836 if (VT != MVT::f32 || (!Subtarget->hasMadMixInsts() && 1837 !Subtarget->hasFmaMixInsts()) || 1838 ((IsFMA && Subtarget->hasMadMixInsts()) || 1839 (!IsFMA && Subtarget->hasFmaMixInsts()))) { 1840 SelectCode(N); 1841 return; 1842 } 1843 1844 SDValue Src0 = N->getOperand(0); 1845 SDValue Src1 = N->getOperand(1); 1846 SDValue Src2 = N->getOperand(2); 1847 unsigned Src0Mods, Src1Mods, Src2Mods; 1848 1849 // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand 1850 // using the conversion from f16. 1851 bool Sel0 = SelectVOP3PMadMixModsImpl(Src0, Src0, Src0Mods); 1852 bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods); 1853 bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods); 1854 1855 assert((IsFMA || !Subtarget->hasFP32Denormals()) && 1856 "fmad selected with denormals enabled"); 1857 // TODO: We can select this with f32 denormals enabled if all the sources are 1858 // converted from f16 (in which case fmad isn't legal). 1859 1860 if (Sel0 || Sel1 || Sel2) { 1861 // For dummy operands. 1862 SDValue Zero = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32); 1863 SDValue Ops[] = { 1864 CurDAG->getTargetConstant(Src0Mods, SDLoc(), MVT::i32), Src0, 1865 CurDAG->getTargetConstant(Src1Mods, SDLoc(), MVT::i32), Src1, 1866 CurDAG->getTargetConstant(Src2Mods, SDLoc(), MVT::i32), Src2, 1867 CurDAG->getTargetConstant(0, SDLoc(), MVT::i1), 1868 Zero, Zero 1869 }; 1870 1871 CurDAG->SelectNodeTo(N, 1872 IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32, 1873 MVT::f32, Ops); 1874 } else { 1875 SelectCode(N); 1876 } 1877 } 1878 1879 // This is here because there isn't a way to use the generated sub0_sub1 as the 1880 // subreg index to EXTRACT_SUBREG in tablegen. 1881 void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) { 1882 MemSDNode *Mem = cast<MemSDNode>(N); 1883 unsigned AS = Mem->getAddressSpace(); 1884 if (AS == AMDGPUAS::FLAT_ADDRESS) { 1885 SelectCode(N); 1886 return; 1887 } 1888 1889 MVT VT = N->getSimpleValueType(0); 1890 bool Is32 = (VT == MVT::i32); 1891 SDLoc SL(N); 1892 1893 MachineSDNode *CmpSwap = nullptr; 1894 if (Subtarget->hasAddr64()) { 1895 SDValue SRsrc, VAddr, SOffset, Offset, SLC; 1896 1897 if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset, SLC)) { 1898 unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN : 1899 AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN; 1900 SDValue CmpVal = Mem->getOperand(2); 1901 1902 // XXX - Do we care about glue operands? 1903 1904 SDValue Ops[] = { 1905 CmpVal, VAddr, SRsrc, SOffset, Offset, SLC, Mem->getChain() 1906 }; 1907 1908 CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops); 1909 } 1910 } 1911 1912 if (!CmpSwap) { 1913 SDValue SRsrc, SOffset, Offset, SLC; 1914 if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset, SLC)) { 1915 unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN : 1916 AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN; 1917 1918 SDValue CmpVal = Mem->getOperand(2); 1919 SDValue Ops[] = { 1920 CmpVal, SRsrc, SOffset, Offset, SLC, Mem->getChain() 1921 }; 1922 1923 CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops); 1924 } 1925 } 1926 1927 if (!CmpSwap) { 1928 SelectCode(N); 1929 return; 1930 } 1931 1932 MachineMemOperand *MMO = Mem->getMemOperand(); 1933 CurDAG->setNodeMemRefs(CmpSwap, {MMO}); 1934 1935 unsigned SubReg = Is32 ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 1936 SDValue Extract 1937 = CurDAG->getTargetExtractSubreg(SubReg, SL, VT, SDValue(CmpSwap, 0)); 1938 1939 ReplaceUses(SDValue(N, 0), Extract); 1940 ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 1)); 1941 CurDAG->RemoveDeadNode(N); 1942 } 1943 1944 void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) { 1945 unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 1946 if ((IntrID != Intrinsic::amdgcn_ds_append && 1947 IntrID != Intrinsic::amdgcn_ds_consume) || 1948 N->getValueType(0) != MVT::i32) { 1949 SelectCode(N); 1950 return; 1951 } 1952 1953 // The address is assumed to be uniform, so if it ends up in a VGPR, it will 1954 // be copied to an SGPR with readfirstlane. 1955 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ? 1956 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME; 1957 1958 SDValue Chain = N->getOperand(0); 1959 SDValue Ptr = N->getOperand(2); 1960 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N); 1961 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS; 1962 1963 SDValue Offset; 1964 if (CurDAG->isBaseWithConstantOffset(Ptr)) { 1965 SDValue PtrBase = Ptr.getOperand(0); 1966 SDValue PtrOffset = Ptr.getOperand(1); 1967 1968 const APInt &OffsetVal = cast<ConstantSDNode>(PtrOffset)->getAPIntValue(); 1969 if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue(), 16)) { 1970 N = glueCopyToM0(N, PtrBase); 1971 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32); 1972 } 1973 } 1974 1975 if (!Offset) { 1976 N = glueCopyToM0(N, Ptr); 1977 Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32); 1978 } 1979 1980 SDValue Ops[] = { 1981 Offset, 1982 CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32), 1983 Chain, 1984 N->getOperand(N->getNumOperands() - 1) // New glue 1985 }; 1986 1987 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); 1988 } 1989 1990 bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src, 1991 unsigned &Mods) const { 1992 Mods = 0; 1993 Src = In; 1994 1995 if (Src.getOpcode() == ISD::FNEG) { 1996 Mods |= SISrcMods::NEG; 1997 Src = Src.getOperand(0); 1998 } 1999 2000 if (Src.getOpcode() == ISD::FABS) { 2001 Mods |= SISrcMods::ABS; 2002 Src = Src.getOperand(0); 2003 } 2004 2005 return true; 2006 } 2007 2008 bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, 2009 SDValue &SrcMods) const { 2010 unsigned Mods; 2011 if (SelectVOP3ModsImpl(In, Src, Mods)) { 2012 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); 2013 return true; 2014 } 2015 2016 return false; 2017 } 2018 2019 bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, 2020 SDValue &SrcMods) const { 2021 SelectVOP3Mods(In, Src, SrcMods); 2022 return isNoNanSrc(Src); 2023 } 2024 2025 bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const { 2026 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG) 2027 return false; 2028 2029 Src = In; 2030 return true; 2031 } 2032 2033 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src, 2034 SDValue &SrcMods, SDValue &Clamp, 2035 SDValue &Omod) const { 2036 SDLoc DL(In); 2037 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); 2038 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1); 2039 2040 return SelectVOP3Mods(In, Src, SrcMods); 2041 } 2042 2043 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, 2044 SDValue &SrcMods, 2045 SDValue &Clamp, 2046 SDValue &Omod) const { 2047 Clamp = Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32); 2048 return SelectVOP3Mods(In, Src, SrcMods); 2049 } 2050 2051 bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src, 2052 SDValue &Clamp, SDValue &Omod) const { 2053 Src = In; 2054 2055 SDLoc DL(In); 2056 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); 2057 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1); 2058 2059 return true; 2060 } 2061 2062 bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, 2063 SDValue &SrcMods) const { 2064 unsigned Mods = 0; 2065 Src = In; 2066 2067 if (Src.getOpcode() == ISD::FNEG) { 2068 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); 2069 Src = Src.getOperand(0); 2070 } 2071 2072 if (Src.getOpcode() == ISD::BUILD_VECTOR) { 2073 unsigned VecMods = Mods; 2074 2075 SDValue Lo = stripBitcast(Src.getOperand(0)); 2076 SDValue Hi = stripBitcast(Src.getOperand(1)); 2077 2078 if (Lo.getOpcode() == ISD::FNEG) { 2079 Lo = stripBitcast(Lo.getOperand(0)); 2080 Mods ^= SISrcMods::NEG; 2081 } 2082 2083 if (Hi.getOpcode() == ISD::FNEG) { 2084 Hi = stripBitcast(Hi.getOperand(0)); 2085 Mods ^= SISrcMods::NEG_HI; 2086 } 2087 2088 if (isExtractHiElt(Lo, Lo)) 2089 Mods |= SISrcMods::OP_SEL_0; 2090 2091 if (isExtractHiElt(Hi, Hi)) 2092 Mods |= SISrcMods::OP_SEL_1; 2093 2094 Lo = stripExtractLoElt(Lo); 2095 Hi = stripExtractLoElt(Hi); 2096 2097 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) { 2098 // Really a scalar input. Just select from the low half of the register to 2099 // avoid packing. 2100 2101 Src = Lo; 2102 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); 2103 return true; 2104 } 2105 2106 Mods = VecMods; 2107 } 2108 2109 // Packed instructions do not have abs modifiers. 2110 Mods |= SISrcMods::OP_SEL_1; 2111 2112 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); 2113 return true; 2114 } 2115 2116 bool AMDGPUDAGToDAGISel::SelectVOP3PMods0(SDValue In, SDValue &Src, 2117 SDValue &SrcMods, 2118 SDValue &Clamp) const { 2119 SDLoc SL(In); 2120 2121 // FIXME: Handle clamp and op_sel 2122 Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32); 2123 2124 return SelectVOP3PMods(In, Src, SrcMods); 2125 } 2126 2127 bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src, 2128 SDValue &SrcMods) const { 2129 Src = In; 2130 // FIXME: Handle op_sel 2131 SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32); 2132 return true; 2133 } 2134 2135 bool AMDGPUDAGToDAGISel::SelectVOP3OpSel0(SDValue In, SDValue &Src, 2136 SDValue &SrcMods, 2137 SDValue &Clamp) const { 2138 SDLoc SL(In); 2139 2140 // FIXME: Handle clamp 2141 Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32); 2142 2143 return SelectVOP3OpSel(In, Src, SrcMods); 2144 } 2145 2146 bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src, 2147 SDValue &SrcMods) const { 2148 // FIXME: Handle op_sel 2149 return SelectVOP3Mods(In, Src, SrcMods); 2150 } 2151 2152 bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods0(SDValue In, SDValue &Src, 2153 SDValue &SrcMods, 2154 SDValue &Clamp) const { 2155 SDLoc SL(In); 2156 2157 // FIXME: Handle clamp 2158 Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32); 2159 2160 return SelectVOP3OpSelMods(In, Src, SrcMods); 2161 } 2162 2163 // The return value is not whether the match is possible (which it always is), 2164 // but whether or not it a conversion is really used. 2165 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, 2166 unsigned &Mods) const { 2167 Mods = 0; 2168 SelectVOP3ModsImpl(In, Src, Mods); 2169 2170 if (Src.getOpcode() == ISD::FP_EXTEND) { 2171 Src = Src.getOperand(0); 2172 assert(Src.getValueType() == MVT::f16); 2173 Src = stripBitcast(Src); 2174 2175 // Be careful about folding modifiers if we already have an abs. fneg is 2176 // applied last, so we don't want to apply an earlier fneg. 2177 if ((Mods & SISrcMods::ABS) == 0) { 2178 unsigned ModsTmp; 2179 SelectVOP3ModsImpl(Src, Src, ModsTmp); 2180 2181 if ((ModsTmp & SISrcMods::NEG) != 0) 2182 Mods ^= SISrcMods::NEG; 2183 2184 if ((ModsTmp & SISrcMods::ABS) != 0) 2185 Mods |= SISrcMods::ABS; 2186 } 2187 2188 // op_sel/op_sel_hi decide the source type and source. 2189 // If the source's op_sel_hi is set, it indicates to do a conversion from fp16. 2190 // If the sources's op_sel is set, it picks the high half of the source 2191 // register. 2192 2193 Mods |= SISrcMods::OP_SEL_1; 2194 if (isExtractHiElt(Src, Src)) { 2195 Mods |= SISrcMods::OP_SEL_0; 2196 2197 // TODO: Should we try to look for neg/abs here? 2198 } 2199 2200 return true; 2201 } 2202 2203 return false; 2204 } 2205 2206 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src, 2207 SDValue &SrcMods) const { 2208 unsigned Mods = 0; 2209 SelectVOP3PMadMixModsImpl(In, Src, Mods); 2210 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); 2211 return true; 2212 } 2213 2214 SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const { 2215 if (In.isUndef()) 2216 return CurDAG->getUNDEF(MVT::i32); 2217 2218 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) { 2219 SDLoc SL(In); 2220 return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32); 2221 } 2222 2223 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) { 2224 SDLoc SL(In); 2225 return CurDAG->getConstant( 2226 C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32); 2227 } 2228 2229 SDValue Src; 2230 if (isExtractHiElt(In, Src)) 2231 return Src; 2232 2233 return SDValue(); 2234 } 2235 2236 bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const { 2237 if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) { 2238 return false; 2239 } 2240 const SIRegisterInfo *SIRI = 2241 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); 2242 const SIInstrInfo * SII = 2243 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 2244 2245 unsigned Limit = 0; 2246 bool AllUsesAcceptSReg = true; 2247 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); 2248 Limit < 10 && U != E; ++U, ++Limit) { 2249 const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo()); 2250 2251 // If the register class is unknown, it could be an unknown 2252 // register class that needs to be an SGPR, e.g. an inline asm 2253 // constraint 2254 if (!RC || SIRI->isSGPRClass(RC)) 2255 return false; 2256 2257 if (RC != &AMDGPU::VS_32RegClass) { 2258 AllUsesAcceptSReg = false; 2259 SDNode * User = *U; 2260 if (User->isMachineOpcode()) { 2261 unsigned Opc = User->getMachineOpcode(); 2262 MCInstrDesc Desc = SII->get(Opc); 2263 if (Desc.isCommutable()) { 2264 unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo(); 2265 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex; 2266 if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) { 2267 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs(); 2268 const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo); 2269 if (CommutedRC == &AMDGPU::VS_32RegClass) 2270 AllUsesAcceptSReg = true; 2271 } 2272 } 2273 } 2274 // If "AllUsesAcceptSReg == false" so far we haven't suceeded 2275 // commuting current user. This means have at least one use 2276 // that strictly require VGPR. Thus, we will not attempt to commute 2277 // other user instructions. 2278 if (!AllUsesAcceptSReg) 2279 break; 2280 } 2281 } 2282 return !AllUsesAcceptSReg && (Limit < 10); 2283 } 2284 2285 bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const { 2286 auto Ld = cast<LoadSDNode>(N); 2287 2288 return Ld->getAlignment() >= 4 && 2289 ( 2290 ( 2291 ( 2292 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || 2293 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT 2294 ) 2295 && 2296 !N->isDivergent() 2297 ) 2298 || 2299 ( 2300 Subtarget->getScalarizeGlobalBehavior() && 2301 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && 2302 !Ld->isVolatile() && 2303 !N->isDivergent() && 2304 static_cast<const SITargetLowering *>( 2305 getTargetLowering())->isMemOpHasNoClobberedMemOperand(N) 2306 ) 2307 ); 2308 } 2309 2310 void AMDGPUDAGToDAGISel::PostprocessISelDAG() { 2311 const AMDGPUTargetLowering& Lowering = 2312 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering()); 2313 bool IsModified = false; 2314 do { 2315 IsModified = false; 2316 2317 // Go over all selected nodes and try to fold them a bit more 2318 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin(); 2319 while (Position != CurDAG->allnodes_end()) { 2320 SDNode *Node = &*Position++; 2321 MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node); 2322 if (!MachineNode) 2323 continue; 2324 2325 SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG); 2326 if (ResNode != Node) { 2327 if (ResNode) 2328 ReplaceUses(Node, ResNode); 2329 IsModified = true; 2330 } 2331 } 2332 CurDAG->RemoveDeadNodes(); 2333 } while (IsModified); 2334 } 2335 2336 bool R600DAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { 2337 Subtarget = &MF.getSubtarget<R600Subtarget>(); 2338 return SelectionDAGISel::runOnMachineFunction(MF); 2339 } 2340 2341 bool R600DAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const { 2342 if (!N->readMem()) 2343 return false; 2344 if (CbId == -1) 2345 return N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || 2346 N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT; 2347 2348 return N->getAddressSpace() == AMDGPUAS::CONSTANT_BUFFER_0 + CbId; 2349 } 2350 2351 bool R600DAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr, 2352 SDValue& IntPtr) { 2353 if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) { 2354 IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr), 2355 true); 2356 return true; 2357 } 2358 return false; 2359 } 2360 2361 bool R600DAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr, 2362 SDValue& BaseReg, SDValue &Offset) { 2363 if (!isa<ConstantSDNode>(Addr)) { 2364 BaseReg = Addr; 2365 Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true); 2366 return true; 2367 } 2368 return false; 2369 } 2370 2371 void R600DAGToDAGISel::Select(SDNode *N) { 2372 unsigned int Opc = N->getOpcode(); 2373 if (N->isMachineOpcode()) { 2374 N->setNodeId(-1); 2375 return; // Already selected. 2376 } 2377 2378 switch (Opc) { 2379 default: break; 2380 case AMDGPUISD::BUILD_VERTICAL_VECTOR: 2381 case ISD::SCALAR_TO_VECTOR: 2382 case ISD::BUILD_VECTOR: { 2383 EVT VT = N->getValueType(0); 2384 unsigned NumVectorElts = VT.getVectorNumElements(); 2385 unsigned RegClassID; 2386 // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG 2387 // that adds a 128 bits reg copy when going through TwoAddressInstructions 2388 // pass. We want to avoid 128 bits copies as much as possible because they 2389 // can't be bundled by our scheduler. 2390 switch(NumVectorElts) { 2391 case 2: RegClassID = R600::R600_Reg64RegClassID; break; 2392 case 4: 2393 if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR) 2394 RegClassID = R600::R600_Reg128VerticalRegClassID; 2395 else 2396 RegClassID = R600::R600_Reg128RegClassID; 2397 break; 2398 default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR"); 2399 } 2400 SelectBuildVector(N, RegClassID); 2401 return; 2402 } 2403 } 2404 2405 SelectCode(N); 2406 } 2407 2408 bool R600DAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, 2409 SDValue &Offset) { 2410 ConstantSDNode *C; 2411 SDLoc DL(Addr); 2412 2413 if ((C = dyn_cast<ConstantSDNode>(Addr))) { 2414 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32); 2415 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); 2416 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) && 2417 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) { 2418 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32); 2419 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); 2420 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) && 2421 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) { 2422 Base = Addr.getOperand(0); 2423 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); 2424 } else { 2425 Base = Addr; 2426 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); 2427 } 2428 2429 return true; 2430 } 2431 2432 bool R600DAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base, 2433 SDValue &Offset) { 2434 ConstantSDNode *IMMOffset; 2435 2436 if (Addr.getOpcode() == ISD::ADD 2437 && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) 2438 && isInt<16>(IMMOffset->getZExtValue())) { 2439 2440 Base = Addr.getOperand(0); 2441 Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr), 2442 MVT::i32); 2443 return true; 2444 // If the pointer address is constant, we can move it to the offset field. 2445 } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr)) 2446 && isInt<16>(IMMOffset->getZExtValue())) { 2447 Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), 2448 SDLoc(CurDAG->getEntryNode()), 2449 R600::ZERO, MVT::i32); 2450 Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr), 2451 MVT::i32); 2452 return true; 2453 } 2454 2455 // Default case, no offset 2456 Base = Addr; 2457 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); 2458 return true; 2459 } 2460