1 //===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Defines an instruction selector for the AMDGPU target. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPUArgumentUsageInfo.h" 16 #include "AMDGPUISelLowering.h" // For AMDGPUISD 17 #include "AMDGPUInstrInfo.h" 18 #include "AMDGPUPerfHintAnalysis.h" 19 #include "AMDGPUSubtarget.h" 20 #include "AMDGPUTargetMachine.h" 21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 22 #include "SIDefines.h" 23 #include "SIISelLowering.h" 24 #include "SIInstrInfo.h" 25 #include "SIMachineFunctionInfo.h" 26 #include "SIRegisterInfo.h" 27 #include "llvm/ADT/APInt.h" 28 #include "llvm/ADT/SmallVector.h" 29 #include "llvm/ADT/StringRef.h" 30 #include "llvm/Analysis/LegacyDivergenceAnalysis.h" 31 #include "llvm/Analysis/ValueTracking.h" 32 #include "llvm/CodeGen/FunctionLoweringInfo.h" 33 #include "llvm/CodeGen/ISDOpcodes.h" 34 #include "llvm/CodeGen/MachineFunction.h" 35 #include "llvm/CodeGen/MachineRegisterInfo.h" 36 #include "llvm/CodeGen/SelectionDAG.h" 37 #include "llvm/CodeGen/SelectionDAGISel.h" 38 #include "llvm/CodeGen/SelectionDAGNodes.h" 39 #include "llvm/CodeGen/ValueTypes.h" 40 #include "llvm/IR/BasicBlock.h" 41 #include "llvm/InitializePasses.h" 42 #ifdef EXPENSIVE_CHECKS 43 #include "llvm/IR/Dominators.h" 44 #endif 45 #include "llvm/IR/Instruction.h" 46 #include "llvm/MC/MCInstrDesc.h" 47 #include "llvm/Support/Casting.h" 48 #include "llvm/Support/CodeGen.h" 49 #include "llvm/Support/ErrorHandling.h" 50 #include "llvm/Support/MachineValueType.h" 51 #include "llvm/Support/MathExtras.h" 52 #include <cassert> 53 #include <cstdint> 54 #include <new> 55 #include <vector> 56 57 #define DEBUG_TYPE "isel" 58 59 using namespace llvm; 60 61 namespace llvm { 62 63 class R600InstrInfo; 64 65 } // end namespace llvm 66 67 //===----------------------------------------------------------------------===// 68 // Instruction Selector Implementation 69 //===----------------------------------------------------------------------===// 70 71 namespace { 72 73 static bool isNullConstantOrUndef(SDValue V) { 74 if (V.isUndef()) 75 return true; 76 77 ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V); 78 return Const != nullptr && Const->isNullValue(); 79 } 80 81 static bool getConstantValue(SDValue N, uint32_t &Out) { 82 // This is only used for packed vectors, where ussing 0 for undef should 83 // always be good. 84 if (N.isUndef()) { 85 Out = 0; 86 return true; 87 } 88 89 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) { 90 Out = C->getAPIntValue().getSExtValue(); 91 return true; 92 } 93 94 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) { 95 Out = C->getValueAPF().bitcastToAPInt().getSExtValue(); 96 return true; 97 } 98 99 return false; 100 } 101 102 // TODO: Handle undef as zero 103 static SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG, 104 bool Negate = false) { 105 assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2); 106 uint32_t LHSVal, RHSVal; 107 if (getConstantValue(N->getOperand(0), LHSVal) && 108 getConstantValue(N->getOperand(1), RHSVal)) { 109 SDLoc SL(N); 110 uint32_t K = Negate ? 111 (-LHSVal & 0xffff) | (-RHSVal << 16) : 112 (LHSVal & 0xffff) | (RHSVal << 16); 113 return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0), 114 DAG.getTargetConstant(K, SL, MVT::i32)); 115 } 116 117 return nullptr; 118 } 119 120 static SDNode *packNegConstantV2I16(const SDNode *N, SelectionDAG &DAG) { 121 return packConstantV2I16(N, DAG, true); 122 } 123 124 /// AMDGPU specific code to select AMDGPU machine instructions for 125 /// SelectionDAG operations. 126 class AMDGPUDAGToDAGISel : public SelectionDAGISel { 127 // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can 128 // make the right decision when generating code for different targets. 129 const GCNSubtarget *Subtarget; 130 131 // Default FP mode for the current function. 132 AMDGPU::SIModeRegisterDefaults Mode; 133 134 bool EnableLateStructurizeCFG; 135 136 public: 137 explicit AMDGPUDAGToDAGISel(TargetMachine *TM = nullptr, 138 CodeGenOpt::Level OptLevel = CodeGenOpt::Default) 139 : SelectionDAGISel(*TM, OptLevel) { 140 EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG; 141 } 142 ~AMDGPUDAGToDAGISel() override = default; 143 144 void getAnalysisUsage(AnalysisUsage &AU) const override { 145 AU.addRequired<AMDGPUArgumentUsageInfo>(); 146 AU.addRequired<LegacyDivergenceAnalysis>(); 147 #ifdef EXPENSIVE_CHECKS 148 AU.addRequired<DominatorTreeWrapperPass>(); 149 AU.addRequired<LoopInfoWrapperPass>(); 150 #endif 151 SelectionDAGISel::getAnalysisUsage(AU); 152 } 153 154 bool matchLoadD16FromBuildVector(SDNode *N) const; 155 156 bool runOnMachineFunction(MachineFunction &MF) override; 157 void PreprocessISelDAG() override; 158 void Select(SDNode *N) override; 159 StringRef getPassName() const override; 160 void PostprocessISelDAG() override; 161 162 protected: 163 void SelectBuildVector(SDNode *N, unsigned RegClassID); 164 165 private: 166 std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const; 167 bool isNoNanSrc(SDValue N) const; 168 bool isInlineImmediate(const SDNode *N, bool Negated = false) const; 169 bool isNegInlineImmediate(const SDNode *N) const { 170 return isInlineImmediate(N, true); 171 } 172 173 bool isInlineImmediate16(int64_t Imm) const { 174 return AMDGPU::isInlinableLiteral16(Imm, Subtarget->hasInv2PiInlineImm()); 175 } 176 177 bool isInlineImmediate32(int64_t Imm) const { 178 return AMDGPU::isInlinableLiteral32(Imm, Subtarget->hasInv2PiInlineImm()); 179 } 180 181 bool isInlineImmediate64(int64_t Imm) const { 182 return AMDGPU::isInlinableLiteral64(Imm, Subtarget->hasInv2PiInlineImm()); 183 } 184 185 bool isInlineImmediate(const APFloat &Imm) const { 186 return Subtarget->getInstrInfo()->isInlineConstant(Imm); 187 } 188 189 bool isVGPRImm(const SDNode *N) const; 190 bool isUniformLoad(const SDNode *N) const; 191 bool isUniformBr(const SDNode *N) const; 192 193 MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const; 194 195 SDNode *glueCopyToOp(SDNode *N, SDValue NewChain, SDValue Glue) const; 196 SDNode *glueCopyToM0(SDNode *N, SDValue Val) const; 197 SDNode *glueCopyToM0LDSInit(SDNode *N) const; 198 199 const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const; 200 virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); 201 virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); 202 bool isDSOffsetLegal(SDValue Base, unsigned Offset, 203 unsigned OffsetBits) const; 204 bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; 205 bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, 206 SDValue &Offset1) const; 207 bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, 208 SDValue &SOffset, SDValue &Offset, SDValue &Offen, 209 SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, 210 SDValue &TFE, SDValue &DLC, SDValue &SWZ) const; 211 bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, 212 SDValue &SOffset, SDValue &Offset, SDValue &GLC, 213 SDValue &SLC, SDValue &TFE, SDValue &DLC, 214 SDValue &SWZ) const; 215 bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, 216 SDValue &VAddr, SDValue &SOffset, SDValue &Offset, 217 SDValue &SLC) const; 218 bool SelectMUBUFScratchOffen(SDNode *Parent, 219 SDValue Addr, SDValue &RSrc, SDValue &VAddr, 220 SDValue &SOffset, SDValue &ImmOffset) const; 221 bool SelectMUBUFScratchOffset(SDNode *Parent, 222 SDValue Addr, SDValue &SRsrc, SDValue &Soffset, 223 SDValue &Offset) const; 224 225 bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, 226 SDValue &Offset, SDValue &GLC, SDValue &SLC, 227 SDValue &TFE, SDValue &DLC, SDValue &SWZ) const; 228 bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, 229 SDValue &Offset, SDValue &SLC) const; 230 bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, 231 SDValue &Offset) const; 232 233 template <bool IsSigned> 234 bool SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr, 235 SDValue &Offset, SDValue &SLC) const; 236 bool SelectFlatAtomic(SDNode *N, SDValue Addr, SDValue &VAddr, 237 SDValue &Offset, SDValue &SLC) const; 238 bool SelectFlatAtomicSigned(SDNode *N, SDValue Addr, SDValue &VAddr, 239 SDValue &Offset, SDValue &SLC) const; 240 241 bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, 242 bool &Imm) const; 243 SDValue Expand32BitAddress(SDValue Addr) const; 244 bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, 245 bool &Imm) const; 246 bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const; 247 bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const; 248 bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const; 249 bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const; 250 bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const; 251 bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; 252 253 bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const; 254 bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods) const; 255 bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; 256 bool SelectVOP3NoMods(SDValue In, SDValue &Src) const; 257 bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, 258 SDValue &Clamp, SDValue &Omod) const; 259 bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods, 260 SDValue &Clamp, SDValue &Omod) const; 261 262 bool SelectVOP3OMods(SDValue In, SDValue &Src, 263 SDValue &Clamp, SDValue &Omod) const; 264 265 bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; 266 267 bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const; 268 269 bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; 270 bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const; 271 bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; 272 273 SDValue getHi16Elt(SDValue In) const; 274 275 SDValue getMaterializedScalarImm32(int64_t Val, const SDLoc &DL) const; 276 277 void SelectADD_SUB_I64(SDNode *N); 278 void SelectAddcSubb(SDNode *N); 279 void SelectUADDO_USUBO(SDNode *N); 280 void SelectDIV_SCALE(SDNode *N); 281 void SelectDIV_FMAS(SDNode *N); 282 void SelectMAD_64_32(SDNode *N); 283 void SelectFMA_W_CHAIN(SDNode *N); 284 void SelectFMUL_W_CHAIN(SDNode *N); 285 286 SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val, 287 uint32_t Offset, uint32_t Width); 288 void SelectS_BFEFromShifts(SDNode *N); 289 void SelectS_BFE(SDNode *N); 290 bool isCBranchSCC(const SDNode *N) const; 291 void SelectBRCOND(SDNode *N); 292 void SelectFMAD_FMA(SDNode *N); 293 void SelectATOMIC_CMP_SWAP(SDNode *N); 294 void SelectDSAppendConsume(SDNode *N, unsigned IntrID); 295 void SelectDS_GWS(SDNode *N, unsigned IntrID); 296 void SelectInterpP1F16(SDNode *N); 297 void SelectINTRINSIC_W_CHAIN(SDNode *N); 298 void SelectINTRINSIC_WO_CHAIN(SDNode *N); 299 void SelectINTRINSIC_VOID(SDNode *N); 300 301 protected: 302 // Include the pieces autogenerated from the target description. 303 #include "AMDGPUGenDAGISel.inc" 304 }; 305 306 class R600DAGToDAGISel : public AMDGPUDAGToDAGISel { 307 const R600Subtarget *Subtarget; 308 309 bool isConstantLoad(const MemSDNode *N, int cbID) const; 310 bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr); 311 bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg, 312 SDValue& Offset); 313 public: 314 explicit R600DAGToDAGISel(TargetMachine *TM, CodeGenOpt::Level OptLevel) : 315 AMDGPUDAGToDAGISel(TM, OptLevel) {} 316 317 void Select(SDNode *N) override; 318 319 bool SelectADDRIndirect(SDValue Addr, SDValue &Base, 320 SDValue &Offset) override; 321 bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, 322 SDValue &Offset) override; 323 324 bool runOnMachineFunction(MachineFunction &MF) override; 325 326 void PreprocessISelDAG() override {} 327 328 protected: 329 // Include the pieces autogenerated from the target description. 330 #include "R600GenDAGISel.inc" 331 }; 332 333 static SDValue stripBitcast(SDValue Val) { 334 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val; 335 } 336 337 // Figure out if this is really an extract of the high 16-bits of a dword. 338 static bool isExtractHiElt(SDValue In, SDValue &Out) { 339 In = stripBitcast(In); 340 if (In.getOpcode() != ISD::TRUNCATE) 341 return false; 342 343 SDValue Srl = In.getOperand(0); 344 if (Srl.getOpcode() == ISD::SRL) { 345 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) { 346 if (ShiftAmt->getZExtValue() == 16) { 347 Out = stripBitcast(Srl.getOperand(0)); 348 return true; 349 } 350 } 351 } 352 353 return false; 354 } 355 356 // Look through operations that obscure just looking at the low 16-bits of the 357 // same register. 358 static SDValue stripExtractLoElt(SDValue In) { 359 if (In.getOpcode() == ISD::TRUNCATE) { 360 SDValue Src = In.getOperand(0); 361 if (Src.getValueType().getSizeInBits() == 32) 362 return stripBitcast(Src); 363 } 364 365 return In; 366 } 367 368 } // end anonymous namespace 369 370 INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel", 371 "AMDGPU DAG->DAG Pattern Instruction Selection", false, false) 372 INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo) 373 INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis) 374 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) 375 #ifdef EXPENSIVE_CHECKS 376 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 377 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 378 #endif 379 INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "amdgpu-isel", 380 "AMDGPU DAG->DAG Pattern Instruction Selection", false, false) 381 382 /// This pass converts a legalized DAG into a AMDGPU-specific 383 // DAG, ready for instruction scheduling. 384 FunctionPass *llvm::createAMDGPUISelDag(TargetMachine *TM, 385 CodeGenOpt::Level OptLevel) { 386 return new AMDGPUDAGToDAGISel(TM, OptLevel); 387 } 388 389 /// This pass converts a legalized DAG into a R600-specific 390 // DAG, ready for instruction scheduling. 391 FunctionPass *llvm::createR600ISelDag(TargetMachine *TM, 392 CodeGenOpt::Level OptLevel) { 393 return new R600DAGToDAGISel(TM, OptLevel); 394 } 395 396 bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { 397 #ifdef EXPENSIVE_CHECKS 398 DominatorTree & DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 399 LoopInfo * LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 400 for (auto &L : LI->getLoopsInPreorder()) { 401 assert(L->isLCSSAForm(DT)); 402 } 403 #endif 404 Subtarget = &MF.getSubtarget<GCNSubtarget>(); 405 Mode = AMDGPU::SIModeRegisterDefaults(MF.getFunction(), *Subtarget); 406 return SelectionDAGISel::runOnMachineFunction(MF); 407 } 408 409 bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const { 410 assert(Subtarget->d16PreservesUnusedBits()); 411 MVT VT = N->getValueType(0).getSimpleVT(); 412 if (VT != MVT::v2i16 && VT != MVT::v2f16) 413 return false; 414 415 SDValue Lo = N->getOperand(0); 416 SDValue Hi = N->getOperand(1); 417 418 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi)); 419 420 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo 421 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo 422 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo 423 424 // Need to check for possible indirect dependencies on the other half of the 425 // vector to avoid introducing a cycle. 426 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) { 427 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other); 428 429 SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo); 430 SDValue Ops[] = { 431 LdHi->getChain(), LdHi->getBasePtr(), TiedIn 432 }; 433 434 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI; 435 if (LdHi->getMemoryVT() == MVT::i8) { 436 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ? 437 AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8; 438 } else { 439 assert(LdHi->getMemoryVT() == MVT::i16); 440 } 441 442 SDValue NewLoadHi = 443 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList, 444 Ops, LdHi->getMemoryVT(), 445 LdHi->getMemOperand()); 446 447 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi); 448 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1)); 449 return true; 450 } 451 452 // build_vector (load ptr), hi -> load_d16_lo ptr, hi 453 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi 454 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi 455 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo)); 456 if (LdLo && Lo.hasOneUse()) { 457 SDValue TiedIn = getHi16Elt(Hi); 458 if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode())) 459 return false; 460 461 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other); 462 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO; 463 if (LdLo->getMemoryVT() == MVT::i8) { 464 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ? 465 AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8; 466 } else { 467 assert(LdLo->getMemoryVT() == MVT::i16); 468 } 469 470 TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn); 471 472 SDValue Ops[] = { 473 LdLo->getChain(), LdLo->getBasePtr(), TiedIn 474 }; 475 476 SDValue NewLoadLo = 477 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList, 478 Ops, LdLo->getMemoryVT(), 479 LdLo->getMemOperand()); 480 481 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo); 482 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1)); 483 return true; 484 } 485 486 return false; 487 } 488 489 void AMDGPUDAGToDAGISel::PreprocessISelDAG() { 490 if (!Subtarget->d16PreservesUnusedBits()) 491 return; 492 493 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end(); 494 495 bool MadeChange = false; 496 while (Position != CurDAG->allnodes_begin()) { 497 SDNode *N = &*--Position; 498 if (N->use_empty()) 499 continue; 500 501 switch (N->getOpcode()) { 502 case ISD::BUILD_VECTOR: 503 MadeChange |= matchLoadD16FromBuildVector(N); 504 break; 505 default: 506 break; 507 } 508 } 509 510 if (MadeChange) { 511 CurDAG->RemoveDeadNodes(); 512 LLVM_DEBUG(dbgs() << "After PreProcess:\n"; 513 CurDAG->dump();); 514 } 515 } 516 517 bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const { 518 if (TM.Options.NoNaNsFPMath) 519 return true; 520 521 // TODO: Move into isKnownNeverNaN 522 if (N->getFlags().isDefined()) 523 return N->getFlags().hasNoNaNs(); 524 525 return CurDAG->isKnownNeverNaN(N); 526 } 527 528 bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N, 529 bool Negated) const { 530 if (N->isUndef()) 531 return true; 532 533 const SIInstrInfo *TII = Subtarget->getInstrInfo(); 534 if (Negated) { 535 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) 536 return TII->isInlineConstant(-C->getAPIntValue()); 537 538 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) 539 return TII->isInlineConstant(-C->getValueAPF().bitcastToAPInt()); 540 541 } else { 542 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) 543 return TII->isInlineConstant(C->getAPIntValue()); 544 545 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) 546 return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt()); 547 } 548 549 return false; 550 } 551 552 /// Determine the register class for \p OpNo 553 /// \returns The register class of the virtual register that will be used for 554 /// the given operand number \OpNo or NULL if the register class cannot be 555 /// determined. 556 const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, 557 unsigned OpNo) const { 558 if (!N->isMachineOpcode()) { 559 if (N->getOpcode() == ISD::CopyToReg) { 560 unsigned Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg(); 561 if (Register::isVirtualRegister(Reg)) { 562 MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo(); 563 return MRI.getRegClass(Reg); 564 } 565 566 const SIRegisterInfo *TRI 567 = static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo(); 568 return TRI->getPhysRegClass(Reg); 569 } 570 571 return nullptr; 572 } 573 574 switch (N->getMachineOpcode()) { 575 default: { 576 const MCInstrDesc &Desc = 577 Subtarget->getInstrInfo()->get(N->getMachineOpcode()); 578 unsigned OpIdx = Desc.getNumDefs() + OpNo; 579 if (OpIdx >= Desc.getNumOperands()) 580 return nullptr; 581 int RegClass = Desc.OpInfo[OpIdx].RegClass; 582 if (RegClass == -1) 583 return nullptr; 584 585 return Subtarget->getRegisterInfo()->getRegClass(RegClass); 586 } 587 case AMDGPU::REG_SEQUENCE: { 588 unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 589 const TargetRegisterClass *SuperRC = 590 Subtarget->getRegisterInfo()->getRegClass(RCID); 591 592 SDValue SubRegOp = N->getOperand(OpNo + 1); 593 unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue(); 594 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC, 595 SubRegIdx); 596 } 597 } 598 } 599 600 SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain, 601 SDValue Glue) const { 602 SmallVector <SDValue, 8> Ops; 603 Ops.push_back(NewChain); // Replace the chain. 604 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 605 Ops.push_back(N->getOperand(i)); 606 607 Ops.push_back(Glue); 608 return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops); 609 } 610 611 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const { 612 const SITargetLowering& Lowering = 613 *static_cast<const SITargetLowering*>(getTargetLowering()); 614 615 assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain"); 616 617 SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val); 618 return glueCopyToOp(N, M0, M0.getValue(1)); 619 } 620 621 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const { 622 unsigned AS = cast<MemSDNode>(N)->getAddressSpace(); 623 if (AS == AMDGPUAS::LOCAL_ADDRESS) { 624 if (Subtarget->ldsRequiresM0Init()) 625 return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32)); 626 } else if (AS == AMDGPUAS::REGION_ADDRESS) { 627 MachineFunction &MF = CurDAG->getMachineFunction(); 628 unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize(); 629 return 630 glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32)); 631 } 632 return N; 633 } 634 635 MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm, 636 EVT VT) const { 637 SDNode *Lo = CurDAG->getMachineNode( 638 AMDGPU::S_MOV_B32, DL, MVT::i32, 639 CurDAG->getTargetConstant(Imm & 0xFFFFFFFF, DL, MVT::i32)); 640 SDNode *Hi = 641 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, 642 CurDAG->getTargetConstant(Imm >> 32, DL, MVT::i32)); 643 const SDValue Ops[] = { 644 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), 645 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), 646 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)}; 647 648 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops); 649 } 650 651 static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) { 652 switch (NumVectorElts) { 653 case 1: 654 return AMDGPU::SReg_32RegClassID; 655 case 2: 656 return AMDGPU::SReg_64RegClassID; 657 case 3: 658 return AMDGPU::SGPR_96RegClassID; 659 case 4: 660 return AMDGPU::SGPR_128RegClassID; 661 case 5: 662 return AMDGPU::SGPR_160RegClassID; 663 case 8: 664 return AMDGPU::SReg_256RegClassID; 665 case 16: 666 return AMDGPU::SReg_512RegClassID; 667 case 32: 668 return AMDGPU::SReg_1024RegClassID; 669 } 670 671 llvm_unreachable("invalid vector size"); 672 } 673 674 void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) { 675 EVT VT = N->getValueType(0); 676 unsigned NumVectorElts = VT.getVectorNumElements(); 677 EVT EltVT = VT.getVectorElementType(); 678 SDLoc DL(N); 679 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); 680 681 if (NumVectorElts == 1) { 682 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0), 683 RegClass); 684 return; 685 } 686 687 assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not " 688 "supported yet"); 689 // 32 = Max Num Vector Elements 690 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index) 691 // 1 = Vector Register Class 692 SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1); 693 694 bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() == 695 Triple::amdgcn; 696 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); 697 bool IsRegSeq = true; 698 unsigned NOps = N->getNumOperands(); 699 for (unsigned i = 0; i < NOps; i++) { 700 // XXX: Why is this here? 701 if (isa<RegisterSDNode>(N->getOperand(i))) { 702 IsRegSeq = false; 703 break; 704 } 705 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i) 706 : R600RegisterInfo::getSubRegFromChannel(i); 707 RegSeqArgs[1 + (2 * i)] = N->getOperand(i); 708 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32); 709 } 710 if (NOps != NumVectorElts) { 711 // Fill in the missing undef elements if this was a scalar_to_vector. 712 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts); 713 MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, 714 DL, EltVT); 715 for (unsigned i = NOps; i < NumVectorElts; ++i) { 716 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i) 717 : R600RegisterInfo::getSubRegFromChannel(i); 718 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0); 719 RegSeqArgs[1 + (2 * i) + 1] = 720 CurDAG->getTargetConstant(Sub, DL, MVT::i32); 721 } 722 } 723 724 if (!IsRegSeq) 725 SelectCode(N); 726 CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs); 727 } 728 729 void AMDGPUDAGToDAGISel::Select(SDNode *N) { 730 unsigned int Opc = N->getOpcode(); 731 if (N->isMachineOpcode()) { 732 N->setNodeId(-1); 733 return; // Already selected. 734 } 735 736 // isa<MemSDNode> almost works but is slightly too permissive for some DS 737 // intrinsics. 738 if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N) || 739 (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC || 740 Opc == ISD::ATOMIC_LOAD_FADD || 741 Opc == AMDGPUISD::ATOMIC_LOAD_FMIN || 742 Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) { 743 N = glueCopyToM0LDSInit(N); 744 SelectCode(N); 745 return; 746 } 747 748 switch (Opc) { 749 default: 750 break; 751 // We are selecting i64 ADD here instead of custom lower it during 752 // DAG legalization, so we can fold some i64 ADDs used for address 753 // calculation into the LOAD and STORE instructions. 754 case ISD::ADDC: 755 case ISD::ADDE: 756 case ISD::SUBC: 757 case ISD::SUBE: { 758 if (N->getValueType(0) != MVT::i64) 759 break; 760 761 SelectADD_SUB_I64(N); 762 return; 763 } 764 case ISD::ADDCARRY: 765 case ISD::SUBCARRY: 766 if (N->getValueType(0) != MVT::i32) 767 break; 768 769 SelectAddcSubb(N); 770 return; 771 case ISD::UADDO: 772 case ISD::USUBO: { 773 SelectUADDO_USUBO(N); 774 return; 775 } 776 case AMDGPUISD::FMUL_W_CHAIN: { 777 SelectFMUL_W_CHAIN(N); 778 return; 779 } 780 case AMDGPUISD::FMA_W_CHAIN: { 781 SelectFMA_W_CHAIN(N); 782 return; 783 } 784 785 case ISD::SCALAR_TO_VECTOR: 786 case ISD::BUILD_VECTOR: { 787 EVT VT = N->getValueType(0); 788 unsigned NumVectorElts = VT.getVectorNumElements(); 789 if (VT.getScalarSizeInBits() == 16) { 790 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) { 791 if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) { 792 ReplaceNode(N, Packed); 793 return; 794 } 795 } 796 797 break; 798 } 799 800 assert(VT.getVectorElementType().bitsEq(MVT::i32)); 801 unsigned RegClassID = selectSGPRVectorRegClassID(NumVectorElts); 802 SelectBuildVector(N, RegClassID); 803 return; 804 } 805 case ISD::BUILD_PAIR: { 806 SDValue RC, SubReg0, SubReg1; 807 SDLoc DL(N); 808 if (N->getValueType(0) == MVT::i128) { 809 RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32); 810 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32); 811 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32); 812 } else if (N->getValueType(0) == MVT::i64) { 813 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32); 814 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); 815 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); 816 } else { 817 llvm_unreachable("Unhandled value type for BUILD_PAIR"); 818 } 819 const SDValue Ops[] = { RC, N->getOperand(0), SubReg0, 820 N->getOperand(1), SubReg1 }; 821 ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, 822 N->getValueType(0), Ops)); 823 return; 824 } 825 826 case ISD::Constant: 827 case ISD::ConstantFP: { 828 if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N)) 829 break; 830 831 uint64_t Imm; 832 if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) 833 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue(); 834 else { 835 ConstantSDNode *C = cast<ConstantSDNode>(N); 836 Imm = C->getZExtValue(); 837 } 838 839 SDLoc DL(N); 840 ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0))); 841 return; 842 } 843 case AMDGPUISD::BFE_I32: 844 case AMDGPUISD::BFE_U32: { 845 // There is a scalar version available, but unlike the vector version which 846 // has a separate operand for the offset and width, the scalar version packs 847 // the width and offset into a single operand. Try to move to the scalar 848 // version if the offsets are constant, so that we can try to keep extended 849 // loads of kernel arguments in SGPRs. 850 851 // TODO: Technically we could try to pattern match scalar bitshifts of 852 // dynamic values, but it's probably not useful. 853 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); 854 if (!Offset) 855 break; 856 857 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2)); 858 if (!Width) 859 break; 860 861 bool Signed = Opc == AMDGPUISD::BFE_I32; 862 863 uint32_t OffsetVal = Offset->getZExtValue(); 864 uint32_t WidthVal = Width->getZExtValue(); 865 866 ReplaceNode(N, getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, 867 SDLoc(N), N->getOperand(0), OffsetVal, WidthVal)); 868 return; 869 } 870 case AMDGPUISD::DIV_SCALE: { 871 SelectDIV_SCALE(N); 872 return; 873 } 874 case AMDGPUISD::DIV_FMAS: { 875 SelectDIV_FMAS(N); 876 return; 877 } 878 case AMDGPUISD::MAD_I64_I32: 879 case AMDGPUISD::MAD_U64_U32: { 880 SelectMAD_64_32(N); 881 return; 882 } 883 case ISD::CopyToReg: { 884 const SITargetLowering& Lowering = 885 *static_cast<const SITargetLowering*>(getTargetLowering()); 886 N = Lowering.legalizeTargetIndependentNode(N, *CurDAG); 887 break; 888 } 889 case ISD::AND: 890 case ISD::SRL: 891 case ISD::SRA: 892 case ISD::SIGN_EXTEND_INREG: 893 if (N->getValueType(0) != MVT::i32) 894 break; 895 896 SelectS_BFE(N); 897 return; 898 case ISD::BRCOND: 899 SelectBRCOND(N); 900 return; 901 case ISD::FMAD: 902 case ISD::FMA: 903 SelectFMAD_FMA(N); 904 return; 905 case AMDGPUISD::ATOMIC_CMP_SWAP: 906 SelectATOMIC_CMP_SWAP(N); 907 return; 908 case AMDGPUISD::CVT_PKRTZ_F16_F32: 909 case AMDGPUISD::CVT_PKNORM_I16_F32: 910 case AMDGPUISD::CVT_PKNORM_U16_F32: 911 case AMDGPUISD::CVT_PK_U16_U32: 912 case AMDGPUISD::CVT_PK_I16_I32: { 913 // Hack around using a legal type if f16 is illegal. 914 if (N->getValueType(0) == MVT::i32) { 915 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16; 916 N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT), 917 { N->getOperand(0), N->getOperand(1) }); 918 SelectCode(N); 919 return; 920 } 921 922 break; 923 } 924 case ISD::INTRINSIC_W_CHAIN: { 925 SelectINTRINSIC_W_CHAIN(N); 926 return; 927 } 928 case ISD::INTRINSIC_WO_CHAIN: { 929 SelectINTRINSIC_WO_CHAIN(N); 930 return; 931 } 932 case ISD::INTRINSIC_VOID: { 933 SelectINTRINSIC_VOID(N); 934 return; 935 } 936 } 937 938 SelectCode(N); 939 } 940 941 bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const { 942 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock(); 943 const Instruction *Term = BB->getTerminator(); 944 return Term->getMetadata("amdgpu.uniform") || 945 Term->getMetadata("structurizecfg.uniform"); 946 } 947 948 StringRef AMDGPUDAGToDAGISel::getPassName() const { 949 return "AMDGPU DAG->DAG Pattern Instruction Selection"; 950 } 951 952 //===----------------------------------------------------------------------===// 953 // Complex Patterns 954 //===----------------------------------------------------------------------===// 955 956 bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base, 957 SDValue &Offset) { 958 return false; 959 } 960 961 bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, 962 SDValue &Offset) { 963 ConstantSDNode *C; 964 SDLoc DL(Addr); 965 966 if ((C = dyn_cast<ConstantSDNode>(Addr))) { 967 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32); 968 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); 969 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) && 970 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) { 971 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32); 972 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); 973 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) && 974 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) { 975 Base = Addr.getOperand(0); 976 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); 977 } else { 978 Base = Addr; 979 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); 980 } 981 982 return true; 983 } 984 985 SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val, 986 const SDLoc &DL) const { 987 SDNode *Mov = CurDAG->getMachineNode( 988 AMDGPU::S_MOV_B32, DL, MVT::i32, 989 CurDAG->getTargetConstant(Val, DL, MVT::i32)); 990 return SDValue(Mov, 0); 991 } 992 993 // FIXME: Should only handle addcarry/subcarry 994 void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { 995 SDLoc DL(N); 996 SDValue LHS = N->getOperand(0); 997 SDValue RHS = N->getOperand(1); 998 999 unsigned Opcode = N->getOpcode(); 1000 bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE); 1001 bool ProduceCarry = 1002 ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC; 1003 bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE; 1004 1005 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); 1006 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); 1007 1008 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, 1009 DL, MVT::i32, LHS, Sub0); 1010 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, 1011 DL, MVT::i32, LHS, Sub1); 1012 1013 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, 1014 DL, MVT::i32, RHS, Sub0); 1015 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, 1016 DL, MVT::i32, RHS, Sub1); 1017 1018 SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue); 1019 1020 static const unsigned OpcMap[2][2][2] = { 1021 {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32}, 1022 {AMDGPU::V_SUB_I32_e32, AMDGPU::V_ADD_I32_e32}}, 1023 {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32}, 1024 {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}}; 1025 1026 unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd]; 1027 unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd]; 1028 1029 SDNode *AddLo; 1030 if (!ConsumeCarry) { 1031 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) }; 1032 AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args); 1033 } else { 1034 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) }; 1035 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args); 1036 } 1037 SDValue AddHiArgs[] = { 1038 SDValue(Hi0, 0), 1039 SDValue(Hi1, 0), 1040 SDValue(AddLo, 1) 1041 }; 1042 SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs); 1043 1044 SDValue RegSequenceArgs[] = { 1045 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), 1046 SDValue(AddLo,0), 1047 Sub0, 1048 SDValue(AddHi,0), 1049 Sub1, 1050 }; 1051 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, 1052 MVT::i64, RegSequenceArgs); 1053 1054 if (ProduceCarry) { 1055 // Replace the carry-use 1056 ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1)); 1057 } 1058 1059 // Replace the remaining uses. 1060 ReplaceNode(N, RegSequence); 1061 } 1062 1063 void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) { 1064 SDLoc DL(N); 1065 SDValue LHS = N->getOperand(0); 1066 SDValue RHS = N->getOperand(1); 1067 SDValue CI = N->getOperand(2); 1068 1069 unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::V_ADDC_U32_e64 1070 : AMDGPU::V_SUBB_U32_e64; 1071 CurDAG->SelectNodeTo( 1072 N, Opc, N->getVTList(), 1073 {LHS, RHS, CI, CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/}); 1074 } 1075 1076 void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) { 1077 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned 1078 // carry out despite the _i32 name. These were renamed in VI to _U32. 1079 // FIXME: We should probably rename the opcodes here. 1080 unsigned Opc = N->getOpcode() == ISD::UADDO ? 1081 AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; 1082 1083 CurDAG->SelectNodeTo( 1084 N, Opc, N->getVTList(), 1085 {N->getOperand(0), N->getOperand(1), 1086 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/}); 1087 } 1088 1089 void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) { 1090 SDLoc SL(N); 1091 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod 1092 SDValue Ops[10]; 1093 1094 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]); 1095 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]); 1096 SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]); 1097 Ops[8] = N->getOperand(0); 1098 Ops[9] = N->getOperand(4); 1099 1100 CurDAG->SelectNodeTo(N, AMDGPU::V_FMA_F32, N->getVTList(), Ops); 1101 } 1102 1103 void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) { 1104 SDLoc SL(N); 1105 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod 1106 SDValue Ops[8]; 1107 1108 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]); 1109 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]); 1110 Ops[6] = N->getOperand(0); 1111 Ops[7] = N->getOperand(3); 1112 1113 CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops); 1114 } 1115 1116 // We need to handle this here because tablegen doesn't support matching 1117 // instructions with multiple outputs. 1118 void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { 1119 SDLoc SL(N); 1120 EVT VT = N->getValueType(0); 1121 1122 assert(VT == MVT::f32 || VT == MVT::f64); 1123 1124 unsigned Opc 1125 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32; 1126 1127 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2) }; 1128 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); 1129 } 1130 1131 void AMDGPUDAGToDAGISel::SelectDIV_FMAS(SDNode *N) { 1132 const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget); 1133 const SIRegisterInfo *TRI = ST->getRegisterInfo(); 1134 1135 SDLoc SL(N); 1136 EVT VT = N->getValueType(0); 1137 1138 assert(VT == MVT::f32 || VT == MVT::f64); 1139 1140 unsigned Opc 1141 = (VT == MVT::f64) ? AMDGPU::V_DIV_FMAS_F64 : AMDGPU::V_DIV_FMAS_F32; 1142 1143 SDValue CarryIn = N->getOperand(3); 1144 // V_DIV_FMAS implicitly reads VCC. 1145 SDValue VCC = CurDAG->getCopyToReg(CurDAG->getEntryNode(), SL, 1146 TRI->getVCC(), CarryIn, SDValue()); 1147 1148 SDValue Ops[10]; 1149 1150 SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]); 1151 SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]); 1152 SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]); 1153 1154 Ops[8] = VCC; 1155 Ops[9] = VCC.getValue(1); 1156 1157 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); 1158 } 1159 1160 // We need to handle this here because tablegen doesn't support matching 1161 // instructions with multiple outputs. 1162 void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) { 1163 SDLoc SL(N); 1164 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32; 1165 unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32 : AMDGPU::V_MAD_U64_U32; 1166 1167 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1); 1168 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), 1169 Clamp }; 1170 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); 1171 } 1172 1173 bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset, 1174 unsigned OffsetBits) const { 1175 if ((OffsetBits == 16 && !isUInt<16>(Offset)) || 1176 (OffsetBits == 8 && !isUInt<8>(Offset))) 1177 return false; 1178 1179 if (Subtarget->hasUsableDSOffset() || 1180 Subtarget->unsafeDSOffsetFoldingEnabled()) 1181 return true; 1182 1183 // On Southern Islands instruction with a negative base value and an offset 1184 // don't seem to work. 1185 return CurDAG->SignBitIsZero(Base); 1186 } 1187 1188 bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, 1189 SDValue &Offset) const { 1190 SDLoc DL(Addr); 1191 if (CurDAG->isBaseWithConstantOffset(Addr)) { 1192 SDValue N0 = Addr.getOperand(0); 1193 SDValue N1 = Addr.getOperand(1); 1194 ConstantSDNode *C1 = cast<ConstantSDNode>(N1); 1195 if (isDSOffsetLegal(N0, C1->getSExtValue(), 16)) { 1196 // (add n0, c0) 1197 Base = N0; 1198 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); 1199 return true; 1200 } 1201 } else if (Addr.getOpcode() == ISD::SUB) { 1202 // sub C, x -> add (sub 0, x), C 1203 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) { 1204 int64_t ByteOffset = C->getSExtValue(); 1205 if (isUInt<16>(ByteOffset)) { 1206 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); 1207 1208 // XXX - This is kind of hacky. Create a dummy sub node so we can check 1209 // the known bits in isDSOffsetLegal. We need to emit the selected node 1210 // here, so this is thrown away. 1211 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32, 1212 Zero, Addr.getOperand(1)); 1213 1214 if (isDSOffsetLegal(Sub, ByteOffset, 16)) { 1215 SmallVector<SDValue, 3> Opnds; 1216 Opnds.push_back(Zero); 1217 Opnds.push_back(Addr.getOperand(1)); 1218 1219 // FIXME: Select to VOP3 version for with-carry. 1220 unsigned SubOp = AMDGPU::V_SUB_I32_e32; 1221 if (Subtarget->hasAddNoCarry()) { 1222 SubOp = AMDGPU::V_SUB_U32_e64; 1223 Opnds.push_back( 1224 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit 1225 } 1226 1227 MachineSDNode *MachineSub = 1228 CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds); 1229 1230 Base = SDValue(MachineSub, 0); 1231 Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16); 1232 return true; 1233 } 1234 } 1235 } 1236 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { 1237 // If we have a constant address, prefer to put the constant into the 1238 // offset. This can save moves to load the constant address since multiple 1239 // operations can share the zero base address register, and enables merging 1240 // into read2 / write2 instructions. 1241 1242 SDLoc DL(Addr); 1243 1244 if (isUInt<16>(CAddr->getZExtValue())) { 1245 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); 1246 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, 1247 DL, MVT::i32, Zero); 1248 Base = SDValue(MovZero, 0); 1249 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16); 1250 return true; 1251 } 1252 } 1253 1254 // default case 1255 Base = Addr; 1256 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16); 1257 return true; 1258 } 1259 1260 // TODO: If offset is too big, put low 16-bit into offset. 1261 bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, 1262 SDValue &Offset0, 1263 SDValue &Offset1) const { 1264 SDLoc DL(Addr); 1265 1266 if (CurDAG->isBaseWithConstantOffset(Addr)) { 1267 SDValue N0 = Addr.getOperand(0); 1268 SDValue N1 = Addr.getOperand(1); 1269 ConstantSDNode *C1 = cast<ConstantSDNode>(N1); 1270 unsigned DWordOffset0 = C1->getZExtValue() / 4; 1271 unsigned DWordOffset1 = DWordOffset0 + 1; 1272 // (add n0, c0) 1273 if (isDSOffsetLegal(N0, DWordOffset1, 8)) { 1274 Base = N0; 1275 Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); 1276 Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); 1277 return true; 1278 } 1279 } else if (Addr.getOpcode() == ISD::SUB) { 1280 // sub C, x -> add (sub 0, x), C 1281 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) { 1282 unsigned DWordOffset0 = C->getZExtValue() / 4; 1283 unsigned DWordOffset1 = DWordOffset0 + 1; 1284 1285 if (isUInt<8>(DWordOffset0)) { 1286 SDLoc DL(Addr); 1287 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); 1288 1289 // XXX - This is kind of hacky. Create a dummy sub node so we can check 1290 // the known bits in isDSOffsetLegal. We need to emit the selected node 1291 // here, so this is thrown away. 1292 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32, 1293 Zero, Addr.getOperand(1)); 1294 1295 if (isDSOffsetLegal(Sub, DWordOffset1, 8)) { 1296 SmallVector<SDValue, 3> Opnds; 1297 Opnds.push_back(Zero); 1298 Opnds.push_back(Addr.getOperand(1)); 1299 unsigned SubOp = AMDGPU::V_SUB_I32_e32; 1300 if (Subtarget->hasAddNoCarry()) { 1301 SubOp = AMDGPU::V_SUB_U32_e64; 1302 Opnds.push_back( 1303 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit 1304 } 1305 1306 MachineSDNode *MachineSub 1307 = CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds); 1308 1309 Base = SDValue(MachineSub, 0); 1310 Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); 1311 Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); 1312 return true; 1313 } 1314 } 1315 } 1316 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { 1317 unsigned DWordOffset0 = CAddr->getZExtValue() / 4; 1318 unsigned DWordOffset1 = DWordOffset0 + 1; 1319 assert(4 * DWordOffset0 == CAddr->getZExtValue()); 1320 1321 if (isUInt<8>(DWordOffset0) && isUInt<8>(DWordOffset1)) { 1322 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); 1323 MachineSDNode *MovZero 1324 = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, 1325 DL, MVT::i32, Zero); 1326 Base = SDValue(MovZero, 0); 1327 Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); 1328 Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); 1329 return true; 1330 } 1331 } 1332 1333 // default case 1334 1335 Base = Addr; 1336 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8); 1337 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8); 1338 return true; 1339 } 1340 1341 bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, 1342 SDValue &VAddr, SDValue &SOffset, 1343 SDValue &Offset, SDValue &Offen, 1344 SDValue &Idxen, SDValue &Addr64, 1345 SDValue &GLC, SDValue &SLC, 1346 SDValue &TFE, SDValue &DLC, 1347 SDValue &SWZ) const { 1348 // Subtarget prefers to use flat instruction 1349 // FIXME: This should be a pattern predicate and not reach here 1350 if (Subtarget->useFlatForGlobal()) 1351 return false; 1352 1353 SDLoc DL(Addr); 1354 1355 if (!GLC.getNode()) 1356 GLC = CurDAG->getTargetConstant(0, DL, MVT::i1); 1357 if (!SLC.getNode()) 1358 SLC = CurDAG->getTargetConstant(0, DL, MVT::i1); 1359 TFE = CurDAG->getTargetConstant(0, DL, MVT::i1); 1360 DLC = CurDAG->getTargetConstant(0, DL, MVT::i1); 1361 SWZ = CurDAG->getTargetConstant(0, DL, MVT::i1); 1362 1363 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1); 1364 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1); 1365 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1); 1366 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); 1367 1368 ConstantSDNode *C1 = nullptr; 1369 SDValue N0 = Addr; 1370 if (CurDAG->isBaseWithConstantOffset(Addr)) { 1371 C1 = cast<ConstantSDNode>(Addr.getOperand(1)); 1372 if (isUInt<32>(C1->getZExtValue())) 1373 N0 = Addr.getOperand(0); 1374 else 1375 C1 = nullptr; 1376 } 1377 1378 if (N0.getOpcode() == ISD::ADD) { 1379 // (add N2, N3) -> addr64, or 1380 // (add (add N2, N3), C1) -> addr64 1381 SDValue N2 = N0.getOperand(0); 1382 SDValue N3 = N0.getOperand(1); 1383 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); 1384 1385 if (N2->isDivergent()) { 1386 if (N3->isDivergent()) { 1387 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the 1388 // addr64, and construct the resource from a 0 address. 1389 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0); 1390 VAddr = N0; 1391 } else { 1392 // N2 is divergent, N3 is not. 1393 Ptr = N3; 1394 VAddr = N2; 1395 } 1396 } else { 1397 // N2 is not divergent. 1398 Ptr = N2; 1399 VAddr = N3; 1400 } 1401 Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); 1402 } else if (N0->isDivergent()) { 1403 // N0 is divergent. Use it as the addr64, and construct the resource from a 1404 // 0 address. 1405 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0); 1406 VAddr = N0; 1407 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); 1408 } else { 1409 // N0 -> offset, or 1410 // (N0 + C1) -> offset 1411 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32); 1412 Ptr = N0; 1413 } 1414 1415 if (!C1) { 1416 // No offset. 1417 Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); 1418 return true; 1419 } 1420 1421 if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) { 1422 // Legal offset for instruction. 1423 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); 1424 return true; 1425 } 1426 1427 // Illegal offset, store it in soffset. 1428 Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); 1429 SOffset = 1430 SDValue(CurDAG->getMachineNode( 1431 AMDGPU::S_MOV_B32, DL, MVT::i32, 1432 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)), 1433 0); 1434 return true; 1435 } 1436 1437 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, 1438 SDValue &VAddr, SDValue &SOffset, 1439 SDValue &Offset, SDValue &GLC, 1440 SDValue &SLC, SDValue &TFE, 1441 SDValue &DLC, SDValue &SWZ) const { 1442 SDValue Ptr, Offen, Idxen, Addr64; 1443 1444 // addr64 bit was removed for volcanic islands. 1445 // FIXME: This should be a pattern predicate and not reach here 1446 if (!Subtarget->hasAddr64()) 1447 return false; 1448 1449 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, 1450 GLC, SLC, TFE, DLC, SWZ)) 1451 return false; 1452 1453 ConstantSDNode *C = cast<ConstantSDNode>(Addr64); 1454 if (C->getSExtValue()) { 1455 SDLoc DL(Addr); 1456 1457 const SITargetLowering& Lowering = 1458 *static_cast<const SITargetLowering*>(getTargetLowering()); 1459 1460 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0); 1461 return true; 1462 } 1463 1464 return false; 1465 } 1466 1467 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, 1468 SDValue &VAddr, SDValue &SOffset, 1469 SDValue &Offset, 1470 SDValue &SLC) const { 1471 SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1); 1472 SDValue GLC, TFE, DLC, SWZ; 1473 1474 return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC, SWZ); 1475 } 1476 1477 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { 1478 auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>(); 1479 return PSV && PSV->isStack(); 1480 } 1481 1482 std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const { 1483 SDLoc DL(N); 1484 const MachineFunction &MF = CurDAG->getMachineFunction(); 1485 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1486 1487 if (auto FI = dyn_cast<FrameIndexSDNode>(N)) { 1488 SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(), 1489 FI->getValueType(0)); 1490 1491 // If we can resolve this to a frame index access, this will be relative to 1492 // either the stack or frame pointer SGPR. 1493 return std::make_pair( 1494 TFI, CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)); 1495 } 1496 1497 // If we don't know this private access is a local stack object, it needs to 1498 // be relative to the entry point's scratch wave offset. 1499 return std::make_pair(N, CurDAG->getTargetConstant(0, DL, MVT::i32)); 1500 } 1501 1502 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, 1503 SDValue Addr, SDValue &Rsrc, 1504 SDValue &VAddr, SDValue &SOffset, 1505 SDValue &ImmOffset) const { 1506 1507 SDLoc DL(Addr); 1508 MachineFunction &MF = CurDAG->getMachineFunction(); 1509 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1510 1511 Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); 1512 1513 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { 1514 unsigned Imm = CAddr->getZExtValue(); 1515 1516 SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32); 1517 MachineSDNode *MovHighBits = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, 1518 DL, MVT::i32, HighBits); 1519 VAddr = SDValue(MovHighBits, 0); 1520 1521 // In a call sequence, stores to the argument stack area are relative to the 1522 // stack pointer. 1523 const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo(); 1524 1525 SOffset = isStackPtrRelative(PtrInfo) 1526 ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32) 1527 : CurDAG->getTargetConstant(0, DL, MVT::i32); 1528 ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16); 1529 return true; 1530 } 1531 1532 if (CurDAG->isBaseWithConstantOffset(Addr)) { 1533 // (add n0, c1) 1534 1535 SDValue N0 = Addr.getOperand(0); 1536 SDValue N1 = Addr.getOperand(1); 1537 1538 // Offsets in vaddr must be positive if range checking is enabled. 1539 // 1540 // The total computation of vaddr + soffset + offset must not overflow. If 1541 // vaddr is negative, even if offset is 0 the sgpr offset add will end up 1542 // overflowing. 1543 // 1544 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would 1545 // always perform a range check. If a negative vaddr base index was used, 1546 // this would fail the range check. The overall address computation would 1547 // compute a valid address, but this doesn't happen due to the range 1548 // check. For out-of-bounds MUBUF loads, a 0 is returned. 1549 // 1550 // Therefore it should be safe to fold any VGPR offset on gfx9 into the 1551 // MUBUF vaddr, but not on older subtargets which can only do this if the 1552 // sign bit is known 0. 1553 ConstantSDNode *C1 = cast<ConstantSDNode>(N1); 1554 if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue()) && 1555 (!Subtarget->privateMemoryResourceIsRangeChecked() || 1556 CurDAG->SignBitIsZero(N0))) { 1557 std::tie(VAddr, SOffset) = foldFrameIndex(N0); 1558 ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); 1559 return true; 1560 } 1561 } 1562 1563 // (node) 1564 std::tie(VAddr, SOffset) = foldFrameIndex(Addr); 1565 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16); 1566 return true; 1567 } 1568 1569 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, 1570 SDValue Addr, 1571 SDValue &SRsrc, 1572 SDValue &SOffset, 1573 SDValue &Offset) const { 1574 ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr); 1575 if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue())) 1576 return false; 1577 1578 SDLoc DL(Addr); 1579 MachineFunction &MF = CurDAG->getMachineFunction(); 1580 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1581 1582 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); 1583 1584 const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo(); 1585 1586 // FIXME: Get from MachinePointerInfo? We should only be using the frame 1587 // offset if we know this is in a call sequence. 1588 SOffset = isStackPtrRelative(PtrInfo) 1589 ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32) 1590 : CurDAG->getTargetConstant(0, DL, MVT::i32); 1591 1592 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16); 1593 return true; 1594 } 1595 1596 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, 1597 SDValue &SOffset, SDValue &Offset, 1598 SDValue &GLC, SDValue &SLC, 1599 SDValue &TFE, SDValue &DLC, 1600 SDValue &SWZ) const { 1601 SDValue Ptr, VAddr, Offen, Idxen, Addr64; 1602 const SIInstrInfo *TII = 1603 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 1604 1605 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, 1606 GLC, SLC, TFE, DLC, SWZ)) 1607 return false; 1608 1609 if (!cast<ConstantSDNode>(Offen)->getSExtValue() && 1610 !cast<ConstantSDNode>(Idxen)->getSExtValue() && 1611 !cast<ConstantSDNode>(Addr64)->getSExtValue()) { 1612 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | 1613 APInt::getAllOnesValue(32).getZExtValue(); // Size 1614 SDLoc DL(Addr); 1615 1616 const SITargetLowering& Lowering = 1617 *static_cast<const SITargetLowering*>(getTargetLowering()); 1618 1619 SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0); 1620 return true; 1621 } 1622 return false; 1623 } 1624 1625 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, 1626 SDValue &Soffset, SDValue &Offset 1627 ) const { 1628 SDValue GLC, SLC, TFE, DLC, SWZ; 1629 1630 return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ); 1631 } 1632 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, 1633 SDValue &Soffset, SDValue &Offset, 1634 SDValue &SLC) const { 1635 SDValue GLC, TFE, DLC, SWZ; 1636 1637 return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ); 1638 } 1639 1640 // Find a load or store from corresponding pattern root. 1641 // Roots may be build_vector, bitconvert or their combinations. 1642 static MemSDNode* findMemSDNode(SDNode *N) { 1643 N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode(); 1644 if (MemSDNode *MN = dyn_cast<MemSDNode>(N)) 1645 return MN; 1646 assert(isa<BuildVectorSDNode>(N)); 1647 for (SDValue V : N->op_values()) 1648 if (MemSDNode *MN = 1649 dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V))) 1650 return MN; 1651 llvm_unreachable("cannot find MemSDNode in the pattern!"); 1652 } 1653 1654 template <bool IsSigned> 1655 bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, 1656 SDValue Addr, 1657 SDValue &VAddr, 1658 SDValue &Offset, 1659 SDValue &SLC) const { 1660 int64_t OffsetVal = 0; 1661 1662 if (Subtarget->hasFlatInstOffsets() && 1663 (!Subtarget->hasFlatSegmentOffsetBug() || 1664 findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS) && 1665 CurDAG->isBaseWithConstantOffset(Addr)) { 1666 SDValue N0 = Addr.getOperand(0); 1667 SDValue N1 = Addr.getOperand(1); 1668 uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue(); 1669 1670 const SIInstrInfo *TII = Subtarget->getInstrInfo(); 1671 unsigned AS = findMemSDNode(N)->getAddressSpace(); 1672 if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) { 1673 Addr = N0; 1674 OffsetVal = COffsetVal; 1675 } else { 1676 // If the offset doesn't fit, put the low bits into the offset field and 1677 // add the rest. 1678 1679 SDLoc DL(N); 1680 uint64_t ImmField; 1681 const unsigned NumBits = TII->getNumFlatOffsetBits(AS, IsSigned); 1682 if (IsSigned) { 1683 ImmField = SignExtend64(COffsetVal, NumBits); 1684 1685 // Don't use a negative offset field if the base offset is positive. 1686 // Since the scheduler currently relies on the offset field, doing so 1687 // could result in strange scheduling decisions. 1688 1689 // TODO: Should we not do this in the opposite direction as well? 1690 if (static_cast<int64_t>(COffsetVal) > 0) { 1691 if (static_cast<int64_t>(ImmField) < 0) { 1692 const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits - 1); 1693 ImmField = COffsetVal & OffsetMask; 1694 } 1695 } 1696 } else { 1697 // TODO: Should we do this for a negative offset? 1698 const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits); 1699 ImmField = COffsetVal & OffsetMask; 1700 } 1701 1702 uint64_t RemainderOffset = COffsetVal - ImmField; 1703 1704 assert(TII->isLegalFLATOffset(ImmField, AS, IsSigned)); 1705 assert(RemainderOffset + ImmField == COffsetVal); 1706 1707 OffsetVal = ImmField; 1708 1709 // TODO: Should this try to use a scalar add pseudo if the base address is 1710 // uniform and saddr is usable? 1711 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); 1712 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); 1713 1714 SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, 1715 DL, MVT::i32, N0, Sub0); 1716 SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, 1717 DL, MVT::i32, N0, Sub1); 1718 1719 SDValue AddOffsetLo 1720 = getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); 1721 SDValue AddOffsetHi 1722 = getMaterializedScalarImm32(Hi_32(RemainderOffset), DL); 1723 1724 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1); 1725 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); 1726 1727 SDNode *Add = CurDAG->getMachineNode( 1728 AMDGPU::V_ADD_I32_e64, DL, VTs, 1729 {AddOffsetLo, SDValue(N0Lo, 0), Clamp}); 1730 1731 SDNode *Addc = CurDAG->getMachineNode( 1732 AMDGPU::V_ADDC_U32_e64, DL, VTs, 1733 {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp}); 1734 1735 SDValue RegSequenceArgs[] = { 1736 CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32), 1737 SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1 1738 }; 1739 1740 Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, 1741 MVT::i64, RegSequenceArgs), 0); 1742 } 1743 } 1744 1745 VAddr = Addr; 1746 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16); 1747 SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1); 1748 return true; 1749 } 1750 1751 bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDNode *N, 1752 SDValue Addr, 1753 SDValue &VAddr, 1754 SDValue &Offset, 1755 SDValue &SLC) const { 1756 return SelectFlatOffset<false>(N, Addr, VAddr, Offset, SLC); 1757 } 1758 1759 bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDNode *N, 1760 SDValue Addr, 1761 SDValue &VAddr, 1762 SDValue &Offset, 1763 SDValue &SLC) const { 1764 return SelectFlatOffset<true>(N, Addr, VAddr, Offset, SLC); 1765 } 1766 1767 bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, 1768 SDValue &Offset, bool &Imm) const { 1769 1770 // FIXME: Handle non-constant offsets. 1771 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode); 1772 if (!C) 1773 return false; 1774 1775 SDLoc SL(ByteOffsetNode); 1776 GCNSubtarget::Generation Gen = Subtarget->getGeneration(); 1777 uint64_t ByteOffset = C->getZExtValue(); 1778 Optional<int64_t> EncodedOffset = 1779 AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset); 1780 if (EncodedOffset) { 1781 Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32); 1782 Imm = true; 1783 return true; 1784 } 1785 1786 if (Gen == AMDGPUSubtarget::SEA_ISLANDS) { 1787 EncodedOffset = 1788 AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset); 1789 if (EncodedOffset) { 1790 Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32); 1791 return true; 1792 } 1793 } 1794 1795 if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset)) 1796 return false; 1797 1798 SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32); 1799 Offset = SDValue( 1800 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0); 1801 1802 return true; 1803 } 1804 1805 SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const { 1806 if (Addr.getValueType() != MVT::i32) 1807 return Addr; 1808 1809 // Zero-extend a 32-bit address. 1810 SDLoc SL(Addr); 1811 1812 const MachineFunction &MF = CurDAG->getMachineFunction(); 1813 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1814 unsigned AddrHiVal = Info->get32BitAddressHighBits(); 1815 SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32); 1816 1817 const SDValue Ops[] = { 1818 CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32), 1819 Addr, 1820 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32), 1821 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi), 1822 0), 1823 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32), 1824 }; 1825 1826 return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64, 1827 Ops), 0); 1828 } 1829 1830 bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, 1831 SDValue &Offset, bool &Imm) const { 1832 SDLoc SL(Addr); 1833 1834 // A 32-bit (address + offset) should not cause unsigned 32-bit integer 1835 // wraparound, because s_load instructions perform the addition in 64 bits. 1836 if ((Addr.getValueType() != MVT::i32 || 1837 Addr->getFlags().hasNoUnsignedWrap()) && 1838 CurDAG->isBaseWithConstantOffset(Addr)) { 1839 SDValue N0 = Addr.getOperand(0); 1840 SDValue N1 = Addr.getOperand(1); 1841 1842 if (SelectSMRDOffset(N1, Offset, Imm)) { 1843 SBase = Expand32BitAddress(N0); 1844 return true; 1845 } 1846 } 1847 SBase = Expand32BitAddress(Addr); 1848 Offset = CurDAG->getTargetConstant(0, SL, MVT::i32); 1849 Imm = true; 1850 return true; 1851 } 1852 1853 bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase, 1854 SDValue &Offset) const { 1855 bool Imm = false; 1856 return SelectSMRD(Addr, SBase, Offset, Imm) && Imm; 1857 } 1858 1859 bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase, 1860 SDValue &Offset) const { 1861 1862 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); 1863 1864 bool Imm = false; 1865 if (!SelectSMRD(Addr, SBase, Offset, Imm)) 1866 return false; 1867 1868 return !Imm && isa<ConstantSDNode>(Offset); 1869 } 1870 1871 bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase, 1872 SDValue &Offset) const { 1873 bool Imm = false; 1874 return SelectSMRD(Addr, SBase, Offset, Imm) && !Imm && 1875 !isa<ConstantSDNode>(Offset); 1876 } 1877 1878 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr, 1879 SDValue &Offset) const { 1880 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr)) { 1881 if (auto Imm = AMDGPU::getSMRDEncodedOffset(*Subtarget, 1882 C->getZExtValue())) { 1883 Offset = CurDAG->getTargetConstant(*Imm, SDLoc(Addr), MVT::i32); 1884 return true; 1885 } 1886 } 1887 1888 return false; 1889 } 1890 1891 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr, 1892 SDValue &Offset) const { 1893 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); 1894 1895 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr)) { 1896 if (auto Imm = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, 1897 C->getZExtValue())) { 1898 Offset = CurDAG->getTargetConstant(*Imm, SDLoc(Addr), MVT::i32); 1899 return true; 1900 } 1901 } 1902 1903 return false; 1904 } 1905 1906 bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index, 1907 SDValue &Base, 1908 SDValue &Offset) const { 1909 SDLoc DL(Index); 1910 1911 if (CurDAG->isBaseWithConstantOffset(Index)) { 1912 SDValue N0 = Index.getOperand(0); 1913 SDValue N1 = Index.getOperand(1); 1914 ConstantSDNode *C1 = cast<ConstantSDNode>(N1); 1915 1916 // (add n0, c0) 1917 // Don't peel off the offset (c0) if doing so could possibly lead 1918 // the base (n0) to be negative. 1919 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0)) { 1920 Base = N0; 1921 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32); 1922 return true; 1923 } 1924 } 1925 1926 if (isa<ConstantSDNode>(Index)) 1927 return false; 1928 1929 Base = Index; 1930 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); 1931 return true; 1932 } 1933 1934 SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, const SDLoc &DL, 1935 SDValue Val, uint32_t Offset, 1936 uint32_t Width) { 1937 // Transformation function, pack the offset and width of a BFE into 1938 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second 1939 // source, bits [5:0] contain the offset and bits [22:16] the width. 1940 uint32_t PackedVal = Offset | (Width << 16); 1941 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32); 1942 1943 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst); 1944 } 1945 1946 void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) { 1947 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c) 1948 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c) 1949 // Predicate: 0 < b <= c < 32 1950 1951 const SDValue &Shl = N->getOperand(0); 1952 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1)); 1953 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 1954 1955 if (B && C) { 1956 uint32_t BVal = B->getZExtValue(); 1957 uint32_t CVal = C->getZExtValue(); 1958 1959 if (0 < BVal && BVal <= CVal && CVal < 32) { 1960 bool Signed = N->getOpcode() == ISD::SRA; 1961 unsigned Opcode = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; 1962 1963 ReplaceNode(N, getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0), CVal - BVal, 1964 32 - CVal)); 1965 return; 1966 } 1967 } 1968 SelectCode(N); 1969 } 1970 1971 void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) { 1972 switch (N->getOpcode()) { 1973 case ISD::AND: 1974 if (N->getOperand(0).getOpcode() == ISD::SRL) { 1975 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)" 1976 // Predicate: isMask(mask) 1977 const SDValue &Srl = N->getOperand(0); 1978 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1)); 1979 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1)); 1980 1981 if (Shift && Mask) { 1982 uint32_t ShiftVal = Shift->getZExtValue(); 1983 uint32_t MaskVal = Mask->getZExtValue(); 1984 1985 if (isMask_32(MaskVal)) { 1986 uint32_t WidthVal = countPopulation(MaskVal); 1987 1988 ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), 1989 Srl.getOperand(0), ShiftVal, WidthVal)); 1990 return; 1991 } 1992 } 1993 } 1994 break; 1995 case ISD::SRL: 1996 if (N->getOperand(0).getOpcode() == ISD::AND) { 1997 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)" 1998 // Predicate: isMask(mask >> b) 1999 const SDValue &And = N->getOperand(0); 2000 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1)); 2001 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1)); 2002 2003 if (Shift && Mask) { 2004 uint32_t ShiftVal = Shift->getZExtValue(); 2005 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal; 2006 2007 if (isMask_32(MaskVal)) { 2008 uint32_t WidthVal = countPopulation(MaskVal); 2009 2010 ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), 2011 And.getOperand(0), ShiftVal, WidthVal)); 2012 return; 2013 } 2014 } 2015 } else if (N->getOperand(0).getOpcode() == ISD::SHL) { 2016 SelectS_BFEFromShifts(N); 2017 return; 2018 } 2019 break; 2020 case ISD::SRA: 2021 if (N->getOperand(0).getOpcode() == ISD::SHL) { 2022 SelectS_BFEFromShifts(N); 2023 return; 2024 } 2025 break; 2026 2027 case ISD::SIGN_EXTEND_INREG: { 2028 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8 2029 SDValue Src = N->getOperand(0); 2030 if (Src.getOpcode() != ISD::SRL) 2031 break; 2032 2033 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1)); 2034 if (!Amt) 2035 break; 2036 2037 unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits(); 2038 ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_I32, SDLoc(N), Src.getOperand(0), 2039 Amt->getZExtValue(), Width)); 2040 return; 2041 } 2042 } 2043 2044 SelectCode(N); 2045 } 2046 2047 bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const { 2048 assert(N->getOpcode() == ISD::BRCOND); 2049 if (!N->hasOneUse()) 2050 return false; 2051 2052 SDValue Cond = N->getOperand(1); 2053 if (Cond.getOpcode() == ISD::CopyToReg) 2054 Cond = Cond.getOperand(2); 2055 2056 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse()) 2057 return false; 2058 2059 MVT VT = Cond.getOperand(0).getSimpleValueType(); 2060 if (VT == MVT::i32) 2061 return true; 2062 2063 if (VT == MVT::i64) { 2064 auto ST = static_cast<const GCNSubtarget *>(Subtarget); 2065 2066 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 2067 return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64(); 2068 } 2069 2070 return false; 2071 } 2072 2073 void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) { 2074 SDValue Cond = N->getOperand(1); 2075 2076 if (Cond.isUndef()) { 2077 CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other, 2078 N->getOperand(2), N->getOperand(0)); 2079 return; 2080 } 2081 2082 const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget); 2083 const SIRegisterInfo *TRI = ST->getRegisterInfo(); 2084 2085 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N); 2086 unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ; 2087 unsigned CondReg = UseSCCBr ? (unsigned)AMDGPU::SCC : TRI->getVCC(); 2088 SDLoc SL(N); 2089 2090 if (!UseSCCBr) { 2091 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not 2092 // analyzed what generates the vcc value, so we do not know whether vcc 2093 // bits for disabled lanes are 0. Thus we need to mask out bits for 2094 // disabled lanes. 2095 // 2096 // For the case that we select S_CBRANCH_SCC1 and it gets 2097 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls 2098 // SIInstrInfo::moveToVALU which inserts the S_AND). 2099 // 2100 // We could add an analysis of what generates the vcc value here and omit 2101 // the S_AND when is unnecessary. But it would be better to add a separate 2102 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it 2103 // catches both cases. 2104 Cond = SDValue(CurDAG->getMachineNode(ST->isWave32() ? AMDGPU::S_AND_B32 2105 : AMDGPU::S_AND_B64, 2106 SL, MVT::i1, 2107 CurDAG->getRegister(ST->isWave32() ? AMDGPU::EXEC_LO 2108 : AMDGPU::EXEC, 2109 MVT::i1), 2110 Cond), 2111 0); 2112 } 2113 2114 SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond); 2115 CurDAG->SelectNodeTo(N, BrOp, MVT::Other, 2116 N->getOperand(2), // Basic Block 2117 VCC.getValue(0)); 2118 } 2119 2120 void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) { 2121 MVT VT = N->getSimpleValueType(0); 2122 bool IsFMA = N->getOpcode() == ISD::FMA; 2123 if (VT != MVT::f32 || (!Subtarget->hasMadMixInsts() && 2124 !Subtarget->hasFmaMixInsts()) || 2125 ((IsFMA && Subtarget->hasMadMixInsts()) || 2126 (!IsFMA && Subtarget->hasFmaMixInsts()))) { 2127 SelectCode(N); 2128 return; 2129 } 2130 2131 SDValue Src0 = N->getOperand(0); 2132 SDValue Src1 = N->getOperand(1); 2133 SDValue Src2 = N->getOperand(2); 2134 unsigned Src0Mods, Src1Mods, Src2Mods; 2135 2136 // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand 2137 // using the conversion from f16. 2138 bool Sel0 = SelectVOP3PMadMixModsImpl(Src0, Src0, Src0Mods); 2139 bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods); 2140 bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods); 2141 2142 assert((IsFMA || !Mode.allFP32Denormals()) && 2143 "fmad selected with denormals enabled"); 2144 // TODO: We can select this with f32 denormals enabled if all the sources are 2145 // converted from f16 (in which case fmad isn't legal). 2146 2147 if (Sel0 || Sel1 || Sel2) { 2148 // For dummy operands. 2149 SDValue Zero = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32); 2150 SDValue Ops[] = { 2151 CurDAG->getTargetConstant(Src0Mods, SDLoc(), MVT::i32), Src0, 2152 CurDAG->getTargetConstant(Src1Mods, SDLoc(), MVT::i32), Src1, 2153 CurDAG->getTargetConstant(Src2Mods, SDLoc(), MVT::i32), Src2, 2154 CurDAG->getTargetConstant(0, SDLoc(), MVT::i1), 2155 Zero, Zero 2156 }; 2157 2158 CurDAG->SelectNodeTo(N, 2159 IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32, 2160 MVT::f32, Ops); 2161 } else { 2162 SelectCode(N); 2163 } 2164 } 2165 2166 // This is here because there isn't a way to use the generated sub0_sub1 as the 2167 // subreg index to EXTRACT_SUBREG in tablegen. 2168 void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) { 2169 MemSDNode *Mem = cast<MemSDNode>(N); 2170 unsigned AS = Mem->getAddressSpace(); 2171 if (AS == AMDGPUAS::FLAT_ADDRESS) { 2172 SelectCode(N); 2173 return; 2174 } 2175 2176 MVT VT = N->getSimpleValueType(0); 2177 bool Is32 = (VT == MVT::i32); 2178 SDLoc SL(N); 2179 2180 MachineSDNode *CmpSwap = nullptr; 2181 if (Subtarget->hasAddr64()) { 2182 SDValue SRsrc, VAddr, SOffset, Offset, SLC; 2183 2184 if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset, SLC)) { 2185 unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN : 2186 AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN; 2187 SDValue CmpVal = Mem->getOperand(2); 2188 2189 // XXX - Do we care about glue operands? 2190 2191 SDValue Ops[] = { 2192 CmpVal, VAddr, SRsrc, SOffset, Offset, SLC, Mem->getChain() 2193 }; 2194 2195 CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops); 2196 } 2197 } 2198 2199 if (!CmpSwap) { 2200 SDValue SRsrc, SOffset, Offset, SLC; 2201 if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset, SLC)) { 2202 unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN : 2203 AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN; 2204 2205 SDValue CmpVal = Mem->getOperand(2); 2206 SDValue Ops[] = { 2207 CmpVal, SRsrc, SOffset, Offset, SLC, Mem->getChain() 2208 }; 2209 2210 CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops); 2211 } 2212 } 2213 2214 if (!CmpSwap) { 2215 SelectCode(N); 2216 return; 2217 } 2218 2219 MachineMemOperand *MMO = Mem->getMemOperand(); 2220 CurDAG->setNodeMemRefs(CmpSwap, {MMO}); 2221 2222 unsigned SubReg = Is32 ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 2223 SDValue Extract 2224 = CurDAG->getTargetExtractSubreg(SubReg, SL, VT, SDValue(CmpSwap, 0)); 2225 2226 ReplaceUses(SDValue(N, 0), Extract); 2227 ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 1)); 2228 CurDAG->RemoveDeadNode(N); 2229 } 2230 2231 void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) { 2232 // The address is assumed to be uniform, so if it ends up in a VGPR, it will 2233 // be copied to an SGPR with readfirstlane. 2234 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ? 2235 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME; 2236 2237 SDValue Chain = N->getOperand(0); 2238 SDValue Ptr = N->getOperand(2); 2239 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N); 2240 MachineMemOperand *MMO = M->getMemOperand(); 2241 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS; 2242 2243 SDValue Offset; 2244 if (CurDAG->isBaseWithConstantOffset(Ptr)) { 2245 SDValue PtrBase = Ptr.getOperand(0); 2246 SDValue PtrOffset = Ptr.getOperand(1); 2247 2248 const APInt &OffsetVal = cast<ConstantSDNode>(PtrOffset)->getAPIntValue(); 2249 if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue(), 16)) { 2250 N = glueCopyToM0(N, PtrBase); 2251 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32); 2252 } 2253 } 2254 2255 if (!Offset) { 2256 N = glueCopyToM0(N, Ptr); 2257 Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32); 2258 } 2259 2260 SDValue Ops[] = { 2261 Offset, 2262 CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32), 2263 Chain, 2264 N->getOperand(N->getNumOperands() - 1) // New glue 2265 }; 2266 2267 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); 2268 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO}); 2269 } 2270 2271 static unsigned gwsIntrinToOpcode(unsigned IntrID) { 2272 switch (IntrID) { 2273 case Intrinsic::amdgcn_ds_gws_init: 2274 return AMDGPU::DS_GWS_INIT; 2275 case Intrinsic::amdgcn_ds_gws_barrier: 2276 return AMDGPU::DS_GWS_BARRIER; 2277 case Intrinsic::amdgcn_ds_gws_sema_v: 2278 return AMDGPU::DS_GWS_SEMA_V; 2279 case Intrinsic::amdgcn_ds_gws_sema_br: 2280 return AMDGPU::DS_GWS_SEMA_BR; 2281 case Intrinsic::amdgcn_ds_gws_sema_p: 2282 return AMDGPU::DS_GWS_SEMA_P; 2283 case Intrinsic::amdgcn_ds_gws_sema_release_all: 2284 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL; 2285 default: 2286 llvm_unreachable("not a gws intrinsic"); 2287 } 2288 } 2289 2290 void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) { 2291 if (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all && 2292 !Subtarget->hasGWSSemaReleaseAll()) { 2293 // Let this error. 2294 SelectCode(N); 2295 return; 2296 } 2297 2298 // Chain, intrinsic ID, vsrc, offset 2299 const bool HasVSrc = N->getNumOperands() == 4; 2300 assert(HasVSrc || N->getNumOperands() == 3); 2301 2302 SDLoc SL(N); 2303 SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2); 2304 int ImmOffset = 0; 2305 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N); 2306 MachineMemOperand *MMO = M->getMemOperand(); 2307 2308 // Don't worry if the offset ends up in a VGPR. Only one lane will have 2309 // effect, so SIFixSGPRCopies will validly insert readfirstlane. 2310 2311 // The resource id offset is computed as (<isa opaque base> + M0[21:16] + 2312 // offset field) % 64. Some versions of the programming guide omit the m0 2313 // part, or claim it's from offset 0. 2314 if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) { 2315 // If we have a constant offset, try to use the 0 in m0 as the base. 2316 // TODO: Look into changing the default m0 initialization value. If the 2317 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to 2318 // the immediate offset. 2319 glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32)); 2320 ImmOffset = ConstOffset->getZExtValue(); 2321 } else { 2322 if (CurDAG->isBaseWithConstantOffset(BaseOffset)) { 2323 ImmOffset = BaseOffset.getConstantOperandVal(1); 2324 BaseOffset = BaseOffset.getOperand(0); 2325 } 2326 2327 // Prefer to do the shift in an SGPR since it should be possible to use m0 2328 // as the result directly. If it's already an SGPR, it will be eliminated 2329 // later. 2330 SDNode *SGPROffset 2331 = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32, 2332 BaseOffset); 2333 // Shift to offset in m0 2334 SDNode *M0Base 2335 = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32, 2336 SDValue(SGPROffset, 0), 2337 CurDAG->getTargetConstant(16, SL, MVT::i32)); 2338 glueCopyToM0(N, SDValue(M0Base, 0)); 2339 } 2340 2341 SDValue Chain = N->getOperand(0); 2342 SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32); 2343 2344 // TODO: Can this just be removed from the instruction? 2345 SDValue GDS = CurDAG->getTargetConstant(1, SL, MVT::i1); 2346 2347 const unsigned Opc = gwsIntrinToOpcode(IntrID); 2348 SmallVector<SDValue, 5> Ops; 2349 if (HasVSrc) 2350 Ops.push_back(N->getOperand(2)); 2351 Ops.push_back(OffsetField); 2352 Ops.push_back(GDS); 2353 Ops.push_back(Chain); 2354 2355 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); 2356 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO}); 2357 } 2358 2359 void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) { 2360 if (Subtarget->getLDSBankCount() != 16) { 2361 // This is a single instruction with a pattern. 2362 SelectCode(N); 2363 return; 2364 } 2365 2366 SDLoc DL(N); 2367 2368 // This requires 2 instructions. It is possible to write a pattern to support 2369 // this, but the generated isel emitter doesn't correctly deal with multiple 2370 // output instructions using the same physical register input. The copy to m0 2371 // is incorrectly placed before the second instruction. 2372 // 2373 // TODO: Match source modifiers. 2374 // 2375 // def : Pat < 2376 // (int_amdgcn_interp_p1_f16 2377 // (VOP3Mods f32:$src0, i32:$src0_modifiers), 2378 // (i32 timm:$attrchan), (i32 timm:$attr), 2379 // (i1 timm:$high), M0), 2380 // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr, 2381 // timm:$attrchan, 0, 2382 // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> { 2383 // let Predicates = [has16BankLDS]; 2384 // } 2385 2386 // 16 bank LDS 2387 SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0, 2388 N->getOperand(5), SDValue()); 2389 2390 SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other); 2391 2392 SDNode *InterpMov = 2393 CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, { 2394 CurDAG->getTargetConstant(2, DL, MVT::i32), // P0 2395 N->getOperand(3), // Attr 2396 N->getOperand(2), // Attrchan 2397 ToM0.getValue(1) // In glue 2398 }); 2399 2400 SDNode *InterpP1LV = 2401 CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, { 2402 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers 2403 N->getOperand(1), // Src0 2404 N->getOperand(3), // Attr 2405 N->getOperand(2), // Attrchan 2406 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers 2407 SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high 2408 N->getOperand(4), // high 2409 CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp 2410 CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod 2411 SDValue(InterpMov, 1) 2412 }); 2413 2414 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0)); 2415 } 2416 2417 void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) { 2418 unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 2419 switch (IntrID) { 2420 case Intrinsic::amdgcn_ds_append: 2421 case Intrinsic::amdgcn_ds_consume: { 2422 if (N->getValueType(0) != MVT::i32) 2423 break; 2424 SelectDSAppendConsume(N, IntrID); 2425 return; 2426 } 2427 } 2428 2429 SelectCode(N); 2430 } 2431 2432 void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) { 2433 unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 2434 unsigned Opcode; 2435 switch (IntrID) { 2436 case Intrinsic::amdgcn_wqm: 2437 Opcode = AMDGPU::WQM; 2438 break; 2439 case Intrinsic::amdgcn_softwqm: 2440 Opcode = AMDGPU::SOFT_WQM; 2441 break; 2442 case Intrinsic::amdgcn_wwm: 2443 Opcode = AMDGPU::WWM; 2444 break; 2445 case Intrinsic::amdgcn_interp_p1_f16: 2446 SelectInterpP1F16(N); 2447 return; 2448 default: 2449 SelectCode(N); 2450 return; 2451 } 2452 2453 SDValue Src = N->getOperand(1); 2454 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src}); 2455 } 2456 2457 void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) { 2458 unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 2459 switch (IntrID) { 2460 case Intrinsic::amdgcn_ds_gws_init: 2461 case Intrinsic::amdgcn_ds_gws_barrier: 2462 case Intrinsic::amdgcn_ds_gws_sema_v: 2463 case Intrinsic::amdgcn_ds_gws_sema_br: 2464 case Intrinsic::amdgcn_ds_gws_sema_p: 2465 case Intrinsic::amdgcn_ds_gws_sema_release_all: 2466 SelectDS_GWS(N, IntrID); 2467 return; 2468 default: 2469 break; 2470 } 2471 2472 SelectCode(N); 2473 } 2474 2475 bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src, 2476 unsigned &Mods) const { 2477 Mods = 0; 2478 Src = In; 2479 2480 if (Src.getOpcode() == ISD::FNEG) { 2481 Mods |= SISrcMods::NEG; 2482 Src = Src.getOperand(0); 2483 } 2484 2485 if (Src.getOpcode() == ISD::FABS) { 2486 Mods |= SISrcMods::ABS; 2487 Src = Src.getOperand(0); 2488 } 2489 2490 return true; 2491 } 2492 2493 bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, 2494 SDValue &SrcMods) const { 2495 unsigned Mods; 2496 if (SelectVOP3ModsImpl(In, Src, Mods)) { 2497 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); 2498 return true; 2499 } 2500 2501 return false; 2502 } 2503 2504 bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, 2505 SDValue &SrcMods) const { 2506 SelectVOP3Mods(In, Src, SrcMods); 2507 return isNoNanSrc(Src); 2508 } 2509 2510 bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const { 2511 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG) 2512 return false; 2513 2514 Src = In; 2515 return true; 2516 } 2517 2518 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src, 2519 SDValue &SrcMods, SDValue &Clamp, 2520 SDValue &Omod) const { 2521 SDLoc DL(In); 2522 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); 2523 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1); 2524 2525 return SelectVOP3Mods(In, Src, SrcMods); 2526 } 2527 2528 bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src, 2529 SDValue &Clamp, SDValue &Omod) const { 2530 Src = In; 2531 2532 SDLoc DL(In); 2533 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); 2534 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1); 2535 2536 return true; 2537 } 2538 2539 bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, 2540 SDValue &SrcMods) const { 2541 unsigned Mods = 0; 2542 Src = In; 2543 2544 if (Src.getOpcode() == ISD::FNEG) { 2545 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); 2546 Src = Src.getOperand(0); 2547 } 2548 2549 if (Src.getOpcode() == ISD::BUILD_VECTOR) { 2550 unsigned VecMods = Mods; 2551 2552 SDValue Lo = stripBitcast(Src.getOperand(0)); 2553 SDValue Hi = stripBitcast(Src.getOperand(1)); 2554 2555 if (Lo.getOpcode() == ISD::FNEG) { 2556 Lo = stripBitcast(Lo.getOperand(0)); 2557 Mods ^= SISrcMods::NEG; 2558 } 2559 2560 if (Hi.getOpcode() == ISD::FNEG) { 2561 Hi = stripBitcast(Hi.getOperand(0)); 2562 Mods ^= SISrcMods::NEG_HI; 2563 } 2564 2565 if (isExtractHiElt(Lo, Lo)) 2566 Mods |= SISrcMods::OP_SEL_0; 2567 2568 if (isExtractHiElt(Hi, Hi)) 2569 Mods |= SISrcMods::OP_SEL_1; 2570 2571 Lo = stripExtractLoElt(Lo); 2572 Hi = stripExtractLoElt(Hi); 2573 2574 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) { 2575 // Really a scalar input. Just select from the low half of the register to 2576 // avoid packing. 2577 2578 Src = Lo; 2579 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); 2580 return true; 2581 } 2582 2583 Mods = VecMods; 2584 } 2585 2586 // Packed instructions do not have abs modifiers. 2587 Mods |= SISrcMods::OP_SEL_1; 2588 2589 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); 2590 return true; 2591 } 2592 2593 bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src, 2594 SDValue &SrcMods) const { 2595 Src = In; 2596 // FIXME: Handle op_sel 2597 SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32); 2598 return true; 2599 } 2600 2601 bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src, 2602 SDValue &SrcMods) const { 2603 // FIXME: Handle op_sel 2604 return SelectVOP3Mods(In, Src, SrcMods); 2605 } 2606 2607 // The return value is not whether the match is possible (which it always is), 2608 // but whether or not it a conversion is really used. 2609 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, 2610 unsigned &Mods) const { 2611 Mods = 0; 2612 SelectVOP3ModsImpl(In, Src, Mods); 2613 2614 if (Src.getOpcode() == ISD::FP_EXTEND) { 2615 Src = Src.getOperand(0); 2616 assert(Src.getValueType() == MVT::f16); 2617 Src = stripBitcast(Src); 2618 2619 // Be careful about folding modifiers if we already have an abs. fneg is 2620 // applied last, so we don't want to apply an earlier fneg. 2621 if ((Mods & SISrcMods::ABS) == 0) { 2622 unsigned ModsTmp; 2623 SelectVOP3ModsImpl(Src, Src, ModsTmp); 2624 2625 if ((ModsTmp & SISrcMods::NEG) != 0) 2626 Mods ^= SISrcMods::NEG; 2627 2628 if ((ModsTmp & SISrcMods::ABS) != 0) 2629 Mods |= SISrcMods::ABS; 2630 } 2631 2632 // op_sel/op_sel_hi decide the source type and source. 2633 // If the source's op_sel_hi is set, it indicates to do a conversion from fp16. 2634 // If the sources's op_sel is set, it picks the high half of the source 2635 // register. 2636 2637 Mods |= SISrcMods::OP_SEL_1; 2638 if (isExtractHiElt(Src, Src)) { 2639 Mods |= SISrcMods::OP_SEL_0; 2640 2641 // TODO: Should we try to look for neg/abs here? 2642 } 2643 2644 return true; 2645 } 2646 2647 return false; 2648 } 2649 2650 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src, 2651 SDValue &SrcMods) const { 2652 unsigned Mods = 0; 2653 SelectVOP3PMadMixModsImpl(In, Src, Mods); 2654 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); 2655 return true; 2656 } 2657 2658 SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const { 2659 if (In.isUndef()) 2660 return CurDAG->getUNDEF(MVT::i32); 2661 2662 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) { 2663 SDLoc SL(In); 2664 return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32); 2665 } 2666 2667 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) { 2668 SDLoc SL(In); 2669 return CurDAG->getConstant( 2670 C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32); 2671 } 2672 2673 SDValue Src; 2674 if (isExtractHiElt(In, Src)) 2675 return Src; 2676 2677 return SDValue(); 2678 } 2679 2680 bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const { 2681 assert(CurDAG->getTarget().getTargetTriple().getArch() == Triple::amdgcn); 2682 2683 const SIRegisterInfo *SIRI = 2684 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); 2685 const SIInstrInfo * SII = 2686 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 2687 2688 unsigned Limit = 0; 2689 bool AllUsesAcceptSReg = true; 2690 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); 2691 Limit < 10 && U != E; ++U, ++Limit) { 2692 const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo()); 2693 2694 // If the register class is unknown, it could be an unknown 2695 // register class that needs to be an SGPR, e.g. an inline asm 2696 // constraint 2697 if (!RC || SIRI->isSGPRClass(RC)) 2698 return false; 2699 2700 if (RC != &AMDGPU::VS_32RegClass) { 2701 AllUsesAcceptSReg = false; 2702 SDNode * User = *U; 2703 if (User->isMachineOpcode()) { 2704 unsigned Opc = User->getMachineOpcode(); 2705 MCInstrDesc Desc = SII->get(Opc); 2706 if (Desc.isCommutable()) { 2707 unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo(); 2708 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex; 2709 if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) { 2710 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs(); 2711 const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo); 2712 if (CommutedRC == &AMDGPU::VS_32RegClass) 2713 AllUsesAcceptSReg = true; 2714 } 2715 } 2716 } 2717 // If "AllUsesAcceptSReg == false" so far we haven't suceeded 2718 // commuting current user. This means have at least one use 2719 // that strictly require VGPR. Thus, we will not attempt to commute 2720 // other user instructions. 2721 if (!AllUsesAcceptSReg) 2722 break; 2723 } 2724 } 2725 return !AllUsesAcceptSReg && (Limit < 10); 2726 } 2727 2728 bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const { 2729 auto Ld = cast<LoadSDNode>(N); 2730 2731 return Ld->getAlignment() >= 4 && 2732 ( 2733 ( 2734 ( 2735 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || 2736 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT 2737 ) 2738 && 2739 !N->isDivergent() 2740 ) 2741 || 2742 ( 2743 Subtarget->getScalarizeGlobalBehavior() && 2744 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && 2745 !Ld->isVolatile() && 2746 !N->isDivergent() && 2747 static_cast<const SITargetLowering *>( 2748 getTargetLowering())->isMemOpHasNoClobberedMemOperand(N) 2749 ) 2750 ); 2751 } 2752 2753 void AMDGPUDAGToDAGISel::PostprocessISelDAG() { 2754 const AMDGPUTargetLowering& Lowering = 2755 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering()); 2756 bool IsModified = false; 2757 do { 2758 IsModified = false; 2759 2760 // Go over all selected nodes and try to fold them a bit more 2761 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin(); 2762 while (Position != CurDAG->allnodes_end()) { 2763 SDNode *Node = &*Position++; 2764 MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node); 2765 if (!MachineNode) 2766 continue; 2767 2768 SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG); 2769 if (ResNode != Node) { 2770 if (ResNode) 2771 ReplaceUses(Node, ResNode); 2772 IsModified = true; 2773 } 2774 } 2775 CurDAG->RemoveDeadNodes(); 2776 } while (IsModified); 2777 } 2778 2779 bool R600DAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { 2780 Subtarget = &MF.getSubtarget<R600Subtarget>(); 2781 return SelectionDAGISel::runOnMachineFunction(MF); 2782 } 2783 2784 bool R600DAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const { 2785 if (!N->readMem()) 2786 return false; 2787 if (CbId == -1) 2788 return N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || 2789 N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT; 2790 2791 return N->getAddressSpace() == AMDGPUAS::CONSTANT_BUFFER_0 + CbId; 2792 } 2793 2794 bool R600DAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr, 2795 SDValue& IntPtr) { 2796 if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) { 2797 IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr), 2798 true); 2799 return true; 2800 } 2801 return false; 2802 } 2803 2804 bool R600DAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr, 2805 SDValue& BaseReg, SDValue &Offset) { 2806 if (!isa<ConstantSDNode>(Addr)) { 2807 BaseReg = Addr; 2808 Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true); 2809 return true; 2810 } 2811 return false; 2812 } 2813 2814 void R600DAGToDAGISel::Select(SDNode *N) { 2815 unsigned int Opc = N->getOpcode(); 2816 if (N->isMachineOpcode()) { 2817 N->setNodeId(-1); 2818 return; // Already selected. 2819 } 2820 2821 switch (Opc) { 2822 default: break; 2823 case AMDGPUISD::BUILD_VERTICAL_VECTOR: 2824 case ISD::SCALAR_TO_VECTOR: 2825 case ISD::BUILD_VECTOR: { 2826 EVT VT = N->getValueType(0); 2827 unsigned NumVectorElts = VT.getVectorNumElements(); 2828 unsigned RegClassID; 2829 // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG 2830 // that adds a 128 bits reg copy when going through TwoAddressInstructions 2831 // pass. We want to avoid 128 bits copies as much as possible because they 2832 // can't be bundled by our scheduler. 2833 switch(NumVectorElts) { 2834 case 2: RegClassID = R600::R600_Reg64RegClassID; break; 2835 case 4: 2836 if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR) 2837 RegClassID = R600::R600_Reg128VerticalRegClassID; 2838 else 2839 RegClassID = R600::R600_Reg128RegClassID; 2840 break; 2841 default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR"); 2842 } 2843 SelectBuildVector(N, RegClassID); 2844 return; 2845 } 2846 } 2847 2848 SelectCode(N); 2849 } 2850 2851 bool R600DAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, 2852 SDValue &Offset) { 2853 ConstantSDNode *C; 2854 SDLoc DL(Addr); 2855 2856 if ((C = dyn_cast<ConstantSDNode>(Addr))) { 2857 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32); 2858 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); 2859 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) && 2860 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) { 2861 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32); 2862 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); 2863 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) && 2864 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) { 2865 Base = Addr.getOperand(0); 2866 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); 2867 } else { 2868 Base = Addr; 2869 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); 2870 } 2871 2872 return true; 2873 } 2874 2875 bool R600DAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base, 2876 SDValue &Offset) { 2877 ConstantSDNode *IMMOffset; 2878 2879 if (Addr.getOpcode() == ISD::ADD 2880 && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) 2881 && isInt<16>(IMMOffset->getZExtValue())) { 2882 2883 Base = Addr.getOperand(0); 2884 Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr), 2885 MVT::i32); 2886 return true; 2887 // If the pointer address is constant, we can move it to the offset field. 2888 } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr)) 2889 && isInt<16>(IMMOffset->getZExtValue())) { 2890 Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), 2891 SDLoc(CurDAG->getEntryNode()), 2892 R600::ZERO, MVT::i32); 2893 Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr), 2894 MVT::i32); 2895 return true; 2896 } 2897 2898 // Default case, no offset 2899 Base = Addr; 2900 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); 2901 return true; 2902 } 2903