1 //===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Defines an instruction selector for the AMDGPU target. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPUArgumentUsageInfo.h" 16 #include "AMDGPUISelLowering.h" // For AMDGPUISD 17 #include "AMDGPUInstrInfo.h" 18 #include "AMDGPUPerfHintAnalysis.h" 19 #include "AMDGPUSubtarget.h" 20 #include "AMDGPUTargetMachine.h" 21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 22 #include "SIDefines.h" 23 #include "SIISelLowering.h" 24 #include "SIInstrInfo.h" 25 #include "SIMachineFunctionInfo.h" 26 #include "SIRegisterInfo.h" 27 #include "llvm/ADT/APInt.h" 28 #include "llvm/ADT/SmallVector.h" 29 #include "llvm/ADT/StringRef.h" 30 #include "llvm/Analysis/LegacyDivergenceAnalysis.h" 31 #include "llvm/Analysis/ValueTracking.h" 32 #include "llvm/CodeGen/FunctionLoweringInfo.h" 33 #include "llvm/CodeGen/ISDOpcodes.h" 34 #include "llvm/CodeGen/MachineFunction.h" 35 #include "llvm/CodeGen/MachineRegisterInfo.h" 36 #include "llvm/CodeGen/SelectionDAG.h" 37 #include "llvm/CodeGen/SelectionDAGISel.h" 38 #include "llvm/CodeGen/SelectionDAGNodes.h" 39 #include "llvm/CodeGen/ValueTypes.h" 40 #include "llvm/IR/BasicBlock.h" 41 #include "llvm/InitializePasses.h" 42 #ifdef EXPENSIVE_CHECKS 43 #include "llvm/IR/Dominators.h" 44 #endif 45 #include "llvm/IR/Instruction.h" 46 #include "llvm/MC/MCInstrDesc.h" 47 #include "llvm/Support/Casting.h" 48 #include "llvm/Support/CodeGen.h" 49 #include "llvm/Support/ErrorHandling.h" 50 #include "llvm/Support/MachineValueType.h" 51 #include "llvm/Support/MathExtras.h" 52 #include <cassert> 53 #include <cstdint> 54 #include <new> 55 #include <vector> 56 57 #define DEBUG_TYPE "isel" 58 59 using namespace llvm; 60 61 namespace llvm { 62 63 class R600InstrInfo; 64 65 } // end namespace llvm 66 67 //===----------------------------------------------------------------------===// 68 // Instruction Selector Implementation 69 //===----------------------------------------------------------------------===// 70 71 namespace { 72 73 static bool isNullConstantOrUndef(SDValue V) { 74 if (V.isUndef()) 75 return true; 76 77 ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V); 78 return Const != nullptr && Const->isNullValue(); 79 } 80 81 static bool getConstantValue(SDValue N, uint32_t &Out) { 82 // This is only used for packed vectors, where ussing 0 for undef should 83 // always be good. 84 if (N.isUndef()) { 85 Out = 0; 86 return true; 87 } 88 89 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) { 90 Out = C->getAPIntValue().getSExtValue(); 91 return true; 92 } 93 94 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) { 95 Out = C->getValueAPF().bitcastToAPInt().getSExtValue(); 96 return true; 97 } 98 99 return false; 100 } 101 102 // TODO: Handle undef as zero 103 static SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG, 104 bool Negate = false) { 105 assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2); 106 uint32_t LHSVal, RHSVal; 107 if (getConstantValue(N->getOperand(0), LHSVal) && 108 getConstantValue(N->getOperand(1), RHSVal)) { 109 SDLoc SL(N); 110 uint32_t K = Negate ? 111 (-LHSVal & 0xffff) | (-RHSVal << 16) : 112 (LHSVal & 0xffff) | (RHSVal << 16); 113 return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0), 114 DAG.getTargetConstant(K, SL, MVT::i32)); 115 } 116 117 return nullptr; 118 } 119 120 static SDNode *packNegConstantV2I16(const SDNode *N, SelectionDAG &DAG) { 121 return packConstantV2I16(N, DAG, true); 122 } 123 124 /// AMDGPU specific code to select AMDGPU machine instructions for 125 /// SelectionDAG operations. 126 class AMDGPUDAGToDAGISel : public SelectionDAGISel { 127 // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can 128 // make the right decision when generating code for different targets. 129 const GCNSubtarget *Subtarget; 130 131 // Default FP mode for the current function. 132 AMDGPU::SIModeRegisterDefaults Mode; 133 134 bool EnableLateStructurizeCFG; 135 136 public: 137 explicit AMDGPUDAGToDAGISel(TargetMachine *TM = nullptr, 138 CodeGenOpt::Level OptLevel = CodeGenOpt::Default) 139 : SelectionDAGISel(*TM, OptLevel) { 140 EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG; 141 } 142 ~AMDGPUDAGToDAGISel() override = default; 143 144 void getAnalysisUsage(AnalysisUsage &AU) const override { 145 AU.addRequired<AMDGPUArgumentUsageInfo>(); 146 AU.addRequired<LegacyDivergenceAnalysis>(); 147 #ifdef EXPENSIVE_CHECKS 148 AU.addRequired<DominatorTreeWrapperPass>(); 149 AU.addRequired<LoopInfoWrapperPass>(); 150 #endif 151 SelectionDAGISel::getAnalysisUsage(AU); 152 } 153 154 bool matchLoadD16FromBuildVector(SDNode *N) const; 155 156 bool runOnMachineFunction(MachineFunction &MF) override; 157 void PreprocessISelDAG() override; 158 void Select(SDNode *N) override; 159 StringRef getPassName() const override; 160 void PostprocessISelDAG() override; 161 162 protected: 163 void SelectBuildVector(SDNode *N, unsigned RegClassID); 164 165 private: 166 std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const; 167 bool isNoNanSrc(SDValue N) const; 168 bool isInlineImmediate(const SDNode *N, bool Negated = false) const; 169 bool isNegInlineImmediate(const SDNode *N) const { 170 return isInlineImmediate(N, true); 171 } 172 173 bool isInlineImmediate16(int64_t Imm) const { 174 return AMDGPU::isInlinableLiteral16(Imm, Subtarget->hasInv2PiInlineImm()); 175 } 176 177 bool isInlineImmediate32(int64_t Imm) const { 178 return AMDGPU::isInlinableLiteral32(Imm, Subtarget->hasInv2PiInlineImm()); 179 } 180 181 bool isInlineImmediate64(int64_t Imm) const { 182 return AMDGPU::isInlinableLiteral64(Imm, Subtarget->hasInv2PiInlineImm()); 183 } 184 185 bool isInlineImmediate(const APFloat &Imm) const { 186 return Subtarget->getInstrInfo()->isInlineConstant(Imm); 187 } 188 189 bool isVGPRImm(const SDNode *N) const; 190 bool isUniformLoad(const SDNode *N) const; 191 bool isUniformBr(const SDNode *N) const; 192 193 MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const; 194 195 SDNode *glueCopyToOp(SDNode *N, SDValue NewChain, SDValue Glue) const; 196 SDNode *glueCopyToM0(SDNode *N, SDValue Val) const; 197 SDNode *glueCopyToM0LDSInit(SDNode *N) const; 198 199 const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const; 200 virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); 201 virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); 202 bool isDSOffsetLegal(SDValue Base, unsigned Offset, 203 unsigned OffsetBits) const; 204 bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; 205 bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, 206 SDValue &Offset1) const; 207 bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, 208 SDValue &SOffset, SDValue &Offset, SDValue &Offen, 209 SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, 210 SDValue &TFE, SDValue &DLC, SDValue &SWZ) const; 211 bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, 212 SDValue &SOffset, SDValue &Offset, SDValue &GLC, 213 SDValue &SLC, SDValue &TFE, SDValue &DLC, 214 SDValue &SWZ) const; 215 bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, 216 SDValue &VAddr, SDValue &SOffset, SDValue &Offset, 217 SDValue &SLC) const; 218 bool SelectMUBUFScratchOffen(SDNode *Parent, 219 SDValue Addr, SDValue &RSrc, SDValue &VAddr, 220 SDValue &SOffset, SDValue &ImmOffset) const; 221 bool SelectMUBUFScratchOffset(SDNode *Parent, 222 SDValue Addr, SDValue &SRsrc, SDValue &Soffset, 223 SDValue &Offset) const; 224 225 bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, 226 SDValue &Offset, SDValue &GLC, SDValue &SLC, 227 SDValue &TFE, SDValue &DLC, SDValue &SWZ) const; 228 bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, 229 SDValue &Offset, SDValue &SLC) const; 230 bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, 231 SDValue &Offset) const; 232 233 template <bool IsSigned> 234 bool SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr, 235 SDValue &Offset, SDValue &SLC) const; 236 bool SelectFlatAtomic(SDNode *N, SDValue Addr, SDValue &VAddr, 237 SDValue &Offset, SDValue &SLC) const; 238 bool SelectFlatAtomicSigned(SDNode *N, SDValue Addr, SDValue &VAddr, 239 SDValue &Offset, SDValue &SLC) const; 240 241 bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, 242 bool &Imm) const; 243 SDValue Expand32BitAddress(SDValue Addr) const; 244 bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, 245 bool &Imm) const; 246 bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const; 247 bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const; 248 bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const; 249 bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const; 250 bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const; 251 bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; 252 253 bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const; 254 bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods) const; 255 bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; 256 bool SelectVOP3NoMods(SDValue In, SDValue &Src) const; 257 bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, 258 SDValue &Clamp, SDValue &Omod) const; 259 bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods, 260 SDValue &Clamp, SDValue &Omod) const; 261 262 bool SelectVOP3OMods(SDValue In, SDValue &Src, 263 SDValue &Clamp, SDValue &Omod) const; 264 265 bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; 266 bool SelectVOP3PMods0(SDValue In, SDValue &Src, SDValue &SrcMods, 267 SDValue &Clamp) const; 268 269 bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const; 270 bool SelectVOP3OpSel0(SDValue In, SDValue &Src, SDValue &SrcMods, 271 SDValue &Clamp) const; 272 273 bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; 274 bool SelectVOP3OpSelMods0(SDValue In, SDValue &Src, SDValue &SrcMods, 275 SDValue &Clamp) const; 276 bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const; 277 bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; 278 279 SDValue getHi16Elt(SDValue In) const; 280 281 SDValue getMaterializedScalarImm32(int64_t Val, const SDLoc &DL) const; 282 283 void SelectADD_SUB_I64(SDNode *N); 284 void SelectAddcSubb(SDNode *N); 285 void SelectUADDO_USUBO(SDNode *N); 286 void SelectDIV_SCALE(SDNode *N); 287 void SelectDIV_FMAS(SDNode *N); 288 void SelectMAD_64_32(SDNode *N); 289 void SelectFMA_W_CHAIN(SDNode *N); 290 void SelectFMUL_W_CHAIN(SDNode *N); 291 292 SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val, 293 uint32_t Offset, uint32_t Width); 294 void SelectS_BFEFromShifts(SDNode *N); 295 void SelectS_BFE(SDNode *N); 296 bool isCBranchSCC(const SDNode *N) const; 297 void SelectBRCOND(SDNode *N); 298 void SelectFMAD_FMA(SDNode *N); 299 void SelectATOMIC_CMP_SWAP(SDNode *N); 300 void SelectDSAppendConsume(SDNode *N, unsigned IntrID); 301 void SelectDS_GWS(SDNode *N, unsigned IntrID); 302 void SelectInterpP1F16(SDNode *N); 303 void SelectINTRINSIC_W_CHAIN(SDNode *N); 304 void SelectINTRINSIC_WO_CHAIN(SDNode *N); 305 void SelectINTRINSIC_VOID(SDNode *N); 306 307 protected: 308 // Include the pieces autogenerated from the target description. 309 #include "AMDGPUGenDAGISel.inc" 310 }; 311 312 class R600DAGToDAGISel : public AMDGPUDAGToDAGISel { 313 const R600Subtarget *Subtarget; 314 315 bool isConstantLoad(const MemSDNode *N, int cbID) const; 316 bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr); 317 bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg, 318 SDValue& Offset); 319 public: 320 explicit R600DAGToDAGISel(TargetMachine *TM, CodeGenOpt::Level OptLevel) : 321 AMDGPUDAGToDAGISel(TM, OptLevel) {} 322 323 void Select(SDNode *N) override; 324 325 bool SelectADDRIndirect(SDValue Addr, SDValue &Base, 326 SDValue &Offset) override; 327 bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, 328 SDValue &Offset) override; 329 330 bool runOnMachineFunction(MachineFunction &MF) override; 331 332 void PreprocessISelDAG() override {} 333 334 protected: 335 // Include the pieces autogenerated from the target description. 336 #include "R600GenDAGISel.inc" 337 }; 338 339 static SDValue stripBitcast(SDValue Val) { 340 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val; 341 } 342 343 // Figure out if this is really an extract of the high 16-bits of a dword. 344 static bool isExtractHiElt(SDValue In, SDValue &Out) { 345 In = stripBitcast(In); 346 if (In.getOpcode() != ISD::TRUNCATE) 347 return false; 348 349 SDValue Srl = In.getOperand(0); 350 if (Srl.getOpcode() == ISD::SRL) { 351 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) { 352 if (ShiftAmt->getZExtValue() == 16) { 353 Out = stripBitcast(Srl.getOperand(0)); 354 return true; 355 } 356 } 357 } 358 359 return false; 360 } 361 362 // Look through operations that obscure just looking at the low 16-bits of the 363 // same register. 364 static SDValue stripExtractLoElt(SDValue In) { 365 if (In.getOpcode() == ISD::TRUNCATE) { 366 SDValue Src = In.getOperand(0); 367 if (Src.getValueType().getSizeInBits() == 32) 368 return stripBitcast(Src); 369 } 370 371 return In; 372 } 373 374 } // end anonymous namespace 375 376 INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel", 377 "AMDGPU DAG->DAG Pattern Instruction Selection", false, false) 378 INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo) 379 INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis) 380 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) 381 #ifdef EXPENSIVE_CHECKS 382 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 383 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 384 #endif 385 INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "amdgpu-isel", 386 "AMDGPU DAG->DAG Pattern Instruction Selection", false, false) 387 388 /// This pass converts a legalized DAG into a AMDGPU-specific 389 // DAG, ready for instruction scheduling. 390 FunctionPass *llvm::createAMDGPUISelDag(TargetMachine *TM, 391 CodeGenOpt::Level OptLevel) { 392 return new AMDGPUDAGToDAGISel(TM, OptLevel); 393 } 394 395 /// This pass converts a legalized DAG into a R600-specific 396 // DAG, ready for instruction scheduling. 397 FunctionPass *llvm::createR600ISelDag(TargetMachine *TM, 398 CodeGenOpt::Level OptLevel) { 399 return new R600DAGToDAGISel(TM, OptLevel); 400 } 401 402 bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { 403 #ifdef EXPENSIVE_CHECKS 404 DominatorTree & DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 405 LoopInfo * LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 406 for (auto &L : LI->getLoopsInPreorder()) { 407 assert(L->isLCSSAForm(DT)); 408 } 409 #endif 410 Subtarget = &MF.getSubtarget<GCNSubtarget>(); 411 Mode = AMDGPU::SIModeRegisterDefaults(MF.getFunction(), *Subtarget); 412 return SelectionDAGISel::runOnMachineFunction(MF); 413 } 414 415 bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const { 416 assert(Subtarget->d16PreservesUnusedBits()); 417 MVT VT = N->getValueType(0).getSimpleVT(); 418 if (VT != MVT::v2i16 && VT != MVT::v2f16) 419 return false; 420 421 SDValue Lo = N->getOperand(0); 422 SDValue Hi = N->getOperand(1); 423 424 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi)); 425 426 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo 427 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo 428 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo 429 430 // Need to check for possible indirect dependencies on the other half of the 431 // vector to avoid introducing a cycle. 432 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) { 433 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other); 434 435 SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo); 436 SDValue Ops[] = { 437 LdHi->getChain(), LdHi->getBasePtr(), TiedIn 438 }; 439 440 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI; 441 if (LdHi->getMemoryVT() == MVT::i8) { 442 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ? 443 AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8; 444 } else { 445 assert(LdHi->getMemoryVT() == MVT::i16); 446 } 447 448 SDValue NewLoadHi = 449 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList, 450 Ops, LdHi->getMemoryVT(), 451 LdHi->getMemOperand()); 452 453 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi); 454 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1)); 455 return true; 456 } 457 458 // build_vector (load ptr), hi -> load_d16_lo ptr, hi 459 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi 460 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi 461 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo)); 462 if (LdLo && Lo.hasOneUse()) { 463 SDValue TiedIn = getHi16Elt(Hi); 464 if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode())) 465 return false; 466 467 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other); 468 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO; 469 if (LdLo->getMemoryVT() == MVT::i8) { 470 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ? 471 AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8; 472 } else { 473 assert(LdLo->getMemoryVT() == MVT::i16); 474 } 475 476 TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn); 477 478 SDValue Ops[] = { 479 LdLo->getChain(), LdLo->getBasePtr(), TiedIn 480 }; 481 482 SDValue NewLoadLo = 483 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList, 484 Ops, LdLo->getMemoryVT(), 485 LdLo->getMemOperand()); 486 487 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo); 488 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1)); 489 return true; 490 } 491 492 return false; 493 } 494 495 void AMDGPUDAGToDAGISel::PreprocessISelDAG() { 496 if (!Subtarget->d16PreservesUnusedBits()) 497 return; 498 499 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end(); 500 501 bool MadeChange = false; 502 while (Position != CurDAG->allnodes_begin()) { 503 SDNode *N = &*--Position; 504 if (N->use_empty()) 505 continue; 506 507 switch (N->getOpcode()) { 508 case ISD::BUILD_VECTOR: 509 MadeChange |= matchLoadD16FromBuildVector(N); 510 break; 511 default: 512 break; 513 } 514 } 515 516 if (MadeChange) { 517 CurDAG->RemoveDeadNodes(); 518 LLVM_DEBUG(dbgs() << "After PreProcess:\n"; 519 CurDAG->dump();); 520 } 521 } 522 523 bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const { 524 if (TM.Options.NoNaNsFPMath) 525 return true; 526 527 // TODO: Move into isKnownNeverNaN 528 if (N->getFlags().isDefined()) 529 return N->getFlags().hasNoNaNs(); 530 531 return CurDAG->isKnownNeverNaN(N); 532 } 533 534 bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N, 535 bool Negated) const { 536 if (N->isUndef()) 537 return true; 538 539 const SIInstrInfo *TII = Subtarget->getInstrInfo(); 540 if (Negated) { 541 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) 542 return TII->isInlineConstant(-C->getAPIntValue()); 543 544 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) 545 return TII->isInlineConstant(-C->getValueAPF().bitcastToAPInt()); 546 547 } else { 548 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) 549 return TII->isInlineConstant(C->getAPIntValue()); 550 551 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) 552 return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt()); 553 } 554 555 return false; 556 } 557 558 /// Determine the register class for \p OpNo 559 /// \returns The register class of the virtual register that will be used for 560 /// the given operand number \OpNo or NULL if the register class cannot be 561 /// determined. 562 const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, 563 unsigned OpNo) const { 564 if (!N->isMachineOpcode()) { 565 if (N->getOpcode() == ISD::CopyToReg) { 566 unsigned Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg(); 567 if (Register::isVirtualRegister(Reg)) { 568 MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo(); 569 return MRI.getRegClass(Reg); 570 } 571 572 const SIRegisterInfo *TRI 573 = static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo(); 574 return TRI->getPhysRegClass(Reg); 575 } 576 577 return nullptr; 578 } 579 580 switch (N->getMachineOpcode()) { 581 default: { 582 const MCInstrDesc &Desc = 583 Subtarget->getInstrInfo()->get(N->getMachineOpcode()); 584 unsigned OpIdx = Desc.getNumDefs() + OpNo; 585 if (OpIdx >= Desc.getNumOperands()) 586 return nullptr; 587 int RegClass = Desc.OpInfo[OpIdx].RegClass; 588 if (RegClass == -1) 589 return nullptr; 590 591 return Subtarget->getRegisterInfo()->getRegClass(RegClass); 592 } 593 case AMDGPU::REG_SEQUENCE: { 594 unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 595 const TargetRegisterClass *SuperRC = 596 Subtarget->getRegisterInfo()->getRegClass(RCID); 597 598 SDValue SubRegOp = N->getOperand(OpNo + 1); 599 unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue(); 600 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC, 601 SubRegIdx); 602 } 603 } 604 } 605 606 SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain, 607 SDValue Glue) const { 608 SmallVector <SDValue, 8> Ops; 609 Ops.push_back(NewChain); // Replace the chain. 610 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 611 Ops.push_back(N->getOperand(i)); 612 613 Ops.push_back(Glue); 614 return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops); 615 } 616 617 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const { 618 const SITargetLowering& Lowering = 619 *static_cast<const SITargetLowering*>(getTargetLowering()); 620 621 assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain"); 622 623 SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val); 624 return glueCopyToOp(N, M0, M0.getValue(1)); 625 } 626 627 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const { 628 unsigned AS = cast<MemSDNode>(N)->getAddressSpace(); 629 if (AS == AMDGPUAS::LOCAL_ADDRESS) { 630 if (Subtarget->ldsRequiresM0Init()) 631 return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32)); 632 } else if (AS == AMDGPUAS::REGION_ADDRESS) { 633 MachineFunction &MF = CurDAG->getMachineFunction(); 634 unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize(); 635 return 636 glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32)); 637 } 638 return N; 639 } 640 641 MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm, 642 EVT VT) const { 643 SDNode *Lo = CurDAG->getMachineNode( 644 AMDGPU::S_MOV_B32, DL, MVT::i32, 645 CurDAG->getTargetConstant(Imm & 0xFFFFFFFF, DL, MVT::i32)); 646 SDNode *Hi = 647 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, 648 CurDAG->getTargetConstant(Imm >> 32, DL, MVT::i32)); 649 const SDValue Ops[] = { 650 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), 651 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), 652 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)}; 653 654 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops); 655 } 656 657 static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) { 658 switch (NumVectorElts) { 659 case 1: 660 return AMDGPU::SReg_32RegClassID; 661 case 2: 662 return AMDGPU::SReg_64RegClassID; 663 case 3: 664 return AMDGPU::SGPR_96RegClassID; 665 case 4: 666 return AMDGPU::SGPR_128RegClassID; 667 case 5: 668 return AMDGPU::SGPR_160RegClassID; 669 case 8: 670 return AMDGPU::SReg_256RegClassID; 671 case 16: 672 return AMDGPU::SReg_512RegClassID; 673 case 32: 674 return AMDGPU::SReg_1024RegClassID; 675 } 676 677 llvm_unreachable("invalid vector size"); 678 } 679 680 void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) { 681 EVT VT = N->getValueType(0); 682 unsigned NumVectorElts = VT.getVectorNumElements(); 683 EVT EltVT = VT.getVectorElementType(); 684 SDLoc DL(N); 685 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); 686 687 if (NumVectorElts == 1) { 688 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0), 689 RegClass); 690 return; 691 } 692 693 assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not " 694 "supported yet"); 695 // 32 = Max Num Vector Elements 696 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index) 697 // 1 = Vector Register Class 698 SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1); 699 700 bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() == 701 Triple::amdgcn; 702 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); 703 bool IsRegSeq = true; 704 unsigned NOps = N->getNumOperands(); 705 for (unsigned i = 0; i < NOps; i++) { 706 // XXX: Why is this here? 707 if (isa<RegisterSDNode>(N->getOperand(i))) { 708 IsRegSeq = false; 709 break; 710 } 711 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i) 712 : R600RegisterInfo::getSubRegFromChannel(i); 713 RegSeqArgs[1 + (2 * i)] = N->getOperand(i); 714 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32); 715 } 716 if (NOps != NumVectorElts) { 717 // Fill in the missing undef elements if this was a scalar_to_vector. 718 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts); 719 MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, 720 DL, EltVT); 721 for (unsigned i = NOps; i < NumVectorElts; ++i) { 722 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i) 723 : R600RegisterInfo::getSubRegFromChannel(i); 724 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0); 725 RegSeqArgs[1 + (2 * i) + 1] = 726 CurDAG->getTargetConstant(Sub, DL, MVT::i32); 727 } 728 } 729 730 if (!IsRegSeq) 731 SelectCode(N); 732 CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs); 733 } 734 735 void AMDGPUDAGToDAGISel::Select(SDNode *N) { 736 unsigned int Opc = N->getOpcode(); 737 if (N->isMachineOpcode()) { 738 N->setNodeId(-1); 739 return; // Already selected. 740 } 741 742 // isa<MemSDNode> almost works but is slightly too permissive for some DS 743 // intrinsics. 744 if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N) || 745 (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC || 746 Opc == ISD::ATOMIC_LOAD_FADD || 747 Opc == AMDGPUISD::ATOMIC_LOAD_FMIN || 748 Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) { 749 N = glueCopyToM0LDSInit(N); 750 SelectCode(N); 751 return; 752 } 753 754 switch (Opc) { 755 default: 756 break; 757 // We are selecting i64 ADD here instead of custom lower it during 758 // DAG legalization, so we can fold some i64 ADDs used for address 759 // calculation into the LOAD and STORE instructions. 760 case ISD::ADDC: 761 case ISD::ADDE: 762 case ISD::SUBC: 763 case ISD::SUBE: { 764 if (N->getValueType(0) != MVT::i64) 765 break; 766 767 SelectADD_SUB_I64(N); 768 return; 769 } 770 case ISD::ADDCARRY: 771 case ISD::SUBCARRY: 772 if (N->getValueType(0) != MVT::i32) 773 break; 774 775 SelectAddcSubb(N); 776 return; 777 case ISD::UADDO: 778 case ISD::USUBO: { 779 SelectUADDO_USUBO(N); 780 return; 781 } 782 case AMDGPUISD::FMUL_W_CHAIN: { 783 SelectFMUL_W_CHAIN(N); 784 return; 785 } 786 case AMDGPUISD::FMA_W_CHAIN: { 787 SelectFMA_W_CHAIN(N); 788 return; 789 } 790 791 case ISD::SCALAR_TO_VECTOR: 792 case ISD::BUILD_VECTOR: { 793 EVT VT = N->getValueType(0); 794 unsigned NumVectorElts = VT.getVectorNumElements(); 795 if (VT.getScalarSizeInBits() == 16) { 796 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) { 797 if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) { 798 ReplaceNode(N, Packed); 799 return; 800 } 801 } 802 803 break; 804 } 805 806 assert(VT.getVectorElementType().bitsEq(MVT::i32)); 807 unsigned RegClassID = selectSGPRVectorRegClassID(NumVectorElts); 808 SelectBuildVector(N, RegClassID); 809 return; 810 } 811 case ISD::BUILD_PAIR: { 812 SDValue RC, SubReg0, SubReg1; 813 SDLoc DL(N); 814 if (N->getValueType(0) == MVT::i128) { 815 RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32); 816 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32); 817 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32); 818 } else if (N->getValueType(0) == MVT::i64) { 819 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32); 820 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); 821 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); 822 } else { 823 llvm_unreachable("Unhandled value type for BUILD_PAIR"); 824 } 825 const SDValue Ops[] = { RC, N->getOperand(0), SubReg0, 826 N->getOperand(1), SubReg1 }; 827 ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, 828 N->getValueType(0), Ops)); 829 return; 830 } 831 832 case ISD::Constant: 833 case ISD::ConstantFP: { 834 if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N)) 835 break; 836 837 uint64_t Imm; 838 if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) 839 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue(); 840 else { 841 ConstantSDNode *C = cast<ConstantSDNode>(N); 842 Imm = C->getZExtValue(); 843 } 844 845 SDLoc DL(N); 846 ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0))); 847 return; 848 } 849 case AMDGPUISD::BFE_I32: 850 case AMDGPUISD::BFE_U32: { 851 // There is a scalar version available, but unlike the vector version which 852 // has a separate operand for the offset and width, the scalar version packs 853 // the width and offset into a single operand. Try to move to the scalar 854 // version if the offsets are constant, so that we can try to keep extended 855 // loads of kernel arguments in SGPRs. 856 857 // TODO: Technically we could try to pattern match scalar bitshifts of 858 // dynamic values, but it's probably not useful. 859 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); 860 if (!Offset) 861 break; 862 863 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2)); 864 if (!Width) 865 break; 866 867 bool Signed = Opc == AMDGPUISD::BFE_I32; 868 869 uint32_t OffsetVal = Offset->getZExtValue(); 870 uint32_t WidthVal = Width->getZExtValue(); 871 872 ReplaceNode(N, getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, 873 SDLoc(N), N->getOperand(0), OffsetVal, WidthVal)); 874 return; 875 } 876 case AMDGPUISD::DIV_SCALE: { 877 SelectDIV_SCALE(N); 878 return; 879 } 880 case AMDGPUISD::DIV_FMAS: { 881 SelectDIV_FMAS(N); 882 return; 883 } 884 case AMDGPUISD::MAD_I64_I32: 885 case AMDGPUISD::MAD_U64_U32: { 886 SelectMAD_64_32(N); 887 return; 888 } 889 case ISD::CopyToReg: { 890 const SITargetLowering& Lowering = 891 *static_cast<const SITargetLowering*>(getTargetLowering()); 892 N = Lowering.legalizeTargetIndependentNode(N, *CurDAG); 893 break; 894 } 895 case ISD::AND: 896 case ISD::SRL: 897 case ISD::SRA: 898 case ISD::SIGN_EXTEND_INREG: 899 if (N->getValueType(0) != MVT::i32) 900 break; 901 902 SelectS_BFE(N); 903 return; 904 case ISD::BRCOND: 905 SelectBRCOND(N); 906 return; 907 case ISD::FMAD: 908 case ISD::FMA: 909 SelectFMAD_FMA(N); 910 return; 911 case AMDGPUISD::ATOMIC_CMP_SWAP: 912 SelectATOMIC_CMP_SWAP(N); 913 return; 914 case AMDGPUISD::CVT_PKRTZ_F16_F32: 915 case AMDGPUISD::CVT_PKNORM_I16_F32: 916 case AMDGPUISD::CVT_PKNORM_U16_F32: 917 case AMDGPUISD::CVT_PK_U16_U32: 918 case AMDGPUISD::CVT_PK_I16_I32: { 919 // Hack around using a legal type if f16 is illegal. 920 if (N->getValueType(0) == MVT::i32) { 921 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16; 922 N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT), 923 { N->getOperand(0), N->getOperand(1) }); 924 SelectCode(N); 925 return; 926 } 927 928 break; 929 } 930 case ISD::INTRINSIC_W_CHAIN: { 931 SelectINTRINSIC_W_CHAIN(N); 932 return; 933 } 934 case ISD::INTRINSIC_WO_CHAIN: { 935 SelectINTRINSIC_WO_CHAIN(N); 936 return; 937 } 938 case ISD::INTRINSIC_VOID: { 939 SelectINTRINSIC_VOID(N); 940 return; 941 } 942 } 943 944 SelectCode(N); 945 } 946 947 bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const { 948 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock(); 949 const Instruction *Term = BB->getTerminator(); 950 return Term->getMetadata("amdgpu.uniform") || 951 Term->getMetadata("structurizecfg.uniform"); 952 } 953 954 StringRef AMDGPUDAGToDAGISel::getPassName() const { 955 return "AMDGPU DAG->DAG Pattern Instruction Selection"; 956 } 957 958 //===----------------------------------------------------------------------===// 959 // Complex Patterns 960 //===----------------------------------------------------------------------===// 961 962 bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base, 963 SDValue &Offset) { 964 return false; 965 } 966 967 bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, 968 SDValue &Offset) { 969 ConstantSDNode *C; 970 SDLoc DL(Addr); 971 972 if ((C = dyn_cast<ConstantSDNode>(Addr))) { 973 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32); 974 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); 975 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) && 976 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) { 977 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32); 978 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); 979 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) && 980 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) { 981 Base = Addr.getOperand(0); 982 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); 983 } else { 984 Base = Addr; 985 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); 986 } 987 988 return true; 989 } 990 991 SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val, 992 const SDLoc &DL) const { 993 SDNode *Mov = CurDAG->getMachineNode( 994 AMDGPU::S_MOV_B32, DL, MVT::i32, 995 CurDAG->getTargetConstant(Val, DL, MVT::i32)); 996 return SDValue(Mov, 0); 997 } 998 999 // FIXME: Should only handle addcarry/subcarry 1000 void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { 1001 SDLoc DL(N); 1002 SDValue LHS = N->getOperand(0); 1003 SDValue RHS = N->getOperand(1); 1004 1005 unsigned Opcode = N->getOpcode(); 1006 bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE); 1007 bool ProduceCarry = 1008 ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC; 1009 bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE; 1010 1011 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); 1012 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); 1013 1014 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, 1015 DL, MVT::i32, LHS, Sub0); 1016 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, 1017 DL, MVT::i32, LHS, Sub1); 1018 1019 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, 1020 DL, MVT::i32, RHS, Sub0); 1021 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, 1022 DL, MVT::i32, RHS, Sub1); 1023 1024 SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue); 1025 1026 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; 1027 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; 1028 1029 SDNode *AddLo; 1030 if (!ConsumeCarry) { 1031 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) }; 1032 AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args); 1033 } else { 1034 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) }; 1035 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args); 1036 } 1037 SDValue AddHiArgs[] = { 1038 SDValue(Hi0, 0), 1039 SDValue(Hi1, 0), 1040 SDValue(AddLo, 1) 1041 }; 1042 SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs); 1043 1044 SDValue RegSequenceArgs[] = { 1045 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), 1046 SDValue(AddLo,0), 1047 Sub0, 1048 SDValue(AddHi,0), 1049 Sub1, 1050 }; 1051 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, 1052 MVT::i64, RegSequenceArgs); 1053 1054 if (ProduceCarry) { 1055 // Replace the carry-use 1056 ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1)); 1057 } 1058 1059 // Replace the remaining uses. 1060 ReplaceNode(N, RegSequence); 1061 } 1062 1063 void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) { 1064 SDLoc DL(N); 1065 SDValue LHS = N->getOperand(0); 1066 SDValue RHS = N->getOperand(1); 1067 SDValue CI = N->getOperand(2); 1068 1069 unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::V_ADDC_U32_e64 1070 : AMDGPU::V_SUBB_U32_e64; 1071 CurDAG->SelectNodeTo( 1072 N, Opc, N->getVTList(), 1073 {LHS, RHS, CI, CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/}); 1074 } 1075 1076 void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) { 1077 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned 1078 // carry out despite the _i32 name. These were renamed in VI to _U32. 1079 // FIXME: We should probably rename the opcodes here. 1080 unsigned Opc = N->getOpcode() == ISD::UADDO ? 1081 AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; 1082 1083 CurDAG->SelectNodeTo( 1084 N, Opc, N->getVTList(), 1085 {N->getOperand(0), N->getOperand(1), 1086 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/}); 1087 } 1088 1089 void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) { 1090 SDLoc SL(N); 1091 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod 1092 SDValue Ops[10]; 1093 1094 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]); 1095 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]); 1096 SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]); 1097 Ops[8] = N->getOperand(0); 1098 Ops[9] = N->getOperand(4); 1099 1100 CurDAG->SelectNodeTo(N, AMDGPU::V_FMA_F32, N->getVTList(), Ops); 1101 } 1102 1103 void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) { 1104 SDLoc SL(N); 1105 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod 1106 SDValue Ops[8]; 1107 1108 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]); 1109 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]); 1110 Ops[6] = N->getOperand(0); 1111 Ops[7] = N->getOperand(3); 1112 1113 CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops); 1114 } 1115 1116 // We need to handle this here because tablegen doesn't support matching 1117 // instructions with multiple outputs. 1118 void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { 1119 SDLoc SL(N); 1120 EVT VT = N->getValueType(0); 1121 1122 assert(VT == MVT::f32 || VT == MVT::f64); 1123 1124 unsigned Opc 1125 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32; 1126 1127 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2) }; 1128 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); 1129 } 1130 1131 void AMDGPUDAGToDAGISel::SelectDIV_FMAS(SDNode *N) { 1132 const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget); 1133 const SIRegisterInfo *TRI = ST->getRegisterInfo(); 1134 1135 SDLoc SL(N); 1136 EVT VT = N->getValueType(0); 1137 1138 assert(VT == MVT::f32 || VT == MVT::f64); 1139 1140 unsigned Opc 1141 = (VT == MVT::f64) ? AMDGPU::V_DIV_FMAS_F64 : AMDGPU::V_DIV_FMAS_F32; 1142 1143 SDValue CarryIn = N->getOperand(3); 1144 // V_DIV_FMAS implicitly reads VCC. 1145 SDValue VCC = CurDAG->getCopyToReg(CurDAG->getEntryNode(), SL, 1146 TRI->getVCC(), CarryIn, SDValue()); 1147 1148 SDValue Ops[10]; 1149 1150 SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]); 1151 SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]); 1152 SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]); 1153 1154 Ops[8] = VCC; 1155 Ops[9] = VCC.getValue(1); 1156 1157 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); 1158 } 1159 1160 // We need to handle this here because tablegen doesn't support matching 1161 // instructions with multiple outputs. 1162 void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) { 1163 SDLoc SL(N); 1164 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32; 1165 unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32 : AMDGPU::V_MAD_U64_U32; 1166 1167 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1); 1168 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), 1169 Clamp }; 1170 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); 1171 } 1172 1173 bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset, 1174 unsigned OffsetBits) const { 1175 if ((OffsetBits == 16 && !isUInt<16>(Offset)) || 1176 (OffsetBits == 8 && !isUInt<8>(Offset))) 1177 return false; 1178 1179 if (Subtarget->hasUsableDSOffset() || 1180 Subtarget->unsafeDSOffsetFoldingEnabled()) 1181 return true; 1182 1183 // On Southern Islands instruction with a negative base value and an offset 1184 // don't seem to work. 1185 return CurDAG->SignBitIsZero(Base); 1186 } 1187 1188 bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, 1189 SDValue &Offset) const { 1190 SDLoc DL(Addr); 1191 if (CurDAG->isBaseWithConstantOffset(Addr)) { 1192 SDValue N0 = Addr.getOperand(0); 1193 SDValue N1 = Addr.getOperand(1); 1194 ConstantSDNode *C1 = cast<ConstantSDNode>(N1); 1195 if (isDSOffsetLegal(N0, C1->getSExtValue(), 16)) { 1196 // (add n0, c0) 1197 Base = N0; 1198 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); 1199 return true; 1200 } 1201 } else if (Addr.getOpcode() == ISD::SUB) { 1202 // sub C, x -> add (sub 0, x), C 1203 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) { 1204 int64_t ByteOffset = C->getSExtValue(); 1205 if (isUInt<16>(ByteOffset)) { 1206 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); 1207 1208 // XXX - This is kind of hacky. Create a dummy sub node so we can check 1209 // the known bits in isDSOffsetLegal. We need to emit the selected node 1210 // here, so this is thrown away. 1211 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32, 1212 Zero, Addr.getOperand(1)); 1213 1214 if (isDSOffsetLegal(Sub, ByteOffset, 16)) { 1215 SmallVector<SDValue, 3> Opnds; 1216 Opnds.push_back(Zero); 1217 Opnds.push_back(Addr.getOperand(1)); 1218 1219 // FIXME: Select to VOP3 version for with-carry. 1220 unsigned SubOp = AMDGPU::V_SUB_I32_e32; 1221 if (Subtarget->hasAddNoCarry()) { 1222 SubOp = AMDGPU::V_SUB_U32_e64; 1223 Opnds.push_back( 1224 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit 1225 } 1226 1227 MachineSDNode *MachineSub = 1228 CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds); 1229 1230 Base = SDValue(MachineSub, 0); 1231 Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16); 1232 return true; 1233 } 1234 } 1235 } 1236 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { 1237 // If we have a constant address, prefer to put the constant into the 1238 // offset. This can save moves to load the constant address since multiple 1239 // operations can share the zero base address register, and enables merging 1240 // into read2 / write2 instructions. 1241 1242 SDLoc DL(Addr); 1243 1244 if (isUInt<16>(CAddr->getZExtValue())) { 1245 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); 1246 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, 1247 DL, MVT::i32, Zero); 1248 Base = SDValue(MovZero, 0); 1249 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16); 1250 return true; 1251 } 1252 } 1253 1254 // default case 1255 Base = Addr; 1256 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16); 1257 return true; 1258 } 1259 1260 // TODO: If offset is too big, put low 16-bit into offset. 1261 bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, 1262 SDValue &Offset0, 1263 SDValue &Offset1) const { 1264 SDLoc DL(Addr); 1265 1266 if (CurDAG->isBaseWithConstantOffset(Addr)) { 1267 SDValue N0 = Addr.getOperand(0); 1268 SDValue N1 = Addr.getOperand(1); 1269 ConstantSDNode *C1 = cast<ConstantSDNode>(N1); 1270 unsigned DWordOffset0 = C1->getZExtValue() / 4; 1271 unsigned DWordOffset1 = DWordOffset0 + 1; 1272 // (add n0, c0) 1273 if (isDSOffsetLegal(N0, DWordOffset1, 8)) { 1274 Base = N0; 1275 Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); 1276 Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); 1277 return true; 1278 } 1279 } else if (Addr.getOpcode() == ISD::SUB) { 1280 // sub C, x -> add (sub 0, x), C 1281 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) { 1282 unsigned DWordOffset0 = C->getZExtValue() / 4; 1283 unsigned DWordOffset1 = DWordOffset0 + 1; 1284 1285 if (isUInt<8>(DWordOffset0)) { 1286 SDLoc DL(Addr); 1287 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); 1288 1289 // XXX - This is kind of hacky. Create a dummy sub node so we can check 1290 // the known bits in isDSOffsetLegal. We need to emit the selected node 1291 // here, so this is thrown away. 1292 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32, 1293 Zero, Addr.getOperand(1)); 1294 1295 if (isDSOffsetLegal(Sub, DWordOffset1, 8)) { 1296 SmallVector<SDValue, 3> Opnds; 1297 Opnds.push_back(Zero); 1298 Opnds.push_back(Addr.getOperand(1)); 1299 unsigned SubOp = AMDGPU::V_SUB_I32_e32; 1300 if (Subtarget->hasAddNoCarry()) { 1301 SubOp = AMDGPU::V_SUB_U32_e64; 1302 Opnds.push_back( 1303 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit 1304 } 1305 1306 MachineSDNode *MachineSub 1307 = CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds); 1308 1309 Base = SDValue(MachineSub, 0); 1310 Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); 1311 Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); 1312 return true; 1313 } 1314 } 1315 } 1316 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { 1317 unsigned DWordOffset0 = CAddr->getZExtValue() / 4; 1318 unsigned DWordOffset1 = DWordOffset0 + 1; 1319 assert(4 * DWordOffset0 == CAddr->getZExtValue()); 1320 1321 if (isUInt<8>(DWordOffset0) && isUInt<8>(DWordOffset1)) { 1322 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); 1323 MachineSDNode *MovZero 1324 = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, 1325 DL, MVT::i32, Zero); 1326 Base = SDValue(MovZero, 0); 1327 Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); 1328 Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); 1329 return true; 1330 } 1331 } 1332 1333 // default case 1334 1335 Base = Addr; 1336 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8); 1337 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8); 1338 return true; 1339 } 1340 1341 bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, 1342 SDValue &VAddr, SDValue &SOffset, 1343 SDValue &Offset, SDValue &Offen, 1344 SDValue &Idxen, SDValue &Addr64, 1345 SDValue &GLC, SDValue &SLC, 1346 SDValue &TFE, SDValue &DLC, 1347 SDValue &SWZ) const { 1348 // Subtarget prefers to use flat instruction 1349 // FIXME: This should be a pattern predicate and not reach here 1350 if (Subtarget->useFlatForGlobal()) 1351 return false; 1352 1353 SDLoc DL(Addr); 1354 1355 if (!GLC.getNode()) 1356 GLC = CurDAG->getTargetConstant(0, DL, MVT::i1); 1357 if (!SLC.getNode()) 1358 SLC = CurDAG->getTargetConstant(0, DL, MVT::i1); 1359 TFE = CurDAG->getTargetConstant(0, DL, MVT::i1); 1360 DLC = CurDAG->getTargetConstant(0, DL, MVT::i1); 1361 SWZ = CurDAG->getTargetConstant(0, DL, MVT::i1); 1362 1363 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1); 1364 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1); 1365 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1); 1366 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); 1367 1368 ConstantSDNode *C1 = nullptr; 1369 SDValue N0 = Addr; 1370 if (CurDAG->isBaseWithConstantOffset(Addr)) { 1371 C1 = cast<ConstantSDNode>(Addr.getOperand(1)); 1372 if (isUInt<32>(C1->getZExtValue())) 1373 N0 = Addr.getOperand(0); 1374 else 1375 C1 = nullptr; 1376 } 1377 1378 if (N0.getOpcode() == ISD::ADD) { 1379 // (add N2, N3) -> addr64, or 1380 // (add (add N2, N3), C1) -> addr64 1381 SDValue N2 = N0.getOperand(0); 1382 SDValue N3 = N0.getOperand(1); 1383 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); 1384 1385 if (N2->isDivergent()) { 1386 if (N3->isDivergent()) { 1387 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the 1388 // addr64, and construct the resource from a 0 address. 1389 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0); 1390 VAddr = N0; 1391 } else { 1392 // N2 is divergent, N3 is not. 1393 Ptr = N3; 1394 VAddr = N2; 1395 } 1396 } else { 1397 // N2 is not divergent. 1398 Ptr = N2; 1399 VAddr = N3; 1400 } 1401 Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); 1402 } else if (N0->isDivergent()) { 1403 // N0 is divergent. Use it as the addr64, and construct the resource from a 1404 // 0 address. 1405 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0); 1406 VAddr = N0; 1407 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); 1408 } else { 1409 // N0 -> offset, or 1410 // (N0 + C1) -> offset 1411 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32); 1412 Ptr = N0; 1413 } 1414 1415 if (!C1) { 1416 // No offset. 1417 Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); 1418 return true; 1419 } 1420 1421 if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) { 1422 // Legal offset for instruction. 1423 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); 1424 return true; 1425 } 1426 1427 // Illegal offset, store it in soffset. 1428 Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); 1429 SOffset = 1430 SDValue(CurDAG->getMachineNode( 1431 AMDGPU::S_MOV_B32, DL, MVT::i32, 1432 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)), 1433 0); 1434 return true; 1435 } 1436 1437 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, 1438 SDValue &VAddr, SDValue &SOffset, 1439 SDValue &Offset, SDValue &GLC, 1440 SDValue &SLC, SDValue &TFE, 1441 SDValue &DLC, SDValue &SWZ) const { 1442 SDValue Ptr, Offen, Idxen, Addr64; 1443 1444 // addr64 bit was removed for volcanic islands. 1445 // FIXME: This should be a pattern predicate and not reach here 1446 if (!Subtarget->hasAddr64()) 1447 return false; 1448 1449 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, 1450 GLC, SLC, TFE, DLC, SWZ)) 1451 return false; 1452 1453 ConstantSDNode *C = cast<ConstantSDNode>(Addr64); 1454 if (C->getSExtValue()) { 1455 SDLoc DL(Addr); 1456 1457 const SITargetLowering& Lowering = 1458 *static_cast<const SITargetLowering*>(getTargetLowering()); 1459 1460 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0); 1461 return true; 1462 } 1463 1464 return false; 1465 } 1466 1467 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, 1468 SDValue &VAddr, SDValue &SOffset, 1469 SDValue &Offset, 1470 SDValue &SLC) const { 1471 SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1); 1472 SDValue GLC, TFE, DLC, SWZ; 1473 1474 return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC, SWZ); 1475 } 1476 1477 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { 1478 auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>(); 1479 return PSV && PSV->isStack(); 1480 } 1481 1482 std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const { 1483 const MachineFunction &MF = CurDAG->getMachineFunction(); 1484 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1485 1486 if (auto FI = dyn_cast<FrameIndexSDNode>(N)) { 1487 SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(), 1488 FI->getValueType(0)); 1489 1490 // If we can resolve this to a frame index access, this will be relative to 1491 // either the stack or frame pointer SGPR. 1492 return std::make_pair( 1493 TFI, CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)); 1494 } 1495 1496 // If we don't know this private access is a local stack object, it needs to 1497 // be relative to the entry point's scratch wave offset register. 1498 return std::make_pair(N, CurDAG->getRegister(Info->getScratchWaveOffsetReg(), 1499 MVT::i32)); 1500 } 1501 1502 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, 1503 SDValue Addr, SDValue &Rsrc, 1504 SDValue &VAddr, SDValue &SOffset, 1505 SDValue &ImmOffset) const { 1506 1507 SDLoc DL(Addr); 1508 MachineFunction &MF = CurDAG->getMachineFunction(); 1509 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1510 1511 Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); 1512 1513 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { 1514 unsigned Imm = CAddr->getZExtValue(); 1515 1516 SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32); 1517 MachineSDNode *MovHighBits = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, 1518 DL, MVT::i32, HighBits); 1519 VAddr = SDValue(MovHighBits, 0); 1520 1521 // In a call sequence, stores to the argument stack area are relative to the 1522 // stack pointer. 1523 const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo(); 1524 unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ? 1525 Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg(); 1526 1527 SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32); 1528 ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16); 1529 return true; 1530 } 1531 1532 if (CurDAG->isBaseWithConstantOffset(Addr)) { 1533 // (add n0, c1) 1534 1535 SDValue N0 = Addr.getOperand(0); 1536 SDValue N1 = Addr.getOperand(1); 1537 1538 // Offsets in vaddr must be positive if range checking is enabled. 1539 // 1540 // The total computation of vaddr + soffset + offset must not overflow. If 1541 // vaddr is negative, even if offset is 0 the sgpr offset add will end up 1542 // overflowing. 1543 // 1544 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would 1545 // always perform a range check. If a negative vaddr base index was used, 1546 // this would fail the range check. The overall address computation would 1547 // compute a valid address, but this doesn't happen due to the range 1548 // check. For out-of-bounds MUBUF loads, a 0 is returned. 1549 // 1550 // Therefore it should be safe to fold any VGPR offset on gfx9 into the 1551 // MUBUF vaddr, but not on older subtargets which can only do this if the 1552 // sign bit is known 0. 1553 ConstantSDNode *C1 = cast<ConstantSDNode>(N1); 1554 if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue()) && 1555 (!Subtarget->privateMemoryResourceIsRangeChecked() || 1556 CurDAG->SignBitIsZero(N0))) { 1557 std::tie(VAddr, SOffset) = foldFrameIndex(N0); 1558 ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); 1559 return true; 1560 } 1561 } 1562 1563 // (node) 1564 std::tie(VAddr, SOffset) = foldFrameIndex(Addr); 1565 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16); 1566 return true; 1567 } 1568 1569 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, 1570 SDValue Addr, 1571 SDValue &SRsrc, 1572 SDValue &SOffset, 1573 SDValue &Offset) const { 1574 ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr); 1575 if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue())) 1576 return false; 1577 1578 SDLoc DL(Addr); 1579 MachineFunction &MF = CurDAG->getMachineFunction(); 1580 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1581 1582 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); 1583 1584 const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo(); 1585 unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ? 1586 Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg(); 1587 1588 // FIXME: Get from MachinePointerInfo? We should only be using the frame 1589 // offset if we know this is in a call sequence. 1590 SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32); 1591 1592 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16); 1593 return true; 1594 } 1595 1596 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, 1597 SDValue &SOffset, SDValue &Offset, 1598 SDValue &GLC, SDValue &SLC, 1599 SDValue &TFE, SDValue &DLC, 1600 SDValue &SWZ) const { 1601 SDValue Ptr, VAddr, Offen, Idxen, Addr64; 1602 const SIInstrInfo *TII = 1603 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 1604 1605 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, 1606 GLC, SLC, TFE, DLC, SWZ)) 1607 return false; 1608 1609 if (!cast<ConstantSDNode>(Offen)->getSExtValue() && 1610 !cast<ConstantSDNode>(Idxen)->getSExtValue() && 1611 !cast<ConstantSDNode>(Addr64)->getSExtValue()) { 1612 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | 1613 APInt::getAllOnesValue(32).getZExtValue(); // Size 1614 SDLoc DL(Addr); 1615 1616 const SITargetLowering& Lowering = 1617 *static_cast<const SITargetLowering*>(getTargetLowering()); 1618 1619 SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0); 1620 return true; 1621 } 1622 return false; 1623 } 1624 1625 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, 1626 SDValue &Soffset, SDValue &Offset 1627 ) const { 1628 SDValue GLC, SLC, TFE, DLC, SWZ; 1629 1630 return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ); 1631 } 1632 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, 1633 SDValue &Soffset, SDValue &Offset, 1634 SDValue &SLC) const { 1635 SDValue GLC, TFE, DLC, SWZ; 1636 1637 return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ); 1638 } 1639 1640 // Find a load or store from corresponding pattern root. 1641 // Roots may be build_vector, bitconvert or their combinations. 1642 static MemSDNode* findMemSDNode(SDNode *N) { 1643 N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode(); 1644 if (MemSDNode *MN = dyn_cast<MemSDNode>(N)) 1645 return MN; 1646 assert(isa<BuildVectorSDNode>(N)); 1647 for (SDValue V : N->op_values()) 1648 if (MemSDNode *MN = 1649 dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V))) 1650 return MN; 1651 llvm_unreachable("cannot find MemSDNode in the pattern!"); 1652 } 1653 1654 template <bool IsSigned> 1655 bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, 1656 SDValue Addr, 1657 SDValue &VAddr, 1658 SDValue &Offset, 1659 SDValue &SLC) const { 1660 int64_t OffsetVal = 0; 1661 1662 if (Subtarget->hasFlatInstOffsets() && 1663 (!Subtarget->hasFlatSegmentOffsetBug() || 1664 findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS) && 1665 CurDAG->isBaseWithConstantOffset(Addr)) { 1666 SDValue N0 = Addr.getOperand(0); 1667 SDValue N1 = Addr.getOperand(1); 1668 uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue(); 1669 1670 const SIInstrInfo *TII = Subtarget->getInstrInfo(); 1671 unsigned AS = findMemSDNode(N)->getAddressSpace(); 1672 if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) { 1673 Addr = N0; 1674 OffsetVal = COffsetVal; 1675 } else { 1676 // If the offset doesn't fit, put the low bits into the offset field and 1677 // add the rest. 1678 1679 SDLoc DL(N); 1680 uint64_t ImmField; 1681 const unsigned NumBits = TII->getNumFlatOffsetBits(AS, IsSigned); 1682 if (IsSigned) { 1683 ImmField = SignExtend64(COffsetVal, NumBits); 1684 1685 // Don't use a negative offset field if the base offset is positive. 1686 // Since the scheduler currently relies on the offset field, doing so 1687 // could result in strange scheduling decisions. 1688 1689 // TODO: Should we not do this in the opposite direction as well? 1690 if (static_cast<int64_t>(COffsetVal) > 0) { 1691 if (static_cast<int64_t>(ImmField) < 0) { 1692 const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits - 1); 1693 ImmField = COffsetVal & OffsetMask; 1694 } 1695 } 1696 } else { 1697 // TODO: Should we do this for a negative offset? 1698 const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits); 1699 ImmField = COffsetVal & OffsetMask; 1700 } 1701 1702 uint64_t RemainderOffset = COffsetVal - ImmField; 1703 1704 assert(TII->isLegalFLATOffset(ImmField, AS, IsSigned)); 1705 assert(RemainderOffset + ImmField == COffsetVal); 1706 1707 OffsetVal = ImmField; 1708 1709 // TODO: Should this try to use a scalar add pseudo if the base address is 1710 // uniform and saddr is usable? 1711 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); 1712 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); 1713 1714 SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, 1715 DL, MVT::i32, N0, Sub0); 1716 SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, 1717 DL, MVT::i32, N0, Sub1); 1718 1719 SDValue AddOffsetLo 1720 = getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); 1721 SDValue AddOffsetHi 1722 = getMaterializedScalarImm32(Hi_32(RemainderOffset), DL); 1723 1724 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1); 1725 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); 1726 1727 SDNode *Add = CurDAG->getMachineNode( 1728 AMDGPU::V_ADD_I32_e64, DL, VTs, 1729 {AddOffsetLo, SDValue(N0Lo, 0), Clamp}); 1730 1731 SDNode *Addc = CurDAG->getMachineNode( 1732 AMDGPU::V_ADDC_U32_e64, DL, VTs, 1733 {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp}); 1734 1735 SDValue RegSequenceArgs[] = { 1736 CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32), 1737 SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1 1738 }; 1739 1740 Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, 1741 MVT::i64, RegSequenceArgs), 0); 1742 } 1743 } 1744 1745 VAddr = Addr; 1746 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16); 1747 SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1); 1748 return true; 1749 } 1750 1751 bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDNode *N, 1752 SDValue Addr, 1753 SDValue &VAddr, 1754 SDValue &Offset, 1755 SDValue &SLC) const { 1756 return SelectFlatOffset<false>(N, Addr, VAddr, Offset, SLC); 1757 } 1758 1759 bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDNode *N, 1760 SDValue Addr, 1761 SDValue &VAddr, 1762 SDValue &Offset, 1763 SDValue &SLC) const { 1764 return SelectFlatOffset<true>(N, Addr, VAddr, Offset, SLC); 1765 } 1766 1767 bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, 1768 SDValue &Offset, bool &Imm) const { 1769 1770 // FIXME: Handle non-constant offsets. 1771 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode); 1772 if (!C) 1773 return false; 1774 1775 SDLoc SL(ByteOffsetNode); 1776 GCNSubtarget::Generation Gen = Subtarget->getGeneration(); 1777 uint64_t ByteOffset = C->getZExtValue(); 1778 Optional<int64_t> EncodedOffset = 1779 AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset); 1780 if (EncodedOffset) { 1781 Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32); 1782 Imm = true; 1783 return true; 1784 } 1785 1786 if (Gen == AMDGPUSubtarget::SEA_ISLANDS) { 1787 EncodedOffset = 1788 AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset); 1789 if (EncodedOffset) { 1790 Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32); 1791 return true; 1792 } 1793 } 1794 1795 if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset)) 1796 return false; 1797 1798 SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32); 1799 Offset = SDValue( 1800 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0); 1801 1802 return true; 1803 } 1804 1805 SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const { 1806 if (Addr.getValueType() != MVT::i32) 1807 return Addr; 1808 1809 // Zero-extend a 32-bit address. 1810 SDLoc SL(Addr); 1811 1812 const MachineFunction &MF = CurDAG->getMachineFunction(); 1813 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1814 unsigned AddrHiVal = Info->get32BitAddressHighBits(); 1815 SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32); 1816 1817 const SDValue Ops[] = { 1818 CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32), 1819 Addr, 1820 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32), 1821 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi), 1822 0), 1823 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32), 1824 }; 1825 1826 return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64, 1827 Ops), 0); 1828 } 1829 1830 bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, 1831 SDValue &Offset, bool &Imm) const { 1832 SDLoc SL(Addr); 1833 1834 // A 32-bit (address + offset) should not cause unsigned 32-bit integer 1835 // wraparound, because s_load instructions perform the addition in 64 bits. 1836 if ((Addr.getValueType() != MVT::i32 || 1837 Addr->getFlags().hasNoUnsignedWrap()) && 1838 CurDAG->isBaseWithConstantOffset(Addr)) { 1839 SDValue N0 = Addr.getOperand(0); 1840 SDValue N1 = Addr.getOperand(1); 1841 1842 if (SelectSMRDOffset(N1, Offset, Imm)) { 1843 SBase = Expand32BitAddress(N0); 1844 return true; 1845 } 1846 } 1847 SBase = Expand32BitAddress(Addr); 1848 Offset = CurDAG->getTargetConstant(0, SL, MVT::i32); 1849 Imm = true; 1850 return true; 1851 } 1852 1853 bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase, 1854 SDValue &Offset) const { 1855 bool Imm = false; 1856 return SelectSMRD(Addr, SBase, Offset, Imm) && Imm; 1857 } 1858 1859 bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase, 1860 SDValue &Offset) const { 1861 1862 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); 1863 1864 bool Imm = false; 1865 if (!SelectSMRD(Addr, SBase, Offset, Imm)) 1866 return false; 1867 1868 return !Imm && isa<ConstantSDNode>(Offset); 1869 } 1870 1871 bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase, 1872 SDValue &Offset) const { 1873 bool Imm = false; 1874 return SelectSMRD(Addr, SBase, Offset, Imm) && !Imm && 1875 !isa<ConstantSDNode>(Offset); 1876 } 1877 1878 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr, 1879 SDValue &Offset) const { 1880 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr)) { 1881 if (auto Imm = AMDGPU::getSMRDEncodedOffset(*Subtarget, 1882 C->getZExtValue())) { 1883 Offset = CurDAG->getTargetConstant(*Imm, SDLoc(Addr), MVT::i32); 1884 return true; 1885 } 1886 } 1887 1888 return false; 1889 } 1890 1891 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr, 1892 SDValue &Offset) const { 1893 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); 1894 1895 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr)) { 1896 if (auto Imm = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, 1897 C->getZExtValue())) { 1898 Offset = CurDAG->getTargetConstant(*Imm, SDLoc(Addr), MVT::i32); 1899 return true; 1900 } 1901 } 1902 1903 return false; 1904 } 1905 1906 bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index, 1907 SDValue &Base, 1908 SDValue &Offset) const { 1909 SDLoc DL(Index); 1910 1911 if (CurDAG->isBaseWithConstantOffset(Index)) { 1912 SDValue N0 = Index.getOperand(0); 1913 SDValue N1 = Index.getOperand(1); 1914 ConstantSDNode *C1 = cast<ConstantSDNode>(N1); 1915 1916 // (add n0, c0) 1917 // Don't peel off the offset (c0) if doing so could possibly lead 1918 // the base (n0) to be negative. 1919 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0)) { 1920 Base = N0; 1921 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32); 1922 return true; 1923 } 1924 } 1925 1926 if (isa<ConstantSDNode>(Index)) 1927 return false; 1928 1929 Base = Index; 1930 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); 1931 return true; 1932 } 1933 1934 SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, const SDLoc &DL, 1935 SDValue Val, uint32_t Offset, 1936 uint32_t Width) { 1937 // Transformation function, pack the offset and width of a BFE into 1938 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second 1939 // source, bits [5:0] contain the offset and bits [22:16] the width. 1940 uint32_t PackedVal = Offset | (Width << 16); 1941 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32); 1942 1943 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst); 1944 } 1945 1946 void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) { 1947 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c) 1948 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c) 1949 // Predicate: 0 < b <= c < 32 1950 1951 const SDValue &Shl = N->getOperand(0); 1952 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1)); 1953 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 1954 1955 if (B && C) { 1956 uint32_t BVal = B->getZExtValue(); 1957 uint32_t CVal = C->getZExtValue(); 1958 1959 if (0 < BVal && BVal <= CVal && CVal < 32) { 1960 bool Signed = N->getOpcode() == ISD::SRA; 1961 unsigned Opcode = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; 1962 1963 ReplaceNode(N, getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0), CVal - BVal, 1964 32 - CVal)); 1965 return; 1966 } 1967 } 1968 SelectCode(N); 1969 } 1970 1971 void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) { 1972 switch (N->getOpcode()) { 1973 case ISD::AND: 1974 if (N->getOperand(0).getOpcode() == ISD::SRL) { 1975 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)" 1976 // Predicate: isMask(mask) 1977 const SDValue &Srl = N->getOperand(0); 1978 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1)); 1979 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1)); 1980 1981 if (Shift && Mask) { 1982 uint32_t ShiftVal = Shift->getZExtValue(); 1983 uint32_t MaskVal = Mask->getZExtValue(); 1984 1985 if (isMask_32(MaskVal)) { 1986 uint32_t WidthVal = countPopulation(MaskVal); 1987 1988 ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), 1989 Srl.getOperand(0), ShiftVal, WidthVal)); 1990 return; 1991 } 1992 } 1993 } 1994 break; 1995 case ISD::SRL: 1996 if (N->getOperand(0).getOpcode() == ISD::AND) { 1997 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)" 1998 // Predicate: isMask(mask >> b) 1999 const SDValue &And = N->getOperand(0); 2000 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1)); 2001 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1)); 2002 2003 if (Shift && Mask) { 2004 uint32_t ShiftVal = Shift->getZExtValue(); 2005 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal; 2006 2007 if (isMask_32(MaskVal)) { 2008 uint32_t WidthVal = countPopulation(MaskVal); 2009 2010 ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), 2011 And.getOperand(0), ShiftVal, WidthVal)); 2012 return; 2013 } 2014 } 2015 } else if (N->getOperand(0).getOpcode() == ISD::SHL) { 2016 SelectS_BFEFromShifts(N); 2017 return; 2018 } 2019 break; 2020 case ISD::SRA: 2021 if (N->getOperand(0).getOpcode() == ISD::SHL) { 2022 SelectS_BFEFromShifts(N); 2023 return; 2024 } 2025 break; 2026 2027 case ISD::SIGN_EXTEND_INREG: { 2028 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8 2029 SDValue Src = N->getOperand(0); 2030 if (Src.getOpcode() != ISD::SRL) 2031 break; 2032 2033 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1)); 2034 if (!Amt) 2035 break; 2036 2037 unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits(); 2038 ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_I32, SDLoc(N), Src.getOperand(0), 2039 Amt->getZExtValue(), Width)); 2040 return; 2041 } 2042 } 2043 2044 SelectCode(N); 2045 } 2046 2047 bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const { 2048 assert(N->getOpcode() == ISD::BRCOND); 2049 if (!N->hasOneUse()) 2050 return false; 2051 2052 SDValue Cond = N->getOperand(1); 2053 if (Cond.getOpcode() == ISD::CopyToReg) 2054 Cond = Cond.getOperand(2); 2055 2056 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse()) 2057 return false; 2058 2059 MVT VT = Cond.getOperand(0).getSimpleValueType(); 2060 if (VT == MVT::i32) 2061 return true; 2062 2063 if (VT == MVT::i64) { 2064 auto ST = static_cast<const GCNSubtarget *>(Subtarget); 2065 2066 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 2067 return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64(); 2068 } 2069 2070 return false; 2071 } 2072 2073 void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) { 2074 SDValue Cond = N->getOperand(1); 2075 2076 if (Cond.isUndef()) { 2077 CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other, 2078 N->getOperand(2), N->getOperand(0)); 2079 return; 2080 } 2081 2082 const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget); 2083 const SIRegisterInfo *TRI = ST->getRegisterInfo(); 2084 2085 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N); 2086 unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ; 2087 unsigned CondReg = UseSCCBr ? (unsigned)AMDGPU::SCC : TRI->getVCC(); 2088 SDLoc SL(N); 2089 2090 if (!UseSCCBr) { 2091 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not 2092 // analyzed what generates the vcc value, so we do not know whether vcc 2093 // bits for disabled lanes are 0. Thus we need to mask out bits for 2094 // disabled lanes. 2095 // 2096 // For the case that we select S_CBRANCH_SCC1 and it gets 2097 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls 2098 // SIInstrInfo::moveToVALU which inserts the S_AND). 2099 // 2100 // We could add an analysis of what generates the vcc value here and omit 2101 // the S_AND when is unnecessary. But it would be better to add a separate 2102 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it 2103 // catches both cases. 2104 Cond = SDValue(CurDAG->getMachineNode(ST->isWave32() ? AMDGPU::S_AND_B32 2105 : AMDGPU::S_AND_B64, 2106 SL, MVT::i1, 2107 CurDAG->getRegister(ST->isWave32() ? AMDGPU::EXEC_LO 2108 : AMDGPU::EXEC, 2109 MVT::i1), 2110 Cond), 2111 0); 2112 } 2113 2114 SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond); 2115 CurDAG->SelectNodeTo(N, BrOp, MVT::Other, 2116 N->getOperand(2), // Basic Block 2117 VCC.getValue(0)); 2118 } 2119 2120 void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) { 2121 MVT VT = N->getSimpleValueType(0); 2122 bool IsFMA = N->getOpcode() == ISD::FMA; 2123 if (VT != MVT::f32 || (!Subtarget->hasMadMixInsts() && 2124 !Subtarget->hasFmaMixInsts()) || 2125 ((IsFMA && Subtarget->hasMadMixInsts()) || 2126 (!IsFMA && Subtarget->hasFmaMixInsts()))) { 2127 SelectCode(N); 2128 return; 2129 } 2130 2131 SDValue Src0 = N->getOperand(0); 2132 SDValue Src1 = N->getOperand(1); 2133 SDValue Src2 = N->getOperand(2); 2134 unsigned Src0Mods, Src1Mods, Src2Mods; 2135 2136 // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand 2137 // using the conversion from f16. 2138 bool Sel0 = SelectVOP3PMadMixModsImpl(Src0, Src0, Src0Mods); 2139 bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods); 2140 bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods); 2141 2142 assert((IsFMA || !Mode.allFP32Denormals()) && 2143 "fmad selected with denormals enabled"); 2144 // TODO: We can select this with f32 denormals enabled if all the sources are 2145 // converted from f16 (in which case fmad isn't legal). 2146 2147 if (Sel0 || Sel1 || Sel2) { 2148 // For dummy operands. 2149 SDValue Zero = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32); 2150 SDValue Ops[] = { 2151 CurDAG->getTargetConstant(Src0Mods, SDLoc(), MVT::i32), Src0, 2152 CurDAG->getTargetConstant(Src1Mods, SDLoc(), MVT::i32), Src1, 2153 CurDAG->getTargetConstant(Src2Mods, SDLoc(), MVT::i32), Src2, 2154 CurDAG->getTargetConstant(0, SDLoc(), MVT::i1), 2155 Zero, Zero 2156 }; 2157 2158 CurDAG->SelectNodeTo(N, 2159 IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32, 2160 MVT::f32, Ops); 2161 } else { 2162 SelectCode(N); 2163 } 2164 } 2165 2166 // This is here because there isn't a way to use the generated sub0_sub1 as the 2167 // subreg index to EXTRACT_SUBREG in tablegen. 2168 void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) { 2169 MemSDNode *Mem = cast<MemSDNode>(N); 2170 unsigned AS = Mem->getAddressSpace(); 2171 if (AS == AMDGPUAS::FLAT_ADDRESS) { 2172 SelectCode(N); 2173 return; 2174 } 2175 2176 MVT VT = N->getSimpleValueType(0); 2177 bool Is32 = (VT == MVT::i32); 2178 SDLoc SL(N); 2179 2180 MachineSDNode *CmpSwap = nullptr; 2181 if (Subtarget->hasAddr64()) { 2182 SDValue SRsrc, VAddr, SOffset, Offset, SLC; 2183 2184 if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset, SLC)) { 2185 unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN : 2186 AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN; 2187 SDValue CmpVal = Mem->getOperand(2); 2188 2189 // XXX - Do we care about glue operands? 2190 2191 SDValue Ops[] = { 2192 CmpVal, VAddr, SRsrc, SOffset, Offset, SLC, Mem->getChain() 2193 }; 2194 2195 CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops); 2196 } 2197 } 2198 2199 if (!CmpSwap) { 2200 SDValue SRsrc, SOffset, Offset, SLC; 2201 if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset, SLC)) { 2202 unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN : 2203 AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN; 2204 2205 SDValue CmpVal = Mem->getOperand(2); 2206 SDValue Ops[] = { 2207 CmpVal, SRsrc, SOffset, Offset, SLC, Mem->getChain() 2208 }; 2209 2210 CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops); 2211 } 2212 } 2213 2214 if (!CmpSwap) { 2215 SelectCode(N); 2216 return; 2217 } 2218 2219 MachineMemOperand *MMO = Mem->getMemOperand(); 2220 CurDAG->setNodeMemRefs(CmpSwap, {MMO}); 2221 2222 unsigned SubReg = Is32 ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 2223 SDValue Extract 2224 = CurDAG->getTargetExtractSubreg(SubReg, SL, VT, SDValue(CmpSwap, 0)); 2225 2226 ReplaceUses(SDValue(N, 0), Extract); 2227 ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 1)); 2228 CurDAG->RemoveDeadNode(N); 2229 } 2230 2231 void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) { 2232 // The address is assumed to be uniform, so if it ends up in a VGPR, it will 2233 // be copied to an SGPR with readfirstlane. 2234 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ? 2235 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME; 2236 2237 SDValue Chain = N->getOperand(0); 2238 SDValue Ptr = N->getOperand(2); 2239 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N); 2240 MachineMemOperand *MMO = M->getMemOperand(); 2241 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS; 2242 2243 SDValue Offset; 2244 if (CurDAG->isBaseWithConstantOffset(Ptr)) { 2245 SDValue PtrBase = Ptr.getOperand(0); 2246 SDValue PtrOffset = Ptr.getOperand(1); 2247 2248 const APInt &OffsetVal = cast<ConstantSDNode>(PtrOffset)->getAPIntValue(); 2249 if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue(), 16)) { 2250 N = glueCopyToM0(N, PtrBase); 2251 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32); 2252 } 2253 } 2254 2255 if (!Offset) { 2256 N = glueCopyToM0(N, Ptr); 2257 Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32); 2258 } 2259 2260 SDValue Ops[] = { 2261 Offset, 2262 CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32), 2263 Chain, 2264 N->getOperand(N->getNumOperands() - 1) // New glue 2265 }; 2266 2267 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); 2268 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO}); 2269 } 2270 2271 static unsigned gwsIntrinToOpcode(unsigned IntrID) { 2272 switch (IntrID) { 2273 case Intrinsic::amdgcn_ds_gws_init: 2274 return AMDGPU::DS_GWS_INIT; 2275 case Intrinsic::amdgcn_ds_gws_barrier: 2276 return AMDGPU::DS_GWS_BARRIER; 2277 case Intrinsic::amdgcn_ds_gws_sema_v: 2278 return AMDGPU::DS_GWS_SEMA_V; 2279 case Intrinsic::amdgcn_ds_gws_sema_br: 2280 return AMDGPU::DS_GWS_SEMA_BR; 2281 case Intrinsic::amdgcn_ds_gws_sema_p: 2282 return AMDGPU::DS_GWS_SEMA_P; 2283 case Intrinsic::amdgcn_ds_gws_sema_release_all: 2284 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL; 2285 default: 2286 llvm_unreachable("not a gws intrinsic"); 2287 } 2288 } 2289 2290 void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) { 2291 if (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all && 2292 !Subtarget->hasGWSSemaReleaseAll()) { 2293 // Let this error. 2294 SelectCode(N); 2295 return; 2296 } 2297 2298 // Chain, intrinsic ID, vsrc, offset 2299 const bool HasVSrc = N->getNumOperands() == 4; 2300 assert(HasVSrc || N->getNumOperands() == 3); 2301 2302 SDLoc SL(N); 2303 SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2); 2304 int ImmOffset = 0; 2305 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N); 2306 MachineMemOperand *MMO = M->getMemOperand(); 2307 2308 // Don't worry if the offset ends up in a VGPR. Only one lane will have 2309 // effect, so SIFixSGPRCopies will validly insert readfirstlane. 2310 2311 // The resource id offset is computed as (<isa opaque base> + M0[21:16] + 2312 // offset field) % 64. Some versions of the programming guide omit the m0 2313 // part, or claim it's from offset 0. 2314 if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) { 2315 // If we have a constant offset, try to use the 0 in m0 as the base. 2316 // TODO: Look into changing the default m0 initialization value. If the 2317 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to 2318 // the immediate offset. 2319 glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32)); 2320 ImmOffset = ConstOffset->getZExtValue(); 2321 } else { 2322 if (CurDAG->isBaseWithConstantOffset(BaseOffset)) { 2323 ImmOffset = BaseOffset.getConstantOperandVal(1); 2324 BaseOffset = BaseOffset.getOperand(0); 2325 } 2326 2327 // Prefer to do the shift in an SGPR since it should be possible to use m0 2328 // as the result directly. If it's already an SGPR, it will be eliminated 2329 // later. 2330 SDNode *SGPROffset 2331 = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32, 2332 BaseOffset); 2333 // Shift to offset in m0 2334 SDNode *M0Base 2335 = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32, 2336 SDValue(SGPROffset, 0), 2337 CurDAG->getTargetConstant(16, SL, MVT::i32)); 2338 glueCopyToM0(N, SDValue(M0Base, 0)); 2339 } 2340 2341 SDValue Chain = N->getOperand(0); 2342 SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32); 2343 2344 // TODO: Can this just be removed from the instruction? 2345 SDValue GDS = CurDAG->getTargetConstant(1, SL, MVT::i1); 2346 2347 const unsigned Opc = gwsIntrinToOpcode(IntrID); 2348 SmallVector<SDValue, 5> Ops; 2349 if (HasVSrc) 2350 Ops.push_back(N->getOperand(2)); 2351 Ops.push_back(OffsetField); 2352 Ops.push_back(GDS); 2353 Ops.push_back(Chain); 2354 2355 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); 2356 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO}); 2357 } 2358 2359 void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) { 2360 if (Subtarget->getLDSBankCount() != 16) { 2361 // This is a single instruction with a pattern. 2362 SelectCode(N); 2363 return; 2364 } 2365 2366 SDLoc DL(N); 2367 2368 // This requires 2 instructions. It is possible to write a pattern to support 2369 // this, but the generated isel emitter doesn't correctly deal with multiple 2370 // output instructions using the same physical register input. The copy to m0 2371 // is incorrectly placed before the second instruction. 2372 // 2373 // TODO: Match source modifiers. 2374 // 2375 // def : Pat < 2376 // (int_amdgcn_interp_p1_f16 2377 // (VOP3Mods f32:$src0, i32:$src0_modifiers), 2378 // (i32 timm:$attrchan), (i32 timm:$attr), 2379 // (i1 timm:$high), M0), 2380 // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr, 2381 // timm:$attrchan, 0, 2382 // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> { 2383 // let Predicates = [has16BankLDS]; 2384 // } 2385 2386 // 16 bank LDS 2387 SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0, 2388 N->getOperand(5), SDValue()); 2389 2390 SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other); 2391 2392 SDNode *InterpMov = 2393 CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, { 2394 CurDAG->getTargetConstant(2, DL, MVT::i32), // P0 2395 N->getOperand(3), // Attr 2396 N->getOperand(2), // Attrchan 2397 ToM0.getValue(1) // In glue 2398 }); 2399 2400 SDNode *InterpP1LV = 2401 CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, { 2402 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers 2403 N->getOperand(1), // Src0 2404 N->getOperand(3), // Attr 2405 N->getOperand(2), // Attrchan 2406 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers 2407 SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high 2408 N->getOperand(4), // high 2409 CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp 2410 CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod 2411 SDValue(InterpMov, 1) 2412 }); 2413 2414 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0)); 2415 } 2416 2417 void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) { 2418 unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 2419 switch (IntrID) { 2420 case Intrinsic::amdgcn_ds_append: 2421 case Intrinsic::amdgcn_ds_consume: { 2422 if (N->getValueType(0) != MVT::i32) 2423 break; 2424 SelectDSAppendConsume(N, IntrID); 2425 return; 2426 } 2427 } 2428 2429 SelectCode(N); 2430 } 2431 2432 void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) { 2433 unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 2434 unsigned Opcode; 2435 switch (IntrID) { 2436 case Intrinsic::amdgcn_wqm: 2437 Opcode = AMDGPU::WQM; 2438 break; 2439 case Intrinsic::amdgcn_softwqm: 2440 Opcode = AMDGPU::SOFT_WQM; 2441 break; 2442 case Intrinsic::amdgcn_wwm: 2443 Opcode = AMDGPU::WWM; 2444 break; 2445 case Intrinsic::amdgcn_interp_p1_f16: 2446 SelectInterpP1F16(N); 2447 return; 2448 default: 2449 SelectCode(N); 2450 return; 2451 } 2452 2453 SDValue Src = N->getOperand(1); 2454 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src}); 2455 } 2456 2457 void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) { 2458 unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 2459 switch (IntrID) { 2460 case Intrinsic::amdgcn_ds_gws_init: 2461 case Intrinsic::amdgcn_ds_gws_barrier: 2462 case Intrinsic::amdgcn_ds_gws_sema_v: 2463 case Intrinsic::amdgcn_ds_gws_sema_br: 2464 case Intrinsic::amdgcn_ds_gws_sema_p: 2465 case Intrinsic::amdgcn_ds_gws_sema_release_all: 2466 SelectDS_GWS(N, IntrID); 2467 return; 2468 default: 2469 break; 2470 } 2471 2472 SelectCode(N); 2473 } 2474 2475 bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src, 2476 unsigned &Mods) const { 2477 Mods = 0; 2478 Src = In; 2479 2480 if (Src.getOpcode() == ISD::FNEG) { 2481 Mods |= SISrcMods::NEG; 2482 Src = Src.getOperand(0); 2483 } 2484 2485 if (Src.getOpcode() == ISD::FABS) { 2486 Mods |= SISrcMods::ABS; 2487 Src = Src.getOperand(0); 2488 } 2489 2490 return true; 2491 } 2492 2493 bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, 2494 SDValue &SrcMods) const { 2495 unsigned Mods; 2496 if (SelectVOP3ModsImpl(In, Src, Mods)) { 2497 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); 2498 return true; 2499 } 2500 2501 return false; 2502 } 2503 2504 bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, 2505 SDValue &SrcMods) const { 2506 SelectVOP3Mods(In, Src, SrcMods); 2507 return isNoNanSrc(Src); 2508 } 2509 2510 bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const { 2511 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG) 2512 return false; 2513 2514 Src = In; 2515 return true; 2516 } 2517 2518 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src, 2519 SDValue &SrcMods, SDValue &Clamp, 2520 SDValue &Omod) const { 2521 SDLoc DL(In); 2522 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); 2523 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1); 2524 2525 return SelectVOP3Mods(In, Src, SrcMods); 2526 } 2527 2528 bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src, 2529 SDValue &Clamp, SDValue &Omod) const { 2530 Src = In; 2531 2532 SDLoc DL(In); 2533 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); 2534 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1); 2535 2536 return true; 2537 } 2538 2539 bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, 2540 SDValue &SrcMods) const { 2541 unsigned Mods = 0; 2542 Src = In; 2543 2544 if (Src.getOpcode() == ISD::FNEG) { 2545 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); 2546 Src = Src.getOperand(0); 2547 } 2548 2549 if (Src.getOpcode() == ISD::BUILD_VECTOR) { 2550 unsigned VecMods = Mods; 2551 2552 SDValue Lo = stripBitcast(Src.getOperand(0)); 2553 SDValue Hi = stripBitcast(Src.getOperand(1)); 2554 2555 if (Lo.getOpcode() == ISD::FNEG) { 2556 Lo = stripBitcast(Lo.getOperand(0)); 2557 Mods ^= SISrcMods::NEG; 2558 } 2559 2560 if (Hi.getOpcode() == ISD::FNEG) { 2561 Hi = stripBitcast(Hi.getOperand(0)); 2562 Mods ^= SISrcMods::NEG_HI; 2563 } 2564 2565 if (isExtractHiElt(Lo, Lo)) 2566 Mods |= SISrcMods::OP_SEL_0; 2567 2568 if (isExtractHiElt(Hi, Hi)) 2569 Mods |= SISrcMods::OP_SEL_1; 2570 2571 Lo = stripExtractLoElt(Lo); 2572 Hi = stripExtractLoElt(Hi); 2573 2574 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) { 2575 // Really a scalar input. Just select from the low half of the register to 2576 // avoid packing. 2577 2578 Src = Lo; 2579 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); 2580 return true; 2581 } 2582 2583 Mods = VecMods; 2584 } 2585 2586 // Packed instructions do not have abs modifiers. 2587 Mods |= SISrcMods::OP_SEL_1; 2588 2589 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); 2590 return true; 2591 } 2592 2593 bool AMDGPUDAGToDAGISel::SelectVOP3PMods0(SDValue In, SDValue &Src, 2594 SDValue &SrcMods, 2595 SDValue &Clamp) const { 2596 SDLoc SL(In); 2597 2598 // FIXME: Handle clamp and op_sel 2599 Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32); 2600 2601 return SelectVOP3PMods(In, Src, SrcMods); 2602 } 2603 2604 bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src, 2605 SDValue &SrcMods) const { 2606 Src = In; 2607 // FIXME: Handle op_sel 2608 SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32); 2609 return true; 2610 } 2611 2612 bool AMDGPUDAGToDAGISel::SelectVOP3OpSel0(SDValue In, SDValue &Src, 2613 SDValue &SrcMods, 2614 SDValue &Clamp) const { 2615 SDLoc SL(In); 2616 2617 // FIXME: Handle clamp 2618 Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32); 2619 2620 return SelectVOP3OpSel(In, Src, SrcMods); 2621 } 2622 2623 bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src, 2624 SDValue &SrcMods) const { 2625 // FIXME: Handle op_sel 2626 return SelectVOP3Mods(In, Src, SrcMods); 2627 } 2628 2629 bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods0(SDValue In, SDValue &Src, 2630 SDValue &SrcMods, 2631 SDValue &Clamp) const { 2632 SDLoc SL(In); 2633 2634 // FIXME: Handle clamp 2635 Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32); 2636 2637 return SelectVOP3OpSelMods(In, Src, SrcMods); 2638 } 2639 2640 // The return value is not whether the match is possible (which it always is), 2641 // but whether or not it a conversion is really used. 2642 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, 2643 unsigned &Mods) const { 2644 Mods = 0; 2645 SelectVOP3ModsImpl(In, Src, Mods); 2646 2647 if (Src.getOpcode() == ISD::FP_EXTEND) { 2648 Src = Src.getOperand(0); 2649 assert(Src.getValueType() == MVT::f16); 2650 Src = stripBitcast(Src); 2651 2652 // Be careful about folding modifiers if we already have an abs. fneg is 2653 // applied last, so we don't want to apply an earlier fneg. 2654 if ((Mods & SISrcMods::ABS) == 0) { 2655 unsigned ModsTmp; 2656 SelectVOP3ModsImpl(Src, Src, ModsTmp); 2657 2658 if ((ModsTmp & SISrcMods::NEG) != 0) 2659 Mods ^= SISrcMods::NEG; 2660 2661 if ((ModsTmp & SISrcMods::ABS) != 0) 2662 Mods |= SISrcMods::ABS; 2663 } 2664 2665 // op_sel/op_sel_hi decide the source type and source. 2666 // If the source's op_sel_hi is set, it indicates to do a conversion from fp16. 2667 // If the sources's op_sel is set, it picks the high half of the source 2668 // register. 2669 2670 Mods |= SISrcMods::OP_SEL_1; 2671 if (isExtractHiElt(Src, Src)) { 2672 Mods |= SISrcMods::OP_SEL_0; 2673 2674 // TODO: Should we try to look for neg/abs here? 2675 } 2676 2677 return true; 2678 } 2679 2680 return false; 2681 } 2682 2683 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src, 2684 SDValue &SrcMods) const { 2685 unsigned Mods = 0; 2686 SelectVOP3PMadMixModsImpl(In, Src, Mods); 2687 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); 2688 return true; 2689 } 2690 2691 SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const { 2692 if (In.isUndef()) 2693 return CurDAG->getUNDEF(MVT::i32); 2694 2695 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) { 2696 SDLoc SL(In); 2697 return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32); 2698 } 2699 2700 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) { 2701 SDLoc SL(In); 2702 return CurDAG->getConstant( 2703 C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32); 2704 } 2705 2706 SDValue Src; 2707 if (isExtractHiElt(In, Src)) 2708 return Src; 2709 2710 return SDValue(); 2711 } 2712 2713 bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const { 2714 assert(CurDAG->getTarget().getTargetTriple().getArch() == Triple::amdgcn); 2715 2716 const SIRegisterInfo *SIRI = 2717 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); 2718 const SIInstrInfo * SII = 2719 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 2720 2721 unsigned Limit = 0; 2722 bool AllUsesAcceptSReg = true; 2723 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); 2724 Limit < 10 && U != E; ++U, ++Limit) { 2725 const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo()); 2726 2727 // If the register class is unknown, it could be an unknown 2728 // register class that needs to be an SGPR, e.g. an inline asm 2729 // constraint 2730 if (!RC || SIRI->isSGPRClass(RC)) 2731 return false; 2732 2733 if (RC != &AMDGPU::VS_32RegClass) { 2734 AllUsesAcceptSReg = false; 2735 SDNode * User = *U; 2736 if (User->isMachineOpcode()) { 2737 unsigned Opc = User->getMachineOpcode(); 2738 MCInstrDesc Desc = SII->get(Opc); 2739 if (Desc.isCommutable()) { 2740 unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo(); 2741 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex; 2742 if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) { 2743 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs(); 2744 const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo); 2745 if (CommutedRC == &AMDGPU::VS_32RegClass) 2746 AllUsesAcceptSReg = true; 2747 } 2748 } 2749 } 2750 // If "AllUsesAcceptSReg == false" so far we haven't suceeded 2751 // commuting current user. This means have at least one use 2752 // that strictly require VGPR. Thus, we will not attempt to commute 2753 // other user instructions. 2754 if (!AllUsesAcceptSReg) 2755 break; 2756 } 2757 } 2758 return !AllUsesAcceptSReg && (Limit < 10); 2759 } 2760 2761 bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const { 2762 auto Ld = cast<LoadSDNode>(N); 2763 2764 return Ld->getAlignment() >= 4 && 2765 ( 2766 ( 2767 ( 2768 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || 2769 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT 2770 ) 2771 && 2772 !N->isDivergent() 2773 ) 2774 || 2775 ( 2776 Subtarget->getScalarizeGlobalBehavior() && 2777 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && 2778 !Ld->isVolatile() && 2779 !N->isDivergent() && 2780 static_cast<const SITargetLowering *>( 2781 getTargetLowering())->isMemOpHasNoClobberedMemOperand(N) 2782 ) 2783 ); 2784 } 2785 2786 void AMDGPUDAGToDAGISel::PostprocessISelDAG() { 2787 const AMDGPUTargetLowering& Lowering = 2788 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering()); 2789 bool IsModified = false; 2790 do { 2791 IsModified = false; 2792 2793 // Go over all selected nodes and try to fold them a bit more 2794 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin(); 2795 while (Position != CurDAG->allnodes_end()) { 2796 SDNode *Node = &*Position++; 2797 MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node); 2798 if (!MachineNode) 2799 continue; 2800 2801 SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG); 2802 if (ResNode != Node) { 2803 if (ResNode) 2804 ReplaceUses(Node, ResNode); 2805 IsModified = true; 2806 } 2807 } 2808 CurDAG->RemoveDeadNodes(); 2809 } while (IsModified); 2810 } 2811 2812 bool R600DAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { 2813 Subtarget = &MF.getSubtarget<R600Subtarget>(); 2814 return SelectionDAGISel::runOnMachineFunction(MF); 2815 } 2816 2817 bool R600DAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const { 2818 if (!N->readMem()) 2819 return false; 2820 if (CbId == -1) 2821 return N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || 2822 N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT; 2823 2824 return N->getAddressSpace() == AMDGPUAS::CONSTANT_BUFFER_0 + CbId; 2825 } 2826 2827 bool R600DAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr, 2828 SDValue& IntPtr) { 2829 if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) { 2830 IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr), 2831 true); 2832 return true; 2833 } 2834 return false; 2835 } 2836 2837 bool R600DAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr, 2838 SDValue& BaseReg, SDValue &Offset) { 2839 if (!isa<ConstantSDNode>(Addr)) { 2840 BaseReg = Addr; 2841 Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true); 2842 return true; 2843 } 2844 return false; 2845 } 2846 2847 void R600DAGToDAGISel::Select(SDNode *N) { 2848 unsigned int Opc = N->getOpcode(); 2849 if (N->isMachineOpcode()) { 2850 N->setNodeId(-1); 2851 return; // Already selected. 2852 } 2853 2854 switch (Opc) { 2855 default: break; 2856 case AMDGPUISD::BUILD_VERTICAL_VECTOR: 2857 case ISD::SCALAR_TO_VECTOR: 2858 case ISD::BUILD_VECTOR: { 2859 EVT VT = N->getValueType(0); 2860 unsigned NumVectorElts = VT.getVectorNumElements(); 2861 unsigned RegClassID; 2862 // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG 2863 // that adds a 128 bits reg copy when going through TwoAddressInstructions 2864 // pass. We want to avoid 128 bits copies as much as possible because they 2865 // can't be bundled by our scheduler. 2866 switch(NumVectorElts) { 2867 case 2: RegClassID = R600::R600_Reg64RegClassID; break; 2868 case 4: 2869 if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR) 2870 RegClassID = R600::R600_Reg128VerticalRegClassID; 2871 else 2872 RegClassID = R600::R600_Reg128RegClassID; 2873 break; 2874 default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR"); 2875 } 2876 SelectBuildVector(N, RegClassID); 2877 return; 2878 } 2879 } 2880 2881 SelectCode(N); 2882 } 2883 2884 bool R600DAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, 2885 SDValue &Offset) { 2886 ConstantSDNode *C; 2887 SDLoc DL(Addr); 2888 2889 if ((C = dyn_cast<ConstantSDNode>(Addr))) { 2890 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32); 2891 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); 2892 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) && 2893 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) { 2894 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32); 2895 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); 2896 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) && 2897 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) { 2898 Base = Addr.getOperand(0); 2899 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); 2900 } else { 2901 Base = Addr; 2902 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); 2903 } 2904 2905 return true; 2906 } 2907 2908 bool R600DAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base, 2909 SDValue &Offset) { 2910 ConstantSDNode *IMMOffset; 2911 2912 if (Addr.getOpcode() == ISD::ADD 2913 && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) 2914 && isInt<16>(IMMOffset->getZExtValue())) { 2915 2916 Base = Addr.getOperand(0); 2917 Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr), 2918 MVT::i32); 2919 return true; 2920 // If the pointer address is constant, we can move it to the offset field. 2921 } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr)) 2922 && isInt<16>(IMMOffset->getZExtValue())) { 2923 Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), 2924 SDLoc(CurDAG->getEntryNode()), 2925 R600::ZERO, MVT::i32); 2926 Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr), 2927 MVT::i32); 2928 return true; 2929 } 2930 2931 // Default case, no offset 2932 Base = Addr; 2933 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); 2934 return true; 2935 } 2936