1 //===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Defines an instruction selector for the AMDGPU target. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPUArgumentUsageInfo.h" 16 #include "AMDGPUISelLowering.h" // For AMDGPUISD 17 #include "AMDGPUInstrInfo.h" 18 #include "AMDGPUPerfHintAnalysis.h" 19 #include "AMDGPURegisterInfo.h" 20 #include "AMDGPUSubtarget.h" 21 #include "AMDGPUTargetMachine.h" 22 #include "SIDefines.h" 23 #include "SIISelLowering.h" 24 #include "SIInstrInfo.h" 25 #include "SIMachineFunctionInfo.h" 26 #include "SIRegisterInfo.h" 27 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 28 #include "llvm/ADT/APInt.h" 29 #include "llvm/ADT/SmallVector.h" 30 #include "llvm/ADT/StringRef.h" 31 #include "llvm/Analysis/LegacyDivergenceAnalysis.h" 32 #include "llvm/Analysis/ValueTracking.h" 33 #include "llvm/CodeGen/FunctionLoweringInfo.h" 34 #include "llvm/CodeGen/ISDOpcodes.h" 35 #include "llvm/CodeGen/MachineFunction.h" 36 #include "llvm/CodeGen/MachineRegisterInfo.h" 37 #include "llvm/CodeGen/SelectionDAG.h" 38 #include "llvm/CodeGen/SelectionDAGISel.h" 39 #include "llvm/CodeGen/SelectionDAGNodes.h" 40 #include "llvm/CodeGen/ValueTypes.h" 41 #include "llvm/IR/BasicBlock.h" 42 #include "llvm/IR/Instruction.h" 43 #include "llvm/MC/MCInstrDesc.h" 44 #include "llvm/Support/Casting.h" 45 #include "llvm/Support/CodeGen.h" 46 #include "llvm/Support/ErrorHandling.h" 47 #include "llvm/Support/MachineValueType.h" 48 #include "llvm/Support/MathExtras.h" 49 #include <cassert> 50 #include <cstdint> 51 #include <new> 52 #include <vector> 53 54 #define DEBUG_TYPE "isel" 55 56 using namespace llvm; 57 58 namespace llvm { 59 60 class R600InstrInfo; 61 62 } // end namespace llvm 63 64 //===----------------------------------------------------------------------===// 65 // Instruction Selector Implementation 66 //===----------------------------------------------------------------------===// 67 68 namespace { 69 70 /// AMDGPU specific code to select AMDGPU machine instructions for 71 /// SelectionDAG operations. 72 class AMDGPUDAGToDAGISel : public SelectionDAGISel { 73 // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can 74 // make the right decision when generating code for different targets. 75 const GCNSubtarget *Subtarget; 76 bool EnableLateStructurizeCFG; 77 78 public: 79 explicit AMDGPUDAGToDAGISel(TargetMachine *TM = nullptr, 80 CodeGenOpt::Level OptLevel = CodeGenOpt::Default) 81 : SelectionDAGISel(*TM, OptLevel) { 82 EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG; 83 } 84 ~AMDGPUDAGToDAGISel() override = default; 85 86 void getAnalysisUsage(AnalysisUsage &AU) const override { 87 AU.addRequired<AMDGPUArgumentUsageInfo>(); 88 AU.addRequired<AMDGPUPerfHintAnalysis>(); 89 AU.addRequired<LegacyDivergenceAnalysis>(); 90 SelectionDAGISel::getAnalysisUsage(AU); 91 } 92 93 bool matchLoadD16FromBuildVector(SDNode *N) const; 94 95 bool runOnMachineFunction(MachineFunction &MF) override; 96 void PreprocessISelDAG() override; 97 void Select(SDNode *N) override; 98 StringRef getPassName() const override; 99 void PostprocessISelDAG() override; 100 101 protected: 102 void SelectBuildVector(SDNode *N, unsigned RegClassID); 103 104 private: 105 std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const; 106 bool isNoNanSrc(SDValue N) const; 107 bool isInlineImmediate(const SDNode *N) const; 108 bool isVGPRImm(const SDNode *N) const; 109 bool isUniformLoad(const SDNode *N) const; 110 bool isUniformBr(const SDNode *N) const; 111 112 MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const; 113 114 SDNode *glueCopyToM0LDSInit(SDNode *N) const; 115 SDNode *glueCopyToM0(SDNode *N, SDValue Val) const; 116 117 const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const; 118 virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); 119 virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); 120 bool isDSOffsetLegal(SDValue Base, unsigned Offset, 121 unsigned OffsetBits) const; 122 bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; 123 bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, 124 SDValue &Offset1) const; 125 bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, 126 SDValue &SOffset, SDValue &Offset, SDValue &Offen, 127 SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, 128 SDValue &TFE, SDValue &DLC) const; 129 bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, 130 SDValue &SOffset, SDValue &Offset, SDValue &GLC, 131 SDValue &SLC, SDValue &TFE, SDValue &DLC) const; 132 bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, 133 SDValue &VAddr, SDValue &SOffset, SDValue &Offset, 134 SDValue &SLC) const; 135 bool SelectMUBUFScratchOffen(SDNode *Parent, 136 SDValue Addr, SDValue &RSrc, SDValue &VAddr, 137 SDValue &SOffset, SDValue &ImmOffset) const; 138 bool SelectMUBUFScratchOffset(SDNode *Parent, 139 SDValue Addr, SDValue &SRsrc, SDValue &Soffset, 140 SDValue &Offset) const; 141 142 bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, 143 SDValue &Offset, SDValue &GLC, SDValue &SLC, 144 SDValue &TFE, SDValue &DLC) const; 145 bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, 146 SDValue &Offset, SDValue &SLC) const; 147 bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, 148 SDValue &Offset) const; 149 150 bool SelectFlatAtomic(SDNode *N, SDValue Addr, SDValue &VAddr, 151 SDValue &Offset, SDValue &SLC) const; 152 bool SelectFlatAtomicSigned(SDNode *N, SDValue Addr, SDValue &VAddr, 153 SDValue &Offset, SDValue &SLC) const; 154 155 template <bool IsSigned> 156 bool SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr, 157 SDValue &Offset, SDValue &SLC) const; 158 159 bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, 160 bool &Imm) const; 161 SDValue Expand32BitAddress(SDValue Addr) const; 162 bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, 163 bool &Imm) const; 164 bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const; 165 bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const; 166 bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const; 167 bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const; 168 bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const; 169 bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; 170 171 bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const; 172 bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods) const; 173 bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; 174 bool SelectVOP3NoMods(SDValue In, SDValue &Src) const; 175 bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, 176 SDValue &Clamp, SDValue &Omod) const; 177 bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods, 178 SDValue &Clamp, SDValue &Omod) const; 179 180 bool SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, SDValue &SrcMods, 181 SDValue &Clamp, 182 SDValue &Omod) const; 183 184 bool SelectVOP3OMods(SDValue In, SDValue &Src, 185 SDValue &Clamp, SDValue &Omod) const; 186 187 bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; 188 bool SelectVOP3PMods0(SDValue In, SDValue &Src, SDValue &SrcMods, 189 SDValue &Clamp) const; 190 191 bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const; 192 bool SelectVOP3OpSel0(SDValue In, SDValue &Src, SDValue &SrcMods, 193 SDValue &Clamp) const; 194 195 bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; 196 bool SelectVOP3OpSelMods0(SDValue In, SDValue &Src, SDValue &SrcMods, 197 SDValue &Clamp) const; 198 bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const; 199 bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; 200 201 SDValue getHi16Elt(SDValue In) const; 202 203 void SelectADD_SUB_I64(SDNode *N); 204 void SelectAddcSubb(SDNode *N); 205 void SelectUADDO_USUBO(SDNode *N); 206 void SelectDIV_SCALE(SDNode *N); 207 void SelectDIV_FMAS(SDNode *N); 208 void SelectMAD_64_32(SDNode *N); 209 void SelectFMA_W_CHAIN(SDNode *N); 210 void SelectFMUL_W_CHAIN(SDNode *N); 211 212 SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val, 213 uint32_t Offset, uint32_t Width); 214 void SelectS_BFEFromShifts(SDNode *N); 215 void SelectS_BFE(SDNode *N); 216 bool isCBranchSCC(const SDNode *N) const; 217 void SelectBRCOND(SDNode *N); 218 void SelectFMAD_FMA(SDNode *N); 219 void SelectATOMIC_CMP_SWAP(SDNode *N); 220 void SelectINTRINSIC_W_CHAIN(SDNode *N); 221 222 protected: 223 // Include the pieces autogenerated from the target description. 224 #include "AMDGPUGenDAGISel.inc" 225 }; 226 227 class R600DAGToDAGISel : public AMDGPUDAGToDAGISel { 228 const R600Subtarget *Subtarget; 229 230 bool isConstantLoad(const MemSDNode *N, int cbID) const; 231 bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr); 232 bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg, 233 SDValue& Offset); 234 public: 235 explicit R600DAGToDAGISel(TargetMachine *TM, CodeGenOpt::Level OptLevel) : 236 AMDGPUDAGToDAGISel(TM, OptLevel) {} 237 238 void Select(SDNode *N) override; 239 240 bool SelectADDRIndirect(SDValue Addr, SDValue &Base, 241 SDValue &Offset) override; 242 bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, 243 SDValue &Offset) override; 244 245 bool runOnMachineFunction(MachineFunction &MF) override; 246 247 void PreprocessISelDAG() override {} 248 249 protected: 250 // Include the pieces autogenerated from the target description. 251 #include "R600GenDAGISel.inc" 252 }; 253 254 static SDValue stripBitcast(SDValue Val) { 255 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val; 256 } 257 258 // Figure out if this is really an extract of the high 16-bits of a dword. 259 static bool isExtractHiElt(SDValue In, SDValue &Out) { 260 In = stripBitcast(In); 261 if (In.getOpcode() != ISD::TRUNCATE) 262 return false; 263 264 SDValue Srl = In.getOperand(0); 265 if (Srl.getOpcode() == ISD::SRL) { 266 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) { 267 if (ShiftAmt->getZExtValue() == 16) { 268 Out = stripBitcast(Srl.getOperand(0)); 269 return true; 270 } 271 } 272 } 273 274 return false; 275 } 276 277 // Look through operations that obscure just looking at the low 16-bits of the 278 // same register. 279 static SDValue stripExtractLoElt(SDValue In) { 280 if (In.getOpcode() == ISD::TRUNCATE) { 281 SDValue Src = In.getOperand(0); 282 if (Src.getValueType().getSizeInBits() == 32) 283 return stripBitcast(Src); 284 } 285 286 return In; 287 } 288 289 } // end anonymous namespace 290 291 INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel", 292 "AMDGPU DAG->DAG Pattern Instruction Selection", false, false) 293 INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo) 294 INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis) 295 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) 296 INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "amdgpu-isel", 297 "AMDGPU DAG->DAG Pattern Instruction Selection", false, false) 298 299 /// This pass converts a legalized DAG into a AMDGPU-specific 300 // DAG, ready for instruction scheduling. 301 FunctionPass *llvm::createAMDGPUISelDag(TargetMachine *TM, 302 CodeGenOpt::Level OptLevel) { 303 return new AMDGPUDAGToDAGISel(TM, OptLevel); 304 } 305 306 /// This pass converts a legalized DAG into a R600-specific 307 // DAG, ready for instruction scheduling. 308 FunctionPass *llvm::createR600ISelDag(TargetMachine *TM, 309 CodeGenOpt::Level OptLevel) { 310 return new R600DAGToDAGISel(TM, OptLevel); 311 } 312 313 bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { 314 Subtarget = &MF.getSubtarget<GCNSubtarget>(); 315 return SelectionDAGISel::runOnMachineFunction(MF); 316 } 317 318 bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const { 319 assert(Subtarget->d16PreservesUnusedBits()); 320 MVT VT = N->getValueType(0).getSimpleVT(); 321 if (VT != MVT::v2i16 && VT != MVT::v2f16) 322 return false; 323 324 SDValue Lo = N->getOperand(0); 325 SDValue Hi = N->getOperand(1); 326 327 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi)); 328 329 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo 330 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo 331 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo 332 333 // Need to check for possible indirect dependencies on the other half of the 334 // vector to avoid introducing a cycle. 335 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) { 336 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other); 337 338 SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo); 339 SDValue Ops[] = { 340 LdHi->getChain(), LdHi->getBasePtr(), TiedIn 341 }; 342 343 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI; 344 if (LdHi->getMemoryVT() == MVT::i8) { 345 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ? 346 AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8; 347 } else { 348 assert(LdHi->getMemoryVT() == MVT::i16); 349 } 350 351 SDValue NewLoadHi = 352 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList, 353 Ops, LdHi->getMemoryVT(), 354 LdHi->getMemOperand()); 355 356 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi); 357 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1)); 358 return true; 359 } 360 361 // build_vector (load ptr), hi -> load_d16_lo ptr, hi 362 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi 363 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi 364 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo)); 365 if (LdLo && Lo.hasOneUse()) { 366 SDValue TiedIn = getHi16Elt(Hi); 367 if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode())) 368 return false; 369 370 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other); 371 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO; 372 if (LdLo->getMemoryVT() == MVT::i8) { 373 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ? 374 AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8; 375 } else { 376 assert(LdLo->getMemoryVT() == MVT::i16); 377 } 378 379 TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn); 380 381 SDValue Ops[] = { 382 LdLo->getChain(), LdLo->getBasePtr(), TiedIn 383 }; 384 385 SDValue NewLoadLo = 386 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList, 387 Ops, LdLo->getMemoryVT(), 388 LdLo->getMemOperand()); 389 390 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo); 391 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1)); 392 return true; 393 } 394 395 return false; 396 } 397 398 void AMDGPUDAGToDAGISel::PreprocessISelDAG() { 399 if (!Subtarget->d16PreservesUnusedBits()) 400 return; 401 402 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end(); 403 404 bool MadeChange = false; 405 while (Position != CurDAG->allnodes_begin()) { 406 SDNode *N = &*--Position; 407 if (N->use_empty()) 408 continue; 409 410 switch (N->getOpcode()) { 411 case ISD::BUILD_VECTOR: 412 MadeChange |= matchLoadD16FromBuildVector(N); 413 break; 414 default: 415 break; 416 } 417 } 418 419 if (MadeChange) { 420 CurDAG->RemoveDeadNodes(); 421 LLVM_DEBUG(dbgs() << "After PreProcess:\n"; 422 CurDAG->dump();); 423 } 424 } 425 426 bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const { 427 if (TM.Options.NoNaNsFPMath) 428 return true; 429 430 // TODO: Move into isKnownNeverNaN 431 if (N->getFlags().isDefined()) 432 return N->getFlags().hasNoNaNs(); 433 434 return CurDAG->isKnownNeverNaN(N); 435 } 436 437 bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const { 438 const SIInstrInfo *TII = Subtarget->getInstrInfo(); 439 440 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) 441 return TII->isInlineConstant(C->getAPIntValue()); 442 443 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) 444 return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt()); 445 446 return false; 447 } 448 449 /// Determine the register class for \p OpNo 450 /// \returns The register class of the virtual register that will be used for 451 /// the given operand number \OpNo or NULL if the register class cannot be 452 /// determined. 453 const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, 454 unsigned OpNo) const { 455 if (!N->isMachineOpcode()) { 456 if (N->getOpcode() == ISD::CopyToReg) { 457 unsigned Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg(); 458 if (TargetRegisterInfo::isVirtualRegister(Reg)) { 459 MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo(); 460 return MRI.getRegClass(Reg); 461 } 462 463 const SIRegisterInfo *TRI 464 = static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo(); 465 return TRI->getPhysRegClass(Reg); 466 } 467 468 return nullptr; 469 } 470 471 switch (N->getMachineOpcode()) { 472 default: { 473 const MCInstrDesc &Desc = 474 Subtarget->getInstrInfo()->get(N->getMachineOpcode()); 475 unsigned OpIdx = Desc.getNumDefs() + OpNo; 476 if (OpIdx >= Desc.getNumOperands()) 477 return nullptr; 478 int RegClass = Desc.OpInfo[OpIdx].RegClass; 479 if (RegClass == -1) 480 return nullptr; 481 482 return Subtarget->getRegisterInfo()->getRegClass(RegClass); 483 } 484 case AMDGPU::REG_SEQUENCE: { 485 unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 486 const TargetRegisterClass *SuperRC = 487 Subtarget->getRegisterInfo()->getRegClass(RCID); 488 489 SDValue SubRegOp = N->getOperand(OpNo + 1); 490 unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue(); 491 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC, 492 SubRegIdx); 493 } 494 } 495 } 496 497 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const { 498 const SITargetLowering& Lowering = 499 *static_cast<const SITargetLowering*>(getTargetLowering()); 500 501 // Write max value to m0 before each load operation 502 503 SDValue M0 = Lowering.copyToM0(*CurDAG, CurDAG->getEntryNode(), SDLoc(N), 504 Val); 505 506 SDValue Glue = M0.getValue(1); 507 508 SmallVector <SDValue, 8> Ops; 509 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) 510 Ops.push_back(N->getOperand(i)); 511 512 Ops.push_back(Glue); 513 return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops); 514 } 515 516 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const { 517 if (cast<MemSDNode>(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS || 518 !Subtarget->ldsRequiresM0Init()) 519 return N; 520 return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32)); 521 } 522 523 MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm, 524 EVT VT) const { 525 SDNode *Lo = CurDAG->getMachineNode( 526 AMDGPU::S_MOV_B32, DL, MVT::i32, 527 CurDAG->getConstant(Imm & 0xFFFFFFFF, DL, MVT::i32)); 528 SDNode *Hi = 529 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, 530 CurDAG->getConstant(Imm >> 32, DL, MVT::i32)); 531 const SDValue Ops[] = { 532 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), 533 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), 534 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)}; 535 536 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops); 537 } 538 539 static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) { 540 switch (NumVectorElts) { 541 case 1: 542 return AMDGPU::SReg_32_XM0RegClassID; 543 case 2: 544 return AMDGPU::SReg_64RegClassID; 545 case 3: 546 return AMDGPU::SGPR_96RegClassID; 547 case 4: 548 return AMDGPU::SReg_128RegClassID; 549 case 5: 550 return AMDGPU::SGPR_160RegClassID; 551 case 8: 552 return AMDGPU::SReg_256RegClassID; 553 case 16: 554 return AMDGPU::SReg_512RegClassID; 555 } 556 557 llvm_unreachable("invalid vector size"); 558 } 559 560 static bool getConstantValue(SDValue N, uint32_t &Out) { 561 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) { 562 Out = C->getAPIntValue().getZExtValue(); 563 return true; 564 } 565 566 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) { 567 Out = C->getValueAPF().bitcastToAPInt().getZExtValue(); 568 return true; 569 } 570 571 return false; 572 } 573 574 void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) { 575 EVT VT = N->getValueType(0); 576 unsigned NumVectorElts = VT.getVectorNumElements(); 577 EVT EltVT = VT.getVectorElementType(); 578 SDLoc DL(N); 579 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); 580 581 if (NumVectorElts == 1) { 582 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0), 583 RegClass); 584 return; 585 } 586 587 assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not " 588 "supported yet"); 589 // 16 = Max Num Vector Elements 590 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index) 591 // 1 = Vector Register Class 592 SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1); 593 594 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); 595 bool IsRegSeq = true; 596 unsigned NOps = N->getNumOperands(); 597 for (unsigned i = 0; i < NOps; i++) { 598 // XXX: Why is this here? 599 if (isa<RegisterSDNode>(N->getOperand(i))) { 600 IsRegSeq = false; 601 break; 602 } 603 unsigned Sub = AMDGPURegisterInfo::getSubRegFromChannel(i); 604 RegSeqArgs[1 + (2 * i)] = N->getOperand(i); 605 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32); 606 } 607 if (NOps != NumVectorElts) { 608 // Fill in the missing undef elements if this was a scalar_to_vector. 609 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts); 610 MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, 611 DL, EltVT); 612 for (unsigned i = NOps; i < NumVectorElts; ++i) { 613 unsigned Sub = AMDGPURegisterInfo::getSubRegFromChannel(i); 614 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0); 615 RegSeqArgs[1 + (2 * i) + 1] = 616 CurDAG->getTargetConstant(Sub, DL, MVT::i32); 617 } 618 } 619 620 if (!IsRegSeq) 621 SelectCode(N); 622 CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs); 623 } 624 625 void AMDGPUDAGToDAGISel::Select(SDNode *N) { 626 unsigned int Opc = N->getOpcode(); 627 if (N->isMachineOpcode()) { 628 N->setNodeId(-1); 629 return; // Already selected. 630 } 631 632 if (isa<AtomicSDNode>(N) || 633 (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC || 634 Opc == ISD::ATOMIC_LOAD_FADD || 635 Opc == AMDGPUISD::ATOMIC_LOAD_FMIN || 636 Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) 637 N = glueCopyToM0LDSInit(N); 638 639 switch (Opc) { 640 default: 641 break; 642 // We are selecting i64 ADD here instead of custom lower it during 643 // DAG legalization, so we can fold some i64 ADDs used for address 644 // calculation into the LOAD and STORE instructions. 645 case ISD::ADDC: 646 case ISD::ADDE: 647 case ISD::SUBC: 648 case ISD::SUBE: { 649 if (N->getValueType(0) != MVT::i64) 650 break; 651 652 SelectADD_SUB_I64(N); 653 return; 654 } 655 case ISD::ADDCARRY: 656 case ISD::SUBCARRY: 657 if (N->getValueType(0) != MVT::i32) 658 break; 659 660 SelectAddcSubb(N); 661 return; 662 case ISD::UADDO: 663 case ISD::USUBO: { 664 SelectUADDO_USUBO(N); 665 return; 666 } 667 case AMDGPUISD::FMUL_W_CHAIN: { 668 SelectFMUL_W_CHAIN(N); 669 return; 670 } 671 case AMDGPUISD::FMA_W_CHAIN: { 672 SelectFMA_W_CHAIN(N); 673 return; 674 } 675 676 case ISD::SCALAR_TO_VECTOR: 677 case ISD::BUILD_VECTOR: { 678 EVT VT = N->getValueType(0); 679 unsigned NumVectorElts = VT.getVectorNumElements(); 680 if (VT.getScalarSizeInBits() == 16) { 681 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) { 682 uint32_t LHSVal, RHSVal; 683 if (getConstantValue(N->getOperand(0), LHSVal) && 684 getConstantValue(N->getOperand(1), RHSVal)) { 685 uint32_t K = LHSVal | (RHSVal << 16); 686 CurDAG->SelectNodeTo(N, AMDGPU::S_MOV_B32, VT, 687 CurDAG->getTargetConstant(K, SDLoc(N), MVT::i32)); 688 return; 689 } 690 } 691 692 break; 693 } 694 695 assert(VT.getVectorElementType().bitsEq(MVT::i32)); 696 unsigned RegClassID = selectSGPRVectorRegClassID(NumVectorElts); 697 SelectBuildVector(N, RegClassID); 698 return; 699 } 700 case ISD::BUILD_PAIR: { 701 SDValue RC, SubReg0, SubReg1; 702 SDLoc DL(N); 703 if (N->getValueType(0) == MVT::i128) { 704 RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32); 705 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32); 706 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32); 707 } else if (N->getValueType(0) == MVT::i64) { 708 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32); 709 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); 710 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); 711 } else { 712 llvm_unreachable("Unhandled value type for BUILD_PAIR"); 713 } 714 const SDValue Ops[] = { RC, N->getOperand(0), SubReg0, 715 N->getOperand(1), SubReg1 }; 716 ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, 717 N->getValueType(0), Ops)); 718 return; 719 } 720 721 case ISD::Constant: 722 case ISD::ConstantFP: { 723 if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N)) 724 break; 725 726 uint64_t Imm; 727 if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) 728 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue(); 729 else { 730 ConstantSDNode *C = cast<ConstantSDNode>(N); 731 Imm = C->getZExtValue(); 732 } 733 734 SDLoc DL(N); 735 ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0))); 736 return; 737 } 738 case ISD::LOAD: 739 case ISD::STORE: 740 case ISD::ATOMIC_LOAD: 741 case ISD::ATOMIC_STORE: { 742 N = glueCopyToM0LDSInit(N); 743 break; 744 } 745 746 case AMDGPUISD::BFE_I32: 747 case AMDGPUISD::BFE_U32: { 748 // There is a scalar version available, but unlike the vector version which 749 // has a separate operand for the offset and width, the scalar version packs 750 // the width and offset into a single operand. Try to move to the scalar 751 // version if the offsets are constant, so that we can try to keep extended 752 // loads of kernel arguments in SGPRs. 753 754 // TODO: Technically we could try to pattern match scalar bitshifts of 755 // dynamic values, but it's probably not useful. 756 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); 757 if (!Offset) 758 break; 759 760 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2)); 761 if (!Width) 762 break; 763 764 bool Signed = Opc == AMDGPUISD::BFE_I32; 765 766 uint32_t OffsetVal = Offset->getZExtValue(); 767 uint32_t WidthVal = Width->getZExtValue(); 768 769 ReplaceNode(N, getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, 770 SDLoc(N), N->getOperand(0), OffsetVal, WidthVal)); 771 return; 772 } 773 case AMDGPUISD::DIV_SCALE: { 774 SelectDIV_SCALE(N); 775 return; 776 } 777 case AMDGPUISD::DIV_FMAS: { 778 SelectDIV_FMAS(N); 779 return; 780 } 781 case AMDGPUISD::MAD_I64_I32: 782 case AMDGPUISD::MAD_U64_U32: { 783 SelectMAD_64_32(N); 784 return; 785 } 786 case ISD::CopyToReg: { 787 const SITargetLowering& Lowering = 788 *static_cast<const SITargetLowering*>(getTargetLowering()); 789 N = Lowering.legalizeTargetIndependentNode(N, *CurDAG); 790 break; 791 } 792 case ISD::AND: 793 case ISD::SRL: 794 case ISD::SRA: 795 case ISD::SIGN_EXTEND_INREG: 796 if (N->getValueType(0) != MVT::i32) 797 break; 798 799 SelectS_BFE(N); 800 return; 801 case ISD::BRCOND: 802 SelectBRCOND(N); 803 return; 804 case ISD::FMAD: 805 case ISD::FMA: 806 SelectFMAD_FMA(N); 807 return; 808 case AMDGPUISD::ATOMIC_CMP_SWAP: 809 SelectATOMIC_CMP_SWAP(N); 810 return; 811 case AMDGPUISD::CVT_PKRTZ_F16_F32: 812 case AMDGPUISD::CVT_PKNORM_I16_F32: 813 case AMDGPUISD::CVT_PKNORM_U16_F32: 814 case AMDGPUISD::CVT_PK_U16_U32: 815 case AMDGPUISD::CVT_PK_I16_I32: { 816 // Hack around using a legal type if f16 is illegal. 817 if (N->getValueType(0) == MVT::i32) { 818 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16; 819 N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT), 820 { N->getOperand(0), N->getOperand(1) }); 821 SelectCode(N); 822 return; 823 } 824 825 break; 826 } 827 case ISD::INTRINSIC_W_CHAIN: { 828 SelectINTRINSIC_W_CHAIN(N); 829 return; 830 } 831 } 832 833 SelectCode(N); 834 } 835 836 bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const { 837 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock(); 838 const Instruction *Term = BB->getTerminator(); 839 return Term->getMetadata("amdgpu.uniform") || 840 Term->getMetadata("structurizecfg.uniform"); 841 } 842 843 StringRef AMDGPUDAGToDAGISel::getPassName() const { 844 return "AMDGPU DAG->DAG Pattern Instruction Selection"; 845 } 846 847 //===----------------------------------------------------------------------===// 848 // Complex Patterns 849 //===----------------------------------------------------------------------===// 850 851 bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base, 852 SDValue &Offset) { 853 return false; 854 } 855 856 bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, 857 SDValue &Offset) { 858 ConstantSDNode *C; 859 SDLoc DL(Addr); 860 861 if ((C = dyn_cast<ConstantSDNode>(Addr))) { 862 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32); 863 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); 864 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) && 865 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) { 866 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32); 867 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); 868 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) && 869 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) { 870 Base = Addr.getOperand(0); 871 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); 872 } else { 873 Base = Addr; 874 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); 875 } 876 877 return true; 878 } 879 880 // FIXME: Should only handle addcarry/subcarry 881 void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { 882 SDLoc DL(N); 883 SDValue LHS = N->getOperand(0); 884 SDValue RHS = N->getOperand(1); 885 886 unsigned Opcode = N->getOpcode(); 887 bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE); 888 bool ProduceCarry = 889 ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC; 890 bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE; 891 892 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); 893 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); 894 895 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, 896 DL, MVT::i32, LHS, Sub0); 897 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, 898 DL, MVT::i32, LHS, Sub1); 899 900 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, 901 DL, MVT::i32, RHS, Sub0); 902 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, 903 DL, MVT::i32, RHS, Sub1); 904 905 SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue); 906 907 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; 908 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; 909 910 SDNode *AddLo; 911 if (!ConsumeCarry) { 912 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) }; 913 AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args); 914 } else { 915 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) }; 916 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args); 917 } 918 SDValue AddHiArgs[] = { 919 SDValue(Hi0, 0), 920 SDValue(Hi1, 0), 921 SDValue(AddLo, 1) 922 }; 923 SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs); 924 925 SDValue RegSequenceArgs[] = { 926 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), 927 SDValue(AddLo,0), 928 Sub0, 929 SDValue(AddHi,0), 930 Sub1, 931 }; 932 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, 933 MVT::i64, RegSequenceArgs); 934 935 if (ProduceCarry) { 936 // Replace the carry-use 937 ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1)); 938 } 939 940 // Replace the remaining uses. 941 ReplaceNode(N, RegSequence); 942 } 943 944 void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) { 945 SDLoc DL(N); 946 SDValue LHS = N->getOperand(0); 947 SDValue RHS = N->getOperand(1); 948 SDValue CI = N->getOperand(2); 949 950 unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::V_ADDC_U32_e64 951 : AMDGPU::V_SUBB_U32_e64; 952 CurDAG->SelectNodeTo( 953 N, Opc, N->getVTList(), 954 {LHS, RHS, CI, CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/}); 955 } 956 957 void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) { 958 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned 959 // carry out despite the _i32 name. These were renamed in VI to _U32. 960 // FIXME: We should probably rename the opcodes here. 961 unsigned Opc = N->getOpcode() == ISD::UADDO ? 962 AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; 963 964 CurDAG->SelectNodeTo( 965 N, Opc, N->getVTList(), 966 {N->getOperand(0), N->getOperand(1), 967 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/}); 968 } 969 970 void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) { 971 SDLoc SL(N); 972 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod 973 SDValue Ops[10]; 974 975 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]); 976 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]); 977 SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]); 978 Ops[8] = N->getOperand(0); 979 Ops[9] = N->getOperand(4); 980 981 CurDAG->SelectNodeTo(N, AMDGPU::V_FMA_F32, N->getVTList(), Ops); 982 } 983 984 void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) { 985 SDLoc SL(N); 986 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod 987 SDValue Ops[8]; 988 989 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]); 990 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]); 991 Ops[6] = N->getOperand(0); 992 Ops[7] = N->getOperand(3); 993 994 CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops); 995 } 996 997 // We need to handle this here because tablegen doesn't support matching 998 // instructions with multiple outputs. 999 void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { 1000 SDLoc SL(N); 1001 EVT VT = N->getValueType(0); 1002 1003 assert(VT == MVT::f32 || VT == MVT::f64); 1004 1005 unsigned Opc 1006 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32; 1007 1008 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2) }; 1009 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); 1010 } 1011 1012 void AMDGPUDAGToDAGISel::SelectDIV_FMAS(SDNode *N) { 1013 SDLoc SL(N); 1014 EVT VT = N->getValueType(0); 1015 1016 assert(VT == MVT::f32 || VT == MVT::f64); 1017 1018 unsigned Opc 1019 = (VT == MVT::f64) ? AMDGPU::V_DIV_FMAS_F64 : AMDGPU::V_DIV_FMAS_F32; 1020 1021 SDValue CarryIn = N->getOperand(3); 1022 // V_DIV_FMAS implicitly reads VCC. 1023 SDValue VCC = CurDAG->getCopyToReg(CurDAG->getEntryNode(), SL, 1024 AMDGPU::VCC, CarryIn, SDValue()); 1025 1026 SDValue Ops[10]; 1027 1028 SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]); 1029 SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]); 1030 SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]); 1031 1032 Ops[8] = VCC; 1033 Ops[9] = VCC.getValue(1); 1034 1035 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); 1036 } 1037 1038 // We need to handle this here because tablegen doesn't support matching 1039 // instructions with multiple outputs. 1040 void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) { 1041 SDLoc SL(N); 1042 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32; 1043 unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32 : AMDGPU::V_MAD_U64_U32; 1044 1045 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1); 1046 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), 1047 Clamp }; 1048 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); 1049 } 1050 1051 bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset, 1052 unsigned OffsetBits) const { 1053 if ((OffsetBits == 16 && !isUInt<16>(Offset)) || 1054 (OffsetBits == 8 && !isUInt<8>(Offset))) 1055 return false; 1056 1057 if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS || 1058 Subtarget->unsafeDSOffsetFoldingEnabled()) 1059 return true; 1060 1061 // On Southern Islands instruction with a negative base value and an offset 1062 // don't seem to work. 1063 return CurDAG->SignBitIsZero(Base); 1064 } 1065 1066 bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, 1067 SDValue &Offset) const { 1068 SDLoc DL(Addr); 1069 if (CurDAG->isBaseWithConstantOffset(Addr)) { 1070 SDValue N0 = Addr.getOperand(0); 1071 SDValue N1 = Addr.getOperand(1); 1072 ConstantSDNode *C1 = cast<ConstantSDNode>(N1); 1073 if (isDSOffsetLegal(N0, C1->getSExtValue(), 16)) { 1074 // (add n0, c0) 1075 Base = N0; 1076 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); 1077 return true; 1078 } 1079 } else if (Addr.getOpcode() == ISD::SUB) { 1080 // sub C, x -> add (sub 0, x), C 1081 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) { 1082 int64_t ByteOffset = C->getSExtValue(); 1083 if (isUInt<16>(ByteOffset)) { 1084 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); 1085 1086 // XXX - This is kind of hacky. Create a dummy sub node so we can check 1087 // the known bits in isDSOffsetLegal. We need to emit the selected node 1088 // here, so this is thrown away. 1089 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32, 1090 Zero, Addr.getOperand(1)); 1091 1092 if (isDSOffsetLegal(Sub, ByteOffset, 16)) { 1093 SmallVector<SDValue, 3> Opnds; 1094 Opnds.push_back(Zero); 1095 Opnds.push_back(Addr.getOperand(1)); 1096 1097 // FIXME: Select to VOP3 version for with-carry. 1098 unsigned SubOp = AMDGPU::V_SUB_I32_e32; 1099 if (Subtarget->hasAddNoCarry()) { 1100 SubOp = AMDGPU::V_SUB_U32_e64; 1101 Opnds.push_back( 1102 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit 1103 } 1104 1105 MachineSDNode *MachineSub = 1106 CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds); 1107 1108 Base = SDValue(MachineSub, 0); 1109 Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16); 1110 return true; 1111 } 1112 } 1113 } 1114 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { 1115 // If we have a constant address, prefer to put the constant into the 1116 // offset. This can save moves to load the constant address since multiple 1117 // operations can share the zero base address register, and enables merging 1118 // into read2 / write2 instructions. 1119 1120 SDLoc DL(Addr); 1121 1122 if (isUInt<16>(CAddr->getZExtValue())) { 1123 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); 1124 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, 1125 DL, MVT::i32, Zero); 1126 Base = SDValue(MovZero, 0); 1127 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16); 1128 return true; 1129 } 1130 } 1131 1132 // default case 1133 Base = Addr; 1134 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16); 1135 return true; 1136 } 1137 1138 // TODO: If offset is too big, put low 16-bit into offset. 1139 bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, 1140 SDValue &Offset0, 1141 SDValue &Offset1) const { 1142 SDLoc DL(Addr); 1143 1144 if (CurDAG->isBaseWithConstantOffset(Addr)) { 1145 SDValue N0 = Addr.getOperand(0); 1146 SDValue N1 = Addr.getOperand(1); 1147 ConstantSDNode *C1 = cast<ConstantSDNode>(N1); 1148 unsigned DWordOffset0 = C1->getZExtValue() / 4; 1149 unsigned DWordOffset1 = DWordOffset0 + 1; 1150 // (add n0, c0) 1151 if (isDSOffsetLegal(N0, DWordOffset1, 8)) { 1152 Base = N0; 1153 Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); 1154 Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); 1155 return true; 1156 } 1157 } else if (Addr.getOpcode() == ISD::SUB) { 1158 // sub C, x -> add (sub 0, x), C 1159 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) { 1160 unsigned DWordOffset0 = C->getZExtValue() / 4; 1161 unsigned DWordOffset1 = DWordOffset0 + 1; 1162 1163 if (isUInt<8>(DWordOffset0)) { 1164 SDLoc DL(Addr); 1165 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); 1166 1167 // XXX - This is kind of hacky. Create a dummy sub node so we can check 1168 // the known bits in isDSOffsetLegal. We need to emit the selected node 1169 // here, so this is thrown away. 1170 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32, 1171 Zero, Addr.getOperand(1)); 1172 1173 if (isDSOffsetLegal(Sub, DWordOffset1, 8)) { 1174 SmallVector<SDValue, 3> Opnds; 1175 Opnds.push_back(Zero); 1176 Opnds.push_back(Addr.getOperand(1)); 1177 unsigned SubOp = AMDGPU::V_SUB_I32_e32; 1178 if (Subtarget->hasAddNoCarry()) { 1179 SubOp = AMDGPU::V_SUB_U32_e64; 1180 Opnds.push_back( 1181 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit 1182 } 1183 1184 MachineSDNode *MachineSub 1185 = CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds); 1186 1187 Base = SDValue(MachineSub, 0); 1188 Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); 1189 Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); 1190 return true; 1191 } 1192 } 1193 } 1194 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { 1195 unsigned DWordOffset0 = CAddr->getZExtValue() / 4; 1196 unsigned DWordOffset1 = DWordOffset0 + 1; 1197 assert(4 * DWordOffset0 == CAddr->getZExtValue()); 1198 1199 if (isUInt<8>(DWordOffset0) && isUInt<8>(DWordOffset1)) { 1200 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); 1201 MachineSDNode *MovZero 1202 = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, 1203 DL, MVT::i32, Zero); 1204 Base = SDValue(MovZero, 0); 1205 Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); 1206 Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); 1207 return true; 1208 } 1209 } 1210 1211 // default case 1212 1213 Base = Addr; 1214 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8); 1215 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8); 1216 return true; 1217 } 1218 1219 bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, 1220 SDValue &VAddr, SDValue &SOffset, 1221 SDValue &Offset, SDValue &Offen, 1222 SDValue &Idxen, SDValue &Addr64, 1223 SDValue &GLC, SDValue &SLC, 1224 SDValue &TFE, SDValue &DLC) const { 1225 // Subtarget prefers to use flat instruction 1226 if (Subtarget->useFlatForGlobal()) 1227 return false; 1228 1229 SDLoc DL(Addr); 1230 1231 if (!GLC.getNode()) 1232 GLC = CurDAG->getTargetConstant(0, DL, MVT::i1); 1233 if (!SLC.getNode()) 1234 SLC = CurDAG->getTargetConstant(0, DL, MVT::i1); 1235 TFE = CurDAG->getTargetConstant(0, DL, MVT::i1); 1236 DLC = CurDAG->getTargetConstant(0, DL, MVT::i1); 1237 1238 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1); 1239 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1); 1240 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1); 1241 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); 1242 1243 ConstantSDNode *C1 = nullptr; 1244 SDValue N0 = Addr; 1245 if (CurDAG->isBaseWithConstantOffset(Addr)) { 1246 C1 = cast<ConstantSDNode>(Addr.getOperand(1)); 1247 if (isUInt<32>(C1->getZExtValue())) 1248 N0 = Addr.getOperand(0); 1249 else 1250 C1 = nullptr; 1251 } 1252 1253 if (N0.getOpcode() == ISD::ADD) { 1254 // (add N2, N3) -> addr64, or 1255 // (add (add N2, N3), C1) -> addr64 1256 SDValue N2 = N0.getOperand(0); 1257 SDValue N3 = N0.getOperand(1); 1258 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); 1259 1260 if (N2->isDivergent()) { 1261 if (N3->isDivergent()) { 1262 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the 1263 // addr64, and construct the resource from a 0 address. 1264 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0); 1265 VAddr = N0; 1266 } else { 1267 // N2 is divergent, N3 is not. 1268 Ptr = N3; 1269 VAddr = N2; 1270 } 1271 } else { 1272 // N2 is not divergent. 1273 Ptr = N2; 1274 VAddr = N3; 1275 } 1276 Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); 1277 } else if (N0->isDivergent()) { 1278 // N0 is divergent. Use it as the addr64, and construct the resource from a 1279 // 0 address. 1280 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0); 1281 VAddr = N0; 1282 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); 1283 } else { 1284 // N0 -> offset, or 1285 // (N0 + C1) -> offset 1286 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32); 1287 Ptr = N0; 1288 } 1289 1290 if (!C1) { 1291 // No offset. 1292 Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); 1293 return true; 1294 } 1295 1296 if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) { 1297 // Legal offset for instruction. 1298 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); 1299 return true; 1300 } 1301 1302 // Illegal offset, store it in soffset. 1303 Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); 1304 SOffset = 1305 SDValue(CurDAG->getMachineNode( 1306 AMDGPU::S_MOV_B32, DL, MVT::i32, 1307 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)), 1308 0); 1309 return true; 1310 } 1311 1312 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, 1313 SDValue &VAddr, SDValue &SOffset, 1314 SDValue &Offset, SDValue &GLC, 1315 SDValue &SLC, SDValue &TFE, 1316 SDValue &DLC) const { 1317 SDValue Ptr, Offen, Idxen, Addr64; 1318 1319 // addr64 bit was removed for volcanic islands. 1320 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 1321 return false; 1322 1323 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, 1324 GLC, SLC, TFE, DLC)) 1325 return false; 1326 1327 ConstantSDNode *C = cast<ConstantSDNode>(Addr64); 1328 if (C->getSExtValue()) { 1329 SDLoc DL(Addr); 1330 1331 const SITargetLowering& Lowering = 1332 *static_cast<const SITargetLowering*>(getTargetLowering()); 1333 1334 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0); 1335 return true; 1336 } 1337 1338 return false; 1339 } 1340 1341 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, 1342 SDValue &VAddr, SDValue &SOffset, 1343 SDValue &Offset, 1344 SDValue &SLC) const { 1345 SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1); 1346 SDValue GLC, TFE, DLC; 1347 1348 return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC); 1349 } 1350 1351 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { 1352 auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>(); 1353 return PSV && PSV->isStack(); 1354 } 1355 1356 std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const { 1357 const MachineFunction &MF = CurDAG->getMachineFunction(); 1358 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1359 1360 if (auto FI = dyn_cast<FrameIndexSDNode>(N)) { 1361 SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(), 1362 FI->getValueType(0)); 1363 1364 // If we can resolve this to a frame index access, this is relative to the 1365 // frame pointer SGPR. 1366 return std::make_pair(TFI, CurDAG->getRegister(Info->getFrameOffsetReg(), 1367 MVT::i32)); 1368 } 1369 1370 // If we don't know this private access is a local stack object, it needs to 1371 // be relative to the entry point's scratch wave offset register. 1372 return std::make_pair(N, CurDAG->getRegister(Info->getScratchWaveOffsetReg(), 1373 MVT::i32)); 1374 } 1375 1376 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, 1377 SDValue Addr, SDValue &Rsrc, 1378 SDValue &VAddr, SDValue &SOffset, 1379 SDValue &ImmOffset) const { 1380 1381 SDLoc DL(Addr); 1382 MachineFunction &MF = CurDAG->getMachineFunction(); 1383 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1384 1385 Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); 1386 1387 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { 1388 unsigned Imm = CAddr->getZExtValue(); 1389 1390 SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32); 1391 MachineSDNode *MovHighBits = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, 1392 DL, MVT::i32, HighBits); 1393 VAddr = SDValue(MovHighBits, 0); 1394 1395 // In a call sequence, stores to the argument stack area are relative to the 1396 // stack pointer. 1397 const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo(); 1398 unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ? 1399 Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg(); 1400 1401 SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32); 1402 ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16); 1403 return true; 1404 } 1405 1406 if (CurDAG->isBaseWithConstantOffset(Addr)) { 1407 // (add n0, c1) 1408 1409 SDValue N0 = Addr.getOperand(0); 1410 SDValue N1 = Addr.getOperand(1); 1411 1412 // Offsets in vaddr must be positive if range checking is enabled. 1413 // 1414 // The total computation of vaddr + soffset + offset must not overflow. If 1415 // vaddr is negative, even if offset is 0 the sgpr offset add will end up 1416 // overflowing. 1417 // 1418 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would 1419 // always perform a range check. If a negative vaddr base index was used, 1420 // this would fail the range check. The overall address computation would 1421 // compute a valid address, but this doesn't happen due to the range 1422 // check. For out-of-bounds MUBUF loads, a 0 is returned. 1423 // 1424 // Therefore it should be safe to fold any VGPR offset on gfx9 into the 1425 // MUBUF vaddr, but not on older subtargets which can only do this if the 1426 // sign bit is known 0. 1427 ConstantSDNode *C1 = cast<ConstantSDNode>(N1); 1428 if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue()) && 1429 (!Subtarget->privateMemoryResourceIsRangeChecked() || 1430 CurDAG->SignBitIsZero(N0))) { 1431 std::tie(VAddr, SOffset) = foldFrameIndex(N0); 1432 ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); 1433 return true; 1434 } 1435 } 1436 1437 // (node) 1438 std::tie(VAddr, SOffset) = foldFrameIndex(Addr); 1439 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16); 1440 return true; 1441 } 1442 1443 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, 1444 SDValue Addr, 1445 SDValue &SRsrc, 1446 SDValue &SOffset, 1447 SDValue &Offset) const { 1448 ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr); 1449 if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue())) 1450 return false; 1451 1452 SDLoc DL(Addr); 1453 MachineFunction &MF = CurDAG->getMachineFunction(); 1454 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1455 1456 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); 1457 1458 const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo(); 1459 unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ? 1460 Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg(); 1461 1462 // FIXME: Get from MachinePointerInfo? We should only be using the frame 1463 // offset if we know this is in a call sequence. 1464 SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32); 1465 1466 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16); 1467 return true; 1468 } 1469 1470 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, 1471 SDValue &SOffset, SDValue &Offset, 1472 SDValue &GLC, SDValue &SLC, 1473 SDValue &TFE, SDValue &DLC) const { 1474 SDValue Ptr, VAddr, Offen, Idxen, Addr64; 1475 const SIInstrInfo *TII = 1476 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 1477 1478 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, 1479 GLC, SLC, TFE, DLC)) 1480 return false; 1481 1482 if (!cast<ConstantSDNode>(Offen)->getSExtValue() && 1483 !cast<ConstantSDNode>(Idxen)->getSExtValue() && 1484 !cast<ConstantSDNode>(Addr64)->getSExtValue()) { 1485 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | 1486 APInt::getAllOnesValue(32).getZExtValue(); // Size 1487 SDLoc DL(Addr); 1488 1489 const SITargetLowering& Lowering = 1490 *static_cast<const SITargetLowering*>(getTargetLowering()); 1491 1492 SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0); 1493 return true; 1494 } 1495 return false; 1496 } 1497 1498 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, 1499 SDValue &Soffset, SDValue &Offset 1500 ) const { 1501 SDValue GLC, SLC, TFE, DLC; 1502 1503 return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC); 1504 } 1505 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, 1506 SDValue &Soffset, SDValue &Offset, 1507 SDValue &SLC) const { 1508 SDValue GLC, TFE, DLC; 1509 1510 return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC); 1511 } 1512 1513 template <bool IsSigned> 1514 bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, 1515 SDValue Addr, 1516 SDValue &VAddr, 1517 SDValue &Offset, 1518 SDValue &SLC) const { 1519 return static_cast<const SITargetLowering*>(getTargetLowering())-> 1520 SelectFlatOffset(IsSigned, *CurDAG, N, Addr, VAddr, Offset, SLC); 1521 } 1522 1523 bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDNode *N, 1524 SDValue Addr, 1525 SDValue &VAddr, 1526 SDValue &Offset, 1527 SDValue &SLC) const { 1528 return SelectFlatOffset<false>(N, Addr, VAddr, Offset, SLC); 1529 } 1530 1531 bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDNode *N, 1532 SDValue Addr, 1533 SDValue &VAddr, 1534 SDValue &Offset, 1535 SDValue &SLC) const { 1536 return SelectFlatOffset<true>(N, Addr, VAddr, Offset, SLC); 1537 } 1538 1539 bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, 1540 SDValue &Offset, bool &Imm) const { 1541 1542 // FIXME: Handle non-constant offsets. 1543 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode); 1544 if (!C) 1545 return false; 1546 1547 SDLoc SL(ByteOffsetNode); 1548 GCNSubtarget::Generation Gen = Subtarget->getGeneration(); 1549 int64_t ByteOffset = C->getSExtValue(); 1550 int64_t EncodedOffset = AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset); 1551 1552 if (AMDGPU::isLegalSMRDImmOffset(*Subtarget, ByteOffset)) { 1553 Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32); 1554 Imm = true; 1555 return true; 1556 } 1557 1558 if (!isUInt<32>(EncodedOffset) || !isUInt<32>(ByteOffset)) 1559 return false; 1560 1561 if (Gen == AMDGPUSubtarget::SEA_ISLANDS && isUInt<32>(EncodedOffset)) { 1562 // 32-bit Immediates are supported on Sea Islands. 1563 Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32); 1564 } else { 1565 SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32); 1566 Offset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, 1567 C32Bit), 0); 1568 } 1569 Imm = false; 1570 return true; 1571 } 1572 1573 SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const { 1574 if (Addr.getValueType() != MVT::i32) 1575 return Addr; 1576 1577 // Zero-extend a 32-bit address. 1578 SDLoc SL(Addr); 1579 1580 const MachineFunction &MF = CurDAG->getMachineFunction(); 1581 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1582 unsigned AddrHiVal = Info->get32BitAddressHighBits(); 1583 SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32); 1584 1585 const SDValue Ops[] = { 1586 CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32), 1587 Addr, 1588 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32), 1589 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi), 1590 0), 1591 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32), 1592 }; 1593 1594 return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64, 1595 Ops), 0); 1596 } 1597 1598 bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, 1599 SDValue &Offset, bool &Imm) const { 1600 SDLoc SL(Addr); 1601 1602 // A 32-bit (address + offset) should not cause unsigned 32-bit integer 1603 // wraparound, because s_load instructions perform the addition in 64 bits. 1604 if ((Addr.getValueType() != MVT::i32 || 1605 Addr->getFlags().hasNoUnsignedWrap()) && 1606 CurDAG->isBaseWithConstantOffset(Addr)) { 1607 SDValue N0 = Addr.getOperand(0); 1608 SDValue N1 = Addr.getOperand(1); 1609 1610 if (SelectSMRDOffset(N1, Offset, Imm)) { 1611 SBase = Expand32BitAddress(N0); 1612 return true; 1613 } 1614 } 1615 SBase = Expand32BitAddress(Addr); 1616 Offset = CurDAG->getTargetConstant(0, SL, MVT::i32); 1617 Imm = true; 1618 return true; 1619 } 1620 1621 bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase, 1622 SDValue &Offset) const { 1623 bool Imm; 1624 return SelectSMRD(Addr, SBase, Offset, Imm) && Imm; 1625 } 1626 1627 bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase, 1628 SDValue &Offset) const { 1629 1630 if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS) 1631 return false; 1632 1633 bool Imm; 1634 if (!SelectSMRD(Addr, SBase, Offset, Imm)) 1635 return false; 1636 1637 return !Imm && isa<ConstantSDNode>(Offset); 1638 } 1639 1640 bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase, 1641 SDValue &Offset) const { 1642 bool Imm; 1643 return SelectSMRD(Addr, SBase, Offset, Imm) && !Imm && 1644 !isa<ConstantSDNode>(Offset); 1645 } 1646 1647 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr, 1648 SDValue &Offset) const { 1649 bool Imm; 1650 return SelectSMRDOffset(Addr, Offset, Imm) && Imm; 1651 } 1652 1653 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr, 1654 SDValue &Offset) const { 1655 if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS) 1656 return false; 1657 1658 bool Imm; 1659 if (!SelectSMRDOffset(Addr, Offset, Imm)) 1660 return false; 1661 1662 return !Imm && isa<ConstantSDNode>(Offset); 1663 } 1664 1665 bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index, 1666 SDValue &Base, 1667 SDValue &Offset) const { 1668 SDLoc DL(Index); 1669 1670 if (CurDAG->isBaseWithConstantOffset(Index)) { 1671 SDValue N0 = Index.getOperand(0); 1672 SDValue N1 = Index.getOperand(1); 1673 ConstantSDNode *C1 = cast<ConstantSDNode>(N1); 1674 1675 // (add n0, c0) 1676 // Don't peel off the offset (c0) if doing so could possibly lead 1677 // the base (n0) to be negative. 1678 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0)) { 1679 Base = N0; 1680 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32); 1681 return true; 1682 } 1683 } 1684 1685 if (isa<ConstantSDNode>(Index)) 1686 return false; 1687 1688 Base = Index; 1689 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); 1690 return true; 1691 } 1692 1693 SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, const SDLoc &DL, 1694 SDValue Val, uint32_t Offset, 1695 uint32_t Width) { 1696 // Transformation function, pack the offset and width of a BFE into 1697 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second 1698 // source, bits [5:0] contain the offset and bits [22:16] the width. 1699 uint32_t PackedVal = Offset | (Width << 16); 1700 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32); 1701 1702 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst); 1703 } 1704 1705 void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) { 1706 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c) 1707 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c) 1708 // Predicate: 0 < b <= c < 32 1709 1710 const SDValue &Shl = N->getOperand(0); 1711 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1)); 1712 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 1713 1714 if (B && C) { 1715 uint32_t BVal = B->getZExtValue(); 1716 uint32_t CVal = C->getZExtValue(); 1717 1718 if (0 < BVal && BVal <= CVal && CVal < 32) { 1719 bool Signed = N->getOpcode() == ISD::SRA; 1720 unsigned Opcode = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; 1721 1722 ReplaceNode(N, getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0), CVal - BVal, 1723 32 - CVal)); 1724 return; 1725 } 1726 } 1727 SelectCode(N); 1728 } 1729 1730 void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) { 1731 switch (N->getOpcode()) { 1732 case ISD::AND: 1733 if (N->getOperand(0).getOpcode() == ISD::SRL) { 1734 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)" 1735 // Predicate: isMask(mask) 1736 const SDValue &Srl = N->getOperand(0); 1737 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1)); 1738 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1)); 1739 1740 if (Shift && Mask) { 1741 uint32_t ShiftVal = Shift->getZExtValue(); 1742 uint32_t MaskVal = Mask->getZExtValue(); 1743 1744 if (isMask_32(MaskVal)) { 1745 uint32_t WidthVal = countPopulation(MaskVal); 1746 1747 ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), 1748 Srl.getOperand(0), ShiftVal, WidthVal)); 1749 return; 1750 } 1751 } 1752 } 1753 break; 1754 case ISD::SRL: 1755 if (N->getOperand(0).getOpcode() == ISD::AND) { 1756 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)" 1757 // Predicate: isMask(mask >> b) 1758 const SDValue &And = N->getOperand(0); 1759 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1)); 1760 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1)); 1761 1762 if (Shift && Mask) { 1763 uint32_t ShiftVal = Shift->getZExtValue(); 1764 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal; 1765 1766 if (isMask_32(MaskVal)) { 1767 uint32_t WidthVal = countPopulation(MaskVal); 1768 1769 ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), 1770 And.getOperand(0), ShiftVal, WidthVal)); 1771 return; 1772 } 1773 } 1774 } else if (N->getOperand(0).getOpcode() == ISD::SHL) { 1775 SelectS_BFEFromShifts(N); 1776 return; 1777 } 1778 break; 1779 case ISD::SRA: 1780 if (N->getOperand(0).getOpcode() == ISD::SHL) { 1781 SelectS_BFEFromShifts(N); 1782 return; 1783 } 1784 break; 1785 1786 case ISD::SIGN_EXTEND_INREG: { 1787 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8 1788 SDValue Src = N->getOperand(0); 1789 if (Src.getOpcode() != ISD::SRL) 1790 break; 1791 1792 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1)); 1793 if (!Amt) 1794 break; 1795 1796 unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits(); 1797 ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_I32, SDLoc(N), Src.getOperand(0), 1798 Amt->getZExtValue(), Width)); 1799 return; 1800 } 1801 } 1802 1803 SelectCode(N); 1804 } 1805 1806 bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const { 1807 assert(N->getOpcode() == ISD::BRCOND); 1808 if (!N->hasOneUse()) 1809 return false; 1810 1811 SDValue Cond = N->getOperand(1); 1812 if (Cond.getOpcode() == ISD::CopyToReg) 1813 Cond = Cond.getOperand(2); 1814 1815 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse()) 1816 return false; 1817 1818 MVT VT = Cond.getOperand(0).getSimpleValueType(); 1819 if (VT == MVT::i32) 1820 return true; 1821 1822 if (VT == MVT::i64) { 1823 auto ST = static_cast<const GCNSubtarget *>(Subtarget); 1824 1825 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 1826 return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64(); 1827 } 1828 1829 return false; 1830 } 1831 1832 void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) { 1833 SDValue Cond = N->getOperand(1); 1834 1835 if (Cond.isUndef()) { 1836 CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other, 1837 N->getOperand(2), N->getOperand(0)); 1838 return; 1839 } 1840 1841 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N); 1842 unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ; 1843 unsigned CondReg = UseSCCBr ? AMDGPU::SCC : AMDGPU::VCC; 1844 SDLoc SL(N); 1845 1846 if (!UseSCCBr) { 1847 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not 1848 // analyzed what generates the vcc value, so we do not know whether vcc 1849 // bits for disabled lanes are 0. Thus we need to mask out bits for 1850 // disabled lanes. 1851 // 1852 // For the case that we select S_CBRANCH_SCC1 and it gets 1853 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls 1854 // SIInstrInfo::moveToVALU which inserts the S_AND). 1855 // 1856 // We could add an analysis of what generates the vcc value here and omit 1857 // the S_AND when is unnecessary. But it would be better to add a separate 1858 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it 1859 // catches both cases. 1860 Cond = SDValue(CurDAG->getMachineNode(AMDGPU::S_AND_B64, SL, MVT::i1, 1861 CurDAG->getRegister(AMDGPU::EXEC, MVT::i1), 1862 Cond), 1863 0); 1864 } 1865 1866 SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond); 1867 CurDAG->SelectNodeTo(N, BrOp, MVT::Other, 1868 N->getOperand(2), // Basic Block 1869 VCC.getValue(0)); 1870 } 1871 1872 void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) { 1873 MVT VT = N->getSimpleValueType(0); 1874 bool IsFMA = N->getOpcode() == ISD::FMA; 1875 if (VT != MVT::f32 || (!Subtarget->hasMadMixInsts() && 1876 !Subtarget->hasFmaMixInsts()) || 1877 ((IsFMA && Subtarget->hasMadMixInsts()) || 1878 (!IsFMA && Subtarget->hasFmaMixInsts()))) { 1879 SelectCode(N); 1880 return; 1881 } 1882 1883 SDValue Src0 = N->getOperand(0); 1884 SDValue Src1 = N->getOperand(1); 1885 SDValue Src2 = N->getOperand(2); 1886 unsigned Src0Mods, Src1Mods, Src2Mods; 1887 1888 // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand 1889 // using the conversion from f16. 1890 bool Sel0 = SelectVOP3PMadMixModsImpl(Src0, Src0, Src0Mods); 1891 bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods); 1892 bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods); 1893 1894 assert((IsFMA || !Subtarget->hasFP32Denormals()) && 1895 "fmad selected with denormals enabled"); 1896 // TODO: We can select this with f32 denormals enabled if all the sources are 1897 // converted from f16 (in which case fmad isn't legal). 1898 1899 if (Sel0 || Sel1 || Sel2) { 1900 // For dummy operands. 1901 SDValue Zero = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32); 1902 SDValue Ops[] = { 1903 CurDAG->getTargetConstant(Src0Mods, SDLoc(), MVT::i32), Src0, 1904 CurDAG->getTargetConstant(Src1Mods, SDLoc(), MVT::i32), Src1, 1905 CurDAG->getTargetConstant(Src2Mods, SDLoc(), MVT::i32), Src2, 1906 CurDAG->getTargetConstant(0, SDLoc(), MVT::i1), 1907 Zero, Zero 1908 }; 1909 1910 CurDAG->SelectNodeTo(N, 1911 IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32, 1912 MVT::f32, Ops); 1913 } else { 1914 SelectCode(N); 1915 } 1916 } 1917 1918 // This is here because there isn't a way to use the generated sub0_sub1 as the 1919 // subreg index to EXTRACT_SUBREG in tablegen. 1920 void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) { 1921 MemSDNode *Mem = cast<MemSDNode>(N); 1922 unsigned AS = Mem->getAddressSpace(); 1923 if (AS == AMDGPUAS::FLAT_ADDRESS) { 1924 SelectCode(N); 1925 return; 1926 } 1927 1928 MVT VT = N->getSimpleValueType(0); 1929 bool Is32 = (VT == MVT::i32); 1930 SDLoc SL(N); 1931 1932 MachineSDNode *CmpSwap = nullptr; 1933 if (Subtarget->hasAddr64()) { 1934 SDValue SRsrc, VAddr, SOffset, Offset, SLC; 1935 1936 if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset, SLC)) { 1937 unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN : 1938 AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN; 1939 SDValue CmpVal = Mem->getOperand(2); 1940 1941 // XXX - Do we care about glue operands? 1942 1943 SDValue Ops[] = { 1944 CmpVal, VAddr, SRsrc, SOffset, Offset, SLC, Mem->getChain() 1945 }; 1946 1947 CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops); 1948 } 1949 } 1950 1951 if (!CmpSwap) { 1952 SDValue SRsrc, SOffset, Offset, SLC; 1953 if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset, SLC)) { 1954 unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN : 1955 AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN; 1956 1957 SDValue CmpVal = Mem->getOperand(2); 1958 SDValue Ops[] = { 1959 CmpVal, SRsrc, SOffset, Offset, SLC, Mem->getChain() 1960 }; 1961 1962 CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops); 1963 } 1964 } 1965 1966 if (!CmpSwap) { 1967 SelectCode(N); 1968 return; 1969 } 1970 1971 MachineMemOperand *MMO = Mem->getMemOperand(); 1972 CurDAG->setNodeMemRefs(CmpSwap, {MMO}); 1973 1974 unsigned SubReg = Is32 ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 1975 SDValue Extract 1976 = CurDAG->getTargetExtractSubreg(SubReg, SL, VT, SDValue(CmpSwap, 0)); 1977 1978 ReplaceUses(SDValue(N, 0), Extract); 1979 ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 1)); 1980 CurDAG->RemoveDeadNode(N); 1981 } 1982 1983 void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) { 1984 unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 1985 if ((IntrID != Intrinsic::amdgcn_ds_append && 1986 IntrID != Intrinsic::amdgcn_ds_consume) || 1987 N->getValueType(0) != MVT::i32) { 1988 SelectCode(N); 1989 return; 1990 } 1991 1992 // The address is assumed to be uniform, so if it ends up in a VGPR, it will 1993 // be copied to an SGPR with readfirstlane. 1994 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ? 1995 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME; 1996 1997 SDValue Chain = N->getOperand(0); 1998 SDValue Ptr = N->getOperand(2); 1999 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N); 2000 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS; 2001 2002 SDValue Offset; 2003 if (CurDAG->isBaseWithConstantOffset(Ptr)) { 2004 SDValue PtrBase = Ptr.getOperand(0); 2005 SDValue PtrOffset = Ptr.getOperand(1); 2006 2007 const APInt &OffsetVal = cast<ConstantSDNode>(PtrOffset)->getAPIntValue(); 2008 if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue(), 16)) { 2009 N = glueCopyToM0(N, PtrBase); 2010 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32); 2011 } 2012 } 2013 2014 if (!Offset) { 2015 N = glueCopyToM0(N, Ptr); 2016 Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32); 2017 } 2018 2019 SDValue Ops[] = { 2020 Offset, 2021 CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32), 2022 Chain, 2023 N->getOperand(N->getNumOperands() - 1) // New glue 2024 }; 2025 2026 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); 2027 } 2028 2029 bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src, 2030 unsigned &Mods) const { 2031 Mods = 0; 2032 Src = In; 2033 2034 if (Src.getOpcode() == ISD::FNEG) { 2035 Mods |= SISrcMods::NEG; 2036 Src = Src.getOperand(0); 2037 } 2038 2039 if (Src.getOpcode() == ISD::FABS) { 2040 Mods |= SISrcMods::ABS; 2041 Src = Src.getOperand(0); 2042 } 2043 2044 return true; 2045 } 2046 2047 bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, 2048 SDValue &SrcMods) const { 2049 unsigned Mods; 2050 if (SelectVOP3ModsImpl(In, Src, Mods)) { 2051 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); 2052 return true; 2053 } 2054 2055 return false; 2056 } 2057 2058 bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, 2059 SDValue &SrcMods) const { 2060 SelectVOP3Mods(In, Src, SrcMods); 2061 return isNoNanSrc(Src); 2062 } 2063 2064 bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const { 2065 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG) 2066 return false; 2067 2068 Src = In; 2069 return true; 2070 } 2071 2072 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src, 2073 SDValue &SrcMods, SDValue &Clamp, 2074 SDValue &Omod) const { 2075 SDLoc DL(In); 2076 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); 2077 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1); 2078 2079 return SelectVOP3Mods(In, Src, SrcMods); 2080 } 2081 2082 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, 2083 SDValue &SrcMods, 2084 SDValue &Clamp, 2085 SDValue &Omod) const { 2086 Clamp = Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32); 2087 return SelectVOP3Mods(In, Src, SrcMods); 2088 } 2089 2090 bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src, 2091 SDValue &Clamp, SDValue &Omod) const { 2092 Src = In; 2093 2094 SDLoc DL(In); 2095 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); 2096 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1); 2097 2098 return true; 2099 } 2100 2101 bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, 2102 SDValue &SrcMods) const { 2103 unsigned Mods = 0; 2104 Src = In; 2105 2106 if (Src.getOpcode() == ISD::FNEG) { 2107 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); 2108 Src = Src.getOperand(0); 2109 } 2110 2111 if (Src.getOpcode() == ISD::BUILD_VECTOR) { 2112 unsigned VecMods = Mods; 2113 2114 SDValue Lo = stripBitcast(Src.getOperand(0)); 2115 SDValue Hi = stripBitcast(Src.getOperand(1)); 2116 2117 if (Lo.getOpcode() == ISD::FNEG) { 2118 Lo = stripBitcast(Lo.getOperand(0)); 2119 Mods ^= SISrcMods::NEG; 2120 } 2121 2122 if (Hi.getOpcode() == ISD::FNEG) { 2123 Hi = stripBitcast(Hi.getOperand(0)); 2124 Mods ^= SISrcMods::NEG_HI; 2125 } 2126 2127 if (isExtractHiElt(Lo, Lo)) 2128 Mods |= SISrcMods::OP_SEL_0; 2129 2130 if (isExtractHiElt(Hi, Hi)) 2131 Mods |= SISrcMods::OP_SEL_1; 2132 2133 Lo = stripExtractLoElt(Lo); 2134 Hi = stripExtractLoElt(Hi); 2135 2136 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) { 2137 // Really a scalar input. Just select from the low half of the register to 2138 // avoid packing. 2139 2140 Src = Lo; 2141 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); 2142 return true; 2143 } 2144 2145 Mods = VecMods; 2146 } 2147 2148 // Packed instructions do not have abs modifiers. 2149 Mods |= SISrcMods::OP_SEL_1; 2150 2151 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); 2152 return true; 2153 } 2154 2155 bool AMDGPUDAGToDAGISel::SelectVOP3PMods0(SDValue In, SDValue &Src, 2156 SDValue &SrcMods, 2157 SDValue &Clamp) const { 2158 SDLoc SL(In); 2159 2160 // FIXME: Handle clamp and op_sel 2161 Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32); 2162 2163 return SelectVOP3PMods(In, Src, SrcMods); 2164 } 2165 2166 bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src, 2167 SDValue &SrcMods) const { 2168 Src = In; 2169 // FIXME: Handle op_sel 2170 SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32); 2171 return true; 2172 } 2173 2174 bool AMDGPUDAGToDAGISel::SelectVOP3OpSel0(SDValue In, SDValue &Src, 2175 SDValue &SrcMods, 2176 SDValue &Clamp) const { 2177 SDLoc SL(In); 2178 2179 // FIXME: Handle clamp 2180 Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32); 2181 2182 return SelectVOP3OpSel(In, Src, SrcMods); 2183 } 2184 2185 bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src, 2186 SDValue &SrcMods) const { 2187 // FIXME: Handle op_sel 2188 return SelectVOP3Mods(In, Src, SrcMods); 2189 } 2190 2191 bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods0(SDValue In, SDValue &Src, 2192 SDValue &SrcMods, 2193 SDValue &Clamp) const { 2194 SDLoc SL(In); 2195 2196 // FIXME: Handle clamp 2197 Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32); 2198 2199 return SelectVOP3OpSelMods(In, Src, SrcMods); 2200 } 2201 2202 // The return value is not whether the match is possible (which it always is), 2203 // but whether or not it a conversion is really used. 2204 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, 2205 unsigned &Mods) const { 2206 Mods = 0; 2207 SelectVOP3ModsImpl(In, Src, Mods); 2208 2209 if (Src.getOpcode() == ISD::FP_EXTEND) { 2210 Src = Src.getOperand(0); 2211 assert(Src.getValueType() == MVT::f16); 2212 Src = stripBitcast(Src); 2213 2214 // Be careful about folding modifiers if we already have an abs. fneg is 2215 // applied last, so we don't want to apply an earlier fneg. 2216 if ((Mods & SISrcMods::ABS) == 0) { 2217 unsigned ModsTmp; 2218 SelectVOP3ModsImpl(Src, Src, ModsTmp); 2219 2220 if ((ModsTmp & SISrcMods::NEG) != 0) 2221 Mods ^= SISrcMods::NEG; 2222 2223 if ((ModsTmp & SISrcMods::ABS) != 0) 2224 Mods |= SISrcMods::ABS; 2225 } 2226 2227 // op_sel/op_sel_hi decide the source type and source. 2228 // If the source's op_sel_hi is set, it indicates to do a conversion from fp16. 2229 // If the sources's op_sel is set, it picks the high half of the source 2230 // register. 2231 2232 Mods |= SISrcMods::OP_SEL_1; 2233 if (isExtractHiElt(Src, Src)) { 2234 Mods |= SISrcMods::OP_SEL_0; 2235 2236 // TODO: Should we try to look for neg/abs here? 2237 } 2238 2239 return true; 2240 } 2241 2242 return false; 2243 } 2244 2245 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src, 2246 SDValue &SrcMods) const { 2247 unsigned Mods = 0; 2248 SelectVOP3PMadMixModsImpl(In, Src, Mods); 2249 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); 2250 return true; 2251 } 2252 2253 SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const { 2254 if (In.isUndef()) 2255 return CurDAG->getUNDEF(MVT::i32); 2256 2257 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) { 2258 SDLoc SL(In); 2259 return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32); 2260 } 2261 2262 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) { 2263 SDLoc SL(In); 2264 return CurDAG->getConstant( 2265 C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32); 2266 } 2267 2268 SDValue Src; 2269 if (isExtractHiElt(In, Src)) 2270 return Src; 2271 2272 return SDValue(); 2273 } 2274 2275 bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const { 2276 if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) { 2277 return false; 2278 } 2279 const SIRegisterInfo *SIRI = 2280 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); 2281 const SIInstrInfo * SII = 2282 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 2283 2284 unsigned Limit = 0; 2285 bool AllUsesAcceptSReg = true; 2286 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); 2287 Limit < 10 && U != E; ++U, ++Limit) { 2288 const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo()); 2289 2290 // If the register class is unknown, it could be an unknown 2291 // register class that needs to be an SGPR, e.g. an inline asm 2292 // constraint 2293 if (!RC || SIRI->isSGPRClass(RC)) 2294 return false; 2295 2296 if (RC != &AMDGPU::VS_32RegClass) { 2297 AllUsesAcceptSReg = false; 2298 SDNode * User = *U; 2299 if (User->isMachineOpcode()) { 2300 unsigned Opc = User->getMachineOpcode(); 2301 MCInstrDesc Desc = SII->get(Opc); 2302 if (Desc.isCommutable()) { 2303 unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo(); 2304 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex; 2305 if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) { 2306 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs(); 2307 const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo); 2308 if (CommutedRC == &AMDGPU::VS_32RegClass) 2309 AllUsesAcceptSReg = true; 2310 } 2311 } 2312 } 2313 // If "AllUsesAcceptSReg == false" so far we haven't suceeded 2314 // commuting current user. This means have at least one use 2315 // that strictly require VGPR. Thus, we will not attempt to commute 2316 // other user instructions. 2317 if (!AllUsesAcceptSReg) 2318 break; 2319 } 2320 } 2321 return !AllUsesAcceptSReg && (Limit < 10); 2322 } 2323 2324 bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const { 2325 auto Ld = cast<LoadSDNode>(N); 2326 2327 return Ld->getAlignment() >= 4 && 2328 ( 2329 ( 2330 ( 2331 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || 2332 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT 2333 ) 2334 && 2335 !N->isDivergent() 2336 ) 2337 || 2338 ( 2339 Subtarget->getScalarizeGlobalBehavior() && 2340 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && 2341 !Ld->isVolatile() && 2342 !N->isDivergent() && 2343 static_cast<const SITargetLowering *>( 2344 getTargetLowering())->isMemOpHasNoClobberedMemOperand(N) 2345 ) 2346 ); 2347 } 2348 2349 void AMDGPUDAGToDAGISel::PostprocessISelDAG() { 2350 const AMDGPUTargetLowering& Lowering = 2351 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering()); 2352 bool IsModified = false; 2353 do { 2354 IsModified = false; 2355 2356 // Go over all selected nodes and try to fold them a bit more 2357 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin(); 2358 while (Position != CurDAG->allnodes_end()) { 2359 SDNode *Node = &*Position++; 2360 MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node); 2361 if (!MachineNode) 2362 continue; 2363 2364 SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG); 2365 if (ResNode != Node) { 2366 if (ResNode) 2367 ReplaceUses(Node, ResNode); 2368 IsModified = true; 2369 } 2370 } 2371 CurDAG->RemoveDeadNodes(); 2372 } while (IsModified); 2373 } 2374 2375 bool R600DAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { 2376 Subtarget = &MF.getSubtarget<R600Subtarget>(); 2377 return SelectionDAGISel::runOnMachineFunction(MF); 2378 } 2379 2380 bool R600DAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const { 2381 if (!N->readMem()) 2382 return false; 2383 if (CbId == -1) 2384 return N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || 2385 N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT; 2386 2387 return N->getAddressSpace() == AMDGPUAS::CONSTANT_BUFFER_0 + CbId; 2388 } 2389 2390 bool R600DAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr, 2391 SDValue& IntPtr) { 2392 if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) { 2393 IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr), 2394 true); 2395 return true; 2396 } 2397 return false; 2398 } 2399 2400 bool R600DAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr, 2401 SDValue& BaseReg, SDValue &Offset) { 2402 if (!isa<ConstantSDNode>(Addr)) { 2403 BaseReg = Addr; 2404 Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true); 2405 return true; 2406 } 2407 return false; 2408 } 2409 2410 void R600DAGToDAGISel::Select(SDNode *N) { 2411 unsigned int Opc = N->getOpcode(); 2412 if (N->isMachineOpcode()) { 2413 N->setNodeId(-1); 2414 return; // Already selected. 2415 } 2416 2417 switch (Opc) { 2418 default: break; 2419 case AMDGPUISD::BUILD_VERTICAL_VECTOR: 2420 case ISD::SCALAR_TO_VECTOR: 2421 case ISD::BUILD_VECTOR: { 2422 EVT VT = N->getValueType(0); 2423 unsigned NumVectorElts = VT.getVectorNumElements(); 2424 unsigned RegClassID; 2425 // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG 2426 // that adds a 128 bits reg copy when going through TwoAddressInstructions 2427 // pass. We want to avoid 128 bits copies as much as possible because they 2428 // can't be bundled by our scheduler. 2429 switch(NumVectorElts) { 2430 case 2: RegClassID = R600::R600_Reg64RegClassID; break; 2431 case 4: 2432 if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR) 2433 RegClassID = R600::R600_Reg128VerticalRegClassID; 2434 else 2435 RegClassID = R600::R600_Reg128RegClassID; 2436 break; 2437 default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR"); 2438 } 2439 SelectBuildVector(N, RegClassID); 2440 return; 2441 } 2442 } 2443 2444 SelectCode(N); 2445 } 2446 2447 bool R600DAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, 2448 SDValue &Offset) { 2449 ConstantSDNode *C; 2450 SDLoc DL(Addr); 2451 2452 if ((C = dyn_cast<ConstantSDNode>(Addr))) { 2453 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32); 2454 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); 2455 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) && 2456 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) { 2457 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32); 2458 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); 2459 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) && 2460 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) { 2461 Base = Addr.getOperand(0); 2462 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); 2463 } else { 2464 Base = Addr; 2465 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); 2466 } 2467 2468 return true; 2469 } 2470 2471 bool R600DAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base, 2472 SDValue &Offset) { 2473 ConstantSDNode *IMMOffset; 2474 2475 if (Addr.getOpcode() == ISD::ADD 2476 && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) 2477 && isInt<16>(IMMOffset->getZExtValue())) { 2478 2479 Base = Addr.getOperand(0); 2480 Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr), 2481 MVT::i32); 2482 return true; 2483 // If the pointer address is constant, we can move it to the offset field. 2484 } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr)) 2485 && isInt<16>(IMMOffset->getZExtValue())) { 2486 Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), 2487 SDLoc(CurDAG->getEntryNode()), 2488 R600::ZERO, MVT::i32); 2489 Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr), 2490 MVT::i32); 2491 return true; 2492 } 2493 2494 // Default case, no offset 2495 Base = Addr; 2496 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); 2497 return true; 2498 } 2499