1 //=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// AMD GCN specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 15 #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 16 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUSubtarget.h" 19 #include "SIFrameLowering.h" 20 #include "SIISelLowering.h" 21 #include "SIInstrInfo.h" 22 #include "llvm/CodeGen/SelectionDAGTargetInfo.h" 23 24 #define GET_SUBTARGETINFO_HEADER 25 #include "AMDGPUGenSubtargetInfo.inc" 26 27 namespace llvm { 28 29 class GCNTargetMachine; 30 31 class GCNSubtarget final : public AMDGPUGenSubtargetInfo, 32 public AMDGPUSubtarget { 33 34 using AMDGPUSubtarget::getMaxWavesPerEU; 35 36 public: 37 // Following 2 enums are documented at: 38 // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 39 enum class TrapHandlerAbi { 40 NONE = 0x00, 41 AMDHSA = 0x01, 42 }; 43 44 enum class TrapID { 45 LLVMAMDHSATrap = 0x02, 46 LLVMAMDHSADebugTrap = 0x03, 47 }; 48 49 private: 50 /// GlobalISel related APIs. 51 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; 52 std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo; 53 std::unique_ptr<InstructionSelector> InstSelector; 54 std::unique_ptr<LegalizerInfo> Legalizer; 55 std::unique_ptr<RegisterBankInfo> RegBankInfo; 56 57 protected: 58 // Basic subtarget description. 59 Triple TargetTriple; 60 AMDGPU::IsaInfo::AMDGPUTargetID TargetID; 61 unsigned Gen = INVALID; 62 InstrItineraryData InstrItins; 63 int LDSBankCount = 0; 64 unsigned MaxPrivateElementSize = 0; 65 66 // Possibly statically set by tablegen, but may want to be overridden. 67 bool FastFMAF32 = false; 68 bool FastDenormalF32 = false; 69 bool HalfRate64Ops = false; 70 bool FullRate64Ops = false; 71 72 // Dynamically set bits that enable features. 73 bool FlatForGlobal = false; 74 bool AutoWaitcntBeforeBarrier = false; 75 bool UnalignedScratchAccess = false; 76 bool UnalignedAccessMode = false; 77 bool HasApertureRegs = false; 78 bool SupportsXNACK = false; 79 80 // This should not be used directly. 'TargetID' tracks the dynamic settings 81 // for XNACK. 82 bool EnableXNACK = false; 83 84 bool EnableTgSplit = false; 85 bool EnableCuMode = false; 86 bool TrapHandler = false; 87 88 // Used as options. 89 bool EnableLoadStoreOpt = false; 90 bool EnableUnsafeDSOffsetFolding = false; 91 bool EnableSIScheduler = false; 92 bool EnableDS128 = false; 93 bool EnablePRTStrictNull = false; 94 bool DumpCode = false; 95 96 // Subtarget statically properties set by tablegen 97 bool FP64 = false; 98 bool FMA = false; 99 bool MIMG_R128 = false; 100 bool CIInsts = false; 101 bool GFX8Insts = false; 102 bool GFX9Insts = false; 103 bool GFX90AInsts = false; 104 bool GFX940Insts = false; 105 bool GFX10Insts = false; 106 bool GFX11Insts = false; 107 bool GFX10_3Insts = false; 108 bool GFX7GFX8GFX9Insts = false; 109 bool SGPRInitBug = false; 110 bool UserSGPRInit16Bug = false; 111 bool NegativeScratchOffsetBug = false; 112 bool NegativeUnalignedScratchOffsetBug = false; 113 bool HasSMemRealTime = false; 114 bool HasIntClamp = false; 115 bool HasFmaMixInsts = false; 116 bool HasMovrel = false; 117 bool HasVGPRIndexMode = false; 118 bool HasScalarStores = false; 119 bool HasScalarAtomics = false; 120 bool HasSDWAOmod = false; 121 bool HasSDWAScalar = false; 122 bool HasSDWASdst = false; 123 bool HasSDWAMac = false; 124 bool HasSDWAOutModsVOPC = false; 125 bool HasDPP = false; 126 bool HasDPP8 = false; 127 bool Has64BitDPP = false; 128 bool HasPackedFP32Ops = false; 129 bool HasImageInsts = false; 130 bool HasExtendedImageInsts = false; 131 bool HasR128A16 = false; 132 bool HasGFX10A16 = false; 133 bool HasG16 = false; 134 bool HasNSAEncoding = false; 135 unsigned NSAMaxSize = 0; 136 bool GFX10_AEncoding = false; 137 bool GFX10_BEncoding = false; 138 bool HasDLInsts = false; 139 bool HasDot1Insts = false; 140 bool HasDot2Insts = false; 141 bool HasDot3Insts = false; 142 bool HasDot4Insts = false; 143 bool HasDot5Insts = false; 144 bool HasDot6Insts = false; 145 bool HasDot7Insts = false; 146 bool HasDot8Insts = false; 147 bool HasMAIInsts = false; 148 bool HasPkFmacF16Inst = false; 149 bool HasAtomicFaddRtnInsts = false; 150 bool HasAtomicFaddNoRtnInsts = false; 151 bool HasAtomicPkFaddNoRtnInsts = false; 152 bool SupportsSRAMECC = false; 153 154 // This should not be used directly. 'TargetID' tracks the dynamic settings 155 // for SRAMECC. 156 bool EnableSRAMECC = false; 157 158 bool HasNoSdstCMPX = false; 159 bool HasVscnt = false; 160 bool HasGetWaveIdInst = false; 161 bool HasSMemTimeInst = false; 162 bool HasShaderCyclesRegister = false; 163 bool HasVOP3Literal = false; 164 bool HasNoDataDepHazard = false; 165 bool FlatAddressSpace = false; 166 bool FlatInstOffsets = false; 167 bool FlatGlobalInsts = false; 168 bool FlatScratchInsts = false; 169 bool ScalarFlatScratchInsts = false; 170 bool HasArchitectedFlatScratch = false; 171 bool EnableFlatScratch = false; 172 bool AddNoCarryInsts = false; 173 bool HasUnpackedD16VMem = false; 174 bool LDSMisalignedBug = false; 175 bool HasMFMAInlineLiteralBug = false; 176 bool UnalignedBufferAccess = false; 177 bool UnalignedDSAccess = false; 178 bool HasPackedTID = false; 179 bool ScalarizeGlobal = false; 180 181 bool HasVcmpxPermlaneHazard = false; 182 bool HasVMEMtoScalarWriteHazard = false; 183 bool HasSMEMtoVectorWriteHazard = false; 184 bool HasInstFwdPrefetchBug = false; 185 bool HasVcmpxExecWARHazard = false; 186 bool HasLdsBranchVmemWARHazard = false; 187 bool HasNSAtoVMEMBug = false; 188 bool HasNSAClauseBug = false; 189 bool HasOffset3fBug = false; 190 bool HasFlatSegmentOffsetBug = false; 191 bool HasImageStoreD16Bug = false; 192 bool HasImageGather4D16Bug = false; 193 bool HasVOPDInsts = false; 194 195 // Dummy feature to use for assembler in tablegen. 196 bool FeatureDisable = false; 197 198 SelectionDAGTargetInfo TSInfo; 199 private: 200 SIInstrInfo InstrInfo; 201 SITargetLowering TLInfo; 202 SIFrameLowering FrameLowering; 203 204 public: 205 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 206 const GCNTargetMachine &TM); 207 ~GCNSubtarget() override; 208 209 GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, 210 StringRef GPU, StringRef FS); 211 212 const SIInstrInfo *getInstrInfo() const override { 213 return &InstrInfo; 214 } 215 216 const SIFrameLowering *getFrameLowering() const override { 217 return &FrameLowering; 218 } 219 220 const SITargetLowering *getTargetLowering() const override { 221 return &TLInfo; 222 } 223 224 const SIRegisterInfo *getRegisterInfo() const override { 225 return &InstrInfo.getRegisterInfo(); 226 } 227 228 const CallLowering *getCallLowering() const override { 229 return CallLoweringInfo.get(); 230 } 231 232 const InlineAsmLowering *getInlineAsmLowering() const override { 233 return InlineAsmLoweringInfo.get(); 234 } 235 236 InstructionSelector *getInstructionSelector() const override { 237 return InstSelector.get(); 238 } 239 240 const LegalizerInfo *getLegalizerInfo() const override { 241 return Legalizer.get(); 242 } 243 244 const RegisterBankInfo *getRegBankInfo() const override { 245 return RegBankInfo.get(); 246 } 247 248 const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const { 249 return TargetID; 250 } 251 252 // Nothing implemented, just prevent crashes on use. 253 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { 254 return &TSInfo; 255 } 256 257 const InstrItineraryData *getInstrItineraryData() const override { 258 return &InstrItins; 259 } 260 261 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); 262 263 Generation getGeneration() const { 264 return (Generation)Gen; 265 } 266 267 unsigned getMaxWaveScratchSize() const { 268 // See COMPUTE_TMPRING_SIZE.WAVESIZE. 269 if (getGeneration() < GFX11) { 270 // 13-bit field in units of 256-dword. 271 return (256 * 4) * ((1 << 13) - 1); 272 } 273 // 15-bit field in units of 64-dword. 274 return (64 * 4) * ((1 << 15) - 1); 275 } 276 277 /// Return the number of high bits known to be zero for a frame index. 278 unsigned getKnownHighZeroBitsForFrameIndex() const { 279 return countLeadingZeros(getMaxWaveScratchSize()) + getWavefrontSizeLog2(); 280 } 281 282 int getLDSBankCount() const { 283 return LDSBankCount; 284 } 285 286 unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const { 287 return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16; 288 } 289 290 unsigned getConstantBusLimit(unsigned Opcode) const; 291 292 /// Returns if the result of this instruction with a 16-bit result returned in 293 /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve 294 /// the original value. 295 bool zeroesHigh16BitsOfDest(unsigned Opcode) const; 296 297 bool hasIntClamp() const { 298 return HasIntClamp; 299 } 300 301 bool hasFP64() const { 302 return FP64; 303 } 304 305 bool hasMIMG_R128() const { 306 return MIMG_R128; 307 } 308 309 bool hasHWFP64() const { 310 return FP64; 311 } 312 313 bool hasFastFMAF32() const { 314 return FastFMAF32; 315 } 316 317 bool hasHalfRate64Ops() const { 318 return HalfRate64Ops; 319 } 320 321 bool hasFullRate64Ops() const { 322 return FullRate64Ops; 323 } 324 325 bool hasAddr64() const { 326 return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS); 327 } 328 329 bool hasFlat() const { 330 return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS); 331 } 332 333 // Return true if the target only has the reverse operand versions of VALU 334 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32). 335 bool hasOnlyRevVALUShifts() const { 336 return getGeneration() >= VOLCANIC_ISLANDS; 337 } 338 339 bool hasFractBug() const { 340 return getGeneration() == SOUTHERN_ISLANDS; 341 } 342 343 bool hasBFE() const { 344 return true; 345 } 346 347 bool hasBFI() const { 348 return true; 349 } 350 351 bool hasBFM() const { 352 return hasBFE(); 353 } 354 355 bool hasBCNT(unsigned Size) const { 356 return true; 357 } 358 359 bool hasFFBL() const { 360 return true; 361 } 362 363 bool hasFFBH() const { 364 return true; 365 } 366 367 bool hasMed3_16() const { 368 return getGeneration() >= AMDGPUSubtarget::GFX9; 369 } 370 371 bool hasMin3Max3_16() const { 372 return getGeneration() >= AMDGPUSubtarget::GFX9; 373 } 374 375 bool hasFmaMixInsts() const { 376 return HasFmaMixInsts; 377 } 378 379 bool hasCARRY() const { 380 return true; 381 } 382 383 bool hasFMA() const { 384 return FMA; 385 } 386 387 bool hasSwap() const { 388 return GFX9Insts; 389 } 390 391 bool hasScalarPackInsts() const { 392 return GFX9Insts; 393 } 394 395 bool hasScalarMulHiInsts() const { 396 return GFX9Insts; 397 } 398 399 TrapHandlerAbi getTrapHandlerAbi() const { 400 return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE; 401 } 402 403 bool supportsGetDoorbellID() const { 404 // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets. 405 return getGeneration() >= GFX9; 406 } 407 408 /// True if the offset field of DS instructions works as expected. On SI, the 409 /// offset uses a 16-bit adder and does not always wrap properly. 410 bool hasUsableDSOffset() const { 411 return getGeneration() >= SEA_ISLANDS; 412 } 413 414 bool unsafeDSOffsetFoldingEnabled() const { 415 return EnableUnsafeDSOffsetFolding; 416 } 417 418 /// Condition output from div_scale is usable. 419 bool hasUsableDivScaleConditionOutput() const { 420 return getGeneration() != SOUTHERN_ISLANDS; 421 } 422 423 /// Extra wait hazard is needed in some cases before 424 /// s_cbranch_vccnz/s_cbranch_vccz. 425 bool hasReadVCCZBug() const { 426 return getGeneration() <= SEA_ISLANDS; 427 } 428 429 /// Writes to VCC_LO/VCC_HI update the VCCZ flag. 430 bool partialVCCWritesUpdateVCCZ() const { 431 return getGeneration() >= GFX10; 432 } 433 434 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR 435 /// was written by a VALU instruction. 436 bool hasSMRDReadVALUDefHazard() const { 437 return getGeneration() == SOUTHERN_ISLANDS; 438 } 439 440 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the 441 /// SGPR was written by a VALU Instruction. 442 bool hasVMEMReadSGPRVALUDefHazard() const { 443 return getGeneration() >= VOLCANIC_ISLANDS; 444 } 445 446 bool hasRFEHazards() const { 447 return getGeneration() >= VOLCANIC_ISLANDS; 448 } 449 450 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32. 451 unsigned getSetRegWaitStates() const { 452 return getGeneration() <= SEA_ISLANDS ? 1 : 2; 453 } 454 455 bool dumpCode() const { 456 return DumpCode; 457 } 458 459 /// Return the amount of LDS that can be used that will not restrict the 460 /// occupancy lower than WaveCount. 461 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, 462 const Function &) const; 463 464 bool supportsMinMaxDenormModes() const { 465 return getGeneration() >= AMDGPUSubtarget::GFX9; 466 } 467 468 /// \returns If target supports S_DENORM_MODE. 469 bool hasDenormModeInst() const { 470 return getGeneration() >= AMDGPUSubtarget::GFX10; 471 } 472 473 bool useFlatForGlobal() const { 474 return FlatForGlobal; 475 } 476 477 /// \returns If target supports ds_read/write_b128 and user enables generation 478 /// of ds_read/write_b128. 479 bool useDS128() const { 480 return CIInsts && EnableDS128; 481 } 482 483 /// \return If target supports ds_read/write_b96/128. 484 bool hasDS96AndDS128() const { 485 return CIInsts; 486 } 487 488 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64 489 bool haveRoundOpsF64() const { 490 return CIInsts; 491 } 492 493 /// \returns If MUBUF instructions always perform range checking, even for 494 /// buffer resources used for private memory access. 495 bool privateMemoryResourceIsRangeChecked() const { 496 return getGeneration() < AMDGPUSubtarget::GFX9; 497 } 498 499 /// \returns If target requires PRT Struct NULL support (zero result registers 500 /// for sparse texture support). 501 bool usePRTStrictNull() const { 502 return EnablePRTStrictNull; 503 } 504 505 bool hasAutoWaitcntBeforeBarrier() const { 506 return AutoWaitcntBeforeBarrier; 507 } 508 509 bool hasUnalignedBufferAccess() const { 510 return UnalignedBufferAccess; 511 } 512 513 bool hasUnalignedBufferAccessEnabled() const { 514 return UnalignedBufferAccess && UnalignedAccessMode; 515 } 516 517 bool hasUnalignedDSAccess() const { 518 return UnalignedDSAccess; 519 } 520 521 bool hasUnalignedDSAccessEnabled() const { 522 return UnalignedDSAccess && UnalignedAccessMode; 523 } 524 525 bool hasUnalignedScratchAccess() const { 526 return UnalignedScratchAccess; 527 } 528 529 bool hasUnalignedAccessMode() const { 530 return UnalignedAccessMode; 531 } 532 533 bool hasApertureRegs() const { 534 return HasApertureRegs; 535 } 536 537 bool isTrapHandlerEnabled() const { 538 return TrapHandler; 539 } 540 541 bool isXNACKEnabled() const { 542 return TargetID.isXnackOnOrAny(); 543 } 544 545 bool isTgSplitEnabled() const { 546 return EnableTgSplit; 547 } 548 549 bool isCuModeEnabled() const { 550 return EnableCuMode; 551 } 552 553 bool hasFlatAddressSpace() const { 554 return FlatAddressSpace; 555 } 556 557 bool hasFlatScrRegister() const { 558 return hasFlatAddressSpace(); 559 } 560 561 bool hasFlatInstOffsets() const { 562 return FlatInstOffsets; 563 } 564 565 bool hasFlatGlobalInsts() const { 566 return FlatGlobalInsts; 567 } 568 569 bool hasFlatScratchInsts() const { 570 return FlatScratchInsts; 571 } 572 573 // Check if target supports ST addressing mode with FLAT scratch instructions. 574 // The ST addressing mode means no registers are used, either VGPR or SGPR, 575 // but only immediate offset is swizzled and added to the FLAT scratch base. 576 bool hasFlatScratchSTMode() const { 577 return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts()); 578 } 579 580 bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; } 581 582 bool hasScalarFlatScratchInsts() const { 583 return ScalarFlatScratchInsts; 584 } 585 586 bool enableFlatScratch() const { 587 return flatScratchIsArchitected() || 588 (EnableFlatScratch && hasFlatScratchInsts()); 589 } 590 591 bool hasGlobalAddTidInsts() const { 592 return GFX10_BEncoding; 593 } 594 595 bool hasAtomicCSub() const { 596 return GFX10_BEncoding; 597 } 598 599 bool hasMultiDwordFlatScratchAddressing() const { 600 return getGeneration() >= GFX9; 601 } 602 603 bool hasFlatSegmentOffsetBug() const { 604 return HasFlatSegmentOffsetBug; 605 } 606 607 bool hasFlatLgkmVMemCountInOrder() const { 608 return getGeneration() > GFX9; 609 } 610 611 bool hasD16LoadStore() const { 612 return getGeneration() >= GFX9; 613 } 614 615 bool d16PreservesUnusedBits() const { 616 return hasD16LoadStore() && !TargetID.isSramEccOnOrAny(); 617 } 618 619 bool hasD16Images() const { 620 return getGeneration() >= VOLCANIC_ISLANDS; 621 } 622 623 /// Return if most LDS instructions have an m0 use that require m0 to be 624 /// initialized. 625 bool ldsRequiresM0Init() const { 626 return getGeneration() < GFX9; 627 } 628 629 // True if the hardware rewinds and replays GWS operations if a wave is 630 // preempted. 631 // 632 // If this is false, a GWS operation requires testing if a nack set the 633 // MEM_VIOL bit, and repeating if so. 634 bool hasGWSAutoReplay() const { 635 return getGeneration() >= GFX9; 636 } 637 638 /// \returns if target has ds_gws_sema_release_all instruction. 639 bool hasGWSSemaReleaseAll() const { 640 return CIInsts; 641 } 642 643 /// \returns true if the target has integer add/sub instructions that do not 644 /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32, 645 /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier 646 /// for saturation. 647 bool hasAddNoCarry() const { 648 return AddNoCarryInsts; 649 } 650 651 bool hasUnpackedD16VMem() const { 652 return HasUnpackedD16VMem; 653 } 654 655 // Covers VS/PS/CS graphics shaders 656 bool isMesaGfxShader(const Function &F) const { 657 return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv()); 658 } 659 660 bool hasMad64_32() const { 661 return getGeneration() >= SEA_ISLANDS; 662 } 663 664 bool hasSDWAOmod() const { 665 return HasSDWAOmod; 666 } 667 668 bool hasSDWAScalar() const { 669 return HasSDWAScalar; 670 } 671 672 bool hasSDWASdst() const { 673 return HasSDWASdst; 674 } 675 676 bool hasSDWAMac() const { 677 return HasSDWAMac; 678 } 679 680 bool hasSDWAOutModsVOPC() const { 681 return HasSDWAOutModsVOPC; 682 } 683 684 bool hasDLInsts() const { 685 return HasDLInsts; 686 } 687 688 bool hasDot1Insts() const { 689 return HasDot1Insts; 690 } 691 692 bool hasDot2Insts() const { 693 return HasDot2Insts; 694 } 695 696 bool hasDot3Insts() const { 697 return HasDot3Insts; 698 } 699 700 bool hasDot4Insts() const { 701 return HasDot4Insts; 702 } 703 704 bool hasDot5Insts() const { 705 return HasDot5Insts; 706 } 707 708 bool hasDot6Insts() const { 709 return HasDot6Insts; 710 } 711 712 bool hasDot7Insts() const { 713 return HasDot7Insts; 714 } 715 716 bool hasDot8Insts() const { 717 return HasDot8Insts; 718 } 719 720 bool hasMAIInsts() const { 721 return HasMAIInsts; 722 } 723 724 bool hasPkFmacF16Inst() const { 725 return HasPkFmacF16Inst; 726 } 727 728 bool hasAtomicFaddInsts() const { 729 return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts; 730 } 731 732 bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; } 733 734 bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; } 735 736 bool hasAtomicPkFaddNoRtnInsts() const { return HasAtomicPkFaddNoRtnInsts; } 737 738 bool hasNoSdstCMPX() const { 739 return HasNoSdstCMPX; 740 } 741 742 bool hasVscnt() const { 743 return HasVscnt; 744 } 745 746 bool hasGetWaveIdInst() const { 747 return HasGetWaveIdInst; 748 } 749 750 bool hasSMemTimeInst() const { 751 return HasSMemTimeInst; 752 } 753 754 bool hasShaderCyclesRegister() const { 755 return HasShaderCyclesRegister; 756 } 757 758 bool hasVOP3Literal() const { 759 return HasVOP3Literal; 760 } 761 762 bool hasNoDataDepHazard() const { 763 return HasNoDataDepHazard; 764 } 765 766 bool vmemWriteNeedsExpWaitcnt() const { 767 return getGeneration() < SEA_ISLANDS; 768 } 769 770 // Scratch is allocated in 256 dword per wave blocks for the entire 771 // wavefront. When viewed from the perspective of an arbitrary workitem, this 772 // is 4-byte aligned. 773 // 774 // Only 4-byte alignment is really needed to access anything. Transformations 775 // on the pointer value itself may rely on the alignment / known low bits of 776 // the pointer. Set this to something above the minimum to avoid needing 777 // dynamic realignment in common cases. 778 Align getStackAlignment() const { return Align(16); } 779 780 bool enableMachineScheduler() const override { 781 return true; 782 } 783 784 bool useAA() const override; 785 786 bool enableSubRegLiveness() const override { 787 return true; 788 } 789 790 void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; } 791 bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; } 792 793 // static wrappers 794 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI); 795 796 // XXX - Why is this here if it isn't in the default pass set? 797 bool enableEarlyIfConversion() const override { 798 return true; 799 } 800 801 void overrideSchedPolicy(MachineSchedPolicy &Policy, 802 unsigned NumRegionInstrs) const override; 803 804 unsigned getMaxNumUserSGPRs() const { 805 return 16; 806 } 807 808 bool hasSMemRealTime() const { 809 return HasSMemRealTime; 810 } 811 812 bool hasMovrel() const { 813 return HasMovrel; 814 } 815 816 bool hasVGPRIndexMode() const { 817 return HasVGPRIndexMode; 818 } 819 820 bool useVGPRIndexMode() const; 821 822 bool hasScalarCompareEq64() const { 823 return getGeneration() >= VOLCANIC_ISLANDS; 824 } 825 826 bool hasScalarStores() const { 827 return HasScalarStores; 828 } 829 830 bool hasScalarAtomics() const { 831 return HasScalarAtomics; 832 } 833 834 bool hasLDSFPAtomicAdd() const { return GFX8Insts; } 835 836 /// \returns true if the subtarget has the v_permlanex16_b32 instruction. 837 bool hasPermLaneX16() const { return getGeneration() >= GFX10; } 838 839 /// \returns true if the subtarget has the v_permlane64_b32 instruction. 840 bool hasPermLane64() const { return getGeneration() >= GFX11; } 841 842 bool hasDPP() const { 843 return HasDPP; 844 } 845 846 bool hasDPPBroadcasts() const { 847 return HasDPP && getGeneration() < GFX10; 848 } 849 850 bool hasDPPWavefrontShifts() const { 851 return HasDPP && getGeneration() < GFX10; 852 } 853 854 bool hasDPP8() const { 855 return HasDPP8; 856 } 857 858 bool has64BitDPP() const { 859 return Has64BitDPP; 860 } 861 862 bool hasPackedFP32Ops() const { 863 return HasPackedFP32Ops; 864 } 865 866 bool hasFmaakFmamkF32Insts() const { 867 return getGeneration() >= GFX10 || hasGFX940Insts(); 868 } 869 870 bool hasImageInsts() const { 871 return HasImageInsts; 872 } 873 874 bool hasExtendedImageInsts() const { 875 return HasExtendedImageInsts; 876 } 877 878 bool hasR128A16() const { 879 return HasR128A16; 880 } 881 882 bool hasGFX10A16() const { 883 return HasGFX10A16; 884 } 885 886 bool hasA16() const { return hasR128A16() || hasGFX10A16(); } 887 888 bool hasG16() const { return HasG16; } 889 890 bool hasOffset3fBug() const { 891 return HasOffset3fBug; 892 } 893 894 bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; } 895 896 bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; } 897 898 bool hasNSAEncoding() const { return HasNSAEncoding; } 899 900 unsigned getNSAMaxSize() const { return NSAMaxSize; } 901 902 bool hasGFX10_AEncoding() const { 903 return GFX10_AEncoding; 904 } 905 906 bool hasGFX10_BEncoding() const { 907 return GFX10_BEncoding; 908 } 909 910 bool hasGFX10_3Insts() const { 911 return GFX10_3Insts; 912 } 913 914 bool hasMadF16() const; 915 916 bool hasMovB64() const { return GFX940Insts; } 917 918 bool hasLshlAddB64() const { return GFX940Insts; } 919 920 bool enableSIScheduler() const { 921 return EnableSIScheduler; 922 } 923 924 bool loadStoreOptEnabled() const { 925 return EnableLoadStoreOpt; 926 } 927 928 bool hasSGPRInitBug() const { 929 return SGPRInitBug; 930 } 931 932 bool hasUserSGPRInit16Bug() const { 933 return UserSGPRInit16Bug; 934 } 935 936 bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; } 937 938 bool hasNegativeUnalignedScratchOffsetBug() const { 939 return NegativeUnalignedScratchOffsetBug; 940 } 941 942 bool hasMFMAInlineLiteralBug() const { 943 return HasMFMAInlineLiteralBug; 944 } 945 946 bool has12DWordStoreHazard() const { 947 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; 948 } 949 950 // \returns true if the subtarget supports DWORDX3 load/store instructions. 951 bool hasDwordx3LoadStores() const { 952 return CIInsts; 953 } 954 955 bool hasReadM0MovRelInterpHazard() const { 956 return getGeneration() == AMDGPUSubtarget::GFX9; 957 } 958 959 bool hasReadM0SendMsgHazard() const { 960 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 961 getGeneration() <= AMDGPUSubtarget::GFX9; 962 } 963 964 bool hasReadM0LdsDmaHazard() const { 965 return getGeneration() == AMDGPUSubtarget::GFX9; 966 } 967 968 bool hasReadM0LdsDirectHazard() const { 969 return getGeneration() == AMDGPUSubtarget::GFX9; 970 } 971 972 bool hasVcmpxPermlaneHazard() const { 973 return HasVcmpxPermlaneHazard; 974 } 975 976 bool hasVMEMtoScalarWriteHazard() const { 977 return HasVMEMtoScalarWriteHazard; 978 } 979 980 bool hasSMEMtoVectorWriteHazard() const { 981 return HasSMEMtoVectorWriteHazard; 982 } 983 984 bool hasLDSMisalignedBug() const { 985 return LDSMisalignedBug && !EnableCuMode; 986 } 987 988 bool hasInstFwdPrefetchBug() const { 989 return HasInstFwdPrefetchBug; 990 } 991 992 bool hasVcmpxExecWARHazard() const { 993 return HasVcmpxExecWARHazard; 994 } 995 996 bool hasLdsBranchVmemWARHazard() const { 997 return HasLdsBranchVmemWARHazard; 998 } 999 1000 // Has one cycle hazard on transcendental instruction feeding a 1001 // non transcendental VALU. 1002 bool hasTransForwardingHazard() const { return GFX940Insts; } 1003 1004 // Has one cycle hazard on a VALU instruction partially writing dst with 1005 // a shift of result bits feeding another VALU instruction. 1006 bool hasDstSelForwardingHazard() const { return GFX940Insts; } 1007 1008 // Cannot use op_sel with v_dot instructions. 1009 bool hasDOTOpSelHazard() const { return GFX940Insts; } 1010 1011 // Does not have HW interlocs for VALU writing and then reading SGPRs. 1012 bool hasVDecCoExecHazard() const { 1013 return GFX940Insts; 1014 } 1015 1016 bool hasNSAtoVMEMBug() const { 1017 return HasNSAtoVMEMBug; 1018 } 1019 1020 bool hasNSAClauseBug() const { return HasNSAClauseBug; } 1021 1022 bool hasHardClauses() const { return getGeneration() >= GFX10; } 1023 1024 bool hasGFX90AInsts() const { return GFX90AInsts; } 1025 1026 bool hasVOP3DPP() const { return getGeneration() >= GFX11; } 1027 1028 bool hasLdsDirect() const { return getGeneration() >= GFX11; } 1029 1030 bool hasVALUPartialForwardingHazard() const { 1031 return getGeneration() >= GFX11; 1032 } 1033 1034 bool hasVALUTransUseHazard() const { return getGeneration() >= GFX11; } 1035 1036 /// Return if operations acting on VGPR tuples require even alignment. 1037 bool needsAlignedVGPRs() const { return GFX90AInsts; } 1038 1039 /// Return true if the target has the S_PACK_HL_B32_B16 instruction. 1040 bool hasSPackHL() const { return GFX11Insts; } 1041 1042 /// Return true if the target's EXP instruction has the COMPR flag, which 1043 /// affects the meaning of the EN (enable) bits. 1044 bool hasCompressedExport() const { return !GFX11Insts; } 1045 1046 /// Return true if the target's EXP instruction supports the NULL export 1047 /// target. 1048 bool hasNullExportTarget() const { return !GFX11Insts; } 1049 1050 bool hasVOPDInsts() const { return HasVOPDInsts; } 1051 1052 bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; } 1053 1054 /// Return true if the target has the S_DELAY_ALU instruction. 1055 bool hasDelayAlu() const { return GFX11Insts; } 1056 1057 bool hasPackedTID() const { return HasPackedTID; } 1058 1059 // GFX940 is a derivation to GFX90A. hasGFX940Insts() being true implies that 1060 // hasGFX90AInsts is also true. 1061 bool hasGFX940Insts() const { return GFX940Insts; } 1062 1063 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs 1064 /// SGPRs 1065 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; 1066 1067 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs 1068 /// VGPRs 1069 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; 1070 1071 /// Return occupancy for the given function. Used LDS and a number of 1072 /// registers if provided. 1073 /// Note, occupancy can be affected by the scratch allocation as well, but 1074 /// we do not have enough information to compute it. 1075 unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0, 1076 unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const; 1077 1078 /// \returns true if the flat_scratch register should be initialized with the 1079 /// pointer to the wave's scratch memory rather than a size and offset. 1080 bool flatScratchIsPointer() const { 1081 return getGeneration() >= AMDGPUSubtarget::GFX9; 1082 } 1083 1084 /// \returns true if the flat_scratch register is initialized by the HW. 1085 /// In this case it is readonly. 1086 bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; } 1087 1088 /// \returns true if the machine has merged shaders in which s0-s7 are 1089 /// reserved by the hardware and user SGPRs start at s8 1090 bool hasMergedShaders() const { 1091 return getGeneration() >= GFX9; 1092 } 1093 1094 // \returns true if the target supports the pre-NGG legacy geometry path. 1095 bool hasLegacyGeometry() const { return getGeneration() < GFX11; } 1096 1097 /// \returns SGPR allocation granularity supported by the subtarget. 1098 unsigned getSGPRAllocGranule() const { 1099 return AMDGPU::IsaInfo::getSGPRAllocGranule(this); 1100 } 1101 1102 /// \returns SGPR encoding granularity supported by the subtarget. 1103 unsigned getSGPREncodingGranule() const { 1104 return AMDGPU::IsaInfo::getSGPREncodingGranule(this); 1105 } 1106 1107 /// \returns Total number of SGPRs supported by the subtarget. 1108 unsigned getTotalNumSGPRs() const { 1109 return AMDGPU::IsaInfo::getTotalNumSGPRs(this); 1110 } 1111 1112 /// \returns Addressable number of SGPRs supported by the subtarget. 1113 unsigned getAddressableNumSGPRs() const { 1114 return AMDGPU::IsaInfo::getAddressableNumSGPRs(this); 1115 } 1116 1117 /// \returns Minimum number of SGPRs that meets the given number of waves per 1118 /// execution unit requirement supported by the subtarget. 1119 unsigned getMinNumSGPRs(unsigned WavesPerEU) const { 1120 return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU); 1121 } 1122 1123 /// \returns Maximum number of SGPRs that meets the given number of waves per 1124 /// execution unit requirement supported by the subtarget. 1125 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const { 1126 return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable); 1127 } 1128 1129 /// \returns Reserved number of SGPRs. This is common 1130 /// utility function called by MachineFunction and 1131 /// Function variants of getReservedNumSGPRs. 1132 unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const; 1133 /// \returns Reserved number of SGPRs for given machine function \p MF. 1134 unsigned getReservedNumSGPRs(const MachineFunction &MF) const; 1135 1136 /// \returns Reserved number of SGPRs for given function \p F. 1137 unsigned getReservedNumSGPRs(const Function &F) const; 1138 1139 /// \returns max num SGPRs. This is the common utility 1140 /// function called by MachineFunction and Function 1141 /// variants of getMaxNumSGPRs. 1142 unsigned getBaseMaxNumSGPRs(const Function &F, 1143 std::pair<unsigned, unsigned> WavesPerEU, 1144 unsigned PreloadedSGPRs, 1145 unsigned ReservedNumSGPRs) const; 1146 1147 /// \returns Maximum number of SGPRs that meets number of waves per execution 1148 /// unit requirement for function \p MF, or number of SGPRs explicitly 1149 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF. 1150 /// 1151 /// \returns Value that meets number of waves per execution unit requirement 1152 /// if explicitly requested value cannot be converted to integer, violates 1153 /// subtarget's specifications, or does not meet number of waves per execution 1154 /// unit requirement. 1155 unsigned getMaxNumSGPRs(const MachineFunction &MF) const; 1156 1157 /// \returns Maximum number of SGPRs that meets number of waves per execution 1158 /// unit requirement for function \p F, or number of SGPRs explicitly 1159 /// requested using "amdgpu-num-sgpr" attribute attached to function \p F. 1160 /// 1161 /// \returns Value that meets number of waves per execution unit requirement 1162 /// if explicitly requested value cannot be converted to integer, violates 1163 /// subtarget's specifications, or does not meet number of waves per execution 1164 /// unit requirement. 1165 unsigned getMaxNumSGPRs(const Function &F) const; 1166 1167 /// \returns VGPR allocation granularity supported by the subtarget. 1168 unsigned getVGPRAllocGranule() const { 1169 return AMDGPU::IsaInfo::getVGPRAllocGranule(this); 1170 } 1171 1172 /// \returns VGPR encoding granularity supported by the subtarget. 1173 unsigned getVGPREncodingGranule() const { 1174 return AMDGPU::IsaInfo::getVGPREncodingGranule(this); 1175 } 1176 1177 /// \returns Total number of VGPRs supported by the subtarget. 1178 unsigned getTotalNumVGPRs() const { 1179 return AMDGPU::IsaInfo::getTotalNumVGPRs(this); 1180 } 1181 1182 /// \returns Addressable number of VGPRs supported by the subtarget. 1183 unsigned getAddressableNumVGPRs() const { 1184 return AMDGPU::IsaInfo::getAddressableNumVGPRs(this); 1185 } 1186 1187 /// \returns Minimum number of VGPRs that meets given number of waves per 1188 /// execution unit requirement supported by the subtarget. 1189 unsigned getMinNumVGPRs(unsigned WavesPerEU) const { 1190 return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU); 1191 } 1192 1193 /// \returns Maximum number of VGPRs that meets given number of waves per 1194 /// execution unit requirement supported by the subtarget. 1195 unsigned getMaxNumVGPRs(unsigned WavesPerEU) const { 1196 return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU); 1197 } 1198 1199 /// \returns max num VGPRs. This is the common utility function 1200 /// called by MachineFunction and Function variants of getMaxNumVGPRs. 1201 unsigned getBaseMaxNumVGPRs(const Function &F, 1202 std::pair<unsigned, unsigned> WavesPerEU) const; 1203 /// \returns Maximum number of VGPRs that meets number of waves per execution 1204 /// unit requirement for function \p F, or number of VGPRs explicitly 1205 /// requested using "amdgpu-num-vgpr" attribute attached to function \p F. 1206 /// 1207 /// \returns Value that meets number of waves per execution unit requirement 1208 /// if explicitly requested value cannot be converted to integer, violates 1209 /// subtarget's specifications, or does not meet number of waves per execution 1210 /// unit requirement. 1211 unsigned getMaxNumVGPRs(const Function &F) const; 1212 1213 unsigned getMaxNumAGPRs(const Function &F) const { 1214 return getMaxNumVGPRs(F); 1215 } 1216 1217 /// \returns Maximum number of VGPRs that meets number of waves per execution 1218 /// unit requirement for function \p MF, or number of VGPRs explicitly 1219 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. 1220 /// 1221 /// \returns Value that meets number of waves per execution unit requirement 1222 /// if explicitly requested value cannot be converted to integer, violates 1223 /// subtarget's specifications, or does not meet number of waves per execution 1224 /// unit requirement. 1225 unsigned getMaxNumVGPRs(const MachineFunction &MF) const; 1226 1227 void getPostRAMutations( 1228 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) 1229 const override; 1230 1231 std::unique_ptr<ScheduleDAGMutation> 1232 createFillMFMAShadowMutation(const TargetInstrInfo *TII) const; 1233 1234 bool isWave32() const { 1235 return getWavefrontSize() == 32; 1236 } 1237 1238 bool isWave64() const { 1239 return getWavefrontSize() == 64; 1240 } 1241 1242 const TargetRegisterClass *getBoolRC() const { 1243 return getRegisterInfo()->getBoolRC(); 1244 } 1245 1246 /// \returns Maximum number of work groups per compute unit supported by the 1247 /// subtarget and limited by given \p FlatWorkGroupSize. 1248 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override { 1249 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize); 1250 } 1251 1252 /// \returns Minimum flat work group size supported by the subtarget. 1253 unsigned getMinFlatWorkGroupSize() const override { 1254 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this); 1255 } 1256 1257 /// \returns Maximum flat work group size supported by the subtarget. 1258 unsigned getMaxFlatWorkGroupSize() const override { 1259 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); 1260 } 1261 1262 /// \returns Number of waves per execution unit required to support the given 1263 /// \p FlatWorkGroupSize. 1264 unsigned 1265 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override { 1266 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize); 1267 } 1268 1269 /// \returns Minimum number of waves per execution unit supported by the 1270 /// subtarget. 1271 unsigned getMinWavesPerEU() const override { 1272 return AMDGPU::IsaInfo::getMinWavesPerEU(this); 1273 } 1274 1275 void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, 1276 SDep &Dep) const override; 1277 1278 // \returns true if it's beneficial on this subtarget for the scheduler to 1279 // cluster stores as well as loads. 1280 bool shouldClusterStores() const { return getGeneration() >= GFX11; } 1281 }; 1282 1283 } // end namespace llvm 1284 1285 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 1286