1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPUCallLowering.h" 16 #include "AMDGPUInstructionSelector.h" 17 #include "AMDGPULegalizerInfo.h" 18 #include "AMDGPURegisterBankInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "R600Subtarget.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "Utils/AMDGPUBaseInfo.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/CodeGen/TargetFrameLowering.h" 27 #include "llvm/IR/IntrinsicsAMDGPU.h" 28 #include "llvm/IR/IntrinsicsR600.h" 29 #include "llvm/IR/MDBuilder.h" 30 #include "llvm/MC/MCSubtargetInfo.h" 31 #include <algorithm> 32 33 using namespace llvm; 34 35 #define DEBUG_TYPE "amdgpu-subtarget" 36 37 #define GET_SUBTARGETINFO_TARGET_DESC 38 #define GET_SUBTARGETINFO_CTOR 39 #define AMDGPUSubtarget GCNSubtarget 40 #include "AMDGPUGenSubtargetInfo.inc" 41 #undef AMDGPUSubtarget 42 43 static cl::opt<bool> DisablePowerSched( 44 "amdgpu-disable-power-sched", 45 cl::desc("Disable scheduling to minimize mAI power bursts"), 46 cl::init(false)); 47 48 static cl::opt<bool> EnableVGPRIndexMode( 49 "amdgpu-vgpr-index-mode", 50 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 51 cl::init(false)); 52 53 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen", 54 cl::desc("Enable the use of AA during codegen."), 55 cl::init(true)); 56 57 GCNSubtarget::~GCNSubtarget() = default; 58 59 GCNSubtarget & 60 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 61 StringRef GPU, StringRef FS) { 62 // Determine default and user-specified characteristics 63 // 64 // We want to be able to turn these off, but making this a subtarget feature 65 // for SI has the unhelpful behavior that it unsets everything else if you 66 // disable it. 67 // 68 // Similarly we want enable-prt-strict-null to be on by default and not to 69 // unset everything else if it is disabled 70 71 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); 72 73 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default 74 if (isAmdHsaOS()) 75 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; 76 77 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 78 79 // Disable mutually exclusive bits. 80 if (FS.contains_insensitive("+wavefrontsize")) { 81 if (!FS.contains_insensitive("wavefrontsize16")) 82 FullFS += "-wavefrontsize16,"; 83 if (!FS.contains_insensitive("wavefrontsize32")) 84 FullFS += "-wavefrontsize32,"; 85 if (!FS.contains_insensitive("wavefrontsize64")) 86 FullFS += "-wavefrontsize64,"; 87 } 88 89 FullFS += FS; 90 91 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 92 93 // Implement the "generic" processors, which acts as the default when no 94 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to 95 // the first amdgcn target that supports flat addressing. Other OSes defaults 96 // to the first amdgcn target. 97 if (Gen == AMDGPUSubtarget::INVALID) { 98 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS 99 : AMDGPUSubtarget::SOUTHERN_ISLANDS; 100 } 101 102 // We don't support FP64 for EG/NI atm. 103 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 104 105 // Targets must either support 64-bit offsets for MUBUF instructions, and/or 106 // support flat operations, otherwise they cannot access a 64-bit global 107 // address space 108 assert(hasAddr64() || hasFlat()); 109 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets 110 // that do not support ADDR64 variants of MUBUF instructions. Such targets 111 // cannot use a 64 bit offset with a MUBUF instruction to access the global 112 // address space 113 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { 114 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 115 FlatForGlobal = true; 116 } 117 // Unless +-flat-for-global is specified, use MUBUF instructions for global 118 // address space access if flat operations are not available. 119 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { 120 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 121 FlatForGlobal = false; 122 } 123 124 // Set defaults if needed. 125 if (MaxPrivateElementSize == 0) 126 MaxPrivateElementSize = 4; 127 128 if (LDSBankCount == 0) 129 LDSBankCount = 32; 130 131 if (TT.getArch() == Triple::amdgcn) { 132 if (LocalMemorySize == 0) 133 LocalMemorySize = 32768; 134 135 // Do something sensible for unspecified target. 136 if (!HasMovrel && !HasVGPRIndexMode) 137 HasMovrel = true; 138 } 139 140 // Don't crash on invalid devices. 141 if (WavefrontSizeLog2 == 0) 142 WavefrontSizeLog2 = 5; 143 144 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 145 HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9; 146 147 TargetID.setTargetIDFromFeaturesString(FS); 148 149 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " 150 << TargetID.getXnackSetting() << '\n'); 151 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " 152 << TargetID.getSramEccSetting() << '\n'); 153 154 return *this; 155 } 156 157 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 158 TargetTriple(TT), 159 GCN3Encoding(false), 160 Has16BitInsts(false), 161 HasMadMixInsts(false), 162 HasMadMacF32Insts(false), 163 HasDsSrc2Insts(false), 164 HasSDWA(false), 165 HasVOP3PInsts(false), 166 HasMulI24(true), 167 HasMulU24(true), 168 HasSMulHi(false), 169 HasInv2PiInlineImm(false), 170 HasFminFmaxLegacy(true), 171 EnablePromoteAlloca(false), 172 HasTrigReducedRange(false), 173 MaxWavesPerEU(10), 174 LocalMemorySize(0), 175 WavefrontSizeLog2(0) 176 { } 177 178 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 179 const GCNTargetMachine &TM) 180 : // clang-format off 181 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), 182 AMDGPUSubtarget(TT), 183 TargetTriple(TT), 184 TargetID(*this), 185 Gen(INVALID), 186 InstrItins(getInstrItineraryForCPU(GPU)), 187 LDSBankCount(0), 188 MaxPrivateElementSize(0), 189 190 FastFMAF32(false), 191 FastDenormalF32(false), 192 HalfRate64Ops(false), 193 FullRate64Ops(false), 194 195 FlatForGlobal(false), 196 AutoWaitcntBeforeBarrier(false), 197 BackOffBarrier(false), 198 UnalignedScratchAccess(false), 199 UnalignedAccessMode(false), 200 201 HasApertureRegs(false), 202 SupportsXNACK(false), 203 EnableXNACK(false), 204 EnableTgSplit(false), 205 EnableCuMode(false), 206 TrapHandler(false), 207 208 EnableLoadStoreOpt(false), 209 EnableUnsafeDSOffsetFolding(false), 210 EnableSIScheduler(false), 211 EnableDS128(false), 212 EnablePRTStrictNull(false), 213 DumpCode(false), 214 215 FP64(false), 216 CIInsts(false), 217 GFX8Insts(false), 218 GFX9Insts(false), 219 GFX90AInsts(false), 220 GFX940Insts(false), 221 GFX10Insts(false), 222 GFX10_3Insts(false), 223 GFX7GFX8GFX9Insts(false), 224 SGPRInitBug(false), 225 NegativeScratchOffsetBug(false), 226 NegativeUnalignedScratchOffsetBug(false), 227 HasSMemRealTime(false), 228 HasIntClamp(false), 229 HasFmaMixInsts(false), 230 HasMovrel(false), 231 HasVGPRIndexMode(false), 232 HasScalarStores(false), 233 HasScalarAtomics(false), 234 HasSDWAOmod(false), 235 HasSDWAScalar(false), 236 HasSDWASdst(false), 237 HasSDWAMac(false), 238 HasSDWAOutModsVOPC(false), 239 HasDPP(false), 240 HasDPP8(false), 241 Has64BitDPP(false), 242 HasPackedFP32Ops(false), 243 HasImageInsts(false), 244 HasExtendedImageInsts(false), 245 HasR128A16(false), 246 HasGFX10A16(false), 247 HasG16(false), 248 HasNSAEncoding(false), 249 NSAMaxSize(0), 250 GFX10_AEncoding(false), 251 GFX10_BEncoding(false), 252 HasDLInsts(false), 253 HasDot1Insts(false), 254 HasDot2Insts(false), 255 HasDot3Insts(false), 256 HasDot4Insts(false), 257 HasDot5Insts(false), 258 HasDot6Insts(false), 259 HasDot7Insts(false), 260 HasMAIInsts(false), 261 HasPkFmacF16Inst(false), 262 HasAtomicFaddInsts(false), 263 SupportsSRAMECC(false), 264 EnableSRAMECC(false), 265 HasNoSdstCMPX(false), 266 HasVscnt(false), 267 HasGetWaveIdInst(false), 268 HasSMemTimeInst(false), 269 HasShaderCyclesRegister(false), 270 HasVOP3Literal(false), 271 HasNoDataDepHazard(false), 272 FlatAddressSpace(false), 273 FlatInstOffsets(false), 274 FlatGlobalInsts(false), 275 FlatScratchInsts(false), 276 ScalarFlatScratchInsts(false), 277 HasArchitectedFlatScratch(false), 278 EnableFlatScratch(false), 279 AddNoCarryInsts(false), 280 HasUnpackedD16VMem(false), 281 LDSMisalignedBug(false), 282 HasMFMAInlineLiteralBug(false), 283 UnalignedBufferAccess(false), 284 UnalignedDSAccess(false), 285 HasPackedTID(false), 286 287 ScalarizeGlobal(false), 288 289 HasVcmpxPermlaneHazard(false), 290 HasVMEMtoScalarWriteHazard(false), 291 HasSMEMtoVectorWriteHazard(false), 292 HasInstFwdPrefetchBug(false), 293 HasVcmpxExecWARHazard(false), 294 HasLdsBranchVmemWARHazard(false), 295 HasNSAtoVMEMBug(false), 296 HasNSAClauseBug(false), 297 HasOffset3fBug(false), 298 HasFlatSegmentOffsetBug(false), 299 HasImageStoreD16Bug(false), 300 HasImageGather4D16Bug(false), 301 302 FeatureDisable(false), 303 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 304 TLInfo(TM, *this), 305 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 306 // clang-format on 307 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 308 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 309 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 310 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 311 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 312 InstSelector.reset(new AMDGPUInstructionSelector( 313 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 314 } 315 316 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 317 if (getGeneration() < GFX10) 318 return 1; 319 320 switch (Opcode) { 321 case AMDGPU::V_LSHLREV_B64_e64: 322 case AMDGPU::V_LSHLREV_B64_gfx10: 323 case AMDGPU::V_LSHL_B64_e64: 324 case AMDGPU::V_LSHRREV_B64_e64: 325 case AMDGPU::V_LSHRREV_B64_gfx10: 326 case AMDGPU::V_LSHR_B64_e64: 327 case AMDGPU::V_ASHRREV_I64_e64: 328 case AMDGPU::V_ASHRREV_I64_gfx10: 329 case AMDGPU::V_ASHR_I64_e64: 330 return 1; 331 } 332 333 return 2; 334 } 335 336 /// This list was mostly derived from experimentation. 337 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const { 338 switch (Opcode) { 339 case AMDGPU::V_CVT_F16_F32_e32: 340 case AMDGPU::V_CVT_F16_F32_e64: 341 case AMDGPU::V_CVT_F16_U16_e32: 342 case AMDGPU::V_CVT_F16_U16_e64: 343 case AMDGPU::V_CVT_F16_I16_e32: 344 case AMDGPU::V_CVT_F16_I16_e64: 345 case AMDGPU::V_RCP_F16_e64: 346 case AMDGPU::V_RCP_F16_e32: 347 case AMDGPU::V_RSQ_F16_e64: 348 case AMDGPU::V_RSQ_F16_e32: 349 case AMDGPU::V_SQRT_F16_e64: 350 case AMDGPU::V_SQRT_F16_e32: 351 case AMDGPU::V_LOG_F16_e64: 352 case AMDGPU::V_LOG_F16_e32: 353 case AMDGPU::V_EXP_F16_e64: 354 case AMDGPU::V_EXP_F16_e32: 355 case AMDGPU::V_SIN_F16_e64: 356 case AMDGPU::V_SIN_F16_e32: 357 case AMDGPU::V_COS_F16_e64: 358 case AMDGPU::V_COS_F16_e32: 359 case AMDGPU::V_FLOOR_F16_e64: 360 case AMDGPU::V_FLOOR_F16_e32: 361 case AMDGPU::V_CEIL_F16_e64: 362 case AMDGPU::V_CEIL_F16_e32: 363 case AMDGPU::V_TRUNC_F16_e64: 364 case AMDGPU::V_TRUNC_F16_e32: 365 case AMDGPU::V_RNDNE_F16_e64: 366 case AMDGPU::V_RNDNE_F16_e32: 367 case AMDGPU::V_FRACT_F16_e64: 368 case AMDGPU::V_FRACT_F16_e32: 369 case AMDGPU::V_FREXP_MANT_F16_e64: 370 case AMDGPU::V_FREXP_MANT_F16_e32: 371 case AMDGPU::V_FREXP_EXP_I16_F16_e64: 372 case AMDGPU::V_FREXP_EXP_I16_F16_e32: 373 case AMDGPU::V_LDEXP_F16_e64: 374 case AMDGPU::V_LDEXP_F16_e32: 375 case AMDGPU::V_LSHLREV_B16_e64: 376 case AMDGPU::V_LSHLREV_B16_e32: 377 case AMDGPU::V_LSHRREV_B16_e64: 378 case AMDGPU::V_LSHRREV_B16_e32: 379 case AMDGPU::V_ASHRREV_I16_e64: 380 case AMDGPU::V_ASHRREV_I16_e32: 381 case AMDGPU::V_ADD_U16_e64: 382 case AMDGPU::V_ADD_U16_e32: 383 case AMDGPU::V_SUB_U16_e64: 384 case AMDGPU::V_SUB_U16_e32: 385 case AMDGPU::V_SUBREV_U16_e64: 386 case AMDGPU::V_SUBREV_U16_e32: 387 case AMDGPU::V_MUL_LO_U16_e64: 388 case AMDGPU::V_MUL_LO_U16_e32: 389 case AMDGPU::V_ADD_F16_e64: 390 case AMDGPU::V_ADD_F16_e32: 391 case AMDGPU::V_SUB_F16_e64: 392 case AMDGPU::V_SUB_F16_e32: 393 case AMDGPU::V_SUBREV_F16_e64: 394 case AMDGPU::V_SUBREV_F16_e32: 395 case AMDGPU::V_MUL_F16_e64: 396 case AMDGPU::V_MUL_F16_e32: 397 case AMDGPU::V_MAX_F16_e64: 398 case AMDGPU::V_MAX_F16_e32: 399 case AMDGPU::V_MIN_F16_e64: 400 case AMDGPU::V_MIN_F16_e32: 401 case AMDGPU::V_MAX_U16_e64: 402 case AMDGPU::V_MAX_U16_e32: 403 case AMDGPU::V_MIN_U16_e64: 404 case AMDGPU::V_MIN_U16_e32: 405 case AMDGPU::V_MAX_I16_e64: 406 case AMDGPU::V_MAX_I16_e32: 407 case AMDGPU::V_MIN_I16_e64: 408 case AMDGPU::V_MIN_I16_e32: 409 case AMDGPU::V_MAD_F16_e64: 410 case AMDGPU::V_MAD_U16_e64: 411 case AMDGPU::V_MAD_I16_e64: 412 case AMDGPU::V_FMA_F16_e64: 413 case AMDGPU::V_DIV_FIXUP_F16_e64: 414 // On gfx10, all 16-bit instructions preserve the high bits. 415 return getGeneration() <= AMDGPUSubtarget::GFX9; 416 case AMDGPU::V_MADAK_F16: 417 case AMDGPU::V_MADMK_F16: 418 case AMDGPU::V_MAC_F16_e64: 419 case AMDGPU::V_MAC_F16_e32: 420 case AMDGPU::V_FMAMK_F16: 421 case AMDGPU::V_FMAAK_F16: 422 case AMDGPU::V_FMAC_F16_e64: 423 case AMDGPU::V_FMAC_F16_e32: 424 // In gfx9, the preferred handling of the unused high 16-bits changed. Most 425 // instructions maintain the legacy behavior of 0ing. Some instructions 426 // changed to preserving the high bits. 427 return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; 428 case AMDGPU::V_MAD_MIXLO_F16: 429 case AMDGPU::V_MAD_MIXHI_F16: 430 default: 431 return false; 432 } 433 } 434 435 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 436 const Function &F) const { 437 if (NWaves == 1) 438 return getLocalMemorySize(); 439 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 440 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 441 if (!WorkGroupsPerCu) 442 return 0; 443 unsigned MaxWaves = getMaxWavesPerEU(); 444 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 445 } 446 447 // FIXME: Should return min,max range. 448 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 449 const Function &F) const { 450 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 451 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 452 if (!MaxWorkGroupsPerCu) 453 return 0; 454 455 const unsigned WaveSize = getWavefrontSize(); 456 457 // FIXME: Do we need to account for alignment requirement of LDS rounding the 458 // size up? 459 // Compute restriction based on LDS usage 460 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 461 462 // This can be queried with more LDS than is possible, so just assume the 463 // worst. 464 if (NumGroups == 0) 465 return 1; 466 467 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 468 469 // Round to the number of waves. 470 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; 471 unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 472 473 // Clamp to the maximum possible number of waves. 474 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 475 476 // FIXME: Needs to be a multiple of the group size? 477 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 478 479 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 480 "computed invalid occupancy"); 481 return MaxWaves; 482 } 483 484 unsigned 485 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 486 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 487 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 488 } 489 490 std::pair<unsigned, unsigned> 491 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 492 switch (CC) { 493 case CallingConv::AMDGPU_VS: 494 case CallingConv::AMDGPU_LS: 495 case CallingConv::AMDGPU_HS: 496 case CallingConv::AMDGPU_ES: 497 case CallingConv::AMDGPU_GS: 498 case CallingConv::AMDGPU_PS: 499 return std::make_pair(1, getWavefrontSize()); 500 default: 501 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 502 } 503 } 504 505 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 506 const Function &F) const { 507 // Default minimum/maximum flat work group sizes. 508 std::pair<unsigned, unsigned> Default = 509 getDefaultFlatWorkGroupSize(F.getCallingConv()); 510 511 // Requested minimum/maximum flat work group sizes. 512 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 513 F, "amdgpu-flat-work-group-size", Default); 514 515 // Make sure requested minimum is less than requested maximum. 516 if (Requested.first > Requested.second) 517 return Default; 518 519 // Make sure requested values do not violate subtarget's specifications. 520 if (Requested.first < getMinFlatWorkGroupSize()) 521 return Default; 522 if (Requested.second > getMaxFlatWorkGroupSize()) 523 return Default; 524 525 return Requested; 526 } 527 528 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 529 const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const { 530 // Default minimum/maximum number of waves per execution unit. 531 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 532 533 // If minimum/maximum flat work group sizes were explicitly requested using 534 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 535 // number of waves per execution unit to values implied by requested 536 // minimum/maximum flat work group sizes. 537 unsigned MinImpliedByFlatWorkGroupSize = 538 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 539 Default.first = MinImpliedByFlatWorkGroupSize; 540 541 // Requested minimum/maximum number of waves per execution unit. 542 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 543 F, "amdgpu-waves-per-eu", Default, true); 544 545 // Make sure requested minimum is less than requested maximum. 546 if (Requested.second && Requested.first > Requested.second) 547 return Default; 548 549 // Make sure requested values do not violate subtarget's specifications. 550 if (Requested.first < getMinWavesPerEU() || 551 Requested.second > getMaxWavesPerEU()) 552 return Default; 553 554 // Make sure requested values are compatible with values implied by requested 555 // minimum/maximum flat work group sizes. 556 if (Requested.first < MinImpliedByFlatWorkGroupSize) 557 return Default; 558 559 return Requested; 560 } 561 562 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { 563 auto Node = Kernel.getMetadata("reqd_work_group_size"); 564 if (Node && Node->getNumOperands() == 3) 565 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); 566 return std::numeric_limits<unsigned>::max(); 567 } 568 569 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { 570 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); 571 } 572 573 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, 574 unsigned Dimension) const { 575 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); 576 if (ReqdSize != std::numeric_limits<unsigned>::max()) 577 return ReqdSize - 1; 578 return getFlatWorkGroupSizes(Kernel).second - 1; 579 } 580 581 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 582 Function *Kernel = I->getParent()->getParent(); 583 unsigned MinSize = 0; 584 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 585 bool IdQuery = false; 586 587 // If reqd_work_group_size is present it narrows value down. 588 if (auto *CI = dyn_cast<CallInst>(I)) { 589 const Function *F = CI->getCalledFunction(); 590 if (F) { 591 unsigned Dim = UINT_MAX; 592 switch (F->getIntrinsicID()) { 593 case Intrinsic::amdgcn_workitem_id_x: 594 case Intrinsic::r600_read_tidig_x: 595 IdQuery = true; 596 LLVM_FALLTHROUGH; 597 case Intrinsic::r600_read_local_size_x: 598 Dim = 0; 599 break; 600 case Intrinsic::amdgcn_workitem_id_y: 601 case Intrinsic::r600_read_tidig_y: 602 IdQuery = true; 603 LLVM_FALLTHROUGH; 604 case Intrinsic::r600_read_local_size_y: 605 Dim = 1; 606 break; 607 case Intrinsic::amdgcn_workitem_id_z: 608 case Intrinsic::r600_read_tidig_z: 609 IdQuery = true; 610 LLVM_FALLTHROUGH; 611 case Intrinsic::r600_read_local_size_z: 612 Dim = 2; 613 break; 614 default: 615 break; 616 } 617 618 if (Dim <= 3) { 619 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); 620 if (ReqdSize != std::numeric_limits<unsigned>::max()) 621 MinSize = MaxSize = ReqdSize; 622 } 623 } 624 } 625 626 if (!MaxSize) 627 return false; 628 629 // Range metadata is [Lo, Hi). For ID query we need to pass max size 630 // as Hi. For size query we need to pass Hi + 1. 631 if (IdQuery) 632 MinSize = 0; 633 else 634 ++MaxSize; 635 636 MDBuilder MDB(I->getContext()); 637 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 638 APInt(32, MaxSize)); 639 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 640 return true; 641 } 642 643 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { 644 assert(AMDGPU::isKernel(F.getCallingConv())); 645 646 // We don't allocate the segment if we know the implicit arguments weren't 647 // used, even if the ABI implies we need them. 648 if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr")) 649 return 0; 650 651 if (isMesaKernel(F)) 652 return 16; 653 654 // Assume all implicit inputs are used by default 655 unsigned NBytes = (AMDGPU::getAmdhsaCodeObjectVersion() >= 5) ? 256 : 56; 656 return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", NBytes); 657 } 658 659 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 660 Align &MaxAlign) const { 661 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 662 F.getCallingConv() == CallingConv::SPIR_KERNEL); 663 664 const DataLayout &DL = F.getParent()->getDataLayout(); 665 uint64_t ExplicitArgBytes = 0; 666 MaxAlign = Align(1); 667 668 for (const Argument &Arg : F.args()) { 669 const bool IsByRef = Arg.hasByRefAttr(); 670 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 671 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; 672 if (!Alignment) 673 Alignment = DL.getABITypeAlign(ArgTy); 674 675 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 676 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 677 MaxAlign = max(MaxAlign, Alignment); 678 } 679 680 return ExplicitArgBytes; 681 } 682 683 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 684 Align &MaxAlign) const { 685 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 686 687 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 688 689 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 690 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 691 if (ImplicitBytes != 0) { 692 const Align Alignment = getAlignmentForImplicitArgPtr(); 693 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 694 MaxAlign = std::max(MaxAlign, Alignment); 695 } 696 697 // Being able to dereference past the end is useful for emitting scalar loads. 698 return alignTo(TotalSize, 4); 699 } 700 701 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const { 702 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32 703 : AMDGPUDwarfFlavour::Wave64; 704 } 705 706 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 707 unsigned NumRegionInstrs) const { 708 // Track register pressure so the scheduler can try to decrease 709 // pressure once register usage is above the threshold defined by 710 // SIRegisterInfo::getRegPressureSetLimit() 711 Policy.ShouldTrackPressure = true; 712 713 // Enabling both top down and bottom up scheduling seems to give us less 714 // register spills than just using one of these approaches on its own. 715 Policy.OnlyTopDown = false; 716 Policy.OnlyBottomUp = false; 717 718 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 719 if (!enableSIScheduler()) 720 Policy.ShouldTrackLaneMasks = true; 721 } 722 723 bool GCNSubtarget::hasMadF16() const { 724 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; 725 } 726 727 bool GCNSubtarget::useVGPRIndexMode() const { 728 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 729 } 730 731 bool GCNSubtarget::useAA() const { return UseAA; } 732 733 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 734 if (getGeneration() >= AMDGPUSubtarget::GFX10) 735 return getMaxWavesPerEU(); 736 737 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 738 if (SGPRs <= 80) 739 return 10; 740 if (SGPRs <= 88) 741 return 9; 742 if (SGPRs <= 100) 743 return 8; 744 return 7; 745 } 746 if (SGPRs <= 48) 747 return 10; 748 if (SGPRs <= 56) 749 return 9; 750 if (SGPRs <= 64) 751 return 8; 752 if (SGPRs <= 72) 753 return 7; 754 if (SGPRs <= 80) 755 return 6; 756 return 5; 757 } 758 759 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 760 unsigned MaxWaves = getMaxWavesPerEU(); 761 unsigned Granule = getVGPRAllocGranule(); 762 if (VGPRs < Granule) 763 return MaxWaves; 764 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 765 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 766 } 767 768 unsigned 769 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const { 770 if (getGeneration() >= AMDGPUSubtarget::GFX10) 771 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 772 773 if (HasFlatScratch || HasArchitectedFlatScratch) { 774 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 775 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 776 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 777 return 4; // FLAT_SCRATCH, VCC (in that order). 778 } 779 780 if (isXNACKEnabled()) 781 return 4; // XNACK, VCC (in that order). 782 return 2; // VCC. 783 } 784 785 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 786 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 787 return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit()); 788 } 789 790 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { 791 // In principle we do not need to reserve SGPR pair used for flat_scratch if 792 // we know flat instructions do not access the stack anywhere in the 793 // program. For now assume it's needed if we have flat instructions. 794 const bool KernelUsesFlatScratch = hasFlatAddressSpace(); 795 return getBaseReservedNumSGPRs(KernelUsesFlatScratch); 796 } 797 798 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 799 unsigned NumSGPRs, 800 unsigned NumVGPRs) const { 801 unsigned Occupancy = 802 std::min(getMaxWavesPerEU(), 803 getOccupancyWithLocalMemSize(LDSSize, F)); 804 if (NumSGPRs) 805 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 806 if (NumVGPRs) 807 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 808 return Occupancy; 809 } 810 811 unsigned GCNSubtarget::getBaseMaxNumSGPRs( 812 const Function &F, std::pair<unsigned, unsigned> WavesPerEU, 813 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const { 814 // Compute maximum number of SGPRs function can use using default/requested 815 // minimum number of waves per execution unit. 816 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 817 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 818 819 // Check if maximum number of SGPRs was explicitly requested using 820 // "amdgpu-num-sgpr" attribute. 821 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 822 unsigned Requested = AMDGPU::getIntegerAttribute( 823 F, "amdgpu-num-sgpr", MaxNumSGPRs); 824 825 // Make sure requested value does not violate subtarget's specifications. 826 if (Requested && (Requested <= ReservedNumSGPRs)) 827 Requested = 0; 828 829 // If more SGPRs are required to support the input user/system SGPRs, 830 // increase to accommodate them. 831 // 832 // FIXME: This really ends up using the requested number of SGPRs + number 833 // of reserved special registers in total. Theoretically you could re-use 834 // the last input registers for these special registers, but this would 835 // require a lot of complexity to deal with the weird aliasing. 836 unsigned InputNumSGPRs = PreloadedSGPRs; 837 if (Requested && Requested < InputNumSGPRs) 838 Requested = InputNumSGPRs; 839 840 // Make sure requested value is compatible with values implied by 841 // default/requested minimum/maximum number of waves per execution unit. 842 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 843 Requested = 0; 844 if (WavesPerEU.second && 845 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 846 Requested = 0; 847 848 if (Requested) 849 MaxNumSGPRs = Requested; 850 } 851 852 if (hasSGPRInitBug()) 853 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 854 855 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs); 856 } 857 858 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 859 const Function &F = MF.getFunction(); 860 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 861 return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(), 862 getReservedNumSGPRs(MF)); 863 } 864 865 static unsigned getMaxNumPreloadedSGPRs() { 866 // Max number of user SGPRs 867 unsigned MaxUserSGPRs = 4 + // private segment buffer 868 2 + // Dispatch ptr 869 2 + // queue ptr 870 2 + // kernel segment ptr 871 2 + // dispatch ID 872 2 + // flat scratch init 873 2; // Implicit buffer ptr 874 // Max number of system SGPRs 875 unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX 876 1 + // WorkGroupIDY 877 1 + // WorkGroupIDZ 878 1 + // WorkGroupInfo 879 1; // private segment wave byte offset 880 return MaxUserSGPRs + MaxSystemSGPRs; 881 } 882 883 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const { 884 return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(), 885 getReservedNumSGPRs(F)); 886 } 887 888 unsigned GCNSubtarget::getBaseMaxNumVGPRs( 889 const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const { 890 // Compute maximum number of VGPRs function can use using default/requested 891 // minimum number of waves per execution unit. 892 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 893 894 // Check if maximum number of VGPRs was explicitly requested using 895 // "amdgpu-num-vgpr" attribute. 896 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 897 unsigned Requested = AMDGPU::getIntegerAttribute( 898 F, "amdgpu-num-vgpr", MaxNumVGPRs); 899 900 if (hasGFX90AInsts()) 901 Requested *= 2; 902 903 // Make sure requested value is compatible with values implied by 904 // default/requested minimum/maximum number of waves per execution unit. 905 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 906 Requested = 0; 907 if (WavesPerEU.second && 908 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 909 Requested = 0; 910 911 if (Requested) 912 MaxNumVGPRs = Requested; 913 } 914 915 return MaxNumVGPRs; 916 } 917 918 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const { 919 return getBaseMaxNumVGPRs(F, getWavesPerEU(F)); 920 } 921 922 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 923 const Function &F = MF.getFunction(); 924 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 925 return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU()); 926 } 927 928 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, 929 int UseOpIdx, SDep &Dep) const { 930 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 931 !Def->isInstr() || !Use->isInstr()) 932 return; 933 934 MachineInstr *DefI = Def->getInstr(); 935 MachineInstr *UseI = Use->getInstr(); 936 937 if (DefI->isBundle()) { 938 const SIRegisterInfo *TRI = getRegisterInfo(); 939 auto Reg = Dep.getReg(); 940 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 941 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 942 unsigned Lat = 0; 943 for (++I; I != E && I->isBundledWithPred(); ++I) { 944 if (I->modifiesRegister(Reg, TRI)) 945 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 946 else if (Lat) 947 --Lat; 948 } 949 Dep.setLatency(Lat); 950 } else if (UseI->isBundle()) { 951 const SIRegisterInfo *TRI = getRegisterInfo(); 952 auto Reg = Dep.getReg(); 953 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 954 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 955 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 956 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 957 if (I->readsRegister(Reg, TRI)) 958 break; 959 --Lat; 960 } 961 Dep.setLatency(Lat); 962 } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) { 963 // Work around the fact that SIInstrInfo::fixImplicitOperands modifies 964 // implicit operands which come from the MCInstrDesc, which can fool 965 // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit 966 // pseudo operands. 967 Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency( 968 DefI, DefOpIdx, UseI, UseOpIdx)); 969 } 970 } 971 972 namespace { 973 struct FillMFMAShadowMutation : ScheduleDAGMutation { 974 const SIInstrInfo *TII; 975 976 ScheduleDAGMI *DAG; 977 978 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 979 980 bool isSALU(const SUnit *SU) const { 981 const MachineInstr *MI = SU->getInstr(); 982 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 983 } 984 985 bool isVALU(const SUnit *SU) const { 986 const MachineInstr *MI = SU->getInstr(); 987 return MI && TII->isVALU(*MI); 988 } 989 990 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 991 if (Pred->NodeNum < Succ->NodeNum) 992 return true; 993 994 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 995 996 for (unsigned I = 0; I < Succs.size(); ++I) { 997 for (const SDep &SI : Succs[I]->Succs) { 998 const SUnit *SU = SI.getSUnit(); 999 if (SU != Succs[I] && !llvm::is_contained(Succs, SU)) 1000 Succs.push_back(SU); 1001 } 1002 } 1003 1004 SmallPtrSet<const SUnit*, 32> Visited; 1005 while (!Preds.empty()) { 1006 const SUnit *SU = Preds.pop_back_val(); 1007 if (llvm::is_contained(Succs, SU)) 1008 return false; 1009 Visited.insert(SU); 1010 for (const SDep &SI : SU->Preds) 1011 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 1012 Preds.push_back(SI.getSUnit()); 1013 } 1014 1015 return true; 1016 } 1017 1018 // Link as many SALU instructions in chain as possible. Return the size 1019 // of the chain. Links up to MaxChain instructions. 1020 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 1021 SmallPtrSetImpl<SUnit *> &Visited) const { 1022 SmallVector<SUnit *, 8> Worklist({To}); 1023 unsigned Linked = 0; 1024 1025 while (!Worklist.empty() && MaxChain-- > 0) { 1026 SUnit *SU = Worklist.pop_back_val(); 1027 if (!Visited.insert(SU).second) 1028 continue; 1029 1030 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 1031 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 1032 1033 if (SU->addPred(SDep(From, SDep::Artificial), false)) 1034 ++Linked; 1035 1036 for (SDep &SI : From->Succs) { 1037 SUnit *SUv = SI.getSUnit(); 1038 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 1039 SUv->addPred(SDep(SU, SDep::Artificial), false); 1040 } 1041 1042 for (SDep &SI : SU->Succs) { 1043 SUnit *Succ = SI.getSUnit(); 1044 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 1045 Worklist.push_back(Succ); 1046 } 1047 } 1048 1049 return Linked; 1050 } 1051 1052 void apply(ScheduleDAGInstrs *DAGInstrs) override { 1053 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 1054 if (!ST.hasMAIInsts() || DisablePowerSched) 1055 return; 1056 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 1057 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 1058 if (!TSchedModel || DAG->SUnits.empty()) 1059 return; 1060 1061 // Scan for MFMA long latency instructions and try to add a dependency 1062 // of available SALU instructions to give them a chance to fill MFMA 1063 // shadow. That is desirable to fill MFMA shadow with SALU instructions 1064 // rather than VALU to prevent power consumption bursts and throttle. 1065 auto LastSALU = DAG->SUnits.begin(); 1066 auto E = DAG->SUnits.end(); 1067 SmallPtrSet<SUnit*, 32> Visited; 1068 for (SUnit &SU : DAG->SUnits) { 1069 MachineInstr &MAI = *SU.getInstr(); 1070 if (!TII->isMAI(MAI) || 1071 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || 1072 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) 1073 continue; 1074 1075 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 1076 1077 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 1078 dbgs() << "Need " << Lat 1079 << " instructions to cover latency.\n"); 1080 1081 // Find up to Lat independent scalar instructions as early as 1082 // possible such that they can be scheduled after this MFMA. 1083 for ( ; Lat && LastSALU != E; ++LastSALU) { 1084 if (Visited.count(&*LastSALU)) 1085 continue; 1086 1087 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 1088 continue; 1089 1090 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 1091 } 1092 } 1093 } 1094 }; 1095 } // namespace 1096 1097 void GCNSubtarget::getPostRAMutations( 1098 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 1099 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 1100 } 1101 1102 std::unique_ptr<ScheduleDAGMutation> 1103 GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const { 1104 return std::make_unique<FillMFMAShadowMutation>(&InstrInfo); 1105 } 1106 1107 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 1108 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 1109 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 1110 else 1111 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 1112 } 1113 1114 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 1115 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 1116 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 1117 else 1118 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 1119 } 1120