1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPUCallLowering.h" 16 #include "AMDGPUInstructionSelector.h" 17 #include "AMDGPULegalizerInfo.h" 18 #include "AMDGPURegisterBankInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "R600Subtarget.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "Utils/AMDGPUBaseInfo.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/CodeGen/TargetFrameLowering.h" 27 #include "llvm/IR/IntrinsicsAMDGPU.h" 28 #include "llvm/IR/IntrinsicsR600.h" 29 #include "llvm/IR/MDBuilder.h" 30 #include "llvm/MC/MCSubtargetInfo.h" 31 #include <algorithm> 32 33 using namespace llvm; 34 35 #define DEBUG_TYPE "amdgpu-subtarget" 36 37 #define GET_SUBTARGETINFO_TARGET_DESC 38 #define GET_SUBTARGETINFO_CTOR 39 #define AMDGPUSubtarget GCNSubtarget 40 #include "AMDGPUGenSubtargetInfo.inc" 41 #undef AMDGPUSubtarget 42 43 static cl::opt<bool> DisablePowerSched( 44 "amdgpu-disable-power-sched", 45 cl::desc("Disable scheduling to minimize mAI power bursts"), 46 cl::init(false)); 47 48 static cl::opt<bool> EnableVGPRIndexMode( 49 "amdgpu-vgpr-index-mode", 50 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 51 cl::init(false)); 52 53 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen", 54 cl::desc("Enable the use of AA during codegen."), 55 cl::init(true)); 56 57 GCNSubtarget::~GCNSubtarget() = default; 58 59 GCNSubtarget & 60 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 61 StringRef GPU, StringRef FS) { 62 // Determine default and user-specified characteristics 63 // 64 // We want to be able to turn these off, but making this a subtarget feature 65 // for SI has the unhelpful behavior that it unsets everything else if you 66 // disable it. 67 // 68 // Similarly we want enable-prt-strict-null to be on by default and not to 69 // unset everything else if it is disabled 70 71 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); 72 73 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default 74 if (isAmdHsaOS()) 75 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; 76 77 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 78 79 // Disable mutually exclusive bits. 80 if (FS.contains_insensitive("+wavefrontsize")) { 81 if (!FS.contains_insensitive("wavefrontsize16")) 82 FullFS += "-wavefrontsize16,"; 83 if (!FS.contains_insensitive("wavefrontsize32")) 84 FullFS += "-wavefrontsize32,"; 85 if (!FS.contains_insensitive("wavefrontsize64")) 86 FullFS += "-wavefrontsize64,"; 87 } 88 89 FullFS += FS; 90 91 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 92 93 // Implement the "generic" processors, which acts as the default when no 94 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to 95 // the first amdgcn target that supports flat addressing. Other OSes defaults 96 // to the first amdgcn target. 97 if (Gen == AMDGPUSubtarget::INVALID) { 98 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS 99 : AMDGPUSubtarget::SOUTHERN_ISLANDS; 100 } 101 102 // We don't support FP64 for EG/NI atm. 103 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 104 105 // Targets must either support 64-bit offsets for MUBUF instructions, and/or 106 // support flat operations, otherwise they cannot access a 64-bit global 107 // address space 108 assert(hasAddr64() || hasFlat()); 109 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets 110 // that do not support ADDR64 variants of MUBUF instructions. Such targets 111 // cannot use a 64 bit offset with a MUBUF instruction to access the global 112 // address space 113 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { 114 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 115 FlatForGlobal = true; 116 } 117 // Unless +-flat-for-global is specified, use MUBUF instructions for global 118 // address space access if flat operations are not available. 119 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { 120 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 121 FlatForGlobal = false; 122 } 123 124 // Set defaults if needed. 125 if (MaxPrivateElementSize == 0) 126 MaxPrivateElementSize = 4; 127 128 if (LDSBankCount == 0) 129 LDSBankCount = 32; 130 131 if (TT.getArch() == Triple::amdgcn) { 132 if (LocalMemorySize == 0) 133 LocalMemorySize = 32768; 134 135 // Do something sensible for unspecified target. 136 if (!HasMovrel && !HasVGPRIndexMode) 137 HasMovrel = true; 138 } 139 140 // Don't crash on invalid devices. 141 if (WavefrontSizeLog2 == 0) 142 WavefrontSizeLog2 = 5; 143 144 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 145 HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9; 146 147 TargetID.setTargetIDFromFeaturesString(FS); 148 149 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " 150 << TargetID.getXnackSetting() << '\n'); 151 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " 152 << TargetID.getSramEccSetting() << '\n'); 153 154 return *this; 155 } 156 157 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 158 TargetTriple(TT), 159 GCN3Encoding(false), 160 Has16BitInsts(false), 161 HasMadMixInsts(false), 162 HasMadMacF32Insts(false), 163 HasDsSrc2Insts(false), 164 HasSDWA(false), 165 HasVOP3PInsts(false), 166 HasMulI24(true), 167 HasMulU24(true), 168 HasSMulHi(false), 169 HasInv2PiInlineImm(false), 170 HasFminFmaxLegacy(true), 171 EnablePromoteAlloca(false), 172 HasTrigReducedRange(false), 173 MaxWavesPerEU(10), 174 LocalMemorySize(0), 175 WavefrontSizeLog2(0) 176 { } 177 178 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 179 const GCNTargetMachine &TM) 180 : // clang-format off 181 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), 182 AMDGPUSubtarget(TT), 183 TargetTriple(TT), 184 TargetID(*this), 185 Gen(INVALID), 186 InstrItins(getInstrItineraryForCPU(GPU)), 187 LDSBankCount(0), 188 MaxPrivateElementSize(0), 189 190 FastFMAF32(false), 191 FastDenormalF32(false), 192 HalfRate64Ops(false), 193 FullRate64Ops(false), 194 195 FlatForGlobal(false), 196 AutoWaitcntBeforeBarrier(false), 197 UnalignedScratchAccess(false), 198 UnalignedAccessMode(false), 199 200 HasApertureRegs(false), 201 SupportsXNACK(false), 202 EnableXNACK(false), 203 EnableTgSplit(false), 204 EnableCuMode(false), 205 TrapHandler(false), 206 207 EnableLoadStoreOpt(false), 208 EnableUnsafeDSOffsetFolding(false), 209 EnableSIScheduler(false), 210 EnableDS128(false), 211 EnablePRTStrictNull(false), 212 DumpCode(false), 213 214 FP64(false), 215 CIInsts(false), 216 GFX8Insts(false), 217 GFX9Insts(false), 218 GFX90AInsts(false), 219 GFX940Insts(false), 220 GFX10Insts(false), 221 GFX10_3Insts(false), 222 GFX7GFX8GFX9Insts(false), 223 SGPRInitBug(false), 224 NegativeScratchOffsetBug(false), 225 NegativeUnalignedScratchOffsetBug(false), 226 HasSMemRealTime(false), 227 HasIntClamp(false), 228 HasFmaMixInsts(false), 229 HasMovrel(false), 230 HasVGPRIndexMode(false), 231 HasScalarStores(false), 232 HasScalarAtomics(false), 233 HasSDWAOmod(false), 234 HasSDWAScalar(false), 235 HasSDWASdst(false), 236 HasSDWAMac(false), 237 HasSDWAOutModsVOPC(false), 238 HasDPP(false), 239 HasDPP8(false), 240 Has64BitDPP(false), 241 HasPackedFP32Ops(false), 242 HasExtendedImageInsts(false), 243 HasR128A16(false), 244 HasGFX10A16(false), 245 HasG16(false), 246 HasNSAEncoding(false), 247 NSAMaxSize(0), 248 GFX10_AEncoding(false), 249 GFX10_BEncoding(false), 250 HasDLInsts(false), 251 HasDot1Insts(false), 252 HasDot2Insts(false), 253 HasDot3Insts(false), 254 HasDot4Insts(false), 255 HasDot5Insts(false), 256 HasDot6Insts(false), 257 HasDot7Insts(false), 258 HasMAIInsts(false), 259 HasPkFmacF16Inst(false), 260 HasAtomicFaddInsts(false), 261 SupportsSRAMECC(false), 262 EnableSRAMECC(false), 263 HasNoSdstCMPX(false), 264 HasVscnt(false), 265 HasGetWaveIdInst(false), 266 HasSMemTimeInst(false), 267 HasShaderCyclesRegister(false), 268 HasVOP3Literal(false), 269 HasNoDataDepHazard(false), 270 FlatAddressSpace(false), 271 FlatInstOffsets(false), 272 FlatGlobalInsts(false), 273 FlatScratchInsts(false), 274 ScalarFlatScratchInsts(false), 275 HasArchitectedFlatScratch(false), 276 EnableFlatScratch(false), 277 AddNoCarryInsts(false), 278 HasUnpackedD16VMem(false), 279 LDSMisalignedBug(false), 280 HasMFMAInlineLiteralBug(false), 281 UnalignedBufferAccess(false), 282 UnalignedDSAccess(false), 283 HasPackedTID(false), 284 285 ScalarizeGlobal(false), 286 287 HasVcmpxPermlaneHazard(false), 288 HasVMEMtoScalarWriteHazard(false), 289 HasSMEMtoVectorWriteHazard(false), 290 HasInstFwdPrefetchBug(false), 291 HasVcmpxExecWARHazard(false), 292 HasLdsBranchVmemWARHazard(false), 293 HasNSAtoVMEMBug(false), 294 HasNSAClauseBug(false), 295 HasOffset3fBug(false), 296 HasFlatSegmentOffsetBug(false), 297 HasImageStoreD16Bug(false), 298 HasImageGather4D16Bug(false), 299 300 FeatureDisable(false), 301 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 302 TLInfo(TM, *this), 303 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 304 // clang-format on 305 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 306 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 307 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 308 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 309 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 310 InstSelector.reset(new AMDGPUInstructionSelector( 311 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 312 } 313 314 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 315 if (getGeneration() < GFX10) 316 return 1; 317 318 switch (Opcode) { 319 case AMDGPU::V_LSHLREV_B64_e64: 320 case AMDGPU::V_LSHLREV_B64_gfx10: 321 case AMDGPU::V_LSHL_B64_e64: 322 case AMDGPU::V_LSHRREV_B64_e64: 323 case AMDGPU::V_LSHRREV_B64_gfx10: 324 case AMDGPU::V_LSHR_B64_e64: 325 case AMDGPU::V_ASHRREV_I64_e64: 326 case AMDGPU::V_ASHRREV_I64_gfx10: 327 case AMDGPU::V_ASHR_I64_e64: 328 return 1; 329 } 330 331 return 2; 332 } 333 334 /// This list was mostly derived from experimentation. 335 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const { 336 switch (Opcode) { 337 case AMDGPU::V_CVT_F16_F32_e32: 338 case AMDGPU::V_CVT_F16_F32_e64: 339 case AMDGPU::V_CVT_F16_U16_e32: 340 case AMDGPU::V_CVT_F16_U16_e64: 341 case AMDGPU::V_CVT_F16_I16_e32: 342 case AMDGPU::V_CVT_F16_I16_e64: 343 case AMDGPU::V_RCP_F16_e64: 344 case AMDGPU::V_RCP_F16_e32: 345 case AMDGPU::V_RSQ_F16_e64: 346 case AMDGPU::V_RSQ_F16_e32: 347 case AMDGPU::V_SQRT_F16_e64: 348 case AMDGPU::V_SQRT_F16_e32: 349 case AMDGPU::V_LOG_F16_e64: 350 case AMDGPU::V_LOG_F16_e32: 351 case AMDGPU::V_EXP_F16_e64: 352 case AMDGPU::V_EXP_F16_e32: 353 case AMDGPU::V_SIN_F16_e64: 354 case AMDGPU::V_SIN_F16_e32: 355 case AMDGPU::V_COS_F16_e64: 356 case AMDGPU::V_COS_F16_e32: 357 case AMDGPU::V_FLOOR_F16_e64: 358 case AMDGPU::V_FLOOR_F16_e32: 359 case AMDGPU::V_CEIL_F16_e64: 360 case AMDGPU::V_CEIL_F16_e32: 361 case AMDGPU::V_TRUNC_F16_e64: 362 case AMDGPU::V_TRUNC_F16_e32: 363 case AMDGPU::V_RNDNE_F16_e64: 364 case AMDGPU::V_RNDNE_F16_e32: 365 case AMDGPU::V_FRACT_F16_e64: 366 case AMDGPU::V_FRACT_F16_e32: 367 case AMDGPU::V_FREXP_MANT_F16_e64: 368 case AMDGPU::V_FREXP_MANT_F16_e32: 369 case AMDGPU::V_FREXP_EXP_I16_F16_e64: 370 case AMDGPU::V_FREXP_EXP_I16_F16_e32: 371 case AMDGPU::V_LDEXP_F16_e64: 372 case AMDGPU::V_LDEXP_F16_e32: 373 case AMDGPU::V_LSHLREV_B16_e64: 374 case AMDGPU::V_LSHLREV_B16_e32: 375 case AMDGPU::V_LSHRREV_B16_e64: 376 case AMDGPU::V_LSHRREV_B16_e32: 377 case AMDGPU::V_ASHRREV_I16_e64: 378 case AMDGPU::V_ASHRREV_I16_e32: 379 case AMDGPU::V_ADD_U16_e64: 380 case AMDGPU::V_ADD_U16_e32: 381 case AMDGPU::V_SUB_U16_e64: 382 case AMDGPU::V_SUB_U16_e32: 383 case AMDGPU::V_SUBREV_U16_e64: 384 case AMDGPU::V_SUBREV_U16_e32: 385 case AMDGPU::V_MUL_LO_U16_e64: 386 case AMDGPU::V_MUL_LO_U16_e32: 387 case AMDGPU::V_ADD_F16_e64: 388 case AMDGPU::V_ADD_F16_e32: 389 case AMDGPU::V_SUB_F16_e64: 390 case AMDGPU::V_SUB_F16_e32: 391 case AMDGPU::V_SUBREV_F16_e64: 392 case AMDGPU::V_SUBREV_F16_e32: 393 case AMDGPU::V_MUL_F16_e64: 394 case AMDGPU::V_MUL_F16_e32: 395 case AMDGPU::V_MAX_F16_e64: 396 case AMDGPU::V_MAX_F16_e32: 397 case AMDGPU::V_MIN_F16_e64: 398 case AMDGPU::V_MIN_F16_e32: 399 case AMDGPU::V_MAX_U16_e64: 400 case AMDGPU::V_MAX_U16_e32: 401 case AMDGPU::V_MIN_U16_e64: 402 case AMDGPU::V_MIN_U16_e32: 403 case AMDGPU::V_MAX_I16_e64: 404 case AMDGPU::V_MAX_I16_e32: 405 case AMDGPU::V_MIN_I16_e64: 406 case AMDGPU::V_MIN_I16_e32: 407 case AMDGPU::V_MAD_F16_e64: 408 case AMDGPU::V_MAD_U16_e64: 409 case AMDGPU::V_MAD_I16_e64: 410 case AMDGPU::V_FMA_F16_e64: 411 case AMDGPU::V_DIV_FIXUP_F16_e64: 412 // On gfx10, all 16-bit instructions preserve the high bits. 413 return getGeneration() <= AMDGPUSubtarget::GFX9; 414 case AMDGPU::V_MADAK_F16: 415 case AMDGPU::V_MADMK_F16: 416 case AMDGPU::V_MAC_F16_e64: 417 case AMDGPU::V_MAC_F16_e32: 418 case AMDGPU::V_FMAMK_F16: 419 case AMDGPU::V_FMAAK_F16: 420 case AMDGPU::V_FMAC_F16_e64: 421 case AMDGPU::V_FMAC_F16_e32: 422 // In gfx9, the preferred handling of the unused high 16-bits changed. Most 423 // instructions maintain the legacy behavior of 0ing. Some instructions 424 // changed to preserving the high bits. 425 return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; 426 case AMDGPU::V_MAD_MIXLO_F16: 427 case AMDGPU::V_MAD_MIXHI_F16: 428 default: 429 return false; 430 } 431 } 432 433 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 434 const Function &F) const { 435 if (NWaves == 1) 436 return getLocalMemorySize(); 437 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 438 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 439 if (!WorkGroupsPerCu) 440 return 0; 441 unsigned MaxWaves = getMaxWavesPerEU(); 442 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 443 } 444 445 // FIXME: Should return min,max range. 446 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 447 const Function &F) const { 448 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 449 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 450 if (!MaxWorkGroupsPerCu) 451 return 0; 452 453 const unsigned WaveSize = getWavefrontSize(); 454 455 // FIXME: Do we need to account for alignment requirement of LDS rounding the 456 // size up? 457 // Compute restriction based on LDS usage 458 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 459 460 // This can be queried with more LDS than is possible, so just assume the 461 // worst. 462 if (NumGroups == 0) 463 return 1; 464 465 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 466 467 // Round to the number of waves. 468 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; 469 unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 470 471 // Clamp to the maximum possible number of waves. 472 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 473 474 // FIXME: Needs to be a multiple of the group size? 475 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 476 477 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 478 "computed invalid occupancy"); 479 return MaxWaves; 480 } 481 482 unsigned 483 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 484 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 485 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 486 } 487 488 std::pair<unsigned, unsigned> 489 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 490 switch (CC) { 491 case CallingConv::AMDGPU_VS: 492 case CallingConv::AMDGPU_LS: 493 case CallingConv::AMDGPU_HS: 494 case CallingConv::AMDGPU_ES: 495 case CallingConv::AMDGPU_GS: 496 case CallingConv::AMDGPU_PS: 497 return std::make_pair(1, getWavefrontSize()); 498 default: 499 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 500 } 501 } 502 503 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 504 const Function &F) const { 505 // Default minimum/maximum flat work group sizes. 506 std::pair<unsigned, unsigned> Default = 507 getDefaultFlatWorkGroupSize(F.getCallingConv()); 508 509 // Requested minimum/maximum flat work group sizes. 510 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 511 F, "amdgpu-flat-work-group-size", Default); 512 513 // Make sure requested minimum is less than requested maximum. 514 if (Requested.first > Requested.second) 515 return Default; 516 517 // Make sure requested values do not violate subtarget's specifications. 518 if (Requested.first < getMinFlatWorkGroupSize()) 519 return Default; 520 if (Requested.second > getMaxFlatWorkGroupSize()) 521 return Default; 522 523 return Requested; 524 } 525 526 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 527 const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const { 528 // Default minimum/maximum number of waves per execution unit. 529 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 530 531 // If minimum/maximum flat work group sizes were explicitly requested using 532 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 533 // number of waves per execution unit to values implied by requested 534 // minimum/maximum flat work group sizes. 535 unsigned MinImpliedByFlatWorkGroupSize = 536 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 537 Default.first = MinImpliedByFlatWorkGroupSize; 538 539 // Requested minimum/maximum number of waves per execution unit. 540 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 541 F, "amdgpu-waves-per-eu", Default, true); 542 543 // Make sure requested minimum is less than requested maximum. 544 if (Requested.second && Requested.first > Requested.second) 545 return Default; 546 547 // Make sure requested values do not violate subtarget's specifications. 548 if (Requested.first < getMinWavesPerEU() || 549 Requested.second > getMaxWavesPerEU()) 550 return Default; 551 552 // Make sure requested values are compatible with values implied by requested 553 // minimum/maximum flat work group sizes. 554 if (Requested.first < MinImpliedByFlatWorkGroupSize) 555 return Default; 556 557 return Requested; 558 } 559 560 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { 561 auto Node = Kernel.getMetadata("reqd_work_group_size"); 562 if (Node && Node->getNumOperands() == 3) 563 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); 564 return std::numeric_limits<unsigned>::max(); 565 } 566 567 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { 568 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); 569 } 570 571 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, 572 unsigned Dimension) const { 573 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); 574 if (ReqdSize != std::numeric_limits<unsigned>::max()) 575 return ReqdSize - 1; 576 return getFlatWorkGroupSizes(Kernel).second - 1; 577 } 578 579 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 580 Function *Kernel = I->getParent()->getParent(); 581 unsigned MinSize = 0; 582 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 583 bool IdQuery = false; 584 585 // If reqd_work_group_size is present it narrows value down. 586 if (auto *CI = dyn_cast<CallInst>(I)) { 587 const Function *F = CI->getCalledFunction(); 588 if (F) { 589 unsigned Dim = UINT_MAX; 590 switch (F->getIntrinsicID()) { 591 case Intrinsic::amdgcn_workitem_id_x: 592 case Intrinsic::r600_read_tidig_x: 593 IdQuery = true; 594 LLVM_FALLTHROUGH; 595 case Intrinsic::r600_read_local_size_x: 596 Dim = 0; 597 break; 598 case Intrinsic::amdgcn_workitem_id_y: 599 case Intrinsic::r600_read_tidig_y: 600 IdQuery = true; 601 LLVM_FALLTHROUGH; 602 case Intrinsic::r600_read_local_size_y: 603 Dim = 1; 604 break; 605 case Intrinsic::amdgcn_workitem_id_z: 606 case Intrinsic::r600_read_tidig_z: 607 IdQuery = true; 608 LLVM_FALLTHROUGH; 609 case Intrinsic::r600_read_local_size_z: 610 Dim = 2; 611 break; 612 default: 613 break; 614 } 615 616 if (Dim <= 3) { 617 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); 618 if (ReqdSize != std::numeric_limits<unsigned>::max()) 619 MinSize = MaxSize = ReqdSize; 620 } 621 } 622 } 623 624 if (!MaxSize) 625 return false; 626 627 // Range metadata is [Lo, Hi). For ID query we need to pass max size 628 // as Hi. For size query we need to pass Hi + 1. 629 if (IdQuery) 630 MinSize = 0; 631 else 632 ++MaxSize; 633 634 MDBuilder MDB(I->getContext()); 635 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 636 APInt(32, MaxSize)); 637 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 638 return true; 639 } 640 641 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { 642 assert(AMDGPU::isKernel(F.getCallingConv())); 643 644 // We don't allocate the segment if we know the implicit arguments weren't 645 // used, even if the ABI implies we need them. 646 if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr")) 647 return 0; 648 649 if (isMesaKernel(F)) 650 return 16; 651 652 // Assume all implicit inputs are used by default 653 return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 56); 654 } 655 656 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 657 Align &MaxAlign) const { 658 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 659 F.getCallingConv() == CallingConv::SPIR_KERNEL); 660 661 const DataLayout &DL = F.getParent()->getDataLayout(); 662 uint64_t ExplicitArgBytes = 0; 663 MaxAlign = Align(1); 664 665 for (const Argument &Arg : F.args()) { 666 const bool IsByRef = Arg.hasByRefAttr(); 667 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 668 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; 669 if (!Alignment) 670 Alignment = DL.getABITypeAlign(ArgTy); 671 672 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 673 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 674 MaxAlign = max(MaxAlign, Alignment); 675 } 676 677 return ExplicitArgBytes; 678 } 679 680 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 681 Align &MaxAlign) const { 682 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 683 684 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 685 686 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 687 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 688 if (ImplicitBytes != 0) { 689 const Align Alignment = getAlignmentForImplicitArgPtr(); 690 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 691 MaxAlign = std::max(MaxAlign, Alignment); 692 } 693 694 // Being able to dereference past the end is useful for emitting scalar loads. 695 return alignTo(TotalSize, 4); 696 } 697 698 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const { 699 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32 700 : AMDGPUDwarfFlavour::Wave64; 701 } 702 703 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 704 unsigned NumRegionInstrs) const { 705 // Track register pressure so the scheduler can try to decrease 706 // pressure once register usage is above the threshold defined by 707 // SIRegisterInfo::getRegPressureSetLimit() 708 Policy.ShouldTrackPressure = true; 709 710 // Enabling both top down and bottom up scheduling seems to give us less 711 // register spills than just using one of these approaches on its own. 712 Policy.OnlyTopDown = false; 713 Policy.OnlyBottomUp = false; 714 715 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 716 if (!enableSIScheduler()) 717 Policy.ShouldTrackLaneMasks = true; 718 } 719 720 bool GCNSubtarget::hasMadF16() const { 721 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; 722 } 723 724 bool GCNSubtarget::useVGPRIndexMode() const { 725 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 726 } 727 728 bool GCNSubtarget::useAA() const { return UseAA; } 729 730 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 731 if (getGeneration() >= AMDGPUSubtarget::GFX10) 732 return getMaxWavesPerEU(); 733 734 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 735 if (SGPRs <= 80) 736 return 10; 737 if (SGPRs <= 88) 738 return 9; 739 if (SGPRs <= 100) 740 return 8; 741 return 7; 742 } 743 if (SGPRs <= 48) 744 return 10; 745 if (SGPRs <= 56) 746 return 9; 747 if (SGPRs <= 64) 748 return 8; 749 if (SGPRs <= 72) 750 return 7; 751 if (SGPRs <= 80) 752 return 6; 753 return 5; 754 } 755 756 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 757 unsigned MaxWaves = getMaxWavesPerEU(); 758 unsigned Granule = getVGPRAllocGranule(); 759 if (VGPRs < Granule) 760 return MaxWaves; 761 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 762 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 763 } 764 765 unsigned 766 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const { 767 if (getGeneration() >= AMDGPUSubtarget::GFX10) 768 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 769 770 if (HasFlatScratch || HasArchitectedFlatScratch) { 771 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 772 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 773 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 774 return 4; // FLAT_SCRATCH, VCC (in that order). 775 } 776 777 if (isXNACKEnabled()) 778 return 4; // XNACK, VCC (in that order). 779 return 2; // VCC. 780 } 781 782 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 783 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 784 return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit()); 785 } 786 787 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { 788 // In principle we do not need to reserve SGPR pair used for flat_scratch if 789 // we know flat instructions do not access the stack anywhere in the 790 // program. For now assume it's needed if we have flat instructions. 791 const bool KernelUsesFlatScratch = hasFlatAddressSpace(); 792 return getBaseReservedNumSGPRs(KernelUsesFlatScratch); 793 } 794 795 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 796 unsigned NumSGPRs, 797 unsigned NumVGPRs) const { 798 unsigned Occupancy = 799 std::min(getMaxWavesPerEU(), 800 getOccupancyWithLocalMemSize(LDSSize, F)); 801 if (NumSGPRs) 802 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 803 if (NumVGPRs) 804 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 805 return Occupancy; 806 } 807 808 unsigned GCNSubtarget::getBaseMaxNumSGPRs( 809 const Function &F, std::pair<unsigned, unsigned> WavesPerEU, 810 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const { 811 // Compute maximum number of SGPRs function can use using default/requested 812 // minimum number of waves per execution unit. 813 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 814 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 815 816 // Check if maximum number of SGPRs was explicitly requested using 817 // "amdgpu-num-sgpr" attribute. 818 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 819 unsigned Requested = AMDGPU::getIntegerAttribute( 820 F, "amdgpu-num-sgpr", MaxNumSGPRs); 821 822 // Make sure requested value does not violate subtarget's specifications. 823 if (Requested && (Requested <= ReservedNumSGPRs)) 824 Requested = 0; 825 826 // If more SGPRs are required to support the input user/system SGPRs, 827 // increase to accommodate them. 828 // 829 // FIXME: This really ends up using the requested number of SGPRs + number 830 // of reserved special registers in total. Theoretically you could re-use 831 // the last input registers for these special registers, but this would 832 // require a lot of complexity to deal with the weird aliasing. 833 unsigned InputNumSGPRs = PreloadedSGPRs; 834 if (Requested && Requested < InputNumSGPRs) 835 Requested = InputNumSGPRs; 836 837 // Make sure requested value is compatible with values implied by 838 // default/requested minimum/maximum number of waves per execution unit. 839 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 840 Requested = 0; 841 if (WavesPerEU.second && 842 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 843 Requested = 0; 844 845 if (Requested) 846 MaxNumSGPRs = Requested; 847 } 848 849 if (hasSGPRInitBug()) 850 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 851 852 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs); 853 } 854 855 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 856 const Function &F = MF.getFunction(); 857 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 858 return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(), 859 getReservedNumSGPRs(MF)); 860 } 861 862 static unsigned getMaxNumPreloadedSGPRs() { 863 // Max number of user SGPRs 864 unsigned MaxUserSGPRs = 4 + // private segment buffer 865 2 + // Dispatch ptr 866 2 + // queue ptr 867 2 + // kernel segment ptr 868 2 + // dispatch ID 869 2 + // flat scratch init 870 2; // Implicit buffer ptr 871 // Max number of system SGPRs 872 unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX 873 1 + // WorkGroupIDY 874 1 + // WorkGroupIDZ 875 1 + // WorkGroupInfo 876 1; // private segment wave byte offset 877 return MaxUserSGPRs + MaxSystemSGPRs; 878 } 879 880 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const { 881 return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(), 882 getReservedNumSGPRs(F)); 883 } 884 885 unsigned GCNSubtarget::getBaseMaxNumVGPRs( 886 const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const { 887 // Compute maximum number of VGPRs function can use using default/requested 888 // minimum number of waves per execution unit. 889 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 890 891 // Check if maximum number of VGPRs was explicitly requested using 892 // "amdgpu-num-vgpr" attribute. 893 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 894 unsigned Requested = AMDGPU::getIntegerAttribute( 895 F, "amdgpu-num-vgpr", MaxNumVGPRs); 896 897 if (hasGFX90AInsts()) 898 Requested *= 2; 899 900 // Make sure requested value is compatible with values implied by 901 // default/requested minimum/maximum number of waves per execution unit. 902 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 903 Requested = 0; 904 if (WavesPerEU.second && 905 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 906 Requested = 0; 907 908 if (Requested) 909 MaxNumVGPRs = Requested; 910 } 911 912 return MaxNumVGPRs; 913 } 914 915 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const { 916 return getBaseMaxNumVGPRs(F, getWavesPerEU(F)); 917 } 918 919 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 920 const Function &F = MF.getFunction(); 921 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 922 return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU()); 923 } 924 925 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, 926 int UseOpIdx, SDep &Dep) const { 927 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 928 !Def->isInstr() || !Use->isInstr()) 929 return; 930 931 MachineInstr *DefI = Def->getInstr(); 932 MachineInstr *UseI = Use->getInstr(); 933 934 if (DefI->isBundle()) { 935 const SIRegisterInfo *TRI = getRegisterInfo(); 936 auto Reg = Dep.getReg(); 937 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 938 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 939 unsigned Lat = 0; 940 for (++I; I != E && I->isBundledWithPred(); ++I) { 941 if (I->modifiesRegister(Reg, TRI)) 942 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 943 else if (Lat) 944 --Lat; 945 } 946 Dep.setLatency(Lat); 947 } else if (UseI->isBundle()) { 948 const SIRegisterInfo *TRI = getRegisterInfo(); 949 auto Reg = Dep.getReg(); 950 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 951 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 952 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 953 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 954 if (I->readsRegister(Reg, TRI)) 955 break; 956 --Lat; 957 } 958 Dep.setLatency(Lat); 959 } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) { 960 // Work around the fact that SIInstrInfo::fixImplicitOperands modifies 961 // implicit operands which come from the MCInstrDesc, which can fool 962 // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit 963 // pseudo operands. 964 Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency( 965 DefI, DefOpIdx, UseI, UseOpIdx)); 966 } 967 } 968 969 namespace { 970 struct FillMFMAShadowMutation : ScheduleDAGMutation { 971 const SIInstrInfo *TII; 972 973 ScheduleDAGMI *DAG; 974 975 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 976 977 bool isSALU(const SUnit *SU) const { 978 const MachineInstr *MI = SU->getInstr(); 979 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 980 } 981 982 bool isVALU(const SUnit *SU) const { 983 const MachineInstr *MI = SU->getInstr(); 984 return MI && TII->isVALU(*MI); 985 } 986 987 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 988 if (Pred->NodeNum < Succ->NodeNum) 989 return true; 990 991 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 992 993 for (unsigned I = 0; I < Succs.size(); ++I) { 994 for (const SDep &SI : Succs[I]->Succs) { 995 const SUnit *SU = SI.getSUnit(); 996 if (SU != Succs[I] && !llvm::is_contained(Succs, SU)) 997 Succs.push_back(SU); 998 } 999 } 1000 1001 SmallPtrSet<const SUnit*, 32> Visited; 1002 while (!Preds.empty()) { 1003 const SUnit *SU = Preds.pop_back_val(); 1004 if (llvm::is_contained(Succs, SU)) 1005 return false; 1006 Visited.insert(SU); 1007 for (const SDep &SI : SU->Preds) 1008 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 1009 Preds.push_back(SI.getSUnit()); 1010 } 1011 1012 return true; 1013 } 1014 1015 // Link as many SALU instructions in chain as possible. Return the size 1016 // of the chain. Links up to MaxChain instructions. 1017 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 1018 SmallPtrSetImpl<SUnit *> &Visited) const { 1019 SmallVector<SUnit *, 8> Worklist({To}); 1020 unsigned Linked = 0; 1021 1022 while (!Worklist.empty() && MaxChain-- > 0) { 1023 SUnit *SU = Worklist.pop_back_val(); 1024 if (!Visited.insert(SU).second) 1025 continue; 1026 1027 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 1028 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 1029 1030 if (SU->addPred(SDep(From, SDep::Artificial), false)) 1031 ++Linked; 1032 1033 for (SDep &SI : From->Succs) { 1034 SUnit *SUv = SI.getSUnit(); 1035 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 1036 SUv->addPred(SDep(SU, SDep::Artificial), false); 1037 } 1038 1039 for (SDep &SI : SU->Succs) { 1040 SUnit *Succ = SI.getSUnit(); 1041 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 1042 Worklist.push_back(Succ); 1043 } 1044 } 1045 1046 return Linked; 1047 } 1048 1049 void apply(ScheduleDAGInstrs *DAGInstrs) override { 1050 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 1051 if (!ST.hasMAIInsts() || DisablePowerSched) 1052 return; 1053 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 1054 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 1055 if (!TSchedModel || DAG->SUnits.empty()) 1056 return; 1057 1058 // Scan for MFMA long latency instructions and try to add a dependency 1059 // of available SALU instructions to give them a chance to fill MFMA 1060 // shadow. That is desirable to fill MFMA shadow with SALU instructions 1061 // rather than VALU to prevent power consumption bursts and throttle. 1062 auto LastSALU = DAG->SUnits.begin(); 1063 auto E = DAG->SUnits.end(); 1064 SmallPtrSet<SUnit*, 32> Visited; 1065 for (SUnit &SU : DAG->SUnits) { 1066 MachineInstr &MAI = *SU.getInstr(); 1067 if (!TII->isMAI(MAI) || 1068 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || 1069 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) 1070 continue; 1071 1072 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 1073 1074 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 1075 dbgs() << "Need " << Lat 1076 << " instructions to cover latency.\n"); 1077 1078 // Find up to Lat independent scalar instructions as early as 1079 // possible such that they can be scheduled after this MFMA. 1080 for ( ; Lat && LastSALU != E; ++LastSALU) { 1081 if (Visited.count(&*LastSALU)) 1082 continue; 1083 1084 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 1085 continue; 1086 1087 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 1088 } 1089 } 1090 } 1091 }; 1092 } // namespace 1093 1094 void GCNSubtarget::getPostRAMutations( 1095 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 1096 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 1097 } 1098 1099 std::unique_ptr<ScheduleDAGMutation> 1100 GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const { 1101 return std::make_unique<FillMFMAShadowMutation>(&InstrInfo); 1102 } 1103 1104 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 1105 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 1106 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 1107 else 1108 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 1109 } 1110 1111 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 1112 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 1113 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 1114 else 1115 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 1116 } 1117